-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCleaning_DeReKo.Rmd
332 lines (287 loc) · 24.7 KB
/
Cleaning_DeReKo.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
---
title: "dereko_cleaning"
author: "Abu Bakr"
date: "2023-12-11"
output: html_document
---
## `Einlesen der Zeitungsartikel` aus pdf-Dateien, die nach der Suchanfrage im DeReKo durch Cosmas II generiert wruden
```{r}
library(tidyverse)
library(pdftools)
## 1
data_w_font1 <- pdf_data("/Users/abuzuzu/Downloads/w1d.pdf",font_info = TRUE) |>
bind_rows() |>
slice(96:538818) |>
mutate(ind = 0)
ind <- 1
for (i in seq_along(data_w_font1$text)) {
if (ind %% 2 != 0 & str_detect(data_w_font1$font_name[[i]], "Bold")){ind <- ind + 1}
if (ind %% 2 == 0 & str_detect(data_w_font1$font_name[[i]], "Bold") == FALSE){ind <- ind + 1}
data_w_font1$ind[[i]] <- ind
}
collapsed_tbl1 <- data_w_font1 |>
group_by(ind) |>
summarize(text = str_c(text, collapse = " ")) |>
mutate(type = case_when(
ind %% 2 != 0 ~ "article",
ind %% 2 == 0 ~ "source"),
group = map(1:4231, \(x) rep(x, 2) |> reduce(c)) |> reduce(c)) |>
select(-ind) |>
pivot_wider(id_cols = group, names_from = type, values_from = text)
###2###
data_w_font2 <- pdf_data("/Users/abuzuzu/Downloads/w2d.pdf", font_info = TRUE) |>
bind_rows() |>
slice(96:190252) |>
mutate(ind = 0)
ind <- 1
for (i in seq_along(data_w_font2$text)) {
if (ind %% 2 != 0 & str_detect(data_w_font2$font_name[[i]], "Bold")){ind <- ind + 1}
if (ind %% 2 == 0 & str_detect(data_w_font2$font_name[[i]], "Bold") == FALSE){ind <- ind + 1}
data_w_font2$ind[[i]] <- ind
}
collapsed_tbl2 <- data_w_font2 |>
group_by(ind) |>
summarize(text = str_c(text, collapse = " ")) |>
mutate(type = case_when(
ind %% 2 != 0 ~ "article",
ind %% 2 == 0 ~ "source"),
group = map(1:1507, \(x) rep(x, 2) |> reduce(c)) |> reduce(c)) |>
select(-ind) |>
pivot_wider(id_cols = group, names_from = type, values_from = text)
###3###
data_w_font3 <- pdf_data("/Users/abuzuzu/Downloads/w3d.pdf", font_info = TRUE) |>
bind_rows() |>
slice(96:141471) |>
mutate(ind = 0)
ind <- 1
for (i in seq_along(data_w_font3$text)) {
if (ind %% 2 != 0 & str_detect(data_w_font3$font_name[[i]], "Bold")){ind <- ind + 1}
if (ind %% 2 == 0 & str_detect(data_w_font3$font_name[[i]], "Bold") == FALSE){ind <- ind + 1}
data_w_font3$ind[[i]] <- ind
}
collapsed_tbl3 <- data_w_font3 |>
group_by(ind) |>
summarize(text = str_c(text, collapse = " ")) |>
mutate(type = case_when(
ind %% 2 != 0 ~ "article",
ind %% 2 == 0 ~ "source"),
group = map(1:1114, \(x) rep(x, 2) |> reduce(c)) |> reduce(c)) |>
select(-ind) |>
pivot_wider(id_cols = group, names_from = type, values_from = text)
###4###
data_w_font4 <- pdf_data("/Users/abuzuzu/Downloads/w4d.pdf", font_info = TRUE) |>
bind_rows() |>
slice(96:137496) |>
mutate(ind = 0)
ind <- 1
for (i in seq_along(data_w_font4$text)) {
if (ind %% 2 != 0 & str_detect(data_w_font4$font_name[[i]], "Bold")){ind <- ind + 1}
if (ind %% 2 == 0 & str_detect(data_w_font4$font_name[[i]], "Bold") == FALSE){ind <- ind + 1}
data_w_font4$ind[[i]] <- ind
}
collapsed_tbl4 <- data_w_font4 |>
group_by(ind) |>
summarize(text = str_c(text, collapse = " ")) |>
mutate(type = case_when(
ind %% 2 != 0 ~ "article",
ind %% 2 == 0 ~ "source"),
group = map(1:1061, \(x) rep(x, 2) |> reduce(c)) |> reduce(c)) |>
select(-ind) |>
pivot_wider(id_cols = group, names_from = type, values_from = text)
tbl_complete <- bind_rows(collapsed_tbl1, collapsed_tbl2, collapsed_tbl3, collapsed_tbl4) |>
select(-group) |>
rownames_to_column(var = "id")
```
## `Erstellung eines Korpus` um die Kürzel auslesen zu können
```{r Korpus erstellen und zum Dataframe hinzufügen}
corpus_pattern <- "(AAN01|AAN02|AAN03|AAN04|AAN05|AAN06|AAN07|AAN08|AAN09|AAN10|AAN11|AAN12|AAN13|AAN14|AAN15|AAN16|AAN17|AAN18|AAN19|AAN20|AAN21|AAN22|AAN23|
ABM00|ABM01|ABM02|ABM03|ABM04|ABM05|ABM06|ABM07|ABM08|ABM09|ABM10|ABM11|ABM12|ABM13|ABM14|ABM15|ABM16|ABM17|ABM18|ABM19|ABM20|ABM21|ABM22|ABM23|
ABO00|ABO01|ABO02|ABO03|ABO04|ABO05|ABO06|ABO07|ABO08|ABO09|ABO10|ABO11|ABO12|ABO13|ABO14|ABO15|ABO16|ABO17|ABO18|ABO19|ABO20|ABO21|ABO22|ABO23|
AZM00|AZM01|AZM02|AZM03|AZM04|AZM05|AZM06|AZM07|AZM08|AZM09|AZM10|AZM11|AZM12|AZM13|AZM14|AZM15|AZM16|AZM17|AZM18|AZM19|AZM20|AZM21|AZM22|AZM23|
BDZ00|BDZ01|BDZ02|BDZ03|BDZ04|BDZ05|BDZ06|BDZ07|BDZ08|BDZ09|BDZ10|BDZ11|BDZ12|BDZ13|BDZ14|BDZ15|BDZ16|BDZ17|BDZ18|BDZ19|BDZ20|BDZ21|BDZ22|BDZ23|
ART00|ART01|ART02|ART03|ART04|ART05|ART06|ART07|ART08|ART09|ART10|ART11|ART12|ART13|ART14|ART15|ART16|ART17|ART18|ART19|ART20|ART21|ART22|ART23|
B60|B61|B62|B63|B64|B65|B66|B67|B68|B69|B70|B71|B72|B73|B74|B75|B76|B77|B78|B79|B80|B81|B82|B83|B84|B85|B86|B87|B88|B89|B90|B91|B92|B93|B94|B95|B96|B97|B98|B99|B00|B01|B02|B03|B04|B05|B06|B07|B08|B09|B10|B11|B12|B13|B14|B15|B16|B17|B18|B19|B20|B21|B22|B23|
|BEE00|BEE01|BEE02|BEE03|BEE04|BEE05|BEE06|BEE07|BEE08|BEE09|BEE10|BEE11|BEE12|BEE13|BEE14|BEE15|BEE16|BEE17|BEE18|BEE19|BEE20|BEE21|BEE22|BEE23|BIH|
|BKU00|BKU01|BKU02|BKU03|BKU04|BKU05|BKU06|BKU07|BKU08|BKU09|BKU10|BKU11|BKU12|BKU13|BKU14|BKU15|BKU16|BKU17|BKU18|BKU19|BKU20|BKU21|BKU22|BKU23|
|BNA00|BNA01|BNA02|BNA03|BNA04|BNA05|BNA06|BNA07|BNA08|BNA09|BNA10|BNA11|BNA12|BNA13|BNA14|BNA15|BNA16|BNA17|BNA18|BNA19|BNA20|BNA21|BNA22|BNA23|
|BOZ00|BOZ01|BOZ02|BOZ03|BOZ04|BOZ05|BOZ06|BOZ07|BOZ08|BOZ09|BOZ10|BOZ11|BOZ12|BOZ13|BOZ14|BOZ15|BOZ16|BOZ17|BOZ18|BOZ19|BOZ20|BOZ21|BOZ22|BOZ23|
|BRU00|BRU01|BRU02|BRU03|BRU04|BRU05|BRU06|BRU07|BRU08|BRU09|BRU10|BRU11|BRU12|BRU13|BRU14|BRU15|BRU16|BRU17|BRU18|BRU19|BRU20|BRU21|BRU22|BRU23|
|BRZ00|BRZ01|BRZ02|BRZ03|BRZ04|BRZ05|BRZ06|BRZ07|BRZ08|BRZ09|BRZ10|BRZ11|BRZ12|BRZ13|BRZ14|BRZ15|BRZ16|BRZ17|BRZ18|BRZ19|BRZ20|BRZ21|BRZ22|BRZ23|
|BUE00|BUE01|BUE02|BUE03|BUE04|BUE05|BUE06|BUE07|BUE08|BUE09|BUE10|BUE11|BUE12|BUE13|BUE14|BUE15|BUE16|BUE17|BUE18|BUE19|BUE20|BUE21|BUE22|BUE23|
|BZE00|BZE01|BZE02|BZE03|BZE04|BZE05|BZE06|BZE07|BZE08|BZE09|BZE10|BZE11|BZE12|BZE13|BZE14|BZE15|BZE16|BZE17|BZE18|BZE19|BZE20|BZE21|BZE22|BZE23|
|BZG00|BZG01|BZG02|BZG03|BZG04|BZG05|BZG06|BZG07|BZG08|BZG09|BZG10|BZG11|BZG12|BZG13|BZG14|BZG15|BZG16|BZG17|BZG18|BZG19|BZG20|BZG21|BZG22|BZG23|
|CAP00|CAP01|CAP02|CAP03|CAP04|CAP05|CAP06|CAP07|CAP08|CAP09|CAP10|CAP11|CAP12|CAP13|CAP14|CAP15|CAP16|CAP17|CAP18|CAP19|CAP20|CAP21|CAP22|CAP23|
|CTB00|CTB01|CTB02|CTB03|CTB04|CTB05|CTB06|CTB07|CTB08|CTB09|CTB10|CTB11|CTB12|CTB13|CTB14|CTB15|CTB16|CTB17|CTB18|CTB19|CTB20|CTB21|CTB22|CTB23|
|DAZ00|DAZ01|DAZ02|DAZ03|DAZ04|DAZ05|DAZ06|DAZ07|DAZ08|DAZ09|DAZ10|DAZ11|DAZ12|DAZ13|DAZ14|DAZ15|DAZ16|DAZ17|DAZ18|DAZ19|DAZ20|DAZ21|DAZ22|DAZ23|
|DEC99|DEC00|DEC01|DEC02|DEC03|DEC04|DEC05|DEC06|DEC07|DEC08|DEC09|DEC10|DEC11|DEC12|DEC13|DEC14|DEC15|DEC16|DEC17|DEC18|DEC19|DEC20|DEC21|DEC22|DEC23|
|DIV|
|DKI00|DKI01|DKI02|DKI03|DKI04|DKI05|DKI06|DKI07|DKI08|DKI09|DKI10|DKI11|DKI12|DKI13|DKI14|DKI15|DKI16|DKI17|DKI18|DKI19|DKI20|DKI21|DKI22|DKI23|
|DNN00|DNN01|DNN02|DNN03|DNN04|DNN05|DNN06|DNN07|DNN08|DNN09|DNN10|DNN11|DNN12|DNN13|DNN14|DNN15|DNN16|DNN17|DNN18|DNN19|DNN20|DNN21|DNN22|DNN23|
|DPA00|DPA01|DPA02|DPA03|DPA04|DPA05|DPA06|DPA07|DPA08|DPA09|DPA10|DPA11|DPA12|DPA13|DPA14|DPA15|DPA16|DPA17|DPA18|DPA19|DPA20|DPA21|DPA22|DPA23|
|DPR00|DPR01|DPR02|DPR03|DPR04|DPR05|DPR06|DPR07|DPR08|DPR09|DPR10|DPR11|DPR12|DPR13|DPR14|DPR15|DPR16|DPR17|DPR18|DPR19|DPR20|DPR21|DPR22|DPR23|
|EFT00|EFT01|EFT02|EFT03|EFT04|EFT05|EFT06|EFT07|EFT08|EFT09|EFT10|EFT11|EFT12|EFT13|EFT14|EFT15|EFT16|EFT17|EFT18|EFT19|EFT20|EFT21|EFT22|EFT23|
|ERK|
|F60|F61|F62|F63|F64|F65|F66|F67|F68|F69|F70|F71|F72|F73|F74|F75|F76|F77|F78|F79|F80|F81|F82|F83|F84|F85|F86|F87|F88|F89|F90|F91|F92|F93|F94|F95|F96|F97|F98|F99|F00|F01|F02|F03|F04|F05|F06|F07|F08|F09|F10|F11|F12|F13|F14|F15|F16|F17|F18|F19|F20|F21|F22|F23|
|FNP00|FNP01|FNP02|FNP03|FNP04|FNP05|FNP06|FNP07|FNP08|FNP09|FNP10|FNP11|FNP12|FNP13|FNP14|FNP15|FNP16|FNP17|FNP18|FNP19|FNP20|FNP21|FNP22|FNP23|
|FRT00|FRT01|FRT02|FRT03|FRT04|FRT05|FRT06|FRT07|FRT08|FRT09|FRT10|FRT11|FRT12|FRT13|FRT14|FRT15|FRT16|FRT17|FRT18|FRT19|FRT20|FRT21|FRT22|FRT23|
|FAZ00|FAZ01|FAZ02|FAZ03|FAZ04|FAZ05|FAZ06|FAZ07|FAZ08|FAZ09|FAZ10|FAZ11|FAZ12|FAZ13|FAZ14|FAZ15|FAZ16|FAZ17|FAZ18|FAZ19|FAZ20|FAZ21|FAZ22|FAZ23|
|FOC00|FOC01|FOC02|FOC03|FOC04|FOC05|FOC06|FOC07|FOC08|FOC09|FOC10|FOC11|FOC12|FOC13|FOC14|FOC15|FOC16|FOC17|FOC18|FOC19|FOC20|FOC21|FOC22|FOC23|
|FPC00|FPC01|FPC02|FPC03|FPC04|FPC05|FPC06|FPC07|FPC08|FPC09|FPC10|FPC11|FPC12|FPC13|FPC14|FPC15|FPC16|FPC17|FPC18|FPC19|FPC20|FPC21|FPC22|FPC23|
|GEO00|GEO01|GEO02|GEO03|GEO04|GEO05|GEO06|GEO07|GEO08|GEO09|GEO10|GEO11|GEO12|GEO13|GEO14|GEO15|GEO16|GEO17|GEO18|GEO19|GEO20|GEO21|GEO22|GEO23|
|GES00|GES01|GES02|GES03|GES04|GES05|GES06|GES07|GES08|GES09|GES10|GES11|GES12|GES13|GES14|GES15|GES16|GES17|GES18|GES19|GES20|GES21|GES22|GES23|
|GNG00|GNG01|GNG02|GNG03|GNG04|GNG05|GNG06|GNG07|GNG08|GNG09|GNG10|GNG11|GNG12|GNG13|GNG14|GNG15|GNG16|GNG17|GNG18|GNG19|GNG20|GNG21|GNG22|GNG23|
|GTA00|GTA01|GTA02|GTA03|GTA04|GTA05|GTA06|GTA07|GTA08|GTA09|GTA10|GTA11|GTA12|GTA13|GTA14|GTA15|GTA16|GTA17|GTA18|GTA19|GTA20|GTA21|GTA22|GTA23|
|GTB00|GTB01|GTB02|GTB03|GTB04|GTB05|GTB06|GTB07|GTB08|GTB09|GTB10|GTB11|GTB12|GTB13|GTB14|GTB15|GTB16|GTB17|GTB18|GTB19|GTB20|GTB21|GTB22|GTB23|
H40|H41|H42|H43|H44|H45|H46|H47|H48|H49|H50|H51|H52|H53|H54|H55|H56|H57|H58|H59|H60|H61|H62|H63|H64|H65|H66|H67|H68|H69|H70|H71|H72|H73|H74|H75|H76|H77|H78|H79|H80|H81|H82|H83|H84|H85|H86|H87|H88|H89|H90|H91|H92|H93|H94|H95|H96|H97|H98|H99|H00|H01|H02|H03|H04|H05|H06|H07|H08|H09|H10|H11|H12|H13|H14|H15|H16|H17|H18|H19|H20|H21|H22|H23|
|HAA99|HAA00|HAA01|HAA02|HAA03|HAA04|HAA05|HAA06|HAA07|HAA08|HAA09|HAA10|HAA11|HAA12|HAA13|HAA14|HAA15|HAA16|HAA17|HAA18|HAA19|HAA20|HAA21|HAA22|HAA23|
|HAB00|HAB01|HAB02|HAB03|HAB04|HAB05|HAB06|HAB07|HAB08|HAB09|HAB10|HAB11|HAB12|HAB13|HAB14|HAB15|HAB16|HAB17|HAB18|HAB19|HAB20|HAB21|HAB22|HAB23|
|HAZ00|HAZ01|HAZ02|HAZ03|HAZ04|HAZ05|HAZ06|HAZ07|HAZ08|HAZ09|HAZ10|HAZ11|HAZ12|HAZ13|HAZ14|HAZ15|HAZ16|HAZ17|HAZ18|HAZ19|HAZ20|HAZ21|HAZ22|HAZ23|
|HES|
|HFZ00|HFZ01|HFZ02|HFZ03|HFZ04|HFZ05|HFZ06|HFZ07|HFZ08|HFZ09|HFZ10|HFZ11|HFZ12|HFZ13|HFZ14|HFZ15|HFZ16|HFZ17|HFZ18|HFZ19|HFZ20|HFZ21|HFZ22|HFZ23|
|HKR00|HKR01|HKR02|HKR03|HKR04|HKR05|HKR06|HKR07|HKR08|HKR09|HKR10|HKR11|HKR12|HKR13|HKR14|HKR15|HKR16|HKR17|HKR18|HKR19|HKR20|HKR21|HKR22|HKR23|
|HMP00|HMP01|HMP02|HMP03|HMP04|HMP05|HMP06|HMP07|HMP08|HMP09|HMP10|HMP11|HMP12|HMP13|HMP14|HMP15|HMP16|HMP17|HMP18|HMP19|HMP20|HMP21|HMP22|HMP23|
|HNA00|HNA01|HNA02|HNA03|HNA04|HNA05|HNA06|HNA07|HNA08|HNA09|HNA10|HNA11|HNA12|HNA13|HNA14|HNA15|HNA16|HNA17|HNA18|HNA19|HNA20|HNA21|HNA22|HNA23|
|HST00|HST01|HST02|HST03|HST04|HST05|HST06|HST07|HST08|HST09|HST10|HST11|HST12|HST13|HST14|HST15|HST16|HST17|HST18|HST19|HST20|HST21|HST22|HST23|
|HTB00|HTB01|HTB02|HTB03|HTB04|HTB05|HTB06|HTB07|HTB08|HTB09|HTB10|HTB11|HTB12|HTB13|HTB14|HTB15|HTB16|HTB17|HTB18|HTB19|HTB20|HTB21|HTB22|HTB23|
|HZZ00|HZZ01|HZZ02|HZZ03|HZZ04|HZZ05|HZZ06|HZZ07|HZZ08|HZZ09|HZZ10|HZZ11|HZZ12|HZZ13|HZZ14|HZZ15|HZZ16|HZZ17|HZZ18|HZZ19|HZZ20|HZZ21|HZZ22|HZZ23|
|JUE00|JUE01|JUE02|JUE03|JUE04|JUE05|JUE06|JUE07|JUE08|JUE09|JUE10|JUE11|JUE12|JUE13|JUE14|JUE15|JUE16|JUE17|JUE18|JUE19|JUE20|JUE21|JUE22|JUE23|
|KAZ00|KAZ01|KAZ02|KAZ03|KAZ04|KAZ05|KAZ06|KAZ07|KAZ08|KAZ09|KAZ10|KAZ11|KAZ12|KAZ13|KAZ14|KAZ15|KAZ16|KAZ17|KAZ18|KAZ19|KAZ20|KAZ21|KAZ22|KAZ23|
|KN22|
|KSA00|KSA01|KSA02|KSA03|KSA04|KSA05|KSA06|KSA07|KSA08|KSA09|KSA10|KSA11|KSA12|KSA13|KSA14|KSA15|KSA16|KSA17|KSA18|KSA19|KSA20|KSA21|KSA22|KSA23|
|KXP00|KXP01|KXP02|KXP03|KXP04|KXP05|KXP06|KXP07|KXP08|KXP09|KXP10|KXP11|KXP12|KXP13|KXP14|KXP15|KXP16|KXP17|KXP18|KXP19|KXP20|KXP21|KXP22|KXP23|
|L40|L41|L42|L43|L44|L45|L46|L47|L48|L49|L50|L51|L52|L53|L54|L55|L56|L57|L58|L59|L60|L61|L62|L63|L64|L65|L66|L67|L68|L69|L70|L71|L72|L73|L74|L75|L76|L77|L78|L79|L80|L81|L82|L83|L84|L85|L86|L87|L88|L89|L90|L91|L92|L93|L94|L95|L96|L97|L98|L99|L00|L01|L02|L03|L04|L05|L06|L07|L08|L09|L10|L11|L12|L13|L14|L15|L16|L17|L18|L19|L20|L21|L22|L23|
|LAN00|LAN01|LAN02|LAN03|LAN04|LAN05|LAN06|LAN07|LAN08|LAN09|LAN10|LAN11|LAN12|LAN13|LAN14|LAN15|LAN16|LAN17|LAN18|LAN19|LAN20|LAN21|LAN22|LAN23|
|LAZ00|LAZ01|LAZ02|LAZ03|LAZ04|LAZ05|LAZ06|LAZ07|LAZ08|LAZ09|LAZ10|LAZ11|LAZ12|LAZ13|LAZ14|LAZ15|LAZ16|LAZ17|LAZ18|LAZ19|LAZ20|LAZ21|LAZ22|LAZ23|
|LMD00|LMD01|LMD02|LMD03|LMD04|LMD05|LMD06|LMD07|LMD08|LMD09|LMD10|LMD11|LMD12|LMD13|LMD14|LMD15|LMD16|LMD17|LMD18|LMD19|LMD20|LMD21|LMD22|LMD23|
|LN00|LN01|LN02|LN03|LN04|LN05|LN06|LN07|LN08|LN09|LN10|LN11|LN12|LN13|LN14|LN15|LN16|LN17|LN18|LN19|LN20|LN21|LN22|LN23|
|LRU00|LRU01|LRU02|LRU03|LRU04|LRU05|LRU06|LRU07|LRU08|LRU09|LRU10|LRU11|LRU12|LRU13|LRU14|LRU15|LRU16|LRU17|LRU18|LRU19|LRU20|LRU21|LRU22|LRU23|
|M60|M61|M62|M63|M64|M65|M66|M67|M68|M69|M70|M71|M72|M73|M74|M75|M76|M77|M78|M79|M80|M81|M82|M83|M84|M85|M86|M87|M88|M89|M90|M91|M92|M93|M94|M95|M96|M97|M98|M99|M00|M01|M02|M03|M04|M05|M06|M07|M08|M09|M10|M11|M12|M13|M14|M15|M16|M17|M18|M19|M20|M21|M22|M23|
|MAG00|MAG01|MAG02|MAG03|MAG04|MAG05|MAG06|MAG07|MAG08|MAG09|MAG10|MAG11|MAG12|MAG13|MAG14|MAG15|MAG16|MAG17|MAG18|MAG19|MAG20|MAG21|MAG22|MAG23|
|MK1|
|MME00|MME01|MME02|MME03|MME04|MME05|MME06|MME07|MME08|MME09|MME10|MME11|MME12|MME13|MME14|MME15|MME16|MME17|MME18|MME19|MME20|MME21|MME22|MME23|
|MPO95|MPO96|MPO97|MPO98|MPO99|MPO00|MPO01|MPO02|MPO03|MPO04|MPO05|MPO06|MPO07|MPO08|MPO09|MPO10|MPO11|MPO12|MPO13|MPO14|MPO15|MPO16|MPO17|MPO18|MPO19|MPO20|MPO21|MPO22|MPO23|
|MSP95|MSP96|MSP97|MSP98|MSP99|MSP00|MSP01|MSP02|MSP03|MSP04|MSP05|MSP06|MSP07|MSP08|MSP09|MSP10|MSP11|MSP12|MSP13|MSP14|MSP15|MSP16|MSP17|MSP18|MSP19|MSP20|MSP21|MSP22|MSP23|
|MTK00|MTK01|MTK02|MTK03|MTK04|MTK05|MTK06|MTK07|MTK08|MTK09|MTK10|MTK11|MTK12|MTK13|MTK14|MTK15|MTK16|MTK17|MTK18|MTK19|MTK20|MTK21|MTK22|MTK23|
|MUV00|MUV01|MUV02|MUV03|MUV04|MUV05|MUV06|MUV07|MUV08|MUV09|MUV10|MUV11|MUV12|MUV13|MUV14|MUV15|MUV16|MUV17|MUV18|MUV19|MUV20|MUV21|MUV22|MUV23|
|MZE00|MZE01|MZE02|MZE03|MZE04|MZE05|MZE06|MZE07|MZE08|MZE09|MZE10|MZE11|MZE12|MZE13|MZE14|MZE15|MZE16|MZE17|MZE18|MZE19|MZE20|MZE21|MZE22|MZE23|
|NBK00|NBK01|NBK02|NBK03|NBK04|NBK05|NBK06|NBK07|NBK08|NBK09|NBK10|NBK11|NBK12|NBK13|NBK14|NBK15|NBK16|NBK17|NBK18|NBK19|NBK20|NBK21|NBK22|NBK23|NGABI|NGTAG|
|NNN00|NNN01|NNN02|NNN03|NNN04|NNN05|NNN06|NNN07|NNN08|NNN09|NNN10|NNN11|NNN12|NNN13|NNN14|NNN15|NNN16|NNN17|NNN18|NNN19|NNN20|NNN21|NNN22|NNN23|
|NNP00|NNP01|NNP02|NNP03|NNP04|NNP05|NNP06|NNP07|NNP08|NNP09|NNP10|NNP11|NNP12|NNP13|NNP14|NNP15|NNP16|NNP17|NNP18|NNP19|NNP20|NNP21|NNP22|NNP23|
|NKU00|NKU01|NKU02|NKU03|NKU04|NKU05|NKU06|NKU07|NKU08|NKU09|NKU10|NKU11|NKU12|NKU13|NKU14|NKU15|NKU16|NKU17|NKU18|NKU19|NKU20|NKU21|NKU22|NKU23|
|NOZ00|NOZ01|NOZ02|NOZ03|NOZ04|NOZ05|NOZ06|NOZ07|NOZ08|NOZ09|NOZ10|NOZ11|NOZ12|NOZ13|NOZ14|NOZ15|NOZ16|NOZ17|NOZ18|NOZ19|NOZ20|NOZ21|NOZ22|NOZ23|
|NUN60|NUN61|NUN62|NUN63|NUN64|NUN65|NUN66|NUN67|NUN68|NUN69|NUN70|NUN71|NUN72|NUN73|NUN74|NUN75|NUN76|NUN77|NUN78|NUN79|NUN80|NUN81|NUN82|NUN83|NUN84|NUN85|NUN86|NUN87|NUN88|NUN89|NUN90|NUN91|NUN92|NUN93|NUN94|NUN95|NUN96|NUN97|NUN98|NUN99|
|NUN00|NUN01|NUN02|NUN03|NUN04|NUN05|NUN06|NUN07|NUN08|NUN09|NUN10|NUN11|NUN12|NUN13|NUN14|NUN15|NUN16|NUN17|NUN18|NUN19|NUN20|NUN21|NUN22|NUN23|
|NWE00|NWE01|NWE02|NWE03|NWE04|NWE05|NWE06|NWE07|NWE08|NWE09|NWE10|NWE11|NWE12|NWE13|NWE14|NWE15|NWE16|NWE17|NWE18|NWE19|NWE20|NWE21|NWE22|NWE23|
|OSZ00|OSZ01|OSZ02|OSZ03|OSZ04|OSZ05|OSZ06|OSZ07|OSZ08|OSZ09|OSZ10|OSZ11|OSZ12|OSZ13|OSZ14|OSZ15|OSZ16|OSZ17|OSZ18|OSZ19|OSZ20|OSZ21|OSZ22|OSZ23|
|PMM00|PMM01|PMM02|PMM03|PMM04|PMM05|PMM06|PMM07|PMM08|PMM09|PMM10|PMM11|PMM12|PMM13|PMM14|PMM15|PMM16|PMM17|PMM18|PMM19|PMM20|PMM21|PMM22|PMM23|
|PBT|PBB|PBW|PBE|PHB|PHE|PHH|PMV|PNW|PRP|PSN|PSH|PSL|PST|PTH|
|PNN00|PNN01|PNN02|PNN03|PNN04|PNN05|PNN06|PNN07|PNN08|PNN09|PNN10|PNN11|PNN12|PNN13|PNN14|PNN15|PNN16|PNN17|PNN18|PNN19|PNN20|PNN21|PNN22|PNN23|
|PNP00|PNP01|PNP02|PNP03|PNP04|PNP05|PNP06|PNP07|PNP08|PNP09|PNP10|PNP11|PNP12|PNP13|PNP14|PNP15|PNP16|PNP17|PNP18|PNP19|PNP20|PNP21|PNP22|PNP23|
|R60|R61|R62|R63|R64|R65|R66|R67|R68|R69|R70|R71|R72|R73|R74|R75|R76|R77|R78|R79|R80|R81|R82|R83|R84|R85|R86|R87|R88|R89|R90|R91|R92|R93|R94|R95|R96|R97|R98|R99|R00|R01|R02|R03|R04|R05|R06|R07|R08|R09|R10|R11|R12|R13|R14|R15|R16|R17|R18|R19|R20|R21|R22|R23
|REI|
|RGA00|RGA01|RGA02|RGA03|RGA04|RGA05|RGA06|RGA07|RGA08|RGA09|RGA10|RGA11|RGA12|RGA13|RGA14|RGA15|RGA16|RGA17|RGA18|RGA19|RGA20|RGA21|RGA22|RGA23|
|RHP00|RHP01|RHP02|RHP03|RHP04|RHP05|RHP06|RHP07|RHP08|RHP09|RHP10|RHP11|RHP12|RHP13|RHP14|RHP15|RHP16|RHP17|RHP18|RHP19|RHP20|RHP21|RHP22|RHP23|
|RN00|RN01|RN02|RN03|RN04|RN05|RN06|RN07|RN08|RN09|RN10|RN11|RN12|RN13|RN14|RN15|RN16|RN17|RN18|RN19|RN20|RN21|RN22|RN23|
|RHZ00|RHZ01|RHZ02|RHZ03|RHZ04|RHZ05|RHZ06|RHZ07|RHZ08|RHZ09|RHZ10|RHZ11|RHZ12|RHZ13|RHZ14|RHZ15|RHZ16|RHZ17|RHZ18|RHZ19|RHZ20|RHZ21|RHZ22|RHZ23|
|RLN00|RLN01|RLN02|RLN03|RLN04|RLN05|RLN06|RLN07|RLN08|RLN09|RLN10|RLN11|RLN12|RLN13|RLN14|RLN15|RLN16|RLN17|RLN18|RLN19|RLN20|RLN21|RLN22|RLN23|
|RPO00|RPO01|RPO02|RPO03|RPO04|RPO05|RPO06|RPO07|RPO08|RPO09|RPO10|RPO11|RPO12|RPO13|RPO14|RPO15|RPO16|RPO17|RPO18|RPO19|RPO20|RPO21|RPO22|RPO23|
|RUE00|RUE01|RUE02|RUE03|RUE04|RUE05|RUE06|RUE07|RUE08|RUE09|RUE10|RUE11|RUE12|RUE13|RUE14|RUE15|RUE16|RUE17|RUE18|RUE19|RUE20|RUE21|RUE22|RUE23|
|S40|S41|S42|S43|S44|S45|S46|S47|S48|S49|S50|S51|S52|S53|S54|S55|S56|S57|S58|S59|S60|S61|S62|S63|S64|S65|S66|S67|S68|S69|S70|S71|S72|S73|S74|S75|S76|S77|S78|S79|S80|S81|S82|S83|S84|S85|S86|S87|S88|S89|S90|S91|S92|S93|S94|S95|S96|S97|S98|S99|S00|S01|S02|S03|S04|S05|S06|S07|S08|S09|S10|S11|S12|S13|S14|S15|S16|S17|S18|S19|S20|S21|S22|S23|
|SOA00|SOA01|SOA02|SOA03|SOA04|SOA05|SOA06|SOA07|SOA08|SOA09|SOA10|SOA11|SOA12|SOA13|SOA14|SOA15|SOA16|SOA17|SOA18|SOA19|SOA20|SOA21|SOA22|SOA23|
|SOL00|SOL01|SOL02|SOL03|SOL04|SOL05|SOL06|SOL07|SOL08|SOL09|SOL10|SOL11|SOL12|SOL13|SOL14|SOL15|SOL16|SOL17|SOL18|SOL19|SOL20|SOL21|SOL22|SOL23|
|SPK|
|STB00|STB01|STB02|STB03|STB04|STB05|STB06|STB07|STB08|STB09|STB10|STB11|STB12|STB13|STB14|STB15|STB16|STB17|STB18|STB19|STB20|STB21|STB22|STB23|
|STE96|STE97|STE98|STE99|STE00|STE01|STE02|STE03|STE04|STE05|STE06|STE07|STE08|STE09|STE10|STE11|STE12|STE13|STE14|STE15|STE16|STE17|STE18|STE19|STE20|STE21|STE22|STE23|
|STN00|STN01|STN02|STN03|STN04|STN05|STN06|STN07|STN08|STN09|STN10|STN11|STN12|STN13|STN14|STN15|STN16|STN17|STN18|STN19|STN20|STN21|STN22|STN23|
|SWP00|SWP01|SWP02|SWP03|SWP04|SWP05|SWP06|SWP07|SWP08|SWP09|SWP10|SWP11|SWP12|SWP13|SWP14|SWP15|SWP16|SWP17|SWP18|SWP19|SWP20|SWP21|SWP22|SWP23|
|SZE95|SZE96|SZE97|SZE98|SZE99|SZE00|SZE01|SZE02|SZE03|SZE04|SZE05|SZE06|SZE07|SZE08|SZE09|SZE10|SZE11|SZE12|SZE13|SZE14|SZE15|SZE16|SZE17|SZE18|SZE19|SZE20|SZE21|SZE22|SZE23|
|T60|T61|T62|T63|T64|T65|T66|T67|T68|T69|T70|T71|T72|T73|T74|T75|T76|T77|T78|T79|T80|T81|T82|T83|T84|T85|T86|T87|T88|T89|T90|T91|T92|T93|T94|T95|T96|T97|T98|T99|T00|T01|T02|T03|T04|T05|T06|T07|T08|T09|T10|T11|T12|T13|T14|T15|T16|T17|T18|T19|T20|T21|T22|T23|
|THA00|THA01|THA02|THA03|THA04|THA05|THA06|THA07|THA08|THA09|THA10|THA11|THA12|THA13|THA14|THA15|THA16|THA17|THA18|THA19|THA20|THA21|THA22|THA23|
|TSP00|TSP01|TSP02|TSP03|TSP04|TSP05|TSP06|TSP07|TSP08|TSP09|TSP10|TSP11|TSP12|TSP13|TSP14|TSP15|TSP16|TSP17|TSP18|TSP19|TSP20|TSP21|TSP22|TSP23|
|U60|U61|U62|U63|U64|U65|U66|U67|U68|U69|U70|U71|U72|U73|U74|U75|U76|U77|U78|U79|U80|U81|U82|U83|U84|U85|U86|U87|U88|U89|U90|U91|U92|U93|U94|U95|U96|U97|U98|U99|U00|U01|U02|U03|U04|U05|U06|U07|U08|U09|U10|U11|U12|U13|U14|U15|U16|U17|U18|U19|U20|U21|U22|U23|
|UAN00|UAN01|UAN02|UAN03|UAN04|UAN05|UAN06|UAN07|UAN08|UAN09|UAN10|UAN11|UAN12|UAN13|UAN14|UAN15|UAN16|UAN17|UAN18|UAN19|UAN20|UAN21|UAN22|UAN23|
|VBW00|VBW01|VBW02|VBW03|VBW04|VBW05|VBW06|VBW07|VBW08|VBW09|VBW10|VBW11|VBW12|VBW13|VBW14|VBW15|VBW16|VBW17|VBW18|VBW19|VBW20|VBW21|VBW22|VBW23|
|VSW00|VSW01|VSW02|VSW03|VSW04|VSW05|VSW06|VSW07|VSW08|VSW09|VSW10|VSW11|VSW12|VSW13|VSW14|VSW15|VSW16|VSW17|VSW18|VSW19|VSW20|VSW21|VSW22|VSW23|
|WAS00|WAS01|WAS02|WAS03|WAS04|WAS05|WAS06|WAS07|WAS08|WAS09|WAS10|WAS11|WAS12|WAS13|WAS14|WAS15|WAS16|WAS17|WAS18|WAS19|WAS20|WAS21|WAS22|WAS23|
|WDD11|WPD11|
|WEO00|WEO01|WEO02|WEO03|WEO04|WEO05|WEO06|WEO07|WEO08|WEO09|WEO10|WEO11|WEO12|WEO13|WEO14|WEO15|WEO16|WEO17|WEO18|WEO19|WEO20|WEO21|WEO22|WEO23|
|W40|W41|W42|W43|W44|W45|W46|W47|W48|W49|W50|W51|W52|W53|W54|W55|W56|W57|W58|W59|W60|W61|W62|W63|W64|W65|W66|W67|W68|W69|W70|W71|W72|W73|W74|W75|W76|W77|W78|W79|W80|W81|W82|W83|W84|W85|W86|W87|W88|W89|W90|W91|W92|W93|W94|W95|W96|W97|W98|W99|W00|W01|W02|W03|W04|W05|W06|W07|W08|W09|W10|W11|W12|W13|W14|W15|W16|W17|W18|W19|W20|W21|W22|W23|
|WAM|WKD|
|ZCA00|ZCA01|ZCA02|ZCA03|ZCA04|ZCA05|ZCA06|ZCA07|ZCA08|ZCA09|ZCA10|ZCA11|ZCA12|ZCA13|ZCA14|ZCA15|ZCA16|ZCA17|ZCA18|ZCA19|ZCA20|ZCA21|ZCA22|ZCA23|
|ZGE00|ZGE01|ZGE02|ZGE03|ZGE04|ZGE05|ZGE06|ZGE07|ZGE08|ZGE09|ZGE10|ZGE11|ZGE12|ZGE13|ZGE14|ZGE15|ZGE16|ZGE17|ZGE18|ZGE19|ZGE20|ZGE21|ZGE22|ZGE23|
|Z40|Z41|Z42|Z43|Z44|Z45|Z46|Z47|Z48|Z49|Z50|Z51|Z52|Z53|Z54|Z55|Z56|Z57|Z58|Z59|Z60|Z61|Z62|Z63|Z64|Z65|Z66|Z67|Z68|Z69|Z70|Z71|Z72|Z73|Z74|Z75|Z76|Z77|Z78|Z79|Z80|Z81|Z82|Z83|Z84|Z85|Z86|Z87|Z88|Z89|Z90|Z91|Z92|Z93|Z94|Z95|Z96|Z97|Z98|Z99|Z00|Z01|Z02|Z03|Z04|Z05|Z06|Z07|Z08|Z09|Z10|Z11|Z12|Z13|Z14|Z15|Z16|Z17|Z18|Z19|Z20|Z21|Z22|Z23)"
tbl_complete$corpus <- str_extract(tbl_complete$source, corpus_pattern)
```
## Aufteilung der Informationen aus dem Meta-Text in `Quelle und Veröffentlichungsdatum`
```{r Aufspalten der Spalte Korpus in character und Zahl}
library(tidyr)
tbl_complete <- separate(tbl_complete, corpus, into = c("outlet", "year"), sep = "(?<=[A-Za-z])(?=[0-9])", remove = FALSE)
tbl_complete$year <- as.numeric(tbl_complete$year)
tbl_complete$year <- ifelse(tbl_complete$year <= 23 & tbl_complete$year >= 0, tbl_complete$year + 2000, tbl_complete$year + 1900)
tbl_complete <- select(tbl_complete, -corpus)
```
```{r Tabelle für die NAs im Korpus erstellen}
missing_indices <- which(is.na(tbl_complete$outlet))
missing_values <- tbl_complete[missing_indices, ]
```
## `Aufteilung der Zeitungsartikel` in "Überregional", "Regional" und "Sonstige" und anschließendes Entfernen aller Artikel der Kategorie "Sonstige
```{r Einteilung in Regional, Überregional und Sonstiges}
library(dplyr)
regional_pattern <- c("B", "RHP", "RHZ", "M", "NUZ", "NUN", "L", "HAZ", "BRZ", "NKU", "MK", "HMP","BDZ", "BRU", "CTB", "DEC", "DKI", "DNN", "DPR", "FNP", "FRT", "GAZ", "HST", "HTB", "HZZ", "KAZ", "LAN", "LAZ", "LRU", "AAN", "ABO", "AZM", "OSZ", "STB", "MSP", "BKU", "GTB", "HFZ", "DAZ", "KSA", "EFT", "GTA", "BZE", "NOZ", "ABM", "MAG", "MTK", "MUV", "KN", "NNN", "NNP", "NWE", "PNN", "PNP", "RPO", "MZE", "MPO", "KXP", "MME", "NBK", "BZG", "BUE", "BNA", "HNA", "LN", "RUE", "RN", "SOA", "STN", "SWP", "SZE", "UAN", "THA", "HAA", "FPC", "RGA", "HKR", "RLN", "VBW")
nonregional_pattern <- c("F", "U", "R", "T", "Z", "S", "SOL", "LMD", "HAB", "WEO", "W", "WAS", "JUE", "BOZ", "TSP", "STE", "FOC")
miscellaneous_pattern <- c("WPD", "BIH", "PBT", "PNW", "PBE", "PMV", "PSN", "PTH", "PSL", "PSH", "PRP", "PHB", "PBW", "PBB", "PHE", "PHH", "PST", "NGTAG", "H85", "ERK", "PBY", "DIV", "WKD", "WAM", "SPK", "HES", "H", "HES", "WAM", "WDD", "WPD", "REI", "MK", "NGABI", "NGTAG", "DPA", "ZCW", "ZCA", "GNG", "GEO", "PMM", "VSW", "CAP", "ART", "BEE", "ZGE", "GES")
tbl_regio <- tbl_complete |>
mutate(
regional = ifelse(outlet %in% regional_pattern, 1, 0),
ueberregional = ifelse(outlet %in% nonregional_pattern, 1, 0),
sonstiges = ifelse(outlet %in% miscellaneous_pattern, 1, 0)
)
```
```{r Sonstige entfernen + Zeitraum eingrenzen}
library(dplyr)
tbl_cleaned <- subset(tbl_regio, sonstiges != 1) |>
filter(year >= 1998)
```
## `Einfügen des Veröffentlichungsdatum` auf den Tag genau
```{r Genaue Datumsangaben einfügen}
library(stringr)
date_pattern <- "\\b\\d{2}\\.\\d{2}\\.\\d{4}\\b"
tbl_cleaned$year <- NA
tbl_cleaned$year <- str_extract(tbl_cleaned$source, date_pattern)
tbl_cleaned$month <- NA
tbl_cleaned$month <- str_sub(tbl_cleaned$year, start = -7, end = -6)
tbl_cleaned$day <- NA
tbl_cleaned$day <- str_sub(tbl_cleaned$year, start = 1, end = 2)
tbl_cleaned$year <- str_sub(tbl_cleaned$year, start = -4, end = -1)
tbl_cleaned <- tbl_cleaned |>
select(-sonstiges, -source, -id, -ueberregional) |>
rename(text = article) |>
rownames_to_column("id") |>
select(id, text, year, month, day, outlet, regional)
```
## `Bundlen der Artikel`, wenn sie im selben Monat von derselben Zeitung veröffenttlicht wurden, um künstlich längere Artikel zu generieren
```{r nach Monaten bundlen}
monthly <- tbl_cleaned |>
group_by(outlet, year, month, regional) |>
summarize(
text = paste(text, collapse = " "),
.groups = 'drop'
) |>
rownames_to_column("id")
```
## Umwandlung ins `tidy-text Format`
```{r df in tidytext-Format bringen}
library(tidytext)
library(tidyverse)
library(SnowballC)
tbl_tidy <- monthly |>
mutate(text = text |>
str_replace_all("[,.]", " ")) |>
unnest_tokens(output = token, input = text) |>
filter(!str_detect(token, "[:digit:]")) |>
anti_join(get_stopwords(language = "de"), by = c("token" = "word")) |>
anti_join(get_stopwords(language = "de", source = "stopwords-iso"), by = c("token" = "word")) |>
anti_join(get_stopwords(language = "de", source = "marimo"), by = c("token" = "word")) |>
mutate(token = wordStem(token, language = "de")) |>
filter(!(token %in% c("quot", "genios:styl", "dpa", "xyxhtmlyxy", "xyxhtmeyxy")))
```