-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScript_search_COCA_ekb.R
293 lines (248 loc) · 14.3 KB
/
Script_search_COCA_ekb.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
# R script to search for a string or several strings in the Corpus of Contemporary American English (corpus.byu.edu/coca) and save the results to a .csv file
# (c) 2014 Earl K. Brown www-personal.ksu.edu/~ekbrown
# Thanks to Jennifer Hinnell for feedback and suggestions on this script
# This script can be freely used and modified for non-profit purposes
#####
# NOTE:
# This script requires three packages not included in base R. If you don't already have the following packages on your computer, uncomment the next line (remove the hash tag) and run that one line, and then comment it again (put the hash tag back)
# install.packages("stringr"); install.packages("RCurl"); install.packages("XML")
#####
# FOLLOW THESE STEPS TO SEARCH THE CORPUS
# STEP 1:
# Specify your credentials. This is the email you use to login to the corpus. If you're not sure which email you have registered, simply go to the Corpus in a web browser and try logging in to figure it out.
login.email <- "your.name@your.institution.edu"
login.password <- "your.password"
# STEP 2:
# Specify the search string or strings. See the Corpus in a web browser for help with search syntax, including part-of-speech tags and wildcards, etc. You can specify multiple search strings by putting them into a vector, like this: c("searchstring1", "searchstring2", "searchstring3")
# NOTE: The Corpus doesn't handle apostrophes in a way one might expect. For example, you can't directly search for "I don't know"; you have to transform it to "I do n't know". If you have apostrophes in a search string, take a minute to manually search for it to see how you will need to transform it before running this script.
search.strings <- c(". You know ", ". I mean ")
# STEP 3:
# Specify the section you want to search in. To search in the entire Corpus use the option "all". If you'd like to use a specific genre of the Corpus, specify one of the following: "spok" for the spoken section, "fic" for fiction, "mag" for magazine, "news" for newspaper, "acad" for academic.
search.section <- "spok"
# STEP 4:
# Specify the maximum number of unique types to return for each search string (results shown in the upper right portion of the Corpus). For example, searching for nouns with the search string "[n*]" could potentially return tens of thousands of unique types, but you may only be interested in the 100 most frequent ones. For single word searches that don't have any part-of-speech tags or wildcards, for example "book", this number simply needs to be at least 1.
max.types <- 100
# STEP 5:
# Specify the maximum number of results to return for each search string before moving onto the next one, if you have specified more than one search string above in "search.strings". If you have specified only one search string, this number has the same effect as the number in the next step ("max.results.total").
max.results.per.search.string <- 1000
# STEP 6:
# Specify the maximum number of total results to return. This number should be equal to or smaller than your keyword-in-context view limit.
max.results.total <- 10000
# STEP 7:
# Specify whether you want to retrieve the links to the expanded view for each hit, with one of two options: "yes" or "no". Once the script has finished running, in order to view the expanded context for a given hit you will need to go to the Corpus in a web browser and login in, and then copy the corresponding link from the .csv file and paste it into the address bar and press <Enter> on your keyboard.
expanded.view <- "yes"
# STEP 8:
# Specify the name of a .csv file that the results will be saved to, for example, "search_results.csv". This will overwrite any previous file with this name in your working directory.
output.file <- "search_results.csv"
# STEP 9:
# Now, save this script and then run the whole thing, and watch the magic happen!
##################
##################
# DON'T CHANGE ANYTHING BELOW HERE, UNLESS YOU KNOW WHAT YOU'RE DOING
# begin script
start.time <- strptime(date(), format = "%a %b %d %H:%M:%S %Y")
# loads necessary packages and defines functions
library("stringr")
library("RCurl")
library("XML")
# defines some helper functions
get.kwic.results <- function(webpage, expanded = FALSE) {
# webpage <- kwic.page
webpage <- substr(webpage, regexpr("<html>", webpage), nchar(webpage))
webpage <- sub("(charset=)windows-1252", "\\1UTF-8", webpage, ignore.case = T)
webpage <- gsub("<b><u>", "\t", webpage)
webpage <- gsub("</u></b> ", "\t", webpage)
webpage <- gsub("</u></b>", "\t", webpage)
webpage <- gsub("\t\t", " ", webpage)
res.table <- readHTMLTable(webpage, which = 2)
year <- res.table[, 2]
genre <- res.table[, 3]
text.source <- res.table[, 4]
text.source <- gsub("[\r\n\t]", "", text.source)
hits.in.context <- res.table[, 8]
if (length(year) != length(genre)) stop("For some reason, the number of centuries and the number of text names differ, but they shouldn't.")
if (length(genre) != length(text.source)) stop("From some reason, the number of years and the number of genre listings differ, but they shouldn't")
if (length(text.source) != length(hits.in.context)) stop("For some reason, the number of text names and the number of keyword-in-context results differ, but they shouldn't.")
if (expanded) {
exp.links <- getHTMLLinks(webpage)
exp.links <- grep("^x4\\.asp", exp.links, value = T)
exp.links <- unique(exp.links)
if (length(hits.in.context) != length(exp.links)) stop("For some reason, the number of links for the expanded view and the keyword-in-context results differ, but they shouldn't.")
exp.links <- paste0("http://corpus.byu.edu/coca/", exp.links)
output <- paste(year, genre, text.source, hits.in.context, exp.links, sep = "\t")
} else {
output <- paste(year, genre, text.source, hits.in.context, sep="\t")
}
return(output)
} # end function
urlEncodeCdE <- function(url) {
url <- gsub(" {1,}", "+", url)
url <- gsub("á", "%E1", url)
url <- gsub("é", "%E9", url)
url <- gsub("í", "%ED", url)
url <- gsub("ó", "%F3", url)
url <- gsub("ú", "%FA", url)
url <- gsub("ü", "%FC", url)
url <- gsub("ñ", "%F1", url)
url <- gsub("Á", "%C1", url)
url <- gsub("É", "%C9", url)
url <- gsub("Í", "%CD", url)
url <- gsub("Ó", "%D3", url)
url <- gsub("Ú", "%DA", url)
url <- gsub("Ü", "%DC", url)
url <- gsub("Ñ", "%D1", url)
url <- gsub("\\[", "%5B", url)
url <- gsub("\\]", "%5D", url)
url <- gsub("'", "%27", url)
url <- gsub(",", "%2C", url)
return(url)
} # end function
shortURLencode = function(url, reserved = F) {
OK <- paste0("[^-ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz0123456789$_.+!*'(),", if (!reserved) ";/?:@=&", "]")
x <- strsplit(url, "")[[1L]]
z <- grep(OK, x)
if (length(z)) {
y <- sapply(x[z], function(x) paste0("%", as.hexmode(utf8ToInt(x)), collapse = ""))
x[z] <- y
}
paste0(x, collapse = "")
} # end function
expanded.view <- tolower(expanded.view)
if (expanded.view == "yes") {
expanded.view <- TRUE
} else if (expanded.view == "no") {
expanded.view <- FALSE
} else {
stop("You must specify either 'yes' or 'no' for the 'expanded.view' variable\n")
}
# creates output file for results to be saved to
if (expanded.view) {
cat("searchTERM\tYEAR\tSECTION\tSOURCE\tPRE\tMATCH\tFOL\tEXPANDED\n", file = output.file)
} else {
cat("searchTERM\tYEAR\tSECTION\tSOURCE\tPRE\tMATCH\tFOL\n", file = output.file)
}
# deletes any cookies from previous sessions in the Corpus
unlink("cookies.txt")
# creates initial curl handle
ch <- getCurlHandle(header = F, cookiefile = "cookies.txt", cookiejar = "cookies.txt", followlocation = T, autoreferer = T, useragent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0")
# gets a cookie at the splash page
splash <- getURL("http://corpus.byu.edu/coca", curl = ch)
# logins into Corpus
url <- sprintf("http://corpus.byu.edu/coca/login.asp?email=%s&password=%s&e=", shortURLencode(login.email), shortURLencode(login.password))
login.page <- getURL(url = url, curl = ch)
# gets code for genre
search.section <- tolower(search.section)
if (!search.section %in% c("all", "spok", "fic", "mag", "news", "acad")) {
stop("You must specify freq.section as one of: 'all', 'spok', 'fic', 'mag', 'news', 'acad'")
}
codes.df <- data.frame(genre = c("all", "spok", "fic", "mag", "news", "acad"), code = c(0, 1, 2, 3, 4, 5))
cur.code <- codes.df[grep(search.section, codes.df$genre), 'code']
# counter for running number of results
all.counter <- 0
for (h in 1:length(search.strings)) {
#h=1
search.string <- search.strings[h]
cat("Working on search string ", h, " of ", length(search.strings), ": ", search.string, "\n", sep = "")
# gets the third page
url <- paste0("http://corpus.byu.edu/coca/x2.asp?chooser=seq&p=", urlEncodeCdE(search.string), "&w2=&wl=4&wr=4&r1=&r2=&ipos1=-select-&B7=SEARCH&sec1=", cur.code, "&sec2=0&sortBy=freq&sortByDo2=freq&minfreq1=freq&freq1=10&freq2=10&numhits=", max.types, "&kh=100&groupBy=words&whatshow=raw&saveList=no&changed=&corpus=coca&word=&sbs=&sbs1=&sbsreg1=&sbsr=&sbsgroup=&redidID=&ownsearch=y&compared=&holder=&whatdo=seq&waited=y&rand1=y&whatdo1=1&didRandom=n&minFreq=freq&s1=0&s2=0&s3=0&perc=mi")
freq.page <- getURL(url = url, curl = ch)
no.results <- length(grep("(Sorry, there are no matching records.|NO MATCHES FOR THE FOLLOWING \\'SLOTS\\')", freq.page, ignore.case = T)) > 0
if (no.results) {
stop(paste0("There appears to be no results for '", search.string, "'"))
}
links <- str_match_all(freq.page, "href=\"(x3.asp\\?[^\"]+)\">([ \\.,'A-ZÁÉÍÓÚÜÑ]+)</a>")
links.themselves <- links[[1]][,2]
words <- str_trim(links[[1]][,3])
if (length(links.themselves) != length(words)) {
stop("The number of links and the words that those links are connected are different, and they shouldn't be")
}
# counter for number of results for current search string
cur.counter <- 0
for (i in 1:length(words)) {
#i=1
cat("\tHit ", i, " of ", length(words), " for search term \"", search.string, "\": ", tolower(words[i]), "\n", sep = "")
kwic.page <- getURL(paste0("http://corpus.byu.edu/coca/", urlEncodeCdE(links.themselves[i])), curl = ch)
# figures out if there is more than one page of results
more.than.one.page <- length(grep("<span ID=\"w_page\">PAGE</span>:</font></td>", kwic.page)) > 0
if (more.than.one.page) {
# true, there's more than one page of results
# gets the number of pages of results and runs a loop that many times, loading each successive page and pulling the results
num.pages <- as.integer(str_match(kwic.page, " 1 / (\\d+) \r\n\r")[,2])
for (j in 1:num.pages) {
# j=1
# progress report
cat("\t\tPage ", j, " of ", num.pages, " of \"", tolower(words[i]), "\"\n", sep="")
if (j == 1) {
current.results <- get.kwic.results(kwic.page, expanded = expanded.view)
current.results <- paste(search.string, current.results, sep = "\t")
all.counter <- all.counter + length(current.results)
cur.counter <- cur.counter + length(current.results)
if (all.counter > max.results.total) {
current.results <- current.results[1:(length(current.results) - (all.counter - max.results.total))]
cat(current.results, file = output.file, sep="\n", append = T)
break
}
if (cur.counter > max.results.per.search.string) {
current.results <- current.results[1:(length(current.results) - (cur.counter - max.results.per.search.string))]
cat(current.results, file = output.file, sep="\n", append = T)
break
}
cat(current.results, file = output.file, sep="\n", append = T)
} else {
cur.link <- links.themselves[i]
cur.link <- sub("xx=\\d+", paste0("p=", j), cur.link)
fifth.url <- paste0("http://corpus.byu.edu/coca/", cur.link)
next.kwic.page <- getURL(url = fifth.url, curl = ch)
current.results <- get.kwic.results(next.kwic.page, expanded = expanded.view)
current.results <- paste(search.string, current.results, sep = "\t")
all.counter <- all.counter + length(current.results)
cur.counter <- cur.counter + length(current.results)
if (all.counter > max.results.total) {
current.results <- current.results[1:(length(current.results) - (all.counter - max.results.total))]
cat(current.results, file = output.file, sep="\n", append = T)
break
}
if (cur.counter > max.results.per.search.string) {
current.results <- current.results[1:(length(current.results) - (cur.counter - max.results.per.search.string))]
cat(current.results, file = output.file, sep="\n", append = T)
break
}
cat(current.results, file = output.file, sep="\n", append = T)
}
} # next page, end of j loop
} else {
# only one page of results, pulls the results from that one page
current.results <- get.kwic.results(kwic.page, expanded = expanded.view)
current.results <- paste(search.string, current.results, sep = "\t")
all.counter <- all.counter + length(current.results)
cur.counter <- cur.counter + length(current.results)
if (all.counter > max.results.total) {
current.results <- current.results[1:(length(current.results) - (all.counter - max.results.total))]
cat(current.results, file = output.file, sep="\n", append = T)
break
}
if (cur.counter > max.results.per.search.string) {
current.results <- current.results[1:(length(current.results) - (cur.counter - max.results.per.search.string))]
cat(current.results, file = output.file, sep="\n", append = T)
break
}
cat(current.results, file = output.file, sep="\n", append = T)
}
if (all.counter >= max.results.total | cur.counter >= max.results.per.search.string) {
break
}
} # next link
if (all.counter >= max.results.total) {
break
}
if (cur.counter >= max.results.per.search.string) {
next
}
} # next search string
end.time <- strptime(date(), format = "%a %b %d %H:%M:%S %Y")
dd <- difftime(end.time, start.time)
cat("\aDone!\n")
cat("The script took", round(dd, 1), attr(dd, "units"), "\n")
cat("The results file is named \"", output.file, "\" and is in the folder \"", getwd(), "\"\n", sep="")
cat("\n\n***If this script has been helpful to you while preparing a presentation and/or a publication, please acknowledge my efforts to facilitate the extraction of results from the COCA and my willingness to share this script publicly. Thanks!\n\nEarl K. Brown, Kansas State University\n\n")
# end script