-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScript_search_CdE_ekb.R
263 lines (218 loc) · 12.1 KB
/
Script_search_CdE_ekb.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# R script to search for a string in the Corpus del Español (corpusdelespanol.org) and save the results to a .csv file
# (c) 2014 Earl K. Brown www-personal.ksu.edu/~ekbrown
# This script can be freely used and modified for non-profit purposes
#####
# NOTE:
# This script requires four packages not included in base R. If you don't already have the following packages on your computer, uncomment the next line (remove the hash tag) and run that one line, and then comment it again (put the hash tag back)
# install.packages("gsubfn"); install.packages("RCurl"); install.packages("stringr"); install.packages("XML")
#####
# FOLLOW THESE STEPS TO SEARCH THE CORPUS
# STEP 1:
# Specify your credentials. This is the email you use to login to the corpus. If you're not sure which email you have registered, simply go to the Corpus in a web browser and try logging in to figure it out.
login.email <- "your.name@your.institution.edu"
login.password <- "your.password"
# STEP 2:
# Specify the search string or strings. See the corpus in a web browser for help with search syntax, including part-of-speech tags and wildcards, etc. You can specify multiple search strings by putting them into a vector, like this: c("searchstring1", "searchstring2", "searchstring3")
search.strings <- c("me [gustar]", "me [encantar]")
# STEP 3:
# Specify which section of the corpus you want to search in, from two options: "oral" and "all". The option "oral" will search only in the oral section of the 20th century while "all" will search in all centuries, including the four sections of the 20th century. When searching, particularly when using the "all" option, care should be taken so as to not run up against your daily keyword-in-context view limit, which is based on the level of access your profile allows for.
search.section <- "oral"
# STEP 4:
# Specify the maximum number of unique types to return for each search string (results shown in the upper right portion of the Corpus). For example, searching for nouns with the search string "[n*]" could potentially return tens of thousands of unique types, but you may only be interested in the 100 most frequent ones. For single word searches that don't have any part-of-speech tags or wildcards, for example "libro", this number simply needs to be at least 1.
max.types <- 100
# STEP 5:
# Specify the maximum number of results to return. This number should be equal to or smaller than your keyword-in-context view limit.
max.results <- 1000
# STEP 6:
# Specify whether you want to retrieve the links to the expanded view for each hit, with one of two options: "yes" or "no". Once the script has finished running, in order to view the expanded context for a given hit you will need to go to the Corpus in a web browser and login in, and then copy the corresponding link from the .csv file and paste it into the address bar and press <Enter> on your keyboard.
expanded.view <- "yes"
# STEP 7:
# Specify the name of a .csv file that the results will be saved to, for example, "search_results.csv". This will overwrite any previous file with this name in your working directory.
output.file <- "search_results.csv"
# STEP 8:
# Now, save this script and then run the whole thing, and watch the magic happen!
##################
##################
# DON'T CHANGE ANYTHING BELOW HERE, UNLESS YOU KNOW WHAT YOU'RE DOING
# begin script
start.time <- strptime(date(), format = "%a %b %d %H:%M:%S %Y")
# loads necessary packages and defines functions
library("RCurl")
library("gsubfn")
library("stringr")
library("XML")
# defines some helper functions
get.kwic.results <- function(webpage, expanded = FALSE) {
# webpage <- kwic.page
webpage <- substr(webpage, regexpr("<html>", webpage), nchar(webpage))
webpage <- sub("(charset=)windows-1252", "\\1UTF-8", webpage, ignore.case = T)
webpage <- gsub("<b><u>", "\t", webpage)
webpage <- gsub("</u></b> ", "\t", webpage)
webpage <- gsub("</u></b>", "\t", webpage)
webpage <- gsub("\t\t", " ", webpage)
res.table <- readHTMLTable(webpage, which = 2)
century <- res.table[, 2]
text.source <- res.table[, 3]
text.source <- gsub("[\r\n\t]", "", text.source)
hits.in.context <- res.table[, 7]
if (length(century) != length(text.source)) stop("For some reason, the number of centuries and the number of text names differ, but they shouldn't.")
if (length(text.source) != length(hits.in.context)) stop("For some reason, the number of text names and the number of keyword-in-context results differ, but they shouldn't.")
if (expanded) {
exp.links <- getHTMLLinks(webpage)
exp.links <- grep("^x4\\.asp", exp.links, value = T)
exp.links <- unique(exp.links)
if (length(hits.in.context) != length(exp.links)) stop("For some reason, the number of links for the expanded view and the keyword-in-context results differ, but they shouldn't.")
exp.links <- paste0("http://www.corpusdelespanol.org/", exp.links)
output <- paste(century, text.source, hits.in.context, exp.links, sep = "\t")
} else {
output <- paste(century, text.source, hits.in.context, sep="\t")
}
return(output)
} # end function
urlEncodeCdE <- function(url) {
url <- gsub(" {1,}", "+", url)
url <- gsub("á", "%E1", url)
url <- gsub("é", "%E9", url)
url <- gsub("í", "%ED", url)
url <- gsub("ó", "%F3", url)
url <- gsub("ú", "%FA", url)
url <- gsub("ü", "%FC", url)
url <- gsub("ñ", "%F1", url)
url <- gsub("Á", "%C1", url)
url <- gsub("É", "%C9", url)
url <- gsub("Í", "%CD", url)
url <- gsub("Ó", "%D3", url)
url <- gsub("Ú", "%DA", url)
url <- gsub("Ü", "%DC", url)
url <- gsub("Ñ", "%D1", url)
url <- gsub("\\[", "%5B", url)
url <- gsub("\\]", "%5D", url)
return(url)
} # end function
shortURLencode = function(url, reserved = F) {
OK <- paste0("[^-ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz0123456789$_.+!*'(),", if (!reserved) ";/?:@=&", "]")
x <- strsplit(url, "")[[1L]]
z <- grep(OK, x)
if (length(z)) {
y <- sapply(x[z], function(x) paste0("%", as.hexmode(utf8ToInt(x)), collapse = ""))
x[z] <- y
}
paste0(x, collapse = "")
} # end function
expanded.view <- tolower(expanded.view)
if (expanded.view == "yes") {
expanded.view <- TRUE
} else if (expanded.view == "no") {
expanded.view <- FALSE
} else {
stop("You must specify either 'yes' or 'no' for the 'expanded.view' variable\n")
}
# creates output file for results to be saved to
if (expanded.view) {
cat("searchTERM\tYEARS\tSOURCE\tPRE\tMATCH\tFOL\tEXPANDED\n", file = output.file)
} else {
cat("searchTERM\tYEARS\tSOURCE\tPRE\tMATCH\tFOL\n", file = output.file)
}
# deletes any cookies from previous sessions in the Corpus
unlink("cookies.txt")
# creates initial curl handle
ch <- getCurlHandle(header = T, cookiefile = "cookies.txt", cookiejar = "cookies.txt", followlocation = T, autoreferer = T, useragent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0")
# gets first cookie on splash screen
splash.screen <- getURL("http://www.corpusdelespanol.org/", curl = ch)
# defines login url with login credentials
url <- sprintf("http://www.corpusdelespanol.org/login.asp?email=%s&password=%s&e=", shortURLencode(login.email), shortURLencode(login.password))
# logins to get cookie in order to continue working in Corpus
login.page <- getURL(url = url, curl = ch)
# sets the switch for the sections to search in
search.section <- tolower(search.section)
if (!search.section %in% c("all", "oral")) stop("You must specify the search.section variable as either 'all' or 'oral'\n")
if (search.section == "oral") {
search.section.code <- 23
} else {
search.section.code <- 0
}
# counter for running number of results
counter <- 0
for (h in 1:length(search.strings)) {
#h=1
search.string <- search.strings[h]
cat("Working on search string ", h, " of ", length(search.strings), ": ", search.string, "\n", sep = "")
# gets the third page
url <- paste0("http://www.corpusdelespanol.org/x2.asp?chooser=seq&p=", urlEncodeCdE(search.string), "&w2=&wl=4&wr=4&r1=&r2=&ipos1=-select-&B7=SEARCH&sec1=", search.section.code, "&sec2=0&sortBy=freq&sortByDo2=freq&minfreq1=freq&freq1=4&freq2=4&numhits=", max.types, "&kh=100&groupBy=words&whatshow=raw&saveList=no&changed=&corpus=cde&word=&sbs=&sbs1=&sbsreg1=&sbsr=&sbsgroup=&redidID=&ownsearch=y&compared=&holder=&whatdo=seq&waited=y&rand1=y&whatdo1=1&didRandom=n&minFreq=freq&s1=0&s2=0&s3=0&perc=mi")
freq.page <- getURL(url = url, curl = ch)
no.results <- length(grep("(Sorry, there are no matching records.|NO MATCHES FOR THE FOLLOWING \\'SLOTS\\')", freq.page, ignore.case = T)) > 0
if (no.results) {
stop(paste0("There appears to be no results for '", search.string, "'"))
}
links <- str_trim(unlist(strapplyc(freq.page, "href=\"(x3.asp\\?[^\"]+)\">([ A-ZÁÉÍÓÚÜÑ]+)</a>", backref = 2)))
links.themselves <- grep("x3\\.asp\\?", links, value = T)
words <- grep("^[ A-ZÁÉÍÓÚÜÑ]+$", links, value = T)
if (length(links.themselves) != length(words)) {
stop("The number of links and the words that those links are connected are different, and they shouldn't be")
}
for (i in 1:length(words)) {
#i=1
cat("\tHit ", i, " of ", length(words), " for search term \"", search.string, "\": ", tolower(words[i]), "\n", sep = "")
kwic.page <- getURL(paste0("http://www.corpusdelespanol.org/", urlEncodeCdE(links.themselves[i])), curl = ch)
# figures out if there is more than one page of results
more.than.one.page <- length(grep("<span ID=\"w_page\">PAGE</span>:</font></td>", kwic.page)) > 0
if (more.than.one.page) {
# true, there's more than one page of results
# gets the number of pages of results and runs a loop that many times, loading each successive page and pulling the results
num.pages <- as.numeric(strapplyc(kwic.page, " 1 / (\\d+) \r\n\r", backref=1)[[1]])
for (j in 1:num.pages) {
# j=1
# progress report
cat("\t\tPage ", j, " of ", num.pages, " of \"", tolower(words[i]), "\"\n", sep="")
if (j == 1) {
current.results <- get.kwic.results(kwic.page, expanded = expanded.view)
current.results <- paste(search.string, current.results, sep = "\t")
counter <- counter + length(current.results)
if (counter > max.results) {
current.results <- current.results[1:(length(current.results) - (counter - max.results))]
cat(current.results, file = output.file, sep="\n", append = T)
break
}
cat(current.results, file = output.file, sep="\n", append = T)
} else {
cur.link <- links.themselves[i]
cur.link <- sub("xx=\\d+", paste0("p=", j), cur.link)
fifth.url <- paste0("http://www.corpusdelespanol.org/", cur.link)
next.kwic.page <- getURL(url = fifth.url, curl = ch)
current.results <- get.kwic.results(next.kwic.page, expanded = expanded.view)
current.results <- paste(search.string, current.results, sep = "\t")
counter <- counter + length(current.results)
if (counter > max.results) {
current.results <- current.results[1:(length(current.results) - (counter - max.results))]
cat(current.results, file = output.file, sep="\n", append = T)
break
}
cat(current.results, file = output.file, sep="\n", append = T)
}
} # next page, end of j loop
} else {
# only one page of results, pulls the results from that one page
current.results <- get.kwic.results(kwic.page, expanded = expanded.view)
current.results <- paste(search.string, current.results, sep = "\t")
counter <- counter + length(current.results)
if (counter > max.results) {
current.results <- current.results[1:(length(current.results) - (counter - max.results))]
cat(current.results, file = output.file, sep="\n", append = T)
break
}
cat(current.results, file = output.file, sep="\n", append = T)
}
if (counter >= max.results) {
break
}
} # next link
if (counter >= max.results) {
break
}
} # next search string
end.time <- strptime(date(), format = "%a %b %d %H:%M:%S %Y")
dd <- difftime(end.time, start.time)
cat("\aDone!\n")
cat("The script took", round(dd, 1), attr(dd, "units"), "\n")
cat("The results file is named \"", output.file, "\" and is in the folder \"", getwd(), "\"\n", sep="")
# end script