-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
84 lines (64 loc) · 2.19 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from lxml import html
import requests
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import operator
import csv
import enchant
STOPWORDS = set(stopwords.words('english'))
ENGLISH = enchant.Dict("en_US")
vocabDict = dict()
customizedStopWords = [".", "ii"]
def encode(text):
return text.replace(" ", "%20")
def getAllLinksFromASectionURLNumber(pageNum=16233):
page = requests.get(
'http://grev3.kmf.com/jijing/workbookdetail?sheet_id=' + str(pageNum))
links = html.fromstring(page.text).xpath('//tr/td/a/@href')
pageLinks = ["http://grev3.kmf.com/" + encode(link) for link in links]
pageLinks = list(set(pageLinks))
return pageLinks
def getQuestionFromALink(url):
page = requests.get(url)
question = html.fromstring(page.text).xpath(
'//div[@class="mb20"]/text()')[0].strip("\n\r")
return question
def getChoicesFromALink(url):
page = requests.get(url)
answers = html.fromstring(page.text).xpath(
'//span[strong]/text()')
answers = [ans.strip("\n\r") for ans in answers]
return answers
def tokenizeSentence(sentence):
wordTokens = word_tokenize(sentence.lower())
filteredSentence = [w for w in wordTokens if not w in STOPWORDS]
return filteredSentence
def addToVocabDict(li):
for item in li:
if item not in vocabDict:
vocabDict[item] = 1
else:
vocabDict[item] += 1
def writePairListToFile(li):
w = csv.writer(open("output.csv", "w"))
for line in li:
w.writerow([line[0], line[1]])
def filterPairList(li):
response = []
for line in li:
vocab = line[0]
if ENGLISH.check(vocab) and vocab not in customizedStopWords:
response.append(line)
return response
# Main
for num in range(16183, 16234):
print("Crawling...", num)
links = getAllLinksFromASectionURLNumber(num)
for link in links:
addToVocabDict(list(tokenizeSentence(getQuestionFromALink(link))))
addToVocabDict(list(getChoicesFromALink(link)))
sortedVocabDict = sorted(vocabDict.items(), key=operator.itemgetter(1))
sortedVocabDict = filterPairList(sortedVocabDict)
writePairListToFile(sortedVocabDict)
print("DONE!")