-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdicocitation_mining.py
121 lines (107 loc) · 4.21 KB
/
dicocitation_mining.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/python3
from bs4 import BeautifulSoup
import requests
import re
from time import sleep
from optparse import OptionParser
def main():
global stop_words_PATH
stop_words_PATH='stopwords.txt'
(options, args) = parseOptions()
if not options.filename:
print('No filename, using verbose instead.\
use -f option if you want to store the words')
options.verbose=True
stop_words_PATH='stopwords.txt'
if options.stop_words_filename:
stop_words_PATH=options.stop_words_filename
stop_words = getStopwords(stop_words_PATH)
if options.delay:
delay=options.delay
else:
delay=2
if args:
targetword = args[0]
else:
print('Word is missing')
context_list=word2context(targetword, stop_words, delay)
if context_list=='':
print('Nothing found :(')
exit(0)
if options.filename:
with open(options.filename, 'w', encoding='utf-8') as f:
for wd in context_list:
f.write(wd+'\n')
if options.verbose:
for words in context_list:
print(words)
def parseOptions():
usage = "usage: %prog [options] word"
parser = OptionParser(usage=usage)
parser.add_option("-s", "--stopwords", dest="stop_words_filename",
help="path to the stopwords file, default={}".format(stop_words_PATH), metavar="STOP_WORDS_FILE")
parser.add_option("-f", "--file", dest="filename",
help="write words in context to FILE", metavar="FILE")
parser.add_option("-w", "--wait", dest="delay",
help="delay (in seconds) between requests for being polite", metavar="DELAY")
parser.add_option("-v", "--verbose",
action="store_true", dest="verbose", default=False,
help="Print all words sorted to stdout")
# return (options, args)
return parser.parse_args()
def getStopwords(stop_words_PATH):
stop_words = set()
try:
with open(stop_words_PATH, 'r', encoding='utf-8') as f:
for wd in f.readlines():
stop_words.add(wd.strip())
except FileNotFoundError:
print('Stop words file probably missing, deal with your mess')
stop_words=''
return stop_words
# Make soup if page contains stuff
def word2Soup(word, i=1, reg_stop=re.compile(' 0 citation')):
URL_FORMAT='https://www.dicocitations.com/citation.php?verif_robot=&motcle={word}&base={i}'
url = URL_FORMAT.format(word=word, i=i)
req=requests.get(url)
if not bool(reg_stop.search(req.text)):
return BeautifulSoup(req.text, 'html.parser')
else:
return None
def word2context(target_word, stop_words, delay):
print('Start mining')
reg_stop=re.compile(' 0 citation') # Help to guess the last page
reg_number=re.compile('[0-9]') # help cleaning words like id=465123
context_dict=dict()
i=1 #n° de page
soup=1
articles=0
while soup:
print('requesting page n°: {}'.format(i))
soup=word2Soup(target_word, i=i, reg_stop=reg_stop)
if soup==None:
break
for link in soup.find_all('a'):
# les mots sont aussi des liens hypertextes, pratique !
word=str(link.get('href'))
articles+=1
print('citation n°{}'.format(articles))
try:
word=word.split('mot=')[1] # le mot apparait après 'mot=' ex: /truc/machin/1235&mot=amour
except IndexError:
word=None
if word:
words=word.split('_') # filtre les mots type 'l_amour' => ['l', 'amour']
for word in words:
word=word.casefold()
# ajoute au dictionnaire si: ne mot n'est pas un nombre, ni le mot recheché, ni un stopword
if word not in stop_words and word != target_word and not bool(reg_number.search(word)):
try:
context_dict[word] +=1
except KeyError:
context_dict[word]=1
sleep(float(delay))
i+=1
return sorted(context_dict, key=lambda v:context_dict[v], reverse=True)
if __name__ == '__main__':
main()