-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwiki_microservice.py
85 lines (71 loc) · 2.76 KB
/
wiki_microservice.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Clinton Lohr
# Microservice to extract data from Wikipedia
import time
import requests
from bs4 import BeautifulSoup
def request():
"""checks if there is a request"""
file = open('request.txt', 'r')
searchWord = file.readline()
file.close()
return searchWord.replace(' ', '_') # creates a valid name
def get_from_Wikipedia():
"""requests data from Wikipedia"""
link = "https://en.wikipedia.org/wiki/" + searchWord
print(link)
result = requests.get(link)
print(result.status_code, '\n')
all_page_info = result.content # stores the content of the page as a variable
return BeautifulSoup(all_page_info,
'lxml') # creates BeautifulSoup object based on the all_page_info variable to parse and process the info
def table():
"""extracts data from the table"""
for tr_tag in soup.find_all('tr'):
table = tr_tag.text
if table[0:8] == 'Vertical':
# some pages have no spaces between words and numbers in the table
table = table[0:8] + ' ' + table[8:]
data.append(table)
elif table[0:3] == 'Top':
table = table[0:13] + ' ' + table[13:]
data.append(table)
elif table[0:4] == 'Base':
table = table[0:14] + ' ' + table[14:]
data.append(table)
elif table[0:7] == 'Skiable':
table = table[0:12] + ' ' + table[12:]
data.append(table)
break
def text():
"""extracts paragraphs"""
num_paragraphs = 10 # the number of saved paragraphs can be changed
for p_tags in soup.find_all('p'):
paragraph = p_tags.text
if len(paragraph) > 2 and num_paragraphs > 0: # skips empty paragraphs if len(paragraph) > 2
num_paragraphs -= 1
data.append(paragraph)
if num_paragraphs <= 0:
break
def response():
"""saves extracted data into the file"""
file = open('response.txt', encoding='utf-8', mode='w')
for i in data:
# removes unnecessary signs and extra spaces from the text
i = i.replace('\xa0', ' ')
i = i.replace(' ', ' ')
i = i.replace(' ', ' ')
i = i.replace('\n', '')
file.write(i + '\n')
file.close()
wait = '' # a variable used when checking if a new request received
while True: # it runs constantly and checks if a new request received
searchWord = request()
#time.sleep(1) # the delay is needed to give time for the application to write the request in the file
if searchWord != wait: # only if a new request received
wait = searchWord
print('new request received')
soup = get_from_Wikipedia()
data = [searchWord + ':'] # to store the extracted data
table()
text()
response()