-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathInfo Hunter
165 lines (154 loc) · 5.56 KB
/
Info Hunter
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import csv
import requests
from bs4 import BeautifulSoup
from nltk import download
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from spacy.lang.en import English
# Download required NLTK resources
download('stopwords')
download('punkt')
# Initialize NLP tools
nlp = English()
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def search_news_website(query):
"""Search a news website for articles on a specific topic"""
# Send GET request to news website search page
resp = requests.get(f'https://www.popularnewswebsite.com/search?q={query}')
# Extract relevant information from the response
if resp.status_code == 200:
soup = BeautifulSoup(resp.text, 'html.parser')
results = []
for article in soup.find_all('article'):
title = article.h2.text
author = article.find('span', class_='author').text
date_published = article.find('span', class_='date').text
url = article.a['href']
content = ''
# Get full article content
try:
article_resp = requests.get(url)
if article_resp.status_code == 200:
article_soup = BeautifulSoup(article_resp.text, 'html.parser')
content = article_soup.find('div', class_='article-body').text
except Exception as e:
print(f'Error getting article content: {e}')
# Extract key phrases from content using NLP
doc = nlp(content)
key_phrases = []
for token in doc:
if (token.text.lower() not in stop_words) and (token.pos_ in ['NOUN', 'ADJ']):
key_phrases.append(lemmatizer.lemmatize(token.text))
# Add article data to results
results.append({
'title': title,
'author': author,
'date_published': date_published,
'url': url,
'key_phrases': key_phrases
})
return results
else:
print(f'Error searching news website: {resp.status_code}')
def search_google(query):
"""Search Google for articles and posts on a specific topic"""
# Send GET request to Google search page
resp = requests.get(f'https://www.google.com/search?q={query}')
# Extract relevant information from the response
if resp.status_code == 200:
soup = BeautifulSoup(resp.text, 'html.parser')
results = []
for item in soup.find_all('div', class_='g'):
try:
title = item.h3.text
url = item.h3.a['href']
snippet = item.find('span', class_='st').text
# Get full article or post content
content = ''
try:
article_resp = requests.get(url)
if article_resp.status_code == 200:
article_soup = BeautifulSoup(article_resp.text, 'html.parser')
content = article_soup.find('body').text
except Exception as e:
print(f'Error getting article content: {e}')
# Extract key phrases from content using NLP
doc = nlp(content)
key_phrases = []
for token in doc:
if (token.text.lower() not in stop_words) and (token.pos_ in ['NOUN', 'ADJ']):
key_phrases.append(lemmatizer.lemmatize(token.text))
# Add article or post data to results
results.append({
'title': title,
'url': url,
'snippet': snippet,
'key_phrases': key_phrases
})
except Exception as e:
print(f'Error processing item: {e}')
return results
else:
print(f'Error searching Google: {resp.status_code}')
def search_forums(query):
"""Search forums for discussions on a specific topic"""
# Send GET request to Reddit search page
resp = requests.get(f'https://www.reddit.com/search.json?q={query}')
# Extract relevant information from the response
if resp.status_code == 200:
data = resp.json()
results = []
for item in data['data']['children']:
title = item['data']['title']
url = item['data']['url']
subreddit = item['data']['subreddit']
created_utc = item['data']['created_utc']
content = ''
# Get full discussion content
try:
discussion_resp = requests.get(url)
if discussion_resp.status_code == 200:
discussion_soup = BeautifulSoup(discussion_resp.text, 'html.parser')
content = discussion_soup.find('div', class_='entry unvoted').text
except Exception as e:
print(f'Error getting discussion content: {e}')
# Extract key phrases from content using NLP
doc = nlp(content)
key_phrases = []
for token in doc:
if (token.text.lower() not in stop_words) and (token.pos_ in ['NOUN', 'ADJ']):
key_phrases.append(lemmatizer.lemmatize(token.text))
# Add discussion data to results
results.append({
'title': title,
'url': url,
'subreddit': subreddit,
'created_utc': created_utc,
'key_phrases': key_phrases
})
return results
else:
print(f'Error searching forums: {resp.status_code}')
def search(query):
"""Search multiple sources for information on a specific topic"""
results = []
# Search news website
news_results = search_news_website(query)
if news_results:
results.extend(news_results)
# Search Google
google_results = search_google(query)
if google_results:
results.extend(google_results)
# Search forums
forums_results = search_forums(query)
if forums_results:
results.extend(forums_results)
# Save results to CSV file
df = pd.DataFrame(results)
df.to_csv(f'{query}_results.csv', index=False)
print(f'Results saved to {query}_results.csv')
# Example usage
search('data science')