-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnews-scraper.py
115 lines (101 loc) · 3.37 KB
/
news-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import time
import random
from initialize import log_message, get_current_timestamp, initialize_log_file
log_file = os.getenv("LOG_FILE")
if not log_file:
log_file, _ = initialize_log_file()
os.environ["LOG_FILE"] = log_file
def get_random_user_agent():
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
]
return random.choice(user_agents)
log_message(f"Starting the newsscraper script.", log_file)
log_message(f"Loading environment variables...", log_file)
load_dotenv()
news_csv_path = "data/ai-filtered-news.csv"
log_message(f"Reading the news.csv file...", log_file)
try:
news_df = pd.read_csv(news_csv_path)
log_message(
f"Successfully read the news.csv file. Found {len(news_df)} articles.",
log_file,
)
except FileNotFoundError:
log_message(f"File {news_csv_path} not found.", log_file)
exit()
log_message(
f"Adding a new column for scraped data...",
log_file,
)
news_df["scraped_data"] = np.nan
news_df["scraped_data"] = news_df["scraped_data"].astype("object")
log_message(f"New column added.", log_file)
log_message(f"Starting to scrape URLs...", log_file)
for i, row in news_df.iterrows():
url = row["url"]
if pd.notna(url):
log_message(
f"Scraping {i+1} of {len(news_df)}: {url}",
log_file,
)
for _ in range(3):
try:
headers = {"User-Agent": get_random_user_agent()}
response = requests.get(url, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "lxml")
all_text = soup.get_text()
news_df.at[i, "scraped_data"] = all_text
log_message(
f"Successfully scraped {i+1} of {len(news_df)}.",
log_file,
)
break
else:
log_message(
f"Received status code {response.status_code}. Retrying...",
log_file,
)
time.sleep(random.uniform(1, 3))
except Exception as e:
log_message(
f"An error occurred while scraping {i+1} of {len(news_df)}: {url}. Error: {e}",
log_file,
)
log_message(
f"Saving the DataFrame to a new CSV file...",
log_file,
)
try:
news_df.to_csv("data/news-scraped.csv", index=False, encoding="utf-8-sig")
log_message(
f"Scraping completed. The data has been saved to 'data/news-scraped.csv'.",
log_file,
)
# Save the DataFrame to an HTML file
html_file_path = "html/news-scraped.html"
log_message(
f"Attempting to save the DataFrame to {html_file_path}...",
log_file,
)
news_df.to_html(html_file_path)
log_message(
f"The DataFrame has been successfully saved as an HTML file in {html_file_path}.",
log_file,
)
except Exception as e:
log_message(
f"An error occurred while saving the CSV or HTML file: {e}",
log_file,
)
log_message(
f"Finished running the newsscraper script.",
log_file,
)