-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnewspaper3k_scraper.py
81 lines (65 loc) · 2.55 KB
/
newspaper3k_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import logging
from pprint import pprint
from newspaper import Article
from nlp_techniques import enrich_keywords, refine_summary, analyze_sentiment, extract_named_entities
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[
logging.StreamHandler() # Log to console
]
)
def get_article_data(url: str) -> dict:
"""
Extracts structured data from an article given its URL.
Args:
url (str): The URL of the article to scrape.
Returns:
dict: A dictionary containing extracted article data including headline, authors,
publication date, main text, top image, keywords, and summary.
Raises:
ValueError: If the URL is invalid or if article download/parsing fails.
"""
if not url or not isinstance(url, str):
logging.error("Invalid URL provided.")
raise ValueError("The provided URL is not valid.")
logging.info(f"Processing URL: {url}")
# Initialize the Article object
article = Article(url)
try:
# Download the article content
article.download()
article.parse()
except Exception as e:
logging.error(f"Error downloading or parsing the article: {e}")
raise ValueError(f"Unable to process the article at {url}: {e}")
try:
# Perform NLP operations
article.nlp()
except Exception as e:
logging.warning(f"NLP processing failed for the article: {e}")
# Proceed without keywords/summary if NLP fails
# Extract desired data
data = {
"headline": article.title or "N/A",
"authors": article.authors or [],
"publication_date": article.publish_date or "N/A",
"main_text": article.text or "",
"top_image": article.top_image or "N/A",
"keywords": enrich_keywords(article),
"summary": refine_summary(article.text),
"sentiment": analyze_sentiment(article),
"named_entities": extract_named_entities(article.text)
}
logging.info(f"Successfully extracted data from: {url}")
return data
if __name__ == '__main__':
# Example article URL
url = "https://www.nbmcw.com/news/ceinsys-tech-wins-rs385-cr-bid-for-wainganga-nalganga-river-link-project.html#:~:text=The%20%E2%82%B9385.15%20crore%20contract,preparing%20the%20project's%20detailed%20report"
try:
# Fetch article data
article_data = get_article_data(url=url)
pprint(article_data)
except ValueError as e:
logging.error(f"Failed to process article: {e}")