-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharticle_parser.py
101 lines (87 loc) · 3.75 KB
/
article_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urljoin, urlparse
import os
class Article:
def __init__(self, url):
self.url = url
self.parse_article()
def parse_article(self):
# Send a GET request to the URL
response = requests.get(self.url)
# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Extract article heading
heading = soup.find('h1').text.strip() if soup.find('h1') else 'Heading not found'
# Article URL is the input URL
article_url = self.url
# Extract publish date (this might vary depending on the website structure)
publish_date = None
date_meta = soup.find('meta', property='article:published_time')
if date_meta:
publish_date = datetime.fromisoformat(date_meta['content'].split('T')[0])
else:
# Try to find a date in a common format (adjust as needed)
date_element = soup.find('time')
if date_element and date_element.has_attr('datetime'):
publish_date = datetime.fromisoformat(date_element['datetime'].split('T')[0])
# Extract all images
images = []
for img in soup.find_all('img'):
src = img.get('src')
if src:
full_src = urljoin(self.url, src)
if not os.path.splitext(urlparse(full_src).path)[1].lower() == '.gif':
images.append(full_src)
# Extract all videos
videos = []
for video in soup.find_all('video'):
src = video.get('src')
if src:
full_src = urljoin(self.url, src)
videos.append(full_src)
# Also check for iframe embedded videos (e.g., YouTube)
for iframe in soup.find_all('iframe'):
src = iframe.get('src')
# if src and ('youtube.com' in src or 'vimeo.com' in src):
if src and ('youtu' in src):
videos.append(src)
# Extract article text
article_text = ""
# Look for common article container elements
article_containers = soup.find_all(['article', 'div'], class_=['article-body', 'entry-content', 'post-content'])
if article_containers:
for container in article_containers:
paragraphs = container.find_all('p')
article_text += ' '.join([p.text for p in paragraphs])
else:
# If no specific article container is found, try to get all paragraphs
paragraphs = soup.find_all('p')
article_text = ' '.join([p.text for p in paragraphs])
# Clean up the text
article_text = ' '.join(article_text.split())
# return {
# 'heading': heading,
# 'url': article_url,
# 'publish_date': publish_date,
# 'images': images,
# 'videos': videos,
# 'text': article_text
# }
self.title = heading
self.publish_date = publish_date
self.images = images
self.videos = videos
self.text = article_text
if __name__ == "__main__":
# # Example usage
article_url = 'https://www.thisisanfield.com/2024/07/2-former-liverpool-players-are-making-identical-transfer-to-cesc-fabregas-club/'
# article_data = parse_article(article_url)
article = Article(article_url)
print(f"Heading: {article.title}")
print(f"URL: {article.url}")
print(f"Publish Date: {article.publish_date}")
print(f"Images: {article.images}")
print(f"Videos: {article.videos}")
print(f"Article Text: {article.text}")