Skip to content

Commit

Permalink
Fixed scraping issues and documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
ishan-surana committed Jun 28, 2024
1 parent cdfc828 commit 71a2039
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 61 deletions.
110 changes: 58 additions & 52 deletions MetaDataScraper/FacebookScraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,56 +11,57 @@
class FacebookScraper:
"""
A class to scrape information from a public Facebook page. It does not require any authentication or API keys.
Attributes
----------
page_id : str
+ `page_id` : str
The Facebook page ID to scrape information from.
driver : webdriver.Chrome
+ `driver` : webdriver.Chrome
The Selenium WebDriver instance.
followers : str
+ `followers` : str
The followers count of the Facebook page.
post_texts : list
+ `post_texts` : list
The list of texts from the posts.
post_likes : list
+ `post_likes` : list
The list of likes count for the posts.
post_shares : list
+ `post_shares` : list
The list of shares count for the posts.
is_video : list
+ `is_video` : list
The list indicating whether the post contains a video.
video_links : list
+ `video_links` : list
The list of video links if the post contains a video.
Methods
-------
scrape(self) -> dict:
`scrape`(self) -> dict:
Initiates the scraping process and returns a dictionary with the scraped data.
Returns
-------
dict
A dictionary containing the following
- 'followers': str
The followers count of the Facebook page.
- 'post_texts': list
A list of texts from the posts.
- 'post_likes': list
A list of likes count for the posts.
- 'post_shares': list
A list of shares count for the posts.
- 'is_video': list
A list indicating whether the post contains a video.
- 'video_links': list
A list of video links if the post contains a video.
[dict]
A dictionary containing the following:-
+ `followers` (str):
The followers count of the Facebook page.
+ `post_texts` (list):
A list of texts from the posts.
+ `post_likes` (list):
A list of likes count for the posts.
+ `post_shares` (list):
A list of shares count for the posts.
+ `is_video` (list):
A list indicating whether the post contains a video.
+ `video_links` (list):
A list of video links if the post contains a video.
Example
-------
To scrape a Facebook page:
from MetaDataScraper import FacebookScraper
```python
from MetaDataScraper import FacebookScraper
scraper = FacebookScraper("page_id")
data = scraper.scrape()
print(f"Followers: {data['followers']}")
print(f"Post Texts: {data['post_texts']}")
print(f"Post Likes: {data['post_likes']}")
Expand Down Expand Up @@ -188,55 +189,60 @@ def __extract_post_details(self):
c = 1
error_count = 0
while True:
xpath = self.xpath_first + str(c) + self.xpath_identifier_addum + self.xpath_last
xpath = self.xpath_first+str(c)+self.xpath_identifier_addum+self.xpath_last
if not self.driver.find_elements(By.XPATH, xpath):
error_count += 1
if error_count < 3:
print('Error extracting post', c, '\b. Retrying extraction...', end='\r')
time.sleep(5)
self.driver.execute_script("window.scrollBy(0, +20);")
continue
break
error_count = 0
# Scroll until the post is visible
self.driver.execute_script("arguments[0].scrollIntoView();", self.driver.find_elements(By.XPATH, xpath)[0])
if not self.driver.find_elements(By.XPATH, xpath):
error_count += 1
if error_count < 3:
print('Error extracting post', c, '\b. Retrying extraction...', end='\r')
time.sleep(5)
self.driver.execute_script("window.scrollBy(0, +20);")
continue
break
error_count = 0
print(" "*100, end='\r')
print('Extracting post', c, end='\r')
post_components = self.driver.find_element(By.XPATH, xpath).find_elements(By.XPATH, './*')
if len(post_components) > 2:
post_text = '\n'.join(post_components[2].text.split('\n'))
if post_components[3].text.split('\n')[0] == 'All reactions:':
post_likes = post_components[3].text.split('\n')[1]
if len(post_components[3].text.split('\n')) > 4:
post_shares = post_components[3].text.split('\n')[4].split(' ')[0]
elif len(post_components) > 4 and post_components[4].text.split('\n')[0] == 'All reactions:':
post_likes = post_components[4].text.split('\n')[1]
post_shares = post_components[4].text.split('\n')[4].split(' ')[0]
if post_components[3].text.split('\n')[0]=='All reactions:':
post_like = post_components[3].text.split('\n')[1]
if len(post_components[3].text.split('\n'))>4:
post_share = post_components[3].text.split('\n')[3].split(' ')[0]
elif len(post_components)>4 and post_components[4].text.split('\n')[0]=='All reactions:':
post_like = post_components[4].text.split('\n')[1]
post_share = post_components[4].text.split('\n')[4].split(' ')[0]
else:
post_likes = 0
post_shares = 0
post_like = 0
post_share = 0
self.post_texts.append(post_text)
self.post_likes.append(post_likes)
self.post_shares.append(post_shares)
self.post_likes.append(post_like)
self.post_shares.append(post_share)
else:
try:
post_shares = post_components[1].find_element(By.XPATH, './/*[@aria-label="Share"]').text
c += 1
post_share = post_components[1].find_element(By.XPATH, './/*[@aria-label="Share"]').text
except:
c += 2
c+=1
continue
post_likes = post_components[1].find_element(By.XPATH, './/*[@aria-label="Like"]').text
post_shares = post_components[1].find_element(By.XPATH, './/*[@aria-label="Share"]').text
post_like = post_components[1].find_element(By.XPATH, './/*[@aria-label="Like"]').text
post_share = post_components[1].find_element(By.XPATH, './/*[@aria-label="Share"]').text
time.sleep(1)
self.post_texts.append('')
self.post_likes.append(post_likes)
self.post_shares.append(post_shares)
self.post_likes.append(post_like)
self.post_shares.append(post_share)
if len(self.driver.find_elements(By.XPATH, xpath)[0].find_elements(By.TAG_NAME, 'video')) > 0:
if 'reels' in self.driver.find_elements(By.XPATH, xpath)[0].find_elements(By.TAG_NAME, 'a')[0].get_attribute('href'):
self.video_links.append('https://www.facebook.com' + self.driver.find_elements(By.XPATH, xpath)[0].find_elements(By.TAG_NAME, 'a')[0].get_attribute('href'))
self.video_links.append('https://www.facebook.com'+self.driver.find_elements(By.XPATH, xpath)[0].find_elements(By.TAG_NAME, 'a')[0].get_attribute('href'))
else:
self.video_links.append(self.driver.find_elements(By.XPATH, xpath)[0].find_elements(By.TAG_NAME, 'a')[4].get_attribute('href'))
self.is_video.append(True)
Expand All @@ -259,17 +265,17 @@ def scrape(self) -> dict:
-------
dict
A dictionary containing the following keys:
- 'followers': str
+ 'followers': str
The followers count of the Facebook page.
- 'post_texts': list
+ 'post_texts': list
A list of texts from the posts.
- 'post_likes': list
+ 'post_likes': list
A list of likes count for the posts.
- 'post_shares': list
+ 'post_shares': list
A list of shares count for the posts.
- 'is_video': list
+ 'is_video': list
A list indicating whether the post contains a video.
- 'video_links': list
+ 'video_links': list
A list of video links if the post contains a video.
Example
Expand Down
14 changes: 6 additions & 8 deletions MetaDataScraper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@
Classes:
--------
FacebookScraper
+ FacebookScraper
A class to scrape followers count and post details from a public Facebook page. It does not require any authentication or API keys.
Methods:
------------------------
scrape(self) -> dict:
+ scrape(self) -> dict:
Initiates the scraping process and returns a dictionary with the scraped data.
Requirements:
Expand All @@ -26,14 +26,12 @@
Usage:
------
from MetaDataScraper import FacebookScraper
page_id = "your_facebook_page_id"
scraper = FacebookScraper(page_id)
```python
from MetaDataScraper import FacebookScraper
scraper = FacebookScraper("page_id")
data = scraper.scrape()
print(f"Followers: {data['followers']}")
print(f"Post Texts: {data['post_texts']}")
print(f"Post Likes: {data['post_likes']}")
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "MetaDataScraper"
version = "0.0.1"
version = "0.0.3"
authors = [
{ name="Ishan Surana", email="ishansurana1234@gmail.com" },
]
Expand Down

0 comments on commit 71a2039

Please sign in to comment.