Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Event Source] Pedal in Tandem #40

Open
wants to merge 39 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
1f509fe
feat: create script to scrap pedalintander events
surreal30 Aug 28, 2024
3da8ca5
feat: events data in json format
surreal30 Aug 28, 2024
653e179
refac: replace soup with new_page_body
surreal30 Aug 28, 2024
abbe234
fix: remove impersonate chrome
surreal30 Aug 29, 2024
789a08e
refac: use select and map for finding events and url
surreal30 Aug 29, 2024
5b136a8
refac: use select select_one where necessary
surreal30 Aug 29, 2024
b960f43
refac: use dict for metrics and options
surreal30 Aug 29, 2024
bebc246
refac: use select for fetching metrics
surreal30 Aug 30, 2024
904e915
refac: remove events from other pages
surreal30 Aug 30, 2024
06343a4
refac: remove _date()
surreal30 Aug 30, 2024
1ad08b3
fix: remove event init as array
surreal30 Aug 30, 2024
fe64820
refac: use css selector for options
surreal30 Aug 30, 2024
5d6c204
refac: convert eventDate to startDate endDate
surreal30 Aug 30, 2024
8e50f4b
feat: pick events happening in blr only
surreal30 Aug 30, 2024
33de4d5
feat: only add single day events
surreal30 Aug 31, 2024
c38239b
feat: create addOn attribute
surreal30 Aug 31, 2024
bf3ebbc
fix
surreal30 Sep 2, 2024
6ad1453
latest json
surreal30 Sep 2, 2024
a82fbc9
refac: timings
surreal30 Sep 6, 2024
5bc2ffd
fix: bugs in dates
surreal30 Sep 6, 2024
a637cd8
refac: use subtring instead of regex for location
surreal30 Sep 6, 2024
137458e
feat: add url attribute
surreal30 Sep 6, 2024
710c10d
feat: add keywords attribute
surreal30 Sep 6, 2024
2ab431c
fix: add currency attr correctly and create get_offers
surreal30 Sep 6, 2024
9e6a65c
refac: use map instead of for in make event
surreal30 Sep 6, 2024
c514cf2
feat: add @type attr
surreal30 Sep 7, 2024
1cff296
refac: conv desc to str and put bangalore[] out of func
surreal30 Sep 7, 2024
af16bb5
fix: timings issue
surreal30 Sep 7, 2024
6664e2e
fix: duration 0
surreal30 Sep 7, 2024
0666f74
refac: return map directly and add comment
surreal30 Sep 7, 2024
8c70e7b
refac: create find_location()
surreal30 Sep 7, 2024
35b7204
updated json
surreal30 Sep 7, 2024
2f1d737
feat: refac duration_in_hours()
surreal30 Sep 8, 2024
7683e6d
fix: add @context
surreal30 Sep 10, 2024
1ed048d
fix: type offers to offer
surreal30 Sep 10, 2024
a532a8f
refac: dates and offers
surreal30 Sep 10, 2024
a5f9a03
refac: location logic
surreal30 Sep 11, 2024
ee67189
json file update
surreal30 Sep 11, 2024
aea9311
fix: remove commented code
surreal30 Sep 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 180 additions & 0 deletions out/pedalintandem.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

212 changes: 212 additions & 0 deletions src/pedalintandem.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
import json
from datetime import datetime, timedelta
from curl_cffi import requests
from common.tz import IST
from bs4 import BeautifulSoup
import re

BASE_URL = "https://www.pedalintandem.com"

def fetch_events_links(session):
res = session.get(f"{BASE_URL}/experiences")
soup = BeautifulSoup(res.text, 'html.parser')
events = soup.select('div.single-experience')
captn3m0 marked this conversation as resolved.
Show resolved Hide resolved

event_links = map(lambda x: x.find('a')['href'], events)

return event_links

def fetch_events(event_links, session):
events = []
for event_link in event_links:

event_page = session.get(f"{BASE_URL}{event_link}")
event = BeautifulSoup(event_page.text, 'html.parser')
location = event.select_one('div.location').get_text().strip()

date_selector = event.select_one('div.product-variations-varieties select')

# Fetch data of the events which would be happening in future otherwise skip
if ( "disabled" in date_selector.attrs):
continue

if not ( bool(re.search('bangalore', location, re.IGNORECASE)) or
bool(re.search('bengaluru', location, re.IGNORECASE)) or
bool(re.search('arkavathi', location, re.IGNORECASE)) or
bool(re.search('avalahalli', location, re.IGNORECASE)) or
bool(re.search('avathi', location, re.IGNORECASE)) or
bool(re.search('devarayanadurga', location, re.IGNORECASE)) or
bool(re.search('gunjur', location, re.IGNORECASE)) or
bool(re.search('hennur', location, re.IGNORECASE)) or
bool(re.search('Hesaraghatta', location, re.IGNORECASE)) or
bool(re.search('kanakapura', location, re.IGNORECASE)) or
bool(re.search('malleshwaram', location, re.IGNORECASE)) or
bool(re.search('indiranagar', location, re.IGNORECASE)) or
bool(re.search('manchanabele', location, re.IGNORECASE)) or
bool(re.search('pedal', location, re.IGNORECASE)) or
bool(re.search('pitstop', location, re.IGNORECASE)) or
bool(re.search('rajankunte', location, re.IGNORECASE)) ):
continue

duration = event.select_one('div.duration').get_text().strip()
if bool(re.search('/', duration)):
continue

events.append(event)

return events

def make_event(event):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add some comments as well please.

Suggested change
def make_event(event):
def make_event(soup):

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will change and add comments.

heading = event.select_one('div.heading').get_text().strip()
captn3m0 marked this conversation as resolved.
Show resolved Hide resolved

location = event.select_one('div.location').get_text().strip()

offers = {}
addOn = {}
offers_selector = event.select_one('div.cart-details')
opts = offers_selector.select('div.product-variations select[name="variation_id"] option')
for opt in opts:
opt_name = opt.get_text()
price = opt['data-price-after-discount']
price = price.replace("\u20b9", "")
if "rent" in opt_name or "transport" in opt_name:
addOn[opt_name] = price
else:
offers[opt_name] = price
offers['addOn'] = addOn
captn3m0 marked this conversation as resolved.
Show resolved Hide resolved

duration = event.select_one('div.duration').get_text().strip()
duration_in_hours = convert_duration_in_hours(duration)

# Find the starting time
if ',' in duration:
# If there is comma then the duration contains start time and end time
timing = duration.split(',')[1].lower()

# Check if time has pm time. Extra is used to convert the time two 24 hour format
if 'pm' in timing:
extra = 12
else:
extra = 0

if 'to' in timing:
start_time = timing.split('to')[0].strip().lower()
else:
start_time = timing.split('-')[0].strip().lower()
captn3m0 marked this conversation as resolved.
Show resolved Hide resolved

elif event.select_one('div.text-box div.trix-content li') != None:
meet_data = event.select_one('div.text-box div.trix-content li').get_text().lower()
if 'meet at' in meet_data:
start_time = meet_data.split('by')[1].strip()
elif 'meeting time' in meet_data:
start_time = re.search(r':(.*?),?', meet_data).group(0).lstrip(':').strip().lower()
else:
meet_data = event.select_one('div.description div.description-style div.trix-content div').get_text().strip().lower()
start_time = re.search(r'')


dates = []
date_opts = offers_selector.select('div.product-variations-variety select[name="variety_id"] option')
for date_opt in date_opts:
booking_begin = datetime.strptime(date_opt['data-booking-begin-at'], "%Y-%m-%d %H:%M:%S %Z").astimezone(IST).isoformat()
startdate = datetime.strptime(date_opt.get_text().strip(), "%d-%b-%Y").astimezone(IST).isoformat()
endDate = (datetime.fromisoformat(startdate) + timedelta(hours = duration_in_hours)).isoformat()

dates.append({"startdate": startdate, "endDate": endDate, "availabilityStarts": booking_begin})

# details
metrics = {}
event_metrics = event.select('div.single-metric.active div.content')

for event_metric in event_metrics:
title = event_metric.find('p').get_text().strip()
value = event_metric.find('h3').get_text().strip()
metrics[title] = value

description = event.select_one('div.trix-content div').get_text()

return {
"name": heading,
"location": location,
"priceCurrency": "INR",
"offers": offers,
"dates": dates,
"duration": duration_in_hours,
"description": [
process_description(description),
metrics
]
}

def convert_to_24_hour(start_time):
captn3m0 marked this conversation as resolved.
Show resolved Hide resolved
# Extract the numeric part of the time
time_match = re.search(r'[0-9]+(:[0-9]+)?', start_time)

if time_match:
time_str = time_match.group(0)
# Split hours and minutes
if ':' in time_str:
hours, minutes = map(int, time_str.split(':'))
else:
hours = int(time_str)
minutes = 0

# Check for 'am' or 'pm' and adjust hours accordingly
if 'am' in start_time.lower():
if hours == 12: # Special case for 12 am
hours = 0
elif 'pm' in start_time.lower():
if hours != 12: # Special case for 12 pm
hours += 12

# Convert time to decimal format
total_hours = hours + minutes / 60.0
return total_hours

def convert_duration_in_hours(duration):
duration_range = duration.split(',')[0]

# fetch the upper limit of time duration
if bool(re.search("hour", duration)):
if bool(re.search('to', duration_range)):
duration_in_hours = duration_range.replace("hours", "").split('to')[1].strip()
elif bool(re.search('-', duration_range)):
duration_in_hours = duration_range.replace("hours", "").split('-')[1].strip()
else:
duration_in_hours = duration_range.replace("hours", "").strip()

elif bool(re.search("hrs", duration)):
if bool(re.search('to', duration_range)):
duration_in_hours = duration_range.replace("hrs", "").split('to')[1].strip()
elif bool(re.search('-', duration_range)):
duration_in_hours = duration_range.replace("hrs", "").split('-')[1].strip()
else:
duration_in_hours = duration_range.replace("hrs", "").strip()

else:
duration_in_hours = 0

return int(duration_in_hours)

def process_description(description):
captn3m0 marked this conversation as resolved.
Show resolved Hide resolved
# Remove chain of hyphen '-' and convert it into a newline
processed_text = re.sub(r'-{2,}', '\n', description)
processed_text = processed_text.replace("\u00a0", "\n")
return processed_text

def main():
session = requests.Session()
event_links = fetch_events_links(session)
events_data = fetch_events(event_links, session)
captn3m0 marked this conversation as resolved.
Show resolved Hide resolved

events = []
for event_data in events_data:
event = make_event(event_data)

events.append(event)

with open("out/pedalintandem.json", "w") as f:
json.dump(events, f, indent=2)

if __name__ == "__main__":
main()