-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathotherwebscraping.py
216 lines (169 loc) · 7.78 KB
/
otherwebscraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
from django.shortcuts import render
from .models import product
from django.http import HttpResponse
import requests
from bs4 import BeautifulSoup
import csv
import time
from concurrent.futures import ThreadPoolExecutor
import requests
import datetime
import smtplib
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"Sec-Ch-Ua": "\"Not A(Brand\";v=\"99\", \"Microsoft Edge\";v=\"121\", \"Chromium\";v=\"121\"",
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": "\"Windows\"",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "cross-site",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0",
"X-Amzn-Trace-Id": "Root=1-65c58daf-3a2c6c3a7adb35726b652acb"
}
'''
AMAZON
'''
def get_product_links_amazon(url):
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
a_tag = soup.find_all('a', class_='a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal')
product_link_list = []
for tag in a_tag:
href = tag.get('href', '')
# Check if href is a complete URL or a relative path
if href.startswith('http'):
product_link_list.append(href)
else:
# Ensure it's a relative path and prepend the base URL
product_link_list.append('https://www.amazon.com' + href)
return product_link_list
def get_product_data_amazon(url):
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
title = soup.find(id='productTitle').get_text().strip() if soup.find(id='productTitle') else 'N/A'
price = soup.find('span', class_='a-offscreen').get_text().strip() if soup.find('span', class_='a-offscreen') else 'N/A'
rating = soup.find('span', id='acrCustomerReviewText').get_text().strip() if soup.find('span', id='acrCustomerReviewText') else 'N/A'
score = soup.find('span', class_='a-icon-alt').get_text().strip() if soup.find('span', class_='a-icon-alt') else 'N/A'
review_summary = soup.find(id='product-summary').get_text().strip() if soup.find(id='product-summary') else 'N/A'
return [url, title, price, rating, score, review_summary]
"""
WALMART
"""
def get_product_links_walmart(url):
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
a_tags = soup.find_all('a', class_='absolute w-100 h-100 z-1 hide-sibling-opacity')
links = [a['href'] for a in a_tags if 'href' in a.attrs and not a['href'].startswith('https')]
base_url = 'https://www.walmart.com'
full_links = [base_url + link for link in links]
return full_links
def get_product_details_walmart(url):
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
title = soup.find(id="main-title").text
price = soup.find('span', itemprop='price').text
rating = soup.find('a', itemprop='ratingCount').text if soup.find('a', itemprop='ratingCount') else 'N/A'
score = soup.find('span', class_='f7 rating-number').text if soup.find('span', class_='f7 rating-number') else 'N/A'
image = soup.find('img', class_=['noselect', 'db'])['src']
return [title,price,rating,score,image]
"""
BEST BUY
"""
def get_product_links_bestbuy(url):
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
h4_elements = soup.find_all('h4', class_='sku-title')
links = []
for h4 in h4_elements:
a_tag = h4.find('a', href=True)
if a_tag:
link = 'https://www.bestbuy.com' + a_tag['href']
links.append(link)
return links
def get_product_details_bestbuy(url):
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
title = soup.find('h1',class_='heading-5 v-fw-regular').text
price_div = soup.find('div', {'data-testid': 'customer-price'})
price = price_div.find('span', {'aria-hidden': 'true'}).text
rating = soup.find('span',class_='c-reviews order-2').text
score = soup.find('span',class_="ugc-c-review-average font-weight-medium order-1").text
image = soup.find('img',class_='primary-image')['src']
return [title,price,rating,score,image]
"""
TARGET (using selenium)
"""
def get_product_links_target(url):
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(url)
# wait for page to load before finding links to products
WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.XPATH, '//a[@data-test="product-title"]'))
)
product_elements = driver.find_elements(By.XPATH, '//a[@data-test="product-title"]')
product_link_list = [element.get_attribute('href') for element in product_elements]
driver.quit()
return product_link_list
def get_product_details_target(url):
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
products_info = []
driver.get(url)
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '[data-test="product-title"]'))
)
title = driver.find_element(By.CSS_SELECTOR, '[data-test="product-title"]').text
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '[data-test="product-price"]'))
)
price = driver.find_element(By.CSS_SELECTOR, '[data-test="product-price"]').text
try:
rating_elements = driver.find_element(By.CSS_SELECTOR, '[data-test="ratings"]').text
except:
rating_elements = "No rating"
parts = rating_elements.split(' ')
score = f"{parts[0]}/{parts[3]}"
rating = parts[-2] + " ratings"
image_element = driver.find_element(By.CSS_SELECTOR, 'img[alt^="Logitech M240 Wireless Mouse"]')
image = image_element.get_attribute('src')
products_info.append((title, price, rating, image))
driver.quit()
return products_info
def index(request):
return HttpResponse("<h1>App is running</h1>")
def add_product_data_to_db(data):
# Assuming your Product model has fields corresponding to the data list items
# e.g., URL, Title, Price, Ratings, Score, Review Summary
# Adapt field names based on your actual model definition
records = {
"url": str(data[0]),
"title" : str(data[1]),
"price " : str(data[2]),
"ratings": str(data[3]),
"score" : str(data[4]),
"review_summary" : str(data[5])
}
product.insert_one(records)
def add_product(request):
link = 'https://www.amazon.com/s?k=razer+mouse'
product_links = get_product_links_amazon(link)
# Using ThreadPoolExecutor to concurrently fetch product data
with ThreadPoolExecutor(max_workers=10) as executor:
results = list(executor.map(get_product_data_amazon, product_links))
# Now, iterate through the results and add them to the database
#for data in results:
#add_product_data_to_db(data)
# Instead of returning an HttpResponse, render the template with the results
return render(request, 'add_products.html', {'products': results})
def get_all_product(request):
products = product.find()
return(products)