-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathkb.py
84 lines (70 loc) · 3.78 KB
/
kb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# -*- coding: utf-8 -*-
import scrapy
import logging
from kabum.items import KabumItem
from kabum.login import LoginSpider
from scrapy.item import Item, Field
from scrapy.http import FormRequest
from scrapy.utils.response import open_in_browser
class KbSpider(scrapy.Spider):
name = "kb"
def start_requests(self):
logar = LoginSpider()
allowed_domains = ["kabum.com.br"]
urls = [
'http://www.kabum.com.br/computadores/computadores/'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
computers = response.xpath('//div[@class="listagem-box"]')
## the computer's list index start at 3
start = 3
for computer in computers:
item = KabumItem()
item['name'] = computer.xpath(
'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[2]/span[1]//text())').extract()[0]
## Sometimes computer's prices doesn't have "De R$ xx,xx por..." at beginning, we check below
price_query = computer.xpath(
'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[3]/div[2]//text())').extract()[0]
if "EM" in price_query:
item['price'] = computer.xpath(
'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[3]/div[1]//text())').extract()[0]
else:
item['price'] = price_query
price_cash_query = computer.xpath(
'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[3]/div[4]//text())').extract()[0]
if "%" in price_cash_query:
item['price_cash'] = computer.xpath(
'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[3]/div[3]//text())').extract()[0]
else:
item['price_cash'] = price_cash_query
item['url'] = computer.xpath(
'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[2]/span//@href)').extract()[0]
item['data_id'] = computer.xpath(
'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[2]/span//@data-id)').extract()[0]
## Verifies thats computer is available to purchase
purchase_status = computer.xpath(
'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[4]/div/a/img/@src)').extract()[0]
if "_off" in purchase_status:
item['status'] = "Indisponivel"
else:
item['status'] = "Disponivel"
item['url_photo'] = computer.xpath(
'//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[1]/a/img/@src').extract()[0]
stars = computer.xpath(
'//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[2]/div/ul/li[3]/div/@class').extract()[0]
if stars[len(stars)-1] == "e":
item['stars'] = 0
else:
item['stars'] = stars[len(stars)-1]
start += 1
yield item
next_pages=[]
#next_pages = response.xpath('//*[@id="BlocoConteudo"]/div[2]/div/div[2]/form/table/tbody/tr/td[6]/span/a/@href').extract()
next_pages.append("?string=&dep=04&sec=34&cat=&sub=&pagina=2&ordem=5&limite=30")
next_pages.append("?string=&dep=04&sec=34&cat=&sub=&pagina=3&ordem=5&limite=30")
for i in next_pages:
next_url = "http://www.kabum.com.br/computadores/computadores/"+i
#logging.info('NEXT_URL: '+next_url)
yield scrapy.Request(url=next_url, callback=self.parse)