-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDailyScraper_BS_update.py
204 lines (159 loc) · 5.64 KB
/
DailyScraper_BS_update.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# encoding=utf8
# # Python Code for Creating Color Archive
#
# The purpose of this file is to scrape Pantone's Color of the Day website and create an archive of all the colors they've picked.
# https://www.pantone.com/colorstrology
#
# When run, this code will output three files:
# 1. Colors.csv
# 2. newcolor.html
# 3. index.html
#### DEFINE FUNCTION
def colorScrape():
# ## Section 1
# Getting COTD info from Pantone's Color of the Day website and putting it into a csv.
from bs4 import BeautifulSoup, Tag, NavigableString
import requests
import os
# Getting the url
url = requests.get("https://www.pantone.com/color-intelligence/color-education/colorstrology")
url
# Getting the html of the website
soup = BeautifulSoup(url.content, "html.parser")
#print(soup.prettify())
# I need:
# <div class="swatch inner" title="Pantone 15-1607 Pale Mauve">
# <div class="coloredSquare" style="background-color: #C5A4A4;">
# <span class="keyLogon"> HARDWORKING </span>
# <span class="keyLogon"> CARING </span>
# <span class="keyLogon"> PREPARED </span>
# Getting html in the div element
div = soup.find(id="outerWrap")
# Drilling down further
div2 = div.find(id="ctl00_divBody")
# Finding specific div for color - doesn't have the 3 words
Color = div.find(class_="swatch inner")
#print(Color.prettify())
# Print the entire color name from the HTML
ColorName = Color["title"]
#print(ColorName)
# Separate PantoneID
PantoneID = ColorName[8:15]
#print(PantoneID)
# Separate Pantone Color Name
PantoneName = ColorName[16:]
#print(PantoneName)
# Separate background color
ID = div.find(class_="coloredSquare")
ColorID = ID["style"]
#print(ColorID)
# Separate hex code
Hex = ColorID[18:25]
#print(Hex)
# Getting the 3 words
Text = div2.find("span",{"class":"keyLogon"})
#print(Text.get_text())
# need to figure out how to get the next two words.
#for wrapper in div2.find_all("span",{"class":"keyLogon"}):
#print(wrapper.text)
# Puts the three words into a list
text = []
for wrapper in div2.find_all("span",{"class":"keyLogon"}):
wrapper = wrapper.get_text()
text.append(wrapper)
#print(text)
# Gets rid of the trailing space
text = [x.strip(' ') for x in text]
#print(text)
# Separate list into 3 string variables and makes them all lowercase
Word1 = text[0]
Word1 = str.lower(Word1)
Word2 = text[1]
Word2 = str.lower(Word2)
Word3 = text[2]
Word3 = str.lower(Word3)
# Creating the date variable
import datetime
from datetime import date
today = datetime.date.today()
today = (today.strftime("%m/%d/%Y"))
#print (today)
# Appends all the scraped items into a .csv
# The variables I need: today, PantoneName, PantoneID, Hex, Word1, Word2, Word3
import csv
destname = 'Colors.csv'
#This section was only needed to add the header to the .csv
#destfile = open(destname, 'w')
#mywriter = csv.writer(destfile)
#mywriter.writerow(["Date", "PantoneName", "PantoneID", "Hex#", "Word1", "Word2", "Word3"])
#destfile.close()
destfile = open(destname, 'a')
mywriter = csv.writer(destfile)
mywriter.writerow([today, PantoneName, PantoneID, Hex, Word1, Word2, Word3])
destfile.close()
# ## Section 2
# Using the variable names, like "PantoneName", to create new "color-item" section in the html.
# Creating the new color section
NewColor = """
<div class="color-item">
<svg height="200" width="200">
<rect fill=
"""
NewColor2 = """
height="200" width="200" x="0" y="0">
</rect>
</svg>
<div class= "date">
<p>
"""
NewColor3 = """
</p>
</div>
<div class="text">
<p>
"""
Break = """
<br/>
"""
End = """
</p>
</div>
</div>
"""
# Making the date into a string
today_str = str(today)
today_str
# Adding the color variables to the HTML text from above
NewColorHTML = [NewColor] + [Hex] + [NewColor2] + [today_str] + [NewColor3] + [PantoneName] + [Break] + [PantoneID] + [Break] + [Hex] + [End]
# Making the list into a string
NewColorHTML = "".join(NewColorHTML)
NewColorHTML
#Saving the string as new HTML file
file = open("newcolor.html","w", encoding='utf8')
file.write(NewColorHTML)
file.close()
# ## Section 3
# Adding "newcolor.html" to "index.html" - for now "newindex.html" until it's perfect
# Now I need to add this code to index.html
# Right after <div class="colors">
# Getting the URL for my COTD archive
cotd_archive = requests.get("https://polk54.github.io/Pantone_ColorOfTheDay/")
soup2 = BeautifulSoup(cotd_archive.content, "html.parser")
#page_text = soup2.text.encode('utf-8').decode('ascii', 'ignore')
#soup2 = BeautifulSoup(page_text)
# Saving the COTD archive as a new html file
with open("cotd.html", "w", encoding='utf8') as file:
file.write(str(soup2))
file.close()
# Opening the cotd and newcolor html files
soup3 = BeautifulSoup(open("cotd.html", encoding='utf8'), "html.parser")
soup4 = BeautifulSoup(open("newcolor.html", encoding='utf8'), "html.parser")
# Finding where the new color should go
colorsection = soup3.find(class_="color-item")
# Inserting the newcolor html - that was read using BS
colorsection.insert_before(soup4)
# Saving the file
with open("index.html", "w", encoding='utf8') as file:
file.write(str(soup3))
#### RUN FUNCTION
#colorScrape()