-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcafe1919.py
71 lines (60 loc) · 2.31 KB
/
cafe1919.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import re
import urllib2
def scrape1919(url):
response = urllib2.urlopen(url)
page_source = response.read()
#Get a list of all grid cells, every third one belongs to each b/l/d
menuPageP = re.compile("(<img src=\"images/cafe1919/title_bibite.png.*?<div id=\"combo\">)|(<img src=\"images/cafe1919/title((?!Hours)(?!What's New).)*?</tbody>)", re.S)
menuPageTuple = menuPageP.findall(page_source)
menuPageList = list(menuPageTuple)
rawMenuP = re.compile("onmouseover=.*?>.*?</a>|alt=\".*?/><br />|\"beverageheader\">.*?</span>", re.S)
menuList = [] #raw list of each item on the menu with titles
#there are six pages
for i in range(0, 6):
menuList.append(list(rawMenuP.findall("".join(menuPageList[i]))))
#create unified list of items
menuAsList = []
for menu in menuList:
for item in menu:
menuAsList.append(item)
menuAsList = parseMenu(menuAsList)
return menuAsList
#parse the meals into readable data
def parseMenu(menuData):
finalList = []
for item in menuData:
rawItemP = re.compile("t=\"(.*?)\" />|beverageheader\">(.*?)</span>|\">(.*?)</a>", re.S)
rawItemListOfMatches = rawItemP.findall(item)
#add "title" tag and clean things up a bit
for match in rawItemListOfMatches:
#match[0] == main titles
#match[1] == sub titles
#match[2] == items
if(match[0] != ""):
editableMatchItem = list(match[0])
editableMatchItem.insert(0, "{\"title\":\"")
editableMatchItem.append("\"}")
finalList.append("".join(editableMatchItem))
elif(match[1] != ""):
editableMatchItem = list(match[1])
editableMatchItem.insert(0, "{\"title\":\"")
editableMatchItem.append("\"}")
finalList.append("".join(editableMatchItem))
else:
matchItem = match[2]
matchItem = re.sub("<.*?>", "", matchItem)
matchItem = re.sub(" ", "", matchItem)
finalList.append(matchItem)
for item in finalList: # we be cleaning
item = re.sub('\xe9', '\xc3\xa9', item)
item = re.sub('\xae', '\xc2\xae', item)
item = unicode(item, "utf-8")
return finalList
def nineteen(url): # this puts the stuff from scrape1919 into lunch dinner and latenight and leaves breakfast empty
scrapeddata = scrape1919(url)
nineteendata = {}
nineteendata['breakfast'] = []
nineteendata['lunch'] = scrapeddata
nineteendata['dinner'] = scrapeddata
nineteendata['latenight'] = scrapeddata
return nineteendata