forked from rbaker26/class-list-extractor
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextractor.py
125 lines (101 loc) · 4.77 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python3
from bs4 import BeautifulSoup
from urllib.request import urlopen
# templateized so that the year and semester can easily be changed
url = 'https://mysite.socccd.edu/onlineschedule/ClassFind.asp?siteID=A&termID={year}{semester}&termtype=&mktcode=CO20&header=Computer+Science'
year = 2018
semester = 3 # fall
sem_map = {
1: 'Spring',
2: 'Summer',
3: 'Fall',
}
#******************************************************************************
def build_dict(url):
# The page has a dangling /div
# Python's html.parser doesn't handle this adequately, so we need html5lib
soup = BeautifulSoup(urlopen(url).read(), "html5lib")
# remove all icon tags
for icons in soup.find_all(class_='material-icons'):
icons.decompose()
courses = []
course_tables = soup.find_all(class_='class-list-course-list')
for course_table in course_tables:
tickets = []
# Clean up the prereq data
prereq = course_table.find(class_='class-list-prereq')
if prereq:
prereq = prereq.text.strip()
if prereq.startswith('Prerequisite: '):
prereq = prereq.lstrip('Prerequisite: ')
course_info = {
'units' : course_table.find(class_='class-list-unit').text.strip()[7:],
'prereq': prereq,
}
course = {
'course_id': course_table.find(class_='course-id').text.strip(),
'course_title' : course_table.find(class_='class-list-course-title').text.strip(),
'course_info' : course_info,
'course_description' : course_table.find(class_='class-list-course-desc').text.strip(),
'tickets': tickets,
}
courses.append(course)
for section in course_table.find_all(class_='class-list-info-method'):
# remove all small tags
for smalls in section.find_all(class_='ins-method'):
smalls.decompose()
class_days = section.find(attrs={'title': 'DAY'})
class_days = list(class_days)
class_times = list(section.find(attrs={'title': 'TIME'}).children)
# Seperate Lab and Lecture Rooms
lec_room = section.find(class_='class-list-room-text')
lab_room = lec_room.find(class_='extra-room')
# None-check for lab attrabutes
if lab_room:
lab_room.extract()
lab_room = lab_room.text.strip()
# If lab is in the same room as lecture
if lab_room == None and class_times[-1].strip():
lab_room = lec_room.text.strip()
lecture = {
'day':class_days[0].strip(),
'time':class_times[0].strip(),
'room': lec_room.text.strip(),
}
lab = {
'day':class_days[-1].strip(),
'time':class_times[-1].strip(),
'room': lab_room,
}
ticket = {
'number': section.find(class_='class-list-info-ticket').text.strip(),
'status': section.find(class_='class-list-info-status').text.strip(),
'lecture': lecture,
'lab': lab,
'instructor': (section.find(attrs={'title': 'INSTRUCTOR'}).text.strip()),
}
tickets.append(ticket)
return courses
#******************************************************************************
#******************************************************************************
def ticket_list(courses):
tickets = []
for course in courses:
for ticket in course['tickets']:
tickets.append(ticket['number'])
return tickets
#******************************************************************************
#******************************************************************************
if __name__ == '__main__':
courses = build_dict(url.format(year=year, semester=semester))
import json
# templateized so that the year and semester can easily be changed with globals
cFileName = 'courses{year}-{semester}.json'
with open(cFileName.format(year=year, semester=semester), 'w') as file:
json.dump(courses, file, sort_keys=True, indent=4)
# templateized so that the year and semester can easily be changed with globals
tickets = ticket_list(courses)
tFilename = 'tickets{year}-{semester}.txt'
with open(tFilename.format(year=year, semester=semester), 'w') as file:
file.write('\n'.join(tickets))
#******************************************************************************