-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcourse_parser.py
304 lines (250 loc) · 9.42 KB
/
course_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
from __future__ import annotations
import json
import os
from dataclasses import asdict, dataclass, field
from enum import StrEnum, auto
from pathlib import Path
from typing import TYPE_CHECKING, Iterable, Self
from bs4 import BeautifulSoup, Tag
if TYPE_CHECKING:
import httpx
COURSE_URL = "https://learnwith.campusx.in/s/courses/653f50d1e4b0d2eae855480a/take"
BASE_RESOURCE_URL = "https://learnwith.campusx.in/s/courses/653f50d1e4b0d2eae855480a"
BASE_HEADERS = {
"accept": "application/json, text/javascript, */*; q=0.01",
"referer": COURSE_URL,
"user-agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_7) AppleWebKit/537.36 (KHTML, "
"like Gecko) Chrome/111.5.0.0 Safari/507.02"
),
}
COURSE_HTML_PATH = Path("dsmp2.arv.html") # Added `.arv` to ignore for *git*
DSMP2_DATA_PATH = Path("data/dsmp2")
COURSE_TOPICS_PATH = DSMP2_DATA_PATH / "courseTopics.json"
COURSE_SUB_TOPICS_PATH = DSMP2_DATA_PATH / "courseSubTopics.json"
SUB_TOPIC_RESOURCES_PATH = DSMP2_DATA_PATH / "subTopicResources.json"
CLEANED_RESOURCES_PATH = SUB_TOPIC_RESOURCES_PATH.with_name("cleanedResources.json")
class ResourceType(StrEnum):
article = auto()
assessment = auto()
assignment = auto()
link = auto()
livetest = auto()
pdf = auto()
video = auto()
def fetch_sub_topic_resource(
client: httpx.Client,
sub_topic_id: str,
resource_type: ResourceType,
) -> bytes:
"""Fetches the resource data for the given subtopic ID and resource type.
Args:
client: HTTPX client instance with cookies set.
sub_topic_id: ID of the subtopic to fetch.
resource_type: Type of resource to fetch.
Returns:
The data as bytes for the requested resource.
Raises:
ValueError: If client does not have cookies set.
HTTPError: If the API request fails.
"""
if not client.cookies:
raise ValueError("Client does not have cookies.")
res = client.get(f"/{resource_type.name}s/{sub_topic_id}/get")
res.raise_for_status()
return res.content
def get_cookies():
"""Construct cookies from ENVs."""
c_ujwt = os.getenv("C_UJWT")
session_id = os.getenv("SESSION_ID")
error_msg = "Define '%s' as environment variable."
if c_ujwt is None:
raise ValueError(error_msg % "C_UJWT")
if session_id is None:
raise ValueError(error_msg % "SESSION_ID")
return {
"c_ujwt": c_ujwt,
"SESSIONID": session_id,
}
@dataclass(kw_only=True)
class CourseTopic:
title: str
id: str
source: Tag = field(repr=False)
@staticmethod
def search(html_path: Path) -> Tag:
"""
Parses CourseTopic instances from a BeautifulSoup tag.
Yields CourseTopic instances parsed from the provided BeautifulSoup tag source.
"""
soup = BeautifulSoup(html_path.read_bytes(), "html.parser")
course_items_tag = soup.select_one("div.courseItems")
if course_items_tag:
return course_items_tag
raise ValueError("'div.courseItems' css selector not present in source.")
@classmethod
def parse(cls, source: Tag) -> Iterable[Self]:
"""
Parses CourseTopic instances from a BeautifulSoup tag.
Yields CourseTopic instances parsed from the provided BeautifulSoup tag source.
"""
yield from (
cls(
title=tag["data-title"],
id=tag["data-id"],
source=tag,
)
for tag in source.find_all("div", {"data-type": "label"})
)
@dataclass(kw_only=True)
class CourseSubTopic:
id: str
topicId: str
title: str
type: ResourceType
def __eq__(self, __value: Self) -> bool:
return self.id == __value.id
@classmethod
def parse(cls, topic: CourseTopic) -> Iterable[Self]:
"""
Parses CourseSubTopic instances from a CourseTopic BeautifulSoup tag.
Yields CourseSubTopic instances parsed from the provided CourseTopic
BeautifulSoup tag source.
"""
yield from (
cls(
id=tag["data-id"],
topicId=topic.id,
title=tag["data-title"],
type=tag["data-type"],
)
for tag in topic.source.find_all("div", {"data-type": True})
)
@classmethod
def parse_many(
cls,
topics: Iterable[CourseTopic],
) -> Iterable[tuple[CourseTopic, Iterable[Self]]]:
for topic in topics:
yield topic, cls.parse(topic)
@classmethod
def find(
cls,
course_topics: Iterable[CourseTopic],
*,
id: str | None = None,
title: str | None = None,
) -> Iterable[Self]:
"""
Parses a single CourseSubTopic from the given CourseTopics.
This allows fetching a specific CourseSubTopic by title or id from the
list of CourseTopics, by searching through their associated subtopics.
Args:
course_topics: Iterable of CourseTopic instances to search through.
title: Optional title of subtopic to find.
id: Optional id of subtopic to find.
Returns:
Iterable of matching CourseSubTopic instances.
Raises:
ValueError: If both title and id are None.
ValueError: If no matching subtopic is found.
"""
if id is None and title is None:
raise ValueError("Both 'id' and 'title' must not be None.")
for topic in course_topics:
if topic.id == id or topic.title == title:
yield from cls.parse(topic)
break
else:
raise ValueError(f"No subtopic found matching id={id} or title={title}")
@classmethod
def from_json(cls, path: str | Path) -> Iterable[Self]:
json_as_dict = json.loads(Path(path).read_bytes())
yield from (cls(**i) for i in json_as_dict)
@dataclass(kw_only=True)
class CourseVideoResource(CourseSubTopic):
totalTime: str
description: str = field(repr=False)
isDescriptionHtml: bool = field(repr=False)
@classmethod
def fetch(cls, client: httpx.Client, sub_topic: CourseSubTopic) -> Self:
if sub_topic.type != "video":
raise ValueError(f"sub_topic is not a video resource, got {sub_topic.type}")
response = fetch_sub_topic_resource(
client=client,
sub_topic_id=sub_topic.id,
resource_type=ResourceType.video,
)
try:
data = json.loads(response)
data = data["spayee:resource"]
except json.JSONDecodeError as e:
raise ValueError("Response could not be parsed as JSON.") from e
except KeyError as e:
raise ValueError("Bad response or missing required fields.") from e
try:
return cls(
**asdict(sub_topic),
description=data["spayee:description"],
totalTime=data["spayee:totalTime"],
isDescriptionHtml=data["spayee:isDescriptionHtml"],
)
except KeyError as e:
print(f"❌ KeyError: {e}")
return cls.null(sub_topic)
@classmethod
def null(cls, sub_topic: CourseSubTopic) -> Self:
return cls(
**asdict(sub_topic),
description="",
totalTime="0",
isDescriptionHtml=False,
)
@dataclass(kw_only=True)
class CourseAssignmentResource(CourseSubTopic):
assignmentLink: str = field(repr=False)
@classmethod
def fetch(cls, client: httpx.Client, sub_topic: CourseSubTopic) -> Self:
if sub_topic.type != "assignment":
raise ValueError(
f"sub_topic is not an assignment resource, got {sub_topic.type}"
)
response = fetch_sub_topic_resource(
client=client,
sub_topic_id=sub_topic.id,
resource_type=ResourceType.assignment,
)
def parse_assignment_link(source: str | bytes) -> str:
soup = BeautifulSoup(source, "html.parser")
link_tag = soup.select_one("#instructions a")
if link_tag:
return link_tag.get_attribute_list("href", "")[0]
raise ValueError("assignmentLink tag not found in source")
return cls(
**asdict(sub_topic),
assignmentLink=parse_assignment_link(response),
)
_RESOURCE_TYPE_MAPPING: dict[ResourceType, type] = {
ResourceType.video: CourseVideoResource,
ResourceType.assignment: CourseAssignmentResource,
}
def load_resources() -> list[CourseSubTopic]:
inferred_resources = []
resources = json.loads(SUB_TOPIC_RESOURCES_PATH.read_bytes())
for i in resources:
_class = _RESOURCE_TYPE_MAPPING.get(getattr(ResourceType, i["type"]))
if _class:
inferred_resources.append(_class(**i))
return inferred_resources
def filter_resources(
sub_topics: Iterable[CourseSubTopic],
resources: Iterable[CourseSubTopic],
) -> list[CourseSubTopic]:
return [sub_topic for sub_topic in sub_topics if sub_topic not in resources]
def dump_resources(resources: Iterable[CourseSubTopic]) -> None:
_resources: list[dict] = [asdict(i) for i in resources]
if SUB_TOPIC_RESOURCES_PATH.exists():
_resources = json.loads(SUB_TOPIC_RESOURCES_PATH.read_bytes()) + _resources
_resources = [dict(i) for i in {frozenset(r.items()) for r in _resources}]
with SUB_TOPIC_RESOURCES_PATH.open("w") as f:
json.dump(_resources, f, indent=2)