From 3032c841ae6ad2e34eb4b996bdd92ce9b0833c59 Mon Sep 17 00:00:00 2001 From: D0dii Date: Wed, 18 Sep 2024 14:46:21 +0200 Subject: [PATCH] feat: scrap courses --- package-lock.json | 105 +++++++++++++++++++++++++++++----- package.json | 6 +- src/lib/scrapRegistrations.ts | 101 ++++++++++++++++++++++++++++++++ 3 files changed, 196 insertions(+), 16 deletions(-) create mode 100644 src/lib/scrapRegistrations.ts diff --git a/package-lock.json b/package-lock.json index 5740bce..4cf1e88 100644 --- a/package-lock.json +++ b/package-lock.json @@ -17,7 +17,7 @@ "@radix-ui/themes": "^3.1.3", "@t3-oss/env-nextjs": "^0.11.0", "@tanstack/react-query": "^5.54.1", - "cheerio": "^1.0.0-rc.12", + "cheerio": "^1.0.0", "class-variance-authority": "^0.7.0", "clsx": "^2.1.1", "fetch-cookie": "^3.0.1", @@ -5733,21 +5733,25 @@ } }, "node_modules/cheerio": { - "version": "1.0.0-rc.12", - "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.12.tgz", - "integrity": "sha512-VqR8m68vM46BNnuZ5NtnGBKIE/DfN0cRIzg9n40EIq9NOv90ayxLBXA8fXC5gquFRGJSTRqBq25Jt2ECLR431Q==", + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0.tgz", + "integrity": "sha512-quS9HgjQpdaXOvsZz82Oz7uxtXiy6UIsIQcpBj7HRw2M63Skasm9qlDocAM7jNuaxdhpPU7c4kJN+gA5MCu4ww==", "license": "MIT", "dependencies": { "cheerio-select": "^2.1.0", "dom-serializer": "^2.0.0", "domhandler": "^5.0.3", - "domutils": "^3.0.1", - "htmlparser2": "^8.0.1", - "parse5": "^7.0.0", - "parse5-htmlparser2-tree-adapter": "^7.0.0" + "domutils": "^3.1.0", + "encoding-sniffer": "^0.2.0", + "htmlparser2": "^9.1.0", + "parse5": "^7.1.2", + "parse5-htmlparser2-tree-adapter": "^7.0.0", + "parse5-parser-stream": "^7.1.2", + "undici": "^6.19.5", + "whatwg-mimetype": "^4.0.0" }, "engines": { - "node": ">= 6" + "node": ">=18.17" }, "funding": { "url": "https://github.com/cheeriojs/cheerio?sponsor=1" @@ -6582,6 +6586,19 @@ "integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==", "license": "MIT" }, + "node_modules/encoding-sniffer": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/encoding-sniffer/-/encoding-sniffer-0.2.0.tgz", + "integrity": "sha512-ju7Wq1kg04I3HtiYIOrUrdfdDvkyO9s5XM8QAj/bN61Yo/Vb4vgJxy5vi4Yxk01gWHbrofpPtpxM8bKger9jhg==", + "license": "MIT", + "dependencies": { + "iconv-lite": "^0.6.3", + "whatwg-encoding": "^3.1.1" + }, + "funding": { + "url": "https://github.com/fb55/encoding-sniffer?sponsor=1" + } + }, "node_modules/enhanced-resolve": { "version": "5.17.1", "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.17.1.tgz", @@ -8111,9 +8128,9 @@ } }, "node_modules/htmlparser2": { - "version": "8.0.2", - "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz", - "integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==", + "version": "9.1.0", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-9.1.0.tgz", + "integrity": "sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ==", "funding": [ "https://github.com/fb55/htmlparser2?sponsor=1", { @@ -8125,8 +8142,8 @@ "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.3", - "domutils": "^3.0.1", - "entities": "^4.4.0" + "domutils": "^3.1.0", + "entities": "^4.5.0" } }, "node_modules/human-signals": { @@ -8155,6 +8172,18 @@ "url": "https://github.com/sponsors/typicode" } }, + "node_modules/iconv-lite": { + "version": "0.6.3", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", + "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", + "license": "MIT", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/ignore": { "version": "5.3.1", "resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.1.tgz", @@ -9946,6 +9975,18 @@ "url": "https://github.com/inikulin/parse5?sponsor=1" } }, + "node_modules/parse5-parser-stream": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/parse5-parser-stream/-/parse5-parser-stream-7.1.2.tgz", + "integrity": "sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow==", + "license": "MIT", + "dependencies": { + "parse5": "^7.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, "node_modules/path-exists": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz", @@ -10849,6 +10890,12 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/safer-buffer": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", + "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==", + "license": "MIT" + }, "node_modules/scheduler": { "version": "0.23.2", "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.23.2.tgz", @@ -11855,6 +11902,15 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/undici": { + "version": "6.19.8", + "resolved": "https://registry.npmjs.org/undici/-/undici-6.19.8.tgz", + "integrity": "sha512-U8uCCl2x9TK3WANvmBavymRzxbfFYG+tAu+fgx3zxQy3qdagQqBLwJVrdyO1TBfUXvfKveMKJZhpvUYoOjM+4g==", + "license": "MIT", + "engines": { + "node": ">=18.17" + } + }, "node_modules/undici-types": { "version": "5.26.5", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", @@ -12157,6 +12213,27 @@ "defaults": "^1.0.3" } }, + "node_modules/whatwg-encoding": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz", + "integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==", + "license": "MIT", + "dependencies": { + "iconv-lite": "0.6.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/whatwg-mimetype": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz", + "integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==", + "license": "MIT", + "engines": { + "node": ">=18" + } + }, "node_modules/which": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", diff --git a/package.json b/package.json index 35bca4d..2b6f07b 100644 --- a/package.json +++ b/package.json @@ -2,6 +2,7 @@ "name": "web-planner", "version": "0.1.0", "private": true, + "type": "module", "scripts": { "dev": "next dev", "build": "next build", @@ -13,7 +14,8 @@ "format:check": "prettier --check .", "knip": "knip", "typecheck": "tsc", - "prepare": "husky" + "prepare": "husky", + "scrap": "npx tsx src/lib/scrapRegistrations.ts" }, "dependencies": { "@radix-ui/react-accordion": "^1.2.0", @@ -25,7 +27,7 @@ "@radix-ui/themes": "^3.1.3", "@t3-oss/env-nextjs": "^0.11.0", "@tanstack/react-query": "^5.54.1", - "cheerio": "^1.0.0-rc.12", + "cheerio": "^1.0.0", "class-variance-authority": "^0.7.0", "clsx": "^2.1.1", "fetch-cookie": "^3.0.1", diff --git a/src/lib/scrapRegistrations.ts b/src/lib/scrapRegistrations.ts new file mode 100644 index 0000000..2b6d674 --- /dev/null +++ b/src/lib/scrapRegistrations.ts @@ -0,0 +1,101 @@ +// eslint-disable-next-line @eslint-community/eslint-comments/disable-enable-pair + +/* eslint-disable no-console */ +import * as cheerio from "cheerio"; + +const DEPARTMENTS_URL = + "https://web.usos.pwr.edu.pl/kontroler.php?_action=news/rejestracje/index"; + +const fetchData = async (url: string) => { + const response = await fetch(url); + return response; +}; + +const scrapDepartments = async () => { + const departmentsNames: string[] = []; + const departmentsUrls: string[] = []; + const response = await fetchData(DEPARTMENTS_URL); + if (!response.ok) { + console.log("Something went wrong in fetching departments"); + return; + } + const body = await response.text(); + const $ = cheerio.load(body); + + const departments = $("div#layout-c22").find(".autostrong").children("tr"); + departments.each((index, element) => { + departmentsNames.push($(element).find("td").html()?.trim()); + departmentsUrls.push($(element).find("a").attr("href")); + }); + return { departmentsNames, departmentsUrls }; +}; + +const scrapRegistrations = async (departmentUrl: string) => { + const registrationsNames: string[] = []; + const registrationsUrls: string[] = []; + const response = await fetchData(departmentUrl); + if (!response.ok) { + console.log("Something went wrong in fetching registrations"); + return; + } + const body = await response.text(); + const $ = cheerio.load(body); + + const registrations = $("main#layout-main-content") + .find("#layout-c22") + .find("div.usos-ui"); + const h2 = registrations.children("h2"); + const names = registrations.children("usos-link"); + h2.each((index, element) => { + registrationsNames.push($(element).text().trim()); + }); + names.each((index, element) => { + registrationsUrls.push($(element).find("a").attr("href")); + }); + return { registrationsNames, registrationsUrls }; +}; + +const scrapCourses = async (registrationUrl: string) => { + const coursesNames: string[] = []; + const coursesUrls: string[] = []; + const response = await fetchData(registrationUrl); + if (!response.ok) { + console.log("Something went wrong in fetching courses"); + return; + } + + const body = await response.text(); + const $ = cheerio.load(body); + + const courses = $("main#layout-main-content") + .find("table.wrnav") + .find("tbody") + .children("tr"); + courses.each((index, element) => { + const a = $(element).find("usos-link").find("a").attr("href"); + console.log(a); + }); +}; + +//'https://web.usos.pwr.edu.pl/kontroler.php?_action=katalog2/przedmioty/szukajPrzedmiotu&method=rej&rej_kod=W09ZARZ-SI7-24%2F25Zv&callback=g_f04839bf' + +const main = async () => { + const { departmentsNames, departmentsUrls } = await scrapDepartments(); + + const registrations: string[][] = []; + const allRegistrationsUrls: string[][] = []; + + for (const department of departmentsUrls) { + const { registrationsNames, registrationsUrls } = + await scrapRegistrations(department); + registrations.push(registrationsNames); + allRegistrationsUrls.push(registrationsUrls); + } + console.log(registrations, allRegistrationsUrls); +}; + +//void main(); +//test scrapCourses +void scrapCourses( + "https://web.usos.pwr.edu.pl/kontroler.php?_action=katalog2/przedmioty/szukajPrzedmiotu&method=rej&rej_kod=W09ZARZ-SI7-24%2F25Zv&callback=g_f04839bf", +);