diff --git a/backend/.env.development b/backend/.env.development index b3ee6f6..2a80a9e 100644 --- a/backend/.env.development +++ b/backend/.env.development @@ -11,6 +11,7 @@ DB_DATABASE=postgres DB_PASSWORD=postgres USOS_CONSUMER_KEY=key USOS_CONSUMER_SECRET=secret -POLWRO_COOKIES=cookies SMTP_HOST=localhost -SMTP_PORT=6060 \ No newline at end of file +SMTP_PORT=6060 +POLWRO_USERNAME="" +POLWRO_PASSWORD="" \ No newline at end of file diff --git a/backend/app/scrap-lecturers/polwro_login.ts b/backend/app/scrap-lecturers/polwro_login.ts new file mode 100644 index 0000000..3c811e9 --- /dev/null +++ b/backend/app/scrap-lecturers/polwro_login.ts @@ -0,0 +1,28 @@ +const getCookieHeader = (cookieArray: string[]): string => { + return ( + cookieArray + .map((cookie) => cookie.split(";")[0]) // Take only the name=value part + // TODO: actual cookie uniqueness enforcement + .filter((_, index) => index === 2 || index === 3) // Take only the 3rd and 4th cookies + .join("; ") + ); // Join with "; " for the Cookie header +}; + +export async function loginToPolwro(username: string, password: string) { + const url = "https://polwro.com/login.php"; + const formData = new FormData(); + formData.append("username", username); + formData.append("password", password); + formData.append("login", "Zaloguj"); + + const response = await fetch(url, { + method: "POST", + body: formData, + redirect: "manual", + }); + + if (response.status !== 302) { + throw new Error("Failed to log in to Polwro"); + } + return getCookieHeader(response.headers.getSetCookie()); +} diff --git a/backend/app/scrap-lecturers/scrap_lecturers.ts b/backend/app/scrap-lecturers/scrap_lecturers.ts index 9e35cfa..6da2a5f 100644 --- a/backend/app/scrap-lecturers/scrap_lecturers.ts +++ b/backend/app/scrap-lecturers/scrap_lecturers.ts @@ -5,6 +5,8 @@ import logger from "@adonisjs/core/services/logger"; import env from "#start/env"; +import { loginToPolwro } from "./polwro_login.js"; + interface Lecturer { rating: string; name: string; @@ -27,10 +29,13 @@ const CATEGORIES_STARTS_URLS = [ "https://polwro.com/viewforum.php?f=42&topicdays=0&start=0", ]; -async function fetchLecturers(url: string, timeout = 100000) { +async function fetchLecturers( + url: string, + authCookie: string, + timeout = 100000, +) { const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), timeout); - try { const response = await fetch(url, { method: "GET", @@ -39,9 +44,8 @@ async function fetchLecturers(url: string, timeout = 100000) { "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Accept-Encoding": "gzip, deflate, br, zstd", "Accept-Language": "en-US,en;q=0.9,pl-PL;q=0.8,pl;q=0.7", - Cookie: env.get("POLWRO_COOKIES") ?? "", + Cookie: authCookie, }, - signal: controller.signal, }); clearTimeout(timeoutId); @@ -74,8 +78,8 @@ function removeTitles(data: string[]): string[] { return data.filter((word) => !titlesToRemove.includes(word.toLowerCase())); } -const scrapLecturersPage = async (url: string) => { - const response = await fetchLecturers(url); +const scrapLecturersPage = async (url: string, authCookie: string) => { + const response = await fetchLecturers(url, authCookie); if (!response.ok) { logger.error("Something went wrong in fetching lecturers"); return; @@ -135,11 +139,11 @@ const scrapLecturersPage = async (url: string) => { return { lecturers, nextPageUrl }; }; -const scrapLecturersForCategory = async (url: string) => { +const scrapLecturersForCategory = async (url: string, authCookie: string) => { const lecturers: Lecturer[] = []; let nextPage = url; while (nextPage !== "") { - const result = await scrapLecturersPage(nextPage); + const result = await scrapLecturersPage(nextPage, authCookie); if (result === undefined) { return lecturers; } @@ -151,11 +155,18 @@ const scrapLecturersForCategory = async (url: string) => { }; export const scrapLecturers = async () => { + const authCookie = await loginToPolwro( + env.get("POLWRO_USERNAME") ?? "", + env.get("POLWRO_PASSWORD") ?? "", + ); // TODO: do not save all shit to RAM, pls const lecturers: Lecturer[] = []; for (const url of CATEGORIES_STARTS_URLS) { logger.info(`scraping category ${url}`); - const lecturersFromCategory = await scrapLecturersForCategory(url); + const lecturersFromCategory = await scrapLecturersForCategory( + url, + authCookie, + ); lecturers.push(...lecturersFromCategory); } return lecturers;