Skip to content

Commit

Permalink
feat(backend): add Polwro logging to scraper (#140)
Browse files Browse the repository at this point in the history
Co-authored-by: Jakub Czajkowski <czaja307@users.noreply.github.com>
Co-authored-by: mini-bomba <55105495+mini-bomba@users.noreply.github.com>
  • Loading branch information
3 people authored Jan 16, 2025
1 parent 9c77170 commit 5c77215
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 11 deletions.
5 changes: 3 additions & 2 deletions backend/.env.development
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ DB_DATABASE=postgres
DB_PASSWORD=postgres
USOS_CONSUMER_KEY=key
USOS_CONSUMER_SECRET=secret
POLWRO_COOKIES=cookies
SMTP_HOST=localhost
SMTP_PORT=6060
SMTP_PORT=6060
POLWRO_USERNAME=""
POLWRO_PASSWORD=""
28 changes: 28 additions & 0 deletions backend/app/scrap-lecturers/polwro_login.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
const getCookieHeader = (cookieArray: string[]): string => {
return (
cookieArray
.map((cookie) => cookie.split(";")[0]) // Take only the name=value part
// TODO: actual cookie uniqueness enforcement
.filter((_, index) => index === 2 || index === 3) // Take only the 3rd and 4th cookies
.join("; ")
); // Join with "; " for the Cookie header
};

export async function loginToPolwro(username: string, password: string) {
const url = "https://polwro.com/login.php";
const formData = new FormData();
formData.append("username", username);
formData.append("password", password);
formData.append("login", "Zaloguj");

const response = await fetch(url, {
method: "POST",
body: formData,
redirect: "manual",
});

if (response.status !== 302) {
throw new Error("Failed to log in to Polwro");
}
return getCookieHeader(response.headers.getSetCookie());
}
29 changes: 20 additions & 9 deletions backend/app/scrap-lecturers/scrap_lecturers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import logger from "@adonisjs/core/services/logger";

import env from "#start/env";

import { loginToPolwro } from "./polwro_login.js";

interface Lecturer {
rating: string;
name: string;
Expand All @@ -27,10 +29,13 @@ const CATEGORIES_STARTS_URLS = [
"https://polwro.com/viewforum.php?f=42&topicdays=0&start=0",
];

async function fetchLecturers(url: string, timeout = 100000) {
async function fetchLecturers(
url: string,
authCookie: string,
timeout = 100000,
) {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeout);

try {
const response = await fetch(url, {
method: "GET",
Expand All @@ -39,9 +44,8 @@ async function fetchLecturers(url: string, timeout = 100000) {
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "en-US,en;q=0.9,pl-PL;q=0.8,pl;q=0.7",
Cookie: env.get("POLWRO_COOKIES") ?? "",
Cookie: authCookie,
},

signal: controller.signal,
});
clearTimeout(timeoutId);
Expand Down Expand Up @@ -74,8 +78,8 @@ function removeTitles(data: string[]): string[] {
return data.filter((word) => !titlesToRemove.includes(word.toLowerCase()));
}

const scrapLecturersPage = async (url: string) => {
const response = await fetchLecturers(url);
const scrapLecturersPage = async (url: string, authCookie: string) => {
const response = await fetchLecturers(url, authCookie);
if (!response.ok) {
logger.error("Something went wrong in fetching lecturers");
return;
Expand Down Expand Up @@ -135,11 +139,11 @@ const scrapLecturersPage = async (url: string) => {
return { lecturers, nextPageUrl };
};

const scrapLecturersForCategory = async (url: string) => {
const scrapLecturersForCategory = async (url: string, authCookie: string) => {
const lecturers: Lecturer[] = [];
let nextPage = url;
while (nextPage !== "") {
const result = await scrapLecturersPage(nextPage);
const result = await scrapLecturersPage(nextPage, authCookie);
if (result === undefined) {
return lecturers;
}
Expand All @@ -151,11 +155,18 @@ const scrapLecturersForCategory = async (url: string) => {
};

export const scrapLecturers = async () => {
const authCookie = await loginToPolwro(
env.get("POLWRO_USERNAME") ?? "",
env.get("POLWRO_PASSWORD") ?? "",
);
// TODO: do not save all shit to RAM, pls
const lecturers: Lecturer[] = [];
for (const url of CATEGORIES_STARTS_URLS) {
logger.info(`scraping category ${url}`);
const lecturersFromCategory = await scrapLecturersForCategory(url);
const lecturersFromCategory = await scrapLecturersForCategory(
url,
authCookie,
);
lecturers.push(...lecturersFromCategory);
}
return lecturers;
Expand Down

0 comments on commit 5c77215

Please sign in to comment.