Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: make scraper command #94

Merged
merged 2 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 1 addition & 8 deletions backend/adonisrc.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,7 @@ export default defineConfig({
| List of modules to import before starting the application.
|
*/
preloads: [
() => import('#start/routes'),
() => import('#start/kernel'),
{
file: () => import('#start/scheduler'),
environment: ['console'],
},
],
preloads: [() => import('#start/routes'), () => import('#start/kernel')],

/*
|--------------------------------------------------------------------------
Expand Down
145 changes: 145 additions & 0 deletions backend/commands/scraper.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import { BaseCommand } from '@adonisjs/core/ace'
import type { CommandOptions } from '@adonisjs/core/types/ace'

import {
scrapDepartments,
scrapRegistrations,
scrapCourses,
scrapCourseNameGroupsUrls,
scrapGroupsUrls,
scrapGroupDetails,
} from '../app/scrap-registrations/scrap_registrations.js'

function extractLastStringInBrackets(input: string): string | null {
const regex = /\[([^\]]+)\]/g
let match
let lastMatch: string | null = null

while ((match = regex.exec(input)) !== null) {
lastMatch = match[1]
}

return lastMatch
}

export default class Scraper extends BaseCommand {
static commandName = 'scraper'
static description = 'Scrap data from usos pages and insert it to database'

static options: CommandOptions = {
startApp: true,
allowUnknownFlags: false,
staysAlive: false,
}

async run() {
const DepartmentModule = await import('#models/department')
const Department = DepartmentModule.default
const RegistrationModule = await import('#models/registration')
const Registration = RegistrationModule.default
const CourseModule = await import('#models/course')
const Course = CourseModule.default
const GroupModule = await import('#models/group')
const Group = GroupModule.default

console.log('Scraping departments')
const departments = await scrapDepartments()
if (!departments) return
await Promise.all(
departments.map((department) =>
Department.updateOrCreate(
{ id: extractLastStringInBrackets(department.name) ?? department.name },
{ name: department.name, url: department.url }
)
)
)
console.log('Scraping registrations')
const registrations = await Promise.all(
departments.map(async (department) => {
const regs = await scrapRegistrations(department.url)
if (!regs) return []
department.registrations = regs
department.registrations.forEach(async (registration) => {
await Registration.updateOrCreate(
{ id: extractLastStringInBrackets(registration.name) ?? registration.name },
{
name: registration.name,
departmentId: extractLastStringInBrackets(department.name) ?? department.name,
}
)
})
return regs
})
).then((results) => results.flat())
console.log('Registrations scraped')
console.log('Scraping courses urls')
await Promise.all(
registrations.map(async (registration) => {
let urls
try {
urls = await scrapCourses(registration.url)
} catch (e) {
console.log(e)
}
if (!urls) return []
registration.courses = urls.map((courseUrl) => {
return { url: courseUrl, courseCode: '', groups: [], name: '' }
})
})
)
console.log('Courses urls scraped')
console.log('Scraping courses details')
for (const registration of registrations) {
await Promise.all(
registration.courses.map(async (course) => {
const courseCodeNameGroupsUrls = await scrapCourseNameGroupsUrls(course.url)
if (!courseCodeNameGroupsUrls) return
const urls = courseCodeNameGroupsUrls.urls
course.courseCode = courseCodeNameGroupsUrls.courseCode
course.name = courseCodeNameGroupsUrls.courseName
course.groups = urls.map((url) => {
return { url, groups: [] }
})
await Course.updateOrCreate(
{ id: courseCodeNameGroupsUrls.courseCode },
{
name: course.name,
registrationId: extractLastStringInBrackets(registration.name) ?? registration.name,
}
)
})
)
}
console.log('Courses details scraped')
console.log('Scraping groups details')
for (const registration of registrations) {
for (const course of registration.courses) {
const detailsUrls = (await Promise.all(
course.groups.map(async (group) => {
return await scrapGroupsUrls(group.url)
})
).then((results) => results.flat())) as string[]
if (!detailsUrls) return
await Promise.all(
detailsUrls.map(async (url) => {
const details = await scrapGroupDetails(url)
if (!details) return
await Group.create({
name: details.name.slice(0, 255),
startTime: details.startTime.slice(0, 255),
endTime: details.endTime.slice(0, 255),
group: details.group.slice(0, 255),
lecturer: details.lecturer.trim().replace(/\s+/g, ' ').slice(0, 255),
week: details.week,
day: details.day.slice(0, 255),
type: details.type.slice(0, 255),
courseId: course.courseCode.slice(0, 255),
url: url.slice(0, 255),
})
})
)
}
}
console.log('Groups details scraped')
}
}
2 changes: 1 addition & 1 deletion backend/database/migrations/4_create_users_table.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ export default class extends BaseSchema {
async down() {
this.schema.dropTable(this.tableName)
}
}
}
2 changes: 1 addition & 1 deletion backend/database/migrations/5_create_schedules_table.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ export default class extends BaseSchema {
async up() {
this.schema.createTable(this.tableName, (table) => {
table.increments('id')
table.string('user_id').references('users.id').onDelete('CASCADE')
table.integer('user_id').references('users.id').onDelete('CASCADE')
table.string('name').defaultTo('Nowy plan')
table.timestamp('created_at').defaultTo('NOW()')
table.timestamp('updated_at')
Expand Down
134 changes: 0 additions & 134 deletions backend/start/scheduler.ts

This file was deleted.

Loading