Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Category descriptions #89

Merged
merged 8 commits into from
Jan 15, 2025
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"lint:fix": "eslint --fix src/**/*.{js,json} tests/**/*.js scripts/**/*.js && jsonlint -isV ./schema.json --trim-trailing-commas --enforce-double-quotes ./src/technologies/ && jsonlint -is --trim-trailing-commas --enforce-double-quotes ./src/categories.json",
"validate": "yarn run lint && node ./scripts/validate.js",
"test": "jest",
"upload": "node ./scripts/upload_technology.js",
"upload": "node ./scripts/bigquery_upload.js",
"convert": "node --no-warnings ./scripts/convert.js",
"build": "yarn run validate && yarn run convert && node ./scripts/build.js"
},
Expand Down
128 changes: 74 additions & 54 deletions scripts/upload_technology.js → scripts/bigquery_upload.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,52 +5,12 @@ const fs = require('fs')
const path = require('path')
const { BigQuery } = require('@google-cloud/bigquery')

const readJsonFiles = (directory) => {
const files = fs.readdirSync(directory)
return files.reduce((mergedData, file) => {
const filePath = path.join(directory, file)
const data = fs.readFileSync(filePath, 'utf8')
return { ...mergedData, ...JSON.parse(data) }
}, {})
}

const getArray = (value) =>
typeof value === 'string' ? [value] : Array.isArray(value) ? value : []
const bigquery = new BigQuery({
keyFilename: '/tmp/gcp_key.json',
})

const getRuleObject = (value) => {
if (typeof value === 'string') {
return [{ name: value, value: null }]
}
if (Array.isArray(value)) {
return value.map((key) => ({ name: key, value: null }))
}
if (typeof value === 'object') {
return Object.keys(value).map((key) => ({
name: key,
value:
typeof value[key] === 'object'
? JSON.stringify(value[key])
: value[key].toString(),
}))
}
return []
}

const loadToBigQuery = async (
data,
tableName = 'apps',
datasetName = 'wappalyzer',
writeDisposition = 'WRITE_TRUNCATE',
sourceFormat = 'NEWLINE_DELIMITED_JSON'
) => {
if (!data) {
throw new Error(`No data to load to \`${datasetName}.${tableName}\`.`)
}

const bigquery = new BigQuery({
keyFilename: '/tmp/gcp_key.json',
})
const schema = {
const schemas = {
technologies: {
fields: [
{ name: 'name', type: 'STRING' },
{ name: 'categories', type: 'STRING', mode: 'REPEATED' },
Expand Down Expand Up @@ -137,8 +97,58 @@ const loadToBigQuery = async (
{ name: 'script', type: 'STRING', mode: 'REPEATED' },
{ name: 'html', type: 'STRING', mode: 'REPEATED' },
],
},
categories: {
fields: [
{ name: 'name', type: 'STRING' },
{ name: 'description', type: 'STRING' },
],
},
}

const readJsonFiles = (directory) => {
const files = fs.readdirSync(directory)
return files.reduce((mergedData, file) => {
const filePath = path.join(directory, file)
const data = fs.readFileSync(filePath, 'utf8')
return { ...mergedData, ...JSON.parse(data) }
}, {})
}

const getArray = (value) =>
typeof value === 'string' ? [value] : Array.isArray(value) ? value : []

const getRuleObject = (value) => {
if (typeof value === 'string') {
return [{ name: value, value: null }]
}
if (Array.isArray(value)) {
return value.map((key) => ({ name: key, value: null }))
}
if (typeof value === 'object') {
return Object.keys(value).map((key) => ({
name: key,
value:
typeof value[key] === 'object'
? JSON.stringify(value[key])
: value[key].toString(),
}))
}
return []
}

const loadToBigQuery = async (
data,
tableName = 'technologies',
datasetName = 'wappalyzer',
writeDisposition = 'WRITE_TRUNCATE',
sourceFormat = 'NEWLINE_DELIMITED_JSON'
) => {
if (!data) {
throw new Error(`No data to load to \`${datasetName}.${tableName}\`.`)
}

const schema = schemas[tableName]
const options = { schema, sourceFormat, writeDisposition }
const [job] = await bigquery
.dataset(datasetName)
Expand All @@ -147,11 +157,11 @@ const loadToBigQuery = async (

if (job.status.errors && job.status.errors.length > 0) {
console.error('Errors encountered:', job.status.errors)
throw new Error('Error loading data into BigQuery')
throw new Error(`Error loading data into ${datasetName}.${tableName}`)
}

console.log(
`Loaded ${job.numRowsLoaded} rows into ${datasetName}.${tableName}...`
`Loaded ${job.statistics.load.outputRows} rows into ${datasetName}.${tableName}`
)
}

Expand Down Expand Up @@ -208,13 +218,23 @@ const main = async () => {
const transformedTechnologiesJsonL = transformedTechnologies
.map((line) => JSON.stringify(line))
.join('\n')
const filePath = './transformedTechnologies.jsonl'
fs.writeFileSync(filePath, transformedTechnologiesJsonL)

await loadToBigQuery(filePath, 'apps')

// cleanup file
fs.unlinkSync(filePath)
const technologiesFilePath = './transformedTechnologies.jsonl'
fs.writeFileSync(technologiesFilePath, transformedTechnologiesJsonL)
await loadToBigQuery(technologiesFilePath, 'technologies')
fs.unlinkSync(technologiesFilePath)

const transformedCategoriesJsonL = Object.values(categories)
.map((value) =>
JSON.stringify({
name: value.name,
description: value.description,
})
)
.join('\n')
const categoriesFilePath = './transformedCategories.jsonl'
fs.writeFileSync(categoriesFilePath, transformedCategoriesJsonL)
await loadToBigQuery(categoriesFilePath, 'categories')
fs.unlinkSync(categoriesFilePath)
}

main().catch(console.error)
Loading
Loading