Scrape latest data #163
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Scrape latest data | |
on: | |
push: | |
branches: | |
- main | |
workflow_dispatch: | |
schedule: | |
- cron: '6 2 * * *' | |
# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages | |
permissions: | |
contents: write | |
pages: write | |
id-token: write | |
# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. | |
# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. | |
concurrency: | |
group: "pages" | |
cancel-in-progress: false | |
jobs: | |
scrape: | |
runs-on: ubuntu-latest | |
environment: | |
name: github-pages | |
url: ${{ steps.deployment.outputs.page_url }} | |
steps: | |
- uses: actions/setup-node@v4 | |
with: | |
node-version: '22' | |
- name: Check out this repo | |
uses: actions/checkout@v4 | |
with: | |
ref: ${{ github.head_ref }} | |
- name: Prepare to parse | |
run: | | |
curl -o- --location https://github.com/mgdm/htmlq/releases/download/v0.4.0/htmlq-x86_64-linux.tar.gz | gunzip | tar -x htmlq | |
chmod u+x htmlq | |
- name: Fetch latest data | |
run: |- | |
curl --fail 'https://www.stolaf.edu/apps/mealtimes/index.cfm' > pipeline/1-raw.html | |
- name: Extract data from HTML document | |
shell: bash | |
run: > | |
cat ./pipeline/1-raw.html \ | |
| ./htmlq -- '[data-name="column_content"]' \ | |
| ./htmlq --remove-nodes 'div div' --text \ | |
| tail -n +50 > ./pipeline/2-extracted.html; | |
- name: Clean up after parsing | |
run: rm htmlq | |
- name: Extract data from HTML document | |
shell: bash | |
run: > | |
node --experimental-default-type=module <<'EOF' | |
import fs from 'node:fs/promises'; | |
import path from 'node:path'; | |
class Chart { constructor(id, config) { globalThis[id] = config; } } | |
let document = {getElementById: (id) => id}; | |
eval(await fs.readFile('./pipeline/2-extracted.html', 'utf-8')) | |
let datasets = Object.entries(globalThis).filter(([key]) => key.startsWith("myChart")).map(([k, v]) => v); | |
await fs.writeFile(path.join('pipeline', '3-converted.json'), JSON.stringify(datasets, null, '\t')); | |
EOF | |
- name: Massage the data into a consistent format | |
shell: bash | |
run: > | |
node --experimental-default-type=module --experimental-network-imports <<'EOF' | |
import fs from 'node:fs/promises'; | |
import path from 'node:path'; | |
import assert from 'node:assert/strict'; | |
import dayjs from 'https://esm.sh/dayjs@1.11.11'; | |
import customParseFormat from 'https://esm.sh/dayjs@1.11.11/plugin/customParseFormat'; | |
dayjs.extend(customParseFormat); | |
const now = dayjs(); | |
const zip = (a, b) => Array(Math.min(b.length, a.length)).fill().map((_,i) => [a[i], b[i]]); | |
const data = JSON.parse(await fs.readFile('./pipeline/3-converted.json')); | |
assert(Array.isArray(data)); | |
let output = data.map(dataset => { | |
let {labels, datasets} = dataset.data; | |
assert(datasets.length === 1); | |
let {label: rawDate, data} = datasets[0]; | |
// "label" (aka the date) looks like "Monday, May 20"; we have to strip off the day-of-week part for DayJS to parse successfully | |
let parsedDate = dayjs(rawDate.split(',')[1], ['MMMM D', 'MMMM DD']); | |
if (parsedDate.isAfter(now)) { | |
// at the end of the year, we don't want to start writing to Y+1's December folder by accident | |
parsedDate = parsedDate.subtract(1, 'year'); | |
} | |
// "labels,", the array of chart labels, re-uses '07:00:00' for both 7am and 7pm. Let's fix that by checking if we've seen `12:00:00` yet... | |
let lastHour = 0; | |
for (let i = 0; i < labels.length; i++) { | |
let label = labels[i]; | |
let parsedLabel = dayjs(label, 'hh:mm:ss'); | |
// give this date the full date | |
parsedLabel = parsedLabel.year(parsedDate.year()).month(parsedDate.month()).date(parsedDate.date()); | |
let wentBackwards = lastHour > parsedLabel.hour(); | |
if (wentBackwards) { | |
parsedLabel = parsedLabel.subtract(12, 'hour'); | |
} | |
labels[i] = parsedLabel.format('HH:mm:ss'); | |
lastHour = parsedLabel.hour(); | |
} | |
return {date: parsedDate.format('YYYY-MM-DD'), times: Object.fromEntries(zip(labels, data))}; | |
}) | |
await fs.writeFile(path.join('pipeline', '4-massaged.json'), JSON.stringify(output, null, '\t')); | |
EOF | |
- name: Parse data into one-file-per-day format | |
shell: bash | |
run: > | |
node --experimental-default-type=module --experimental-network-imports <<'EOF' | |
import fs from 'node:fs/promises'; | |
import path from 'node:path'; | |
import assert from 'node:assert/strict'; | |
import dayjs from 'https://esm.sh/dayjs@1.11.11'; | |
const data = JSON.parse(await fs.readFile('./pipeline/4-massaged.json')); | |
for (const {date, times} of data) { | |
let parsedDate = dayjs(date) | |
let folderName = path.join('data', 'raw', parsedDate.format('YYYY'), parsedDate.format('MM')); | |
await fs.mkdir(folderName, { recursive: true }); | |
await fs.writeFile(path.join(folderName, parsedDate.format('YYYY-MM-DD') + '.json'), JSON.stringify({date, times}, null, '\t')); | |
} | |
EOF | |
- name: Summarize parsed data into hourly information | |
shell: bash | |
run: > | |
node --experimental-default-type=module --experimental-network-imports <<'EOF' | |
import fs from 'node:fs/promises'; | |
import path from 'node:path'; | |
import assert from 'node:assert/strict'; | |
import dayjs from 'https://esm.sh/dayjs@1.11.11'; | |
import customParseFormat from 'https://esm.sh/dayjs@1.11.11/plugin/customParseFormat'; | |
dayjs.extend(customParseFormat); | |
for await (const file of fs.glob('data/raw/**/*.json')) { | |
let {date, times: inputTimes} = JSON.parse(await fs.readFile(file)) | |
let parsedDate = dayjs(date) | |
let times = Object.fromEntries(Array(24).fill().map((_, hour) => { | |
let stringHour = String(hour).padStart(2, '0') | |
let hourTotal = Object.entries(inputTimes).filter(([key]) => key.startsWith(`${stringHour}:`)).map(([_key, value]) => value).reduce((a, b) => a + b, 0) | |
return [`${stringHour}:00`, hourTotal]; | |
})) | |
let folderName = path.join('data', 'summary', parsedDate.format('YYYY'), parsedDate.format('MM')); | |
await fs.mkdir(folderName, { recursive: true }); | |
await fs.writeFile(path.join(folderName, parsedDate.format('YYYY-MM-DD') + '.json'), JSON.stringify({date, times}, null, '\t')); | |
} | |
EOF | |
- name: Commit and push if anything changed | |
run: |- | |
git config user.name "Automated" | |
git config user.email "actions@users.noreply.github.com" | |
git add -A | |
timestamp=$(date -u) | |
git commit -m "Latest data: ${timestamp}" || exit 0 | |
git push | |
- name: Setup Pages | |
uses: actions/configure-pages@v5 | |
- name: Generate two-weeks artifact | |
run: > | |
jq -s . $(ls data/summary/*/*/*.json | sort -n | tail -n 14) > data/two-weeks.json | |
- name: Upload artifact | |
uses: actions/upload-pages-artifact@v3 | |
with: | |
# Upload entire repository | |
path: './data/' | |
- name: Deploy to GitHub Pages | |
id: deployment | |
uses: actions/deploy-pages@v4 |