Skip to content

Scrape latest data #179

Scrape latest data

Scrape latest data #179

Workflow file for this run

name: Scrape latest data
on:
push:
branches:
- main
workflow_dispatch:
schedule:
- cron: '6 2 * * *'
# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
permissions:
contents: write
pages: write
id-token: write
# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
concurrency:
group: "pages"
cancel-in-progress: false
jobs:
scrape:
runs-on: ubuntu-latest
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
steps:
- uses: actions/setup-node@v4
with:
node-version: '22'
- name: Check out this repo
uses: actions/checkout@v4
with:
ref: ${{ github.head_ref }}
- name: Prepare to parse
run: |
curl -o- --location https://github.com/mgdm/htmlq/releases/download/v0.4.0/htmlq-x86_64-linux.tar.gz | gunzip | tar -x htmlq
chmod u+x htmlq
- name: Fetch latest data
run: |-
curl --fail 'https://www.stolaf.edu/apps/mealtimes/index.cfm' > pipeline/1-raw.html
- name: Extract data from HTML document
shell: bash
run: >
cat ./pipeline/1-raw.html \
| ./htmlq -- '[data-name="column_content"]' \
| ./htmlq --remove-nodes 'div div' --text \
| tail -n +50 > ./pipeline/2-extracted.html;
- name: Clean up after parsing
run: rm htmlq
- name: Extract data from HTML document
shell: bash
run: >
node --experimental-default-type=module <<'EOF'
import fs from 'node:fs/promises';
import path from 'node:path';
class Chart { constructor(id, config) { globalThis[id] = config; } }
let document = {getElementById: (id) => id};
eval(await fs.readFile('./pipeline/2-extracted.html', 'utf-8'))
let datasets = Object.entries(globalThis).filter(([key]) => key.startsWith("myChart")).map(([k, v]) => v);
await fs.writeFile(path.join('pipeline', '3-converted.json'), JSON.stringify(datasets, null, '\t'));
EOF
- name: Massage the data into a consistent format
shell: bash
run: >
node --experimental-default-type=module --experimental-network-imports <<'EOF'
import fs from 'node:fs/promises';
import path from 'node:path';
import assert from 'node:assert/strict';
import dayjs from 'https://esm.sh/dayjs@1.11.11';
import customParseFormat from 'https://esm.sh/dayjs@1.11.11/plugin/customParseFormat';
dayjs.extend(customParseFormat);
const now = dayjs();
const zip = (a, b) => Array(Math.min(b.length, a.length)).fill().map((_,i) => [a[i], b[i]]);
const data = JSON.parse(await fs.readFile('./pipeline/3-converted.json'));
assert(Array.isArray(data));
let output = data.map(dataset => {
let {labels, datasets} = dataset.data;
assert(datasets.length === 1);
let {label: rawDate, data} = datasets[0];
// "label" (aka the date) looks like "Monday, May 20"; we have to strip off the day-of-week part for DayJS to parse successfully
let parsedDate = dayjs(rawDate.split(',')[1], ['MMMM D', 'MMMM DD']);
if (parsedDate.isAfter(now)) {
// at the end of the year, we don't want to start writing to Y+1's December folder by accident
parsedDate = parsedDate.subtract(1, 'year');
}
// "labels,", the array of chart labels, re-uses '07:00:00' for both 7am and 7pm. Let's fix that by checking if we've seen `12:00:00` yet...
let lastHour = 0;
for (let i = 0; i < labels.length; i++) {
let label = labels[i];
let parsedLabel = dayjs(label, 'hh:mm:ss');
// give this date the full date
parsedLabel = parsedLabel.year(parsedDate.year()).month(parsedDate.month()).date(parsedDate.date());
let wentBackwards = lastHour > parsedLabel.hour();
if (wentBackwards) {
parsedLabel = parsedLabel.subtract(12, 'hour');
}
labels[i] = parsedLabel.format('HH:mm:ss');
lastHour = parsedLabel.hour();
}
return {date: parsedDate.format('YYYY-MM-DD'), times: Object.fromEntries(zip(labels, data))};
})
await fs.writeFile(path.join('pipeline', '4-massaged.json'), JSON.stringify(output, null, '\t'));
EOF
- name: Parse data into one-file-per-day format
shell: bash
run: >
node --experimental-default-type=module --experimental-network-imports <<'EOF'
import fs from 'node:fs/promises';
import path from 'node:path';
import assert from 'node:assert/strict';
import dayjs from 'https://esm.sh/dayjs@1.11.11';
const data = JSON.parse(await fs.readFile('./pipeline/4-massaged.json'));
for (const {date, times} of data) {
let parsedDate = dayjs(date)
let folderName = path.join('data', 'raw', parsedDate.format('YYYY'), parsedDate.format('MM'));
await fs.mkdir(folderName, { recursive: true });
await fs.writeFile(path.join(folderName, parsedDate.format('YYYY-MM-DD') + '.json'), JSON.stringify({date, times}, null, '\t'));
}
EOF
- name: Summarize parsed data into hourly information
shell: bash
run: >
node --experimental-default-type=module --experimental-network-imports <<'EOF'
import fs from 'node:fs/promises';
import path from 'node:path';
import assert from 'node:assert/strict';
import dayjs from 'https://esm.sh/dayjs@1.11.11';
import customParseFormat from 'https://esm.sh/dayjs@1.11.11/plugin/customParseFormat';
dayjs.extend(customParseFormat);
for await (const file of fs.glob('data/raw/**/*.json')) {
let {date, times: inputTimes} = JSON.parse(await fs.readFile(file))
let parsedDate = dayjs(date)
let times = Object.fromEntries(Array(24).fill().map((_, hour) => {
let stringHour = String(hour).padStart(2, '0')
let hourTotal = Object.entries(inputTimes).filter(([key]) => key.startsWith(`${stringHour}:`)).map(([_key, value]) => value).reduce((a, b) => a + b, 0)
return [`${stringHour}:00`, hourTotal];
}))
let folderName = path.join('data', 'summary', parsedDate.format('YYYY'), parsedDate.format('MM'));
await fs.mkdir(folderName, { recursive: true });
await fs.writeFile(path.join(folderName, parsedDate.format('YYYY-MM-DD') + '.json'), JSON.stringify({date, times}, null, '\t'));
}
EOF
- name: Commit and push if anything changed
run: |-
git config user.name "Automated"
git config user.email "actions@users.noreply.github.com"
git add -A
timestamp=$(date -u)
git commit -m "Latest data: ${timestamp}" || exit 0
git push
- name: Setup Pages
uses: actions/configure-pages@v5
- name: Generate two-weeks artifact
run: >
jq -s . $(ls data/summary/*/*/*.json | sort -n | tail -n 14) > data/two-weeks.json
- name: Upload artifact
uses: actions/upload-pages-artifact@v3
with:
# Upload entire repository
path: './data/'
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v4