Skip to content

Commit

Permalink
It's always after you publish, isn't it? Also bumping the node engine…
Browse files Browse the repository at this point in the history
… version in package.json to avoid warnings about node 18's new fetch API.
  • Loading branch information
eaton committed Jan 28, 2023
1 parent 6c6a3e5 commit 5b28033
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 21 deletions.
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
{
"name": "spidergram",
"version": "0.8.2",
"version": "0.8.3",
"description": "Structural analysis tools for complex web sites",
"main": "./dist/index.js",
"exports": "./dist/index.js",
"types": "./dist/index.d.ts",
"type": "module",
"engines": {
"node": "^18.0.0"
"node": "^18.1.0"
},
"bin": {
"spidergram": "./bin/run.js"
Expand Down
62 changes: 43 additions & 19 deletions src/cli/commands/urls.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
import { Flags } from '@oclif/core';
import { NormalizedUrlSet } from '@autogram/url-tools';
import { CLI, Query, SgCommand, aql, HierarchyTools, TextTools } from '../../index.js';
import {
CLI,
Query,
SgCommand,
aql,
HierarchyTools,
TextTools,
} from '../../index.js';
import { URL_WITH_COMMAS_REGEX } from 'crawlee';
import { readFile } from 'fs/promises';
import minimatch from 'minimatch';
Expand Down Expand Up @@ -61,7 +68,8 @@ export default class Urls extends SgCommand {
}),
hide: Flags.string({
summary: 'URLs matching this string will be hidden from view',
description: "Both --hide and --highlight use glob-style wildcards; '**/*cnn.com*' will match content on CNN or one of its domains; '**/news*' would only display the news directory and its descendents, and so on.",
description:
"Both --hide and --highlight use glob-style wildcards; '**/*cnn.com*' will match content on CNN or one of its domains; '**/news*' would only display the news directory and its descendents, and so on.",
dependsOn: ['tree'],
required: false,
helpGroup: 'FORMAT',
Expand Down Expand Up @@ -109,11 +117,13 @@ export default class Urls extends SgCommand {
}),
};

static args = [{
name: 'input',
description: 'A database collection, local filename, or remote URL',
default: 'resources'
}]
static args = [
{
name: 'input',
description: 'A database collection, local filename, or remote URL',
default: 'resources',
},
];

async run() {
const { args, flags } = await this.parse(Urls);
Expand All @@ -126,13 +136,13 @@ export default class Urls extends SgCommand {

if (isParsableUrl(args.input)) {
const responseData = await fetch(new URL(args.input))
.then(response => response.text() )
.then(response => response.text())
.catch(reason => {
if (reason instanceof Error) this.error(reason.message);
else this.error("An error occurred loading the URL.");
else this.error('An error occurred loading the URL.');
});
rawUrls = responseData.match(URL_WITH_COMMAS_REGEX) || [];
} else if (args.input.indexOf('.') !== -1) {
rawUrls = responseData.match(URL_WITH_COMMAS_REGEX) || [];
} else if (args.input.indexOf('.') !== -1) {
const urlFile = await readFile(args.input)
.then(buffer => buffer.toString())
.catch(() => this.error(`File ${args.input} couldn't be opened`));
Expand Down Expand Up @@ -177,10 +187,16 @@ export default class Urls extends SgCommand {
summary['Hidden URLs'] = rawUrls.length - filteredUrls.length;
}
if (urls.unparsable.size) {
summary['Unparsable Urls'] = flags.unparsable ? [...urls.unparsable] : urls.unparsable.size;
summary['Unparsable Urls'] = flags.unparsable
? [...urls.unparsable]
: urls.unparsable.size;
}
if ((urls.size - webUrls.length) > 0) {
summary['Non-Web URLs'] = flags.nonweb ? [...urls].filter(url => !['https:', 'http:'].includes(url.protocol)).map(url => url.href) : urls.size - webUrls.length;
if (urls.size - webUrls.length > 0) {
summary['Non-Web URLs'] = flags.nonweb
? [...urls]
.filter(url => !['https:', 'http:'].includes(url.protocol))
.map(url => url.href)
: urls.size - webUrls.length;
}

const output: string[] = [];
Expand Down Expand Up @@ -222,11 +238,15 @@ export default class Urls extends SgCommand {
};
}

const hierarchy = new HierarchyTools.UrlHierarchyBuilder(treeOptions).add(webUrls);
const hierarchy = new HierarchyTools.UrlHierarchyBuilder(treeOptions).add(
webUrls,
);
const orphans = hierarchy.items.filter(item => item.isOrphan).length;
if (orphans > 0) {
if (flags.orphans) {
summary['Orphaned URLs'] = hierarchy.items.filter(item => item.isOrphan).map(orphan => orphan.data.url.toString());
summary['Orphaned URLs'] = hierarchy.items
.filter(item => item.isOrphan)
.map(orphan => orphan.data.url.toString());
} else {
summary['Orphaned URLs'] = orphans;
}
Expand All @@ -238,9 +258,13 @@ export default class Urls extends SgCommand {
summaryLines.push('# URL Summary');
for (const [bullet, content] of Object.entries(summary)) {
if (typeof content === 'number') {
summaryLines.push(`- **${bullet}**: ${content.toLocaleString().trim()}`);
summaryLines.push(
`- **${bullet}**: ${content.toLocaleString().trim()}`,
);
} else {
summaryLines.push(`- **${bullet}**: ${TextTools.joinOxford(content).trim()}`);
summaryLines.push(
`- **${bullet}**: ${TextTools.joinOxford(content).trim()}`,
);
}
}
output.push(summaryLines.join('\n'));
Expand Down Expand Up @@ -268,4 +292,4 @@ function isParsableUrl(input: string) {
} catch {
return false;
}
}
}

0 comments on commit 5b28033

Please sign in to comment.