From 82401590f7032d35d3b70174079f53b7b4b54c89 Mon Sep 17 00:00:00 2001 From: Zeke Sikelianos Date: Wed, 20 Nov 2019 14:57:57 -0800 Subject: [PATCH] feat: add `filter` function as an option to linkinator.check() (#120) --- README.md | 2 +- src/index.ts | 35 ++++++++++++++++++++++++--------- test/fixtures/filter/index.html | 7 +++++++ test/test.ts | 16 +++++++++++++++ 4 files changed, 50 insertions(+), 10 deletions(-) create mode 100644 test/fixtures/filter/index.html diff --git a/README.md b/README.md index 48fe7b15..8a284e12 100644 --- a/README.md +++ b/README.md @@ -127,7 +127,7 @@ Asynchronous method that runs a site wide scan. Options come in the form of an o - `concurrency` (number) - The number of connections to make simultaneously. Defaults to 100. - `port` (number) - When the `path` is provided as a local path on disk, the `port` on which to start the temporary web server. Defaults to a random high range order port. - `recurse` (boolean) - By default, all scans are shallow. Only the top level links on the requested page will be scanned. By setting `recurse` to `true`, the crawler will follow all links on the page, and continue scanning links **on the same domain** for as long as it can go. Results are cached, so no worries about loops. -- `linksToSkip` (array) - An array of regular expression strings that should be skipped during the scan. +- `linksToSkip` (array | function) - An array of regular expression strings that should be skipped, OR an async function that's called for each link with the link URL as its only argument. Return a Promise that resolves to `true` to skip the link or `false` to check it. #### linkinator.LinkChecker() Constructor method that can be used to create a new `LinkChecker` instance. This is particularly useful if you want to receive events as the crawler crawls. Exposes the following events: diff --git a/src/index.ts b/src/index.ts index fdd5efbf..bee46a24 100644 --- a/src/index.ts +++ b/src/index.ts @@ -16,7 +16,7 @@ export interface CheckOptions { port?: number; path: string; recurse?: boolean; - linksToSkip?: string[]; + linksToSkip?: string[] | ((link: string) => Promise); } export enum LinkState { @@ -137,14 +137,11 @@ export class LinkChecker extends EventEmitter { return; } - // Check for user configured links that should be skipped - const skips = opts.checkOptions - .linksToSkip!.map(linkToSkip => { - return new RegExp(linkToSkip).test(opts.url.href); - }) - .filter(match => !!match); - - if (skips.length > 0) { + // Check for a user-configured function to filter out links + if ( + typeof opts.checkOptions.linksToSkip === 'function' && + (await opts.checkOptions.linksToSkip(opts.url.href)) + ) { const result: LinkResult = { url: opts.url.href, state: LinkState.SKIPPED, @@ -155,6 +152,26 @@ export class LinkChecker extends EventEmitter { return; } + // Check for a user-configured array of link regular expressions that should be skipped + if (Array.isArray(opts.checkOptions.linksToSkip)) { + const skips = opts.checkOptions.linksToSkip + .map(linkToSkip => { + return new RegExp(linkToSkip).test(opts.url.href); + }) + .filter(match => !!match); + + if (skips.length > 0) { + const result: LinkResult = { + url: opts.url.href, + state: LinkState.SKIPPED, + parent: opts.parent, + }; + opts.results.push(result); + this.emit('link', result); + return; + } + } + // Perform a HEAD or GET request based on the need to crawl let status = 0; let state = LinkState.BROKEN; diff --git a/test/fixtures/filter/index.html b/test/fixtures/filter/index.html new file mode 100644 index 00000000..a53e1b9d --- /dev/null +++ b/test/fixtures/filter/index.html @@ -0,0 +1,7 @@ + + +I'm good +I should be filtered +I should also be filtered + + diff --git a/test/test.ts b/test/test.ts index cc64361c..ad8a537b 100644 --- a/test/test.ts +++ b/test/test.ts @@ -45,6 +45,22 @@ describe('linkinator', () => { ); }); + it('should skip links if passed a linksToSkip function', async () => { + const scope = nock('https://good.com') + .head('/') + .reply(200); + const results = await check({ + path: 'test/fixtures/filter', + linksToSkip: link => Promise.resolve(link.includes('filterme')), + }); + assert.ok(results.passed); + assert.strictEqual( + results.links.filter(x => x.state === LinkState.SKIPPED).length, + 2 + ); + scope.done(); + }); + it('should report broken links', async () => { const scope = nock('http://fake.local') .head('/')