Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for TypeScript #4

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 34 additions & 4 deletions .eslintrc
Original file line number Diff line number Diff line change
@@ -1,6 +1,36 @@
{
"extends": ["airbnb"],
"rules": {
"no-useless-escape": 0
}
"root": true,
"extends": [
"eslint:recommended"
],
"env": {
"es2021": true,
"node": true,
"mocha": true
},
"overrides": [
{
"files": [
"**/*.ts",
"**/*.tsx"
],
"env": {
"es2021": true,
"node": true
},
"extends": [
"eslint:recommended",
"plugin:@typescript-eslint/recommended"
],
"parser": "@typescript-eslint/parser",
"parserOptions": {
"ecmaVersion": 12,
"sourceType": "module"
},
"plugins": [
"@typescript-eslint"
],
"rules": {}
}
]
}
6 changes: 6 additions & 0 deletions .prettierrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"trailingComma": "all",
"tabWidth": 2,
"semi": true,
"singleQuote": true
}
2 changes: 2 additions & 0 deletions dist/get.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
declare const getRobots: (url: string, timeout: number) => Promise<string>;
export = getRobots;
60 changes: 60 additions & 0 deletions dist/get.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __generator = (this && this.__generator) || function (thisArg, body) {
var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g;
return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
function verb(n) { return function (v) { return step([n, v]); }; }
function step(op) {
if (f) throw new TypeError("Generator is already executing.");
while (_) try {
if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
if (y = 0, t) op = [op[0] & 2, t.value];
switch (op[0]) {
case 0: case 1: t = op; break;
case 4: _.label++; return { value: op[1], done: false };
case 5: _.label++; y = op[1]; op = [0]; continue;
case 7: op = _.ops.pop(); _.trys.pop(); continue;
default:
if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
if (t[2]) _.ops.pop();
_.trys.pop(); continue;
}
op = body.call(thisArg, _);
} catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
}
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
var got_1 = __importDefault(require("got"));
var getRobots = function (url, timeout) { return __awaiter(void 0, void 0, void 0, function () {
var response;
return __generator(this, function (_a) {
switch (_a.label) {
case 0: return [4 /*yield*/, (0, got_1.default)(url, {
method: 'GET',
timeout: timeout,
throwHttpErrors: false,
responseType: 'text',
})];
case 1:
response = _a.sent();
if (response.statusCode !== 200)
throw new Error('No robots.txt');
return [2 /*return*/, response.body];
}
});
}); };
module.exports = getRobots;
4 changes: 4 additions & 0 deletions dist/index.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import Robots from './robots';
import { RobotOptions } from './types';
declare const _default: (opts?: Partial<RobotOptions> | undefined) => Robots;
export = _default;
6 changes: 6 additions & 0 deletions dist/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
var robots_1 = __importDefault(require("./robots"));
module.exports = function (opts) { return new robots_1.default(opts); };
3 changes: 3 additions & 0 deletions dist/parser.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import { ParsedRobotsTxt } from './types';
declare const parser: (rawString: string) => ParsedRobotsTxt;
export = parser;
126 changes: 126 additions & 0 deletions dist/parser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
"use strict";
// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
var __spreadArray = (this && this.__spreadArray) || function (to, from, pack) {
if (pack || arguments.length === 2) for (var i = 0, l = from.length, ar; i < l; i++) {
if (ar || !(i in from)) {
if (!ar) ar = Array.prototype.slice.call(from, 0, i);
ar[i] = from[i];
}
}
return to.concat(ar || Array.prototype.slice.call(from));
};
// Constants for groupings
var USER_AGENT = 'user-agent';
var ALLOW = 'allow';
var DISALLOW = 'disallow';
var SITEMAP = 'sitemap';
var CRAWL_DELAY = 'crawl-delay';
var HOST = 'host';
// Regex's for cleaning up the file.
var comments = /#.*$/gm;
var whitespace = ' ';
var lineEndings = /[\r\n]+/g;
var recordSlices = /(\w+-)?\w+:\s\S*/g;
// Replace comments and whitespace
var cleanComments = function (rawString) { return rawString.replace(comments, ''); };
var cleanSpaces = function (rawString) {
return rawString.replace(whitespace, '').trim();
};
var splitOnLines = function (string) { return string.split(lineEndings); };
var robustSplit = function (string) {
var _a;
return !string.includes('<html>')
? __spreadArray([], ((_a = string.match(recordSlices)) !== null && _a !== void 0 ? _a : []), true).map(cleanSpaces)
: [];
};
var parseRecord = function (line) {
// Find first colon and assume is the field delimiter.
var firstColonI = line.indexOf(':');
return {
// Fields are non-case sensitive, therefore lowercase them.
field: line.slice(0, firstColonI).toLowerCase().trim(),
// Values are case sensitive (e.g. urls) and therefore leave alone.
value: line.slice(firstColonI + 1).trim(),
};
};
var parsePattern = function (pattern) {
var regexSpecialChars = /[-[\]/{}()+?.\\^$|]/g;
var wildCardPattern = /\*/g;
var EOLPattern = /\\\$$/;
var flags = 'm';
var regexString = pattern
.replace(regexSpecialChars, '\\$&')
.replace(wildCardPattern, '.*')
.replace(EOLPattern, '$');
return new RegExp(regexString, flags);
};
var groupMemberRecord = function (value) { return ({
specificity: value.length,
path: parsePattern(value),
}); };
var parser = function (rawString) {
var lines = splitOnLines(cleanSpaces(cleanComments(rawString)));
// Fallback to the record based split method if we find only one line.
if (lines.length === 1) {
lines = robustSplit(cleanComments(rawString));
}
var robotsObj = {
sitemaps: [],
};
var agent = '';
lines.forEach(function (line) {
var record = parseRecord(line);
var recordValue = record.value.toLowerCase();
switch (record.field) {
case USER_AGENT:
if (recordValue !== agent && recordValue.length > 0) {
// Bot names are non-case sensitive.
agent = recordValue;
robotsObj[agent] = {
allow: [],
disallow: [],
crawlDelay: 0,
};
}
else if (recordValue.length === 0) {
// Malformed user-agent, ignore its rules.
agent = '';
}
break;
// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt#order-of-precedence-for-group-member-records
case ALLOW:
if (agent.length > 0 && record.value.length > 0) {
robotsObj[agent].allow.push(groupMemberRecord(record.value));
}
break;
case DISALLOW:
if (agent.length > 0 && record.value.length > 0) {
robotsObj[agent].disallow.push(groupMemberRecord(record.value));
}
break;
// Non standard but support by google therefore included.
case SITEMAP:
if (record.value.length > 0) {
robotsObj.sitemaps.push(record.value);
}
break;
case CRAWL_DELAY:
if (agent.length > 0) {
robotsObj[agent].crawlDelay = Number.parseInt(record.value, 10);
}
break;
// Non standard but included for completeness.
case HOST:
if (!robotsObj.host) {
robotsObj.host = record.value;
}
break;
default:
break;
}
});
// Return only unique sitemaps.
robotsObj.sitemaps = robotsObj.sitemaps.filter(function (val, i, s) { return s.indexOf(val) === i; });
return robotsObj;
};
module.exports = parser;
27 changes: 27 additions & 0 deletions dist/robots.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import { ParsedRobotsTxt, RobotOptions, RobotsAgent } from './types';
declare class Robots {
active: string;
robotsCache: Record<string, ParsedRobotsTxt>;
opts: RobotOptions;
constructor(opts?: Partial<RobotOptions>);
getRecordsForAgent(): false | RobotsAgent;
canVisit(url: string, botGroup: RobotsAgent): boolean;
parseRobots(url: string, string: string): void;
fetch(link: string): Promise<ParsedRobotsTxt>;
isCached(domain: string): boolean;
useRobotsFor(url: string, callback?: () => unknown): unknown;
canCrawl(url: string, callback?: (crawlable: boolean) => unknown): unknown;
canCrawlSync(url: string): boolean;
getSitemaps(callback?: (sitemaps: string[]) => unknown): unknown;
getSitemapsSync(): string[];
getCrawlDelay(callback?: (crawlDelay: number) => unknown): unknown;
getCrawlDelaySync(): number;
getCrawlableLinks(linkArray: string[], callback?: (crawlableLinks: string[]) => unknown): unknown;
getCrawlableLinksSync(linkArray: string[] | string): string[];
getPreferredHost(callback?: (host?: string) => unknown): Promise<string | undefined>;
getPreferredHostSync(): string | undefined;
setUserAgent(agent: string): void;
setAllowOnNeutral(allow: boolean): void;
clearCache(): void;
}
export = Robots;
Loading