Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bracket parsing refactor and support for 「」『』<> brackets #463

Merged
merged 6 commits into from
Nov 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 63 additions & 98 deletions packages/linkifyjs/src/parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -43,31 +43,37 @@ export function init({ groups }) {
tk.SLASH,
tk.SYM,
tk.TILDE,
tk.UNDERSCORE
tk.UNDERSCORE,
]);

// Types of tokens that can follow a URL and be part of the query string
// but cannot be the very last characters
// Characters that cannot appear in the URL at all should be excluded
const qsNonAccepting = [
tk.APOSTROPHE,
tk.CLOSEANGLEBRACKET,
tk.CLOSEBRACE,
tk.CLOSEBRACKET,
tk.CLOSEPAREN,
tk.FULLWIDTH_CLOSEPAREN,
tk.COLON,
tk.COMMA,
tk.DOT,
tk.EXCLAMATION,
tk.QUERY,
tk.QUOTE,
tk.SEMI,
tk.OPENANGLEBRACKET,
tk.CLOSEANGLEBRACKET,
tk.OPENBRACE,
tk.CLOSEBRACE,
tk.CLOSEBRACKET,
tk.OPENBRACKET,
tk.OPENPAREN,
tk.FULLWIDTH_OPENPAREN,
tk.QUERY,
tk.QUOTE,
tk.SEMI
tk.CLOSEPAREN,
tk.FULLWIDTHLEFTPAREN,
tk.FULLWIDTHRIGHTPAREN,
tk.LEFTCORNERBRACKET,
tk.RIGHTCORNERBRACKET,
tk.LEFTWHITECORNERBRACKET,
tk.RIGHTWHITECORNERBRACKET,
tk.FULLWIDTHLESSTHAN,
tk.FULLWIDTHGREATERTHAN,
];

// For addresses without the mailto prefix
Expand All @@ -79,11 +85,11 @@ export function init({ groups }) {
tk.BACKSLASH,
tk.BACKTICK,
tk.CARET,
tk.CLOSEBRACE,
tk.DOLLAR,
tk.EQUALS,
tk.HYPHEN,
tk.OPENBRACE,
tk.CLOSEBRACE,
tk.PERCENT,
tk.PIPE,
tk.PLUS,
Expand All @@ -92,7 +98,7 @@ export function init({ groups }) {
tk.SLASH,
tk.SYM,
tk.TILDE,
tk.UNDERSCORE
tk.UNDERSCORE,
];

// The universal starting state.
Expand All @@ -104,7 +110,9 @@ export function init({ groups }) {
ta(Localpart, localpartAccepting, Localpart);
ta(Localpart, groups.domain, Localpart);

const Domain = makeState(), Scheme = makeState(), SlashScheme = makeState();
const Domain = makeState(),
Scheme = makeState(),
SlashScheme = makeState();
ta(Start, groups.domain, Domain); // parsed string ends with a potential domain name (A)
ta(Start, groups.scheme, Scheme); // e.g., 'mailto'
ta(Start, groups.slashscheme, SlashScheme); // e.g., 'http'
Expand Down Expand Up @@ -144,7 +152,7 @@ export function init({ groups }) {

// Final possible email states
const EmailColon = tt(Email, tk.COLON); // URL followed by colon (potential port number here)
/*const EmailColonPort = */ta(EmailColon, groups.numeric, mtk.Email); // URL followed by colon and port numner
/*const EmailColonPort = */ ta(EmailColon, groups.numeric, mtk.Email); // URL followed by colon and port number

// Account for dots and hyphens. Hyphens are usually parts of domain names
// (but not TLDs)
Expand Down Expand Up @@ -206,86 +214,46 @@ export function init({ groups }) {
ta(UriPrefix, qsAccepting, Url);
tt(UriPrefix, tk.SLASH, Url);

// URL, followed by an opening bracket
const UrlOpenbrace = tt(Url, tk.OPENBRACE); // URL followed by {
const UrlOpenbracket = tt(Url, tk.OPENBRACKET); // URL followed by [
const UrlOpenanglebracket = tt(Url, tk.OPENANGLEBRACKET); // URL followed by <
const UrlOpenparen = tt(Url, tk.OPENPAREN); // URL followed by (
const UrlFullwidthOpenparen = tt(Url, tk.FULLWIDTH_OPENPAREN); // URL followed by (

tt(UrlNonaccept, tk.OPENBRACE, UrlOpenbrace);
tt(UrlNonaccept, tk.OPENBRACKET, UrlOpenbracket);
tt(UrlNonaccept, tk.OPENANGLEBRACKET, UrlOpenanglebracket);
tt(UrlNonaccept, tk.OPENPAREN, UrlOpenparen);
tt(UrlNonaccept, tk.FULLWIDTH_OPENPAREN, UrlFullwidthOpenparen);

// Closing bracket component. This character WILL be included in the URL
tt(UrlOpenbrace, tk.CLOSEBRACE, Url);
tt(UrlOpenbracket, tk.CLOSEBRACKET, Url);
tt(UrlOpenanglebracket, tk.CLOSEANGLEBRACKET, Url);
tt(UrlOpenparen, tk.CLOSEPAREN, Url);
tt(UrlFullwidthOpenparen, tk.FULLWIDTH_CLOSEPAREN, Url);
tt(UrlOpenbrace, tk.CLOSEBRACE, Url);

// URL that beings with an opening bracket, followed by a symbols.
// Note that the final state can still be `UrlOpenbrace` (if the URL only
// has a single opening bracket for some reason).
const UrlOpenbraceQ = makeState(mtk.Url); // URL followed by { and some symbols that the URL can end it
const UrlOpenbracketQ = makeState(mtk.Url); // URL followed by [ and some symbols that the URL can end it
const UrlOpenanglebracketQ = makeState(mtk.Url); // URL followed by < and some symbols that the URL can end it
const UrlOpenparenQ = makeState(mtk.Url); // URL followed by ( and some symbols that the URL can end it
const UrlFullwidthOpenparenQ = makeState(mtk.Url); // URL followed by ( and some symbols that the URL can end it
ta(UrlOpenbrace, qsAccepting, UrlOpenbraceQ);
ta(UrlOpenbracket, qsAccepting, UrlOpenbracketQ);
ta(UrlOpenanglebracket, qsAccepting, UrlOpenanglebracketQ);
ta(UrlOpenparen, qsAccepting, UrlOpenparenQ);
ta(UrlFullwidthOpenparen, qsAccepting, UrlFullwidthOpenparenQ);

const UrlOpenbraceSyms = makeState(); // UrlOpenbrace followed by some symbols it cannot end it
const UrlOpenbracketSyms = makeState(); // UrlOpenbracketQ followed by some symbols it cannot end it
const UrlOpenanglebracketSyms = makeState(); // UrlOpenanglebracketQ followed by some symbols it cannot end it
const UrlOpenparenSyms = makeState(); // UrlOpenparenQ followed by some symbols it cannot end it
const UrlFullwidthOpenparenSyms = makeState(); // UrlFullwidthOpenparenQ followed by some symbols it cannot end it
ta(UrlOpenbrace, qsNonAccepting);
ta(UrlOpenbracket, qsNonAccepting);
ta(UrlOpenanglebracket, qsNonAccepting);
ta(UrlOpenparen, qsNonAccepting);
ta(UrlFullwidthOpenparen, qsNonAccepting);

// URL that begins with an opening bracket, followed by some symbols
ta(UrlOpenbraceQ, qsAccepting, UrlOpenbraceQ);
ta(UrlOpenbracketQ, qsAccepting, UrlOpenbracketQ);
ta(UrlOpenanglebracketQ, qsAccepting, UrlOpenanglebracketQ);
ta(UrlOpenparenQ, qsAccepting, UrlOpenparenQ);
ta(UrlFullwidthOpenparenQ, qsAccepting, UrlFullwidthOpenparenQ);
ta(UrlOpenbraceQ, qsNonAccepting, UrlOpenbraceQ);
ta(UrlOpenbracketQ, qsNonAccepting, UrlOpenbracketQ);
ta(UrlOpenanglebracketQ, qsNonAccepting, UrlOpenanglebracketQ);
ta(UrlOpenparenQ, qsNonAccepting, UrlOpenparenQ);
ta(UrlFullwidthOpenparenQ, qsAccepting, UrlFullwidthOpenparenQ);

ta(UrlOpenbraceSyms, qsAccepting, UrlOpenbraceSyms);
ta(UrlOpenbracketSyms, qsAccepting, UrlOpenbracketQ);
ta(UrlOpenanglebracketSyms, qsAccepting, UrlOpenanglebracketQ);
ta(UrlOpenparenSyms, qsAccepting, UrlOpenparenQ);
ta(UrlFullwidthOpenparenSyms, qsAccepting, UrlFullwidthOpenparenQ);
ta(UrlOpenbraceSyms, qsNonAccepting, UrlOpenbraceSyms);
ta(UrlOpenbracketSyms, qsNonAccepting, UrlOpenbracketSyms);
ta(UrlOpenanglebracketSyms, qsNonAccepting, UrlOpenanglebracketSyms);
ta(UrlOpenparenSyms, qsNonAccepting, UrlOpenparenSyms);
ta(UrlFullwidthOpenparenSyms, qsAccepting, UrlFullwidthOpenparenSyms);

// Close brace/bracket to become regular URL
tt(UrlOpenbracketQ, tk.CLOSEBRACKET, Url);
tt(UrlOpenanglebracketQ, tk.CLOSEANGLEBRACKET, Url);
tt(UrlOpenparenQ, tk.CLOSEPAREN, Url);
tt(UrlFullwidthOpenparenQ, tk.FULLWIDTH_CLOSEPAREN, Url);
tt(UrlOpenbraceQ, tk.CLOSEBRACE, Url);
tt(UrlOpenbracketSyms, tk.CLOSEBRACKET, Url);
tt(UrlOpenanglebracketSyms, tk.CLOSEANGLEBRACKET, Url);
tt(UrlFullwidthOpenparenSyms, tk.FULLWIDTH_CLOSEPAREN, Url);
tt(UrlOpenbraceSyms, tk.CLOSEPAREN, Url);
tt(UrlOpenbraceSyms, tk.FULLWIDTH_CLOSEPAREN, Url);
const bracketPairs = [
[tk.OPENBRACE, tk.CLOSEBRACE], // {}
[tk.OPENBRACKET, tk.CLOSEBRACKET], // []
[tk.OPENPAREN, tk.CLOSEPAREN], // ()
[tk.OPENANGLEBRACKET, tk.CLOSEANGLEBRACKET], // <>
[tk.FULLWIDTHLEFTPAREN, tk.FULLWIDTHRIGHTPAREN], // ()
[tk.LEFTCORNERBRACKET, tk.RIGHTCORNERBRACKET], // 「」
[tk.LEFTWHITECORNERBRACKET, tk.RIGHTWHITECORNERBRACKET], // 『』
[tk.FULLWIDTHLESSTHAN, tk.FULLWIDTHGREATERTHAN], // <>
];

for (let i = 0; i < bracketPairs.length; i++) {
const [OPEN, CLOSE] = bracketPairs[i];
const UrlOpen = tt(Url, OPEN); // URL followed by open bracket

// Continue not accepting for open brackets
tt(UrlNonaccept, OPEN, UrlOpen);

// Closing bracket component. This character WILL be included in the URL
tt(UrlOpen, CLOSE, Url);

// URL that beings with an opening bracket, followed by a symbols.
// Note that the final state can still be `UrlOpen` (if the URL has a
// single opening bracket for some reason).
const UrlOpenQ = makeState(mtk.Url);
ta(UrlOpen, qsAccepting, UrlOpenQ);

const UrlOpenSyms = makeState(); // UrlOpen followed by some symbols it cannot end it
ta(UrlOpen, qsNonAccepting);

// URL that begins with an opening bracket, followed by some symbols
ta(UrlOpenQ, qsAccepting, UrlOpenQ);
ta(UrlOpenQ, qsNonAccepting, UrlOpenSyms);
ta(UrlOpenSyms, qsAccepting, UrlOpenQ);
ta(UrlOpenSyms, qsNonAccepting, UrlOpenSyms);

// Close brace/bracket to become regular URL
tt(UrlOpenQ, CLOSE, Url);
tt(UrlOpenSyms, CLOSE, Url);
}

tt(Start, tk.LOCALHOST, DomainDotTld); // localhost is a valid URL state
tt(Start, tk.NL, mtk.Nl); // single new line
Expand Down Expand Up @@ -323,10 +291,7 @@ export function run(start, input, tokens) {
textTokens.push(tokens[cursor++]);
}

while (cursor < len && (
nextState = secondState || state.go(tokens[cursor].t))
) {

while (cursor < len && (nextState = secondState || state.go(tokens[cursor].t))) {
// Get the next state
secondState = null;
state = nextState;
Expand Down
42 changes: 28 additions & 14 deletions packages/linkifyjs/src/scanner.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ const NL = '\n'; // New line character
const EMOJI_VARIATION = '\ufe0f'; // Variation selector, follows heart and others
const EMOJI_JOINER = '\u200d'; // zero-width joiner

let tlds = null, utlds = null; // don't change so only have to be computed once
let tlds = null,
utlds = null; // don't change so only have to be computed once

/**
* Scanner output token:
Expand Down Expand Up @@ -55,15 +56,21 @@ export function init(customSchemes = []) {
// States for special URL symbols that accept immediately after start
tt(Start, "'", tk.APOSTROPHE);
tt(Start, '{', tk.OPENBRACE);
tt(Start, '[', tk.OPENBRACKET);
tt(Start, '<', tk.OPENANGLEBRACKET);
tt(Start, '(', tk.OPENPAREN);
tt(Start, '(', tk.FULLWIDTH_OPENPAREN);
tt(Start, '}', tk.CLOSEBRACE);
tt(Start, '[', tk.OPENBRACKET);
tt(Start, ']', tk.CLOSEBRACKET);
tt(Start, '>', tk.CLOSEANGLEBRACKET);
tt(Start, '(', tk.OPENPAREN);
tt(Start, ')', tk.CLOSEPAREN);
tt(Start, ')', tk.FULLWIDTH_CLOSEPAREN);
tt(Start, '<', tk.OPENANGLEBRACKET);
tt(Start, '>', tk.CLOSEANGLEBRACKET);
tt(Start, '(', tk.FULLWIDTHLEFTPAREN);
tt(Start, ')', tk.FULLWIDTHRIGHTPAREN);
tt(Start, '「', tk.LEFTCORNERBRACKET);
tt(Start, '」', tk.RIGHTCORNERBRACKET);
tt(Start, '『', tk.LEFTWHITECORNERBRACKET);
tt(Start, '』', tk.RIGHTWHITECORNERBRACKET);
tt(Start, '<', tk.FULLWIDTHLESSTHAN);
tt(Start, '>', tk.FULLWIDTHGREATERTHAN);
tt(Start, '&', tk.AMPERSAND);
tt(Start, '*', tk.ASTERISK);
tt(Start, '@', tk.AT);
Expand Down Expand Up @@ -122,7 +129,10 @@ export function init(customSchemes = []) {
// Generates states for top-level domains
// Note that this is most accurate when tlds are in alphabetical order
const wordjr = [[re.ASCII_LETTER, Word]];
const uwordjr = [[re.ASCII_LETTER, null], [re.LETTER, UWord]];
const uwordjr = [
[re.ASCII_LETTER, null],
[re.LETTER, UWord],
];
for (let i = 0; i < tlds.length; i++) {
fastts(Start, tlds[i], tk.TLD, tk.WORD, wordjr);
}
Expand All @@ -145,7 +155,7 @@ export function init(customSchemes = []) {
addToGroups(tk.SLASH_SCHEME, { slashscheme: true, ascii: true }, groups);

// Register custom schemes. Assumes each scheme is asciinumeric with hyphens
customSchemes = customSchemes.sort((a, b) => a[0] > b[0] ? 1 : -1);
customSchemes = customSchemes.sort((a, b) => (a[0] > b[0] ? 1 : -1));
for (let i = 0; i < customSchemes.length; i++) {
const sch = customSchemes[i][0];
const optionalSlashSlash = customSchemes[i][1];
Expand Down Expand Up @@ -233,7 +243,7 @@ export function run(start, str) {
t: latestAccepting.t, // token type/name
v: str.slice(cursor - tokenLength, cursor), // string value
s: cursor - tokenLength, // start index
e: cursor // end index (excluding)
e: cursor, // end index (excluding)
});
}

Expand All @@ -258,10 +268,14 @@ export function stringToArray(str) {
while (index < len) {
let first = str.charCodeAt(index);
let second;
let char = first < 0xd800 || first > 0xdbff || index + 1 === len
|| (second = str.charCodeAt(index + 1)) < 0xdc00 || second > 0xdfff
? str[index] // single character
: str.slice(index, index + 2); // two-index characters
let char =
first < 0xd800 ||
first > 0xdbff ||
index + 1 === len ||
(second = str.charCodeAt(index + 1)) < 0xdc00 ||
second > 0xdfff
? str[index] // single character
: str.slice(index, index + 2); // two-index characters
result.push(char);
index += char.length;
}
Expand Down
25 changes: 16 additions & 9 deletions packages/linkifyjs/src/text.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ Identifiers for token outputs from the regexp scanner
******************************************************************************/

// A valid web domain token
export const WORD = 'WORD'; // only contains a-z
export const UWORD = 'UWORD'; // contains letters other than a-z, used for IDN
export const WORD = 'WORD'; // only contains a-z
export const UWORD = 'UWORD'; // contains letters other than a-z, used for IDN

// Special case of word
export const LOCALHOST = 'LOCALHOST';
Expand Down Expand Up @@ -36,16 +36,24 @@ export const WS = 'WS';
export const NL = 'NL'; // \n

// Opening/closing bracket classes
// TODO: Rename OPEN -> LEFT and CLOSE -> RIGHT in v5 to fit with Unicode names
// Also rename angle brackes to LESSTHAN and GREATER THAN
export const OPENBRACE = 'OPENBRACE'; // {
export const OPENBRACKET = 'OPENBRACKET'; // [
export const OPENANGLEBRACKET = 'OPENANGLEBRACKET'; // <
export const OPENPAREN = 'OPENPAREN'; // (
export const CLOSEBRACE = 'CLOSEBRACE'; // }
export const OPENBRACKET = 'OPENBRACKET'; // [
export const CLOSEBRACKET = 'CLOSEBRACKET'; // ]
export const CLOSEANGLEBRACKET = 'CLOSEANGLEBRACKET'; // >
export const OPENPAREN = 'OPENPAREN'; // (
export const CLOSEPAREN = 'CLOSEPAREN'; // )
export const FULLWIDTH_OPENPAREN = 'FULLWIDTH_OPENPAREN'; // (
export const FULLWIDTH_CLOSEPAREN = 'FULLWIDTH_CLOSEPAREN'; // )
export const OPENANGLEBRACKET = 'OPENANGLEBRACKET'; // <
export const CLOSEANGLEBRACKET = 'CLOSEANGLEBRACKET'; // >
export const FULLWIDTHLEFTPAREN = 'FULLWIDTHLEFTPAREN'; // (
export const FULLWIDTHRIGHTPAREN = 'FULLWIDTHRIGHTPAREN'; // )
export const LEFTCORNERBRACKET = 'LEFTCORNERBRACKET'; // 「
export const RIGHTCORNERBRACKET = 'RIGHTCORNERBRACKET'; // 」
export const LEFTWHITECORNERBRACKET = 'LEFTWHITECORNERBRACKET'; // 『
export const RIGHTWHITECORNERBRACKET = 'RIGHTWHITECORNERBRACKET'; // 』
export const FULLWIDTHLESSTHAN = 'FULLWIDTHLESSTHAN'; // <
export const FULLWIDTHGREATERTHAN = 'FULLWIDTHGREATERTHAN'; // >

// Various symbols
export const AMPERSAND = 'AMPERSAND'; // &
Expand Down Expand Up @@ -79,4 +87,3 @@ export const EMOJI = 'EMOJI';

// Default token - anything that is not one of the above
export const SYM = 'SYM';

6 changes: 4 additions & 2 deletions test/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ if [[ "$1" == "--dist" ]]; then
npm run test:coverage
npm run build:ci
npm run copy
npm run test:ci
sleep 3 # Wait for threads to exit?
if [[ "${BROWSERSTACK_USERNAME}" != "" ]] && [[ "${BROWSERSTACK_ACCESS_KEY}" != "" ]]; then
npm run test:ci
sleep 3 # Wait for threads to exit?
fi
else
# Run basic tests
echo "Running basic tests..."
Expand Down
Loading