Skip to content

Commit

Permalink
Support multiple group references for scan substring
Browse files Browse the repository at this point in the history
  • Loading branch information
Zoltan Herczeg committed Sep 3, 2024
1 parent 07ca63c commit 6e1f45b
Show file tree
Hide file tree
Showing 5 changed files with 350 additions and 132 deletions.
250 changes: 150 additions & 100 deletions src/pcre2_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -233,63 +233,65 @@ code (meta_extra_lengths, just below) must be updated to remain in step. */
#define META_COND_VERSION 0x80150000u /* (?(VERSION<op>x.y)... */
#define META_SCS_NAME 0x80160000u /* (*scan_substring:(<name>)... */
#define META_SCS_NUMBER 0x80170000u /* (*scan_substring:(digits)... */
#define META_DOLLAR 0x80180000u /* $ metacharacter */
#define META_DOT 0x80190000u /* . metacharacter */
#define META_ESCAPE 0x801a0000u /* \d and friends */
#define META_KET 0x801b0000u /* closing parenthesis */
#define META_NOCAPTURE 0x801c0000u /* no capture parens */
#define META_OPTIONS 0x801d0000u /* (?i) and friends */
#define META_POSIX 0x801e0000u /* POSIX class item */
#define META_POSIX_NEG 0x801f0000u /* negative POSIX class item */
#define META_RANGE_ESCAPED 0x80200000u /* range with at least one escape */
#define META_RANGE_LITERAL 0x80210000u /* range defined literally */
#define META_RECURSE 0x80220000u /* Recursion */
#define META_RECURSE_BYNAME 0x80230000u /* (?&name) */
#define META_SCRIPT_RUN 0x80240000u /* (*script_run:...) */
#define META_SCS_NEXT_NAME 0x80180000u /* Next <name> of scan_substring */
#define META_SCS_NEXT_NUMBER 0x80190000u /* Next digits of scan_substring */
#define META_DOLLAR 0x801a0000u /* $ metacharacter */
#define META_DOT 0x801b0000u /* . metacharacter */
#define META_ESCAPE 0x801c0000u /* \d and friends */
#define META_KET 0x801d0000u /* closing parenthesis */
#define META_NOCAPTURE 0x801e0000u /* no capture parens */
#define META_OPTIONS 0x801f0000u /* (?i) and friends */
#define META_POSIX 0x80200000u /* POSIX class item */
#define META_POSIX_NEG 0x80210000u /* negative POSIX class item */
#define META_RANGE_ESCAPED 0x80220000u /* range with at least one escape */
#define META_RANGE_LITERAL 0x80230000u /* range defined literally */
#define META_RECURSE 0x80240000u /* Recursion */
#define META_RECURSE_BYNAME 0x80250000u /* (?&name) */
#define META_SCRIPT_RUN 0x80260000u /* (*script_run:...) */

/* These must be kept together to make it easy to check that an assertion
is present where expected in a conditional group. */

#define META_LOOKAHEAD 0x80250000u /* (?= */
#define META_LOOKAHEADNOT 0x80260000u /* (?! */
#define META_LOOKBEHIND 0x80270000u /* (?<= */
#define META_LOOKBEHINDNOT 0x80280000u /* (?<! */
#define META_LOOKAHEAD 0x80270000u /* (?= */
#define META_LOOKAHEADNOT 0x80280000u /* (?! */
#define META_LOOKBEHIND 0x80290000u /* (?<= */
#define META_LOOKBEHINDNOT 0x802a0000u /* (?<! */

/* These cannot be conditions */

#define META_LOOKAHEAD_NA 0x80290000u /* (*napla: */
#define META_LOOKBEHIND_NA 0x802a0000u /* (*naplb: */
#define META_LOOKAHEAD_NA 0x802b0000u /* (*napla: */
#define META_LOOKBEHIND_NA 0x802c0000u /* (*naplb: */

/* These must be kept in this order, with consecutive values, and the _ARG
versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
versions. */

#define META_MARK 0x802b0000u /* (*MARK) */
#define META_ACCEPT 0x802c0000u /* (*ACCEPT) */
#define META_FAIL 0x802d0000u /* (*FAIL) */
#define META_COMMIT 0x802e0000u /* These */
#define META_COMMIT_ARG 0x802f0000u /* pairs */
#define META_PRUNE 0x80300000u /* must */
#define META_PRUNE_ARG 0x80310000u /* be */
#define META_SKIP 0x80320000u /* kept */
#define META_SKIP_ARG 0x80330000u /* in */
#define META_THEN 0x80340000u /* this */
#define META_THEN_ARG 0x80350000u /* order */
#define META_MARK 0x802d0000u /* (*MARK) */
#define META_ACCEPT 0x802e0000u /* (*ACCEPT) */
#define META_FAIL 0x802f0000u /* (*FAIL) */
#define META_COMMIT 0x80300000u /* These */
#define META_COMMIT_ARG 0x80310000u /* pairs */
#define META_PRUNE 0x80320000u /* must */
#define META_PRUNE_ARG 0x80330000u /* be */
#define META_SKIP 0x80340000u /* kept */
#define META_SKIP_ARG 0x80350000u /* in */
#define META_THEN 0x80360000u /* this */
#define META_THEN_ARG 0x80370000u /* order */

/* These must be kept in groups of adjacent 3 values, and all together. */

#define META_ASTERISK 0x80360000u /* * */
#define META_ASTERISK_PLUS 0x80370000u /* *+ */
#define META_ASTERISK_QUERY 0x80380000u /* *? */
#define META_PLUS 0x80390000u /* + */
#define META_PLUS_PLUS 0x803a0000u /* ++ */
#define META_PLUS_QUERY 0x803b0000u /* +? */
#define META_QUERY 0x803c0000u /* ? */
#define META_QUERY_PLUS 0x803d0000u /* ?+ */
#define META_QUERY_QUERY 0x803e0000u /* ?? */
#define META_MINMAX 0x803f0000u /* {n,m} repeat */
#define META_MINMAX_PLUS 0x80400000u /* {n,m}+ repeat */
#define META_MINMAX_QUERY 0x80410000u /* {n,m}? repeat */
#define META_ASTERISK 0x80380000u /* * */
#define META_ASTERISK_PLUS 0x80390000u /* *+ */
#define META_ASTERISK_QUERY 0x803a0000u /* *? */
#define META_PLUS 0x803b0000u /* + */
#define META_PLUS_PLUS 0x803c0000u /* ++ */
#define META_PLUS_QUERY 0x803d0000u /* +? */
#define META_QUERY 0x803e0000u /* ? */
#define META_QUERY_PLUS 0x803f0000u /* ?+ */
#define META_QUERY_QUERY 0x80400000u /* ?? */
#define META_MINMAX 0x80410000u /* {n,m} repeat */
#define META_MINMAX_PLUS 0x80420000u /* {n,m}+ repeat */
#define META_MINMAX_QUERY 0x80430000u /* {n,m}? repeat */

#define META_FIRST_QUANTIFIER META_ASTERISK
#define META_LAST_QUANTIFIER META_MINMAX_QUERY
Expand Down Expand Up @@ -330,6 +332,8 @@ static unsigned char meta_extra_lengths[] = {
3, /* META_COND_VERSION */
1+SIZEOFFSET, /* META_SCS_NAME */
1+SIZEOFFSET, /* META_SCS_NUMBER */
1+SIZEOFFSET, /* META_SCS_NEXT_NAME */
1+SIZEOFFSET, /* META_SCS_NEXT_NUMBER */
0, /* META_DOLLAR */
0, /* META_DOT */
0, /* META_ESCAPE - one more for ESC_P and ESC_p */
Expand Down Expand Up @@ -1173,6 +1177,19 @@ for (;;)
pptr++;
break;

case META_SCS_NEXT_NAME:
fprintf(stderr, "META_SCS_NEXT_NAME length=%d offset=", *pptr++);
GETOFFSET(offset, pptr);
fprintf(stderr, "%zd", offset);
break;

case META_SCS_NEXT_NUMBER:
fprintf(stderr, "META_SCS_NEXT_NUMBER %d offset=", pptr[SIZEOFFSET]);
GETOFFSET(offset, pptr);
fprintf(stderr, "%zd", offset);
pptr++;
break;

case META_MARK:
fprintf(stderr, "META (*MARK:");
goto SHOWARG;
Expand Down Expand Up @@ -4083,51 +4100,61 @@ while (ptr < ptrend)
}

ptr++;
/* Temporary variable, zero in the first iteration. */
meta = 0;

/* Handle (scan_substring:([+-]number)... */
if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61,
&i, &errorcode))
for (;;)
{
if (i <= 0)
/* Handle (scan_substring:([+-]number)... */
if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61,
&i, &errorcode))
{
errorcode = ERR15;
goto FAILED;
if (i <= 0)
{
errorcode = ERR15;
goto FAILED;
}
*parsed_pattern++ = meta ? META_SCS_NEXT_NUMBER : META_SCS_NUMBER;
offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
PUTOFFSET(offset, parsed_pattern);
*parsed_pattern++ = i;
}
*parsed_pattern++ = META_SCS_NUMBER;
offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
PUTOFFSET(offset, parsed_pattern);
*parsed_pattern++ = i;
}
else if (errorcode != 0) goto FAILED; /* Number too big */
else
{
if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;

/* Handle (*scan_substring:('name') or (*scan_substring:(<name>) */
if (*ptr == CHAR_LESS_THAN_SIGN)
terminator = CHAR_GREATER_THAN_SIGN;
else if (*ptr == CHAR_APOSTROPHE)
terminator = CHAR_APOSTROPHE;
else if (errorcode != 0) goto FAILED; /* Number too big */
else
{
errorcode = ERR15;
goto FAILED;
if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;

/* Handle (*scan_substring:('name') or (*scan_substring:(<name>) */
if (*ptr == CHAR_LESS_THAN_SIGN)
terminator = CHAR_GREATER_THAN_SIGN;
else if (*ptr == CHAR_APOSTROPHE)
terminator = CHAR_APOSTROPHE;
else
{
errorcode = ERR15;
goto FAILED;
}

if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name,
&namelen, &errorcode, cb)) goto FAILED;

*parsed_pattern++ = meta ? META_SCS_NEXT_NAME : META_SCS_NAME;
*parsed_pattern++ = namelen;
PUTOFFSET(offset, parsed_pattern);
}

if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name,
&namelen, &errorcode, cb)) goto FAILED;
if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;

*parsed_pattern++ = META_SCS_NAME;
*parsed_pattern++ = namelen;
PUTOFFSET(offset, parsed_pattern);
}
if (*ptr == CHAR_RIGHT_PARENTHESIS) break;

if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
if (*ptr != CHAR_COMMA)
{
errorcode = ERR24;
goto FAILED;
}

if (*ptr != CHAR_RIGHT_PARENTHESIS)
{
errorcode = ERR24;
break;
ptr++;
meta = 1;
}
ptr++;
goto POST_ASSERTION;
Expand Down Expand Up @@ -6723,6 +6750,7 @@ for (;; pptr++)
case META_COND_NAME: /* (?(name) or (?'name') or ?(<name>) */
case META_COND_RNAME: /* (?(R&name) - test for recursion */
case META_SCS_NAME: /* (*scan_substring:'name') or (*scan_substring:(<name>)) */
case META_SCS_NEXT_NAME: /* More names for scan substring. */
bravalue = meta == META_SCS_NAME ? OP_ASSERT_SCS : OP_COND;
{
int count, index;
Expand All @@ -6741,28 +6769,15 @@ for (;; pptr++)
numerical group. */

for (i = 0; i < cb->names_found; i++, ng++)
{
if (length == ng->length &&
PRIV(strncmp)(name, ng->name, length) == 0)
{
if (!ng->isdup)
{
code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
PUT2(code, 2+LINK_SIZE, ng->number);
if (ng->number > cb->top_backref) cb->top_backref = ng->number;
skipunits = 1+IMM2_SIZE;
goto GROUP_PROCESS_NOTE_EMPTY;
}
break; /* Found a duplicated name */
}
}

/* If the name was not found we have a bad reference, unless we are
dealing with R<digits>, which is treated as a recursion test by number.
*/
PRIV(strncmp)(name, ng->name, length) == 0) break;

if (i >= cb->names_found)
{
/* If the name was not found we have a bad reference, unless we are
dealing with R<digits>, which is treated as a recursion test by number.
*/

groupnumber = 0;
if (meta == META_COND_RNUMBER)
{
Expand Down Expand Up @@ -6795,11 +6810,24 @@ for (;; pptr++)
skipunits = 1+IMM2_SIZE;
goto GROUP_PROCESS_NOTE_EMPTY;
}
else if (!ng->isdup)
{
/* Otherwise found a duplicated name */
if (ng->number > cb->top_backref) cb->top_backref = ng->number;

/* A duplicated name was found. Note that if an R<digits> name is found
(META_COND_RNUMBER), it is a reference test, not a recursion test. */
if (meta == META_SCS_NEXT_NAME)
{
code[0] = OP_CREF;
PUT2(code, 1, ng->number);
code += 1+IMM2_SIZE;
break;
}

code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
PUT2(code, 2+LINK_SIZE, ng->number);
skipunits = 1+IMM2_SIZE;
goto GROUP_PROCESS_NOTE_EMPTY;
}

/* We have a duplicated name. In the compile pass we have to search the
main table in order to get the index and count values. */
Expand All @@ -6809,14 +6837,26 @@ for (;; pptr++)
if (lengthptr == NULL && !find_dupname_details(name, length, &index,
&count, errorcodeptr, cb)) return 0;

/* Add one to the opcode to change CREF/RREF into DNCREF/DNRREF and
insert appropriate data values. */
if (meta == META_SCS_NEXT_NAME)
{
code[0] = OP_DNCREF;
PUT2(code, 1, index);
PUT2(code, 1+IMM2_SIZE, count);
code += 1+2*IMM2_SIZE;
break;
}

/* A duplicated name was found. Note that if an R<digits> name is found
(META_COND_RNUMBER), it is a reference test, not a recursion test. */

code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_DNRREF : OP_DNCREF;

code[1+LINK_SIZE]++;
/* Insert appropriate data values. */
skipunits = 1+2*IMM2_SIZE;
PUT2(code, 2+LINK_SIZE, index);
PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
}

if (meta != META_SCS_NAME) goto GROUP_PROCESS_NOTE_EMPTY;
cb->assert_depth += 1;
goto GROUP_PROCESS;
Expand All @@ -6836,6 +6876,7 @@ for (;; pptr++)

case META_COND_NUMBER:
case META_SCS_NUMBER:
case META_SCS_NEXT_NUMBER:
bravalue = meta == META_SCS_NUMBER ? OP_ASSERT_SCS : OP_COND;
GETPLUSOFFSET(offset, pptr);
groupnumber = *(++pptr);
Expand All @@ -6846,6 +6887,15 @@ for (;; pptr++)
return 0;
}
if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;

if (meta == META_SCS_NEXT_NUMBER)
{
code[0] = OP_CREF;
PUT2(code, 1, groupnumber);
code += 1+IMM2_SIZE;
break;
}

/* Point at initial ( for too many branches error */
if (meta != META_SCS_NUMBER) offset -= 2;
code[1+LINK_SIZE] = OP_CREF;
Expand Down
Loading

0 comments on commit 6e1f45b

Please sign in to comment.