Skip to content

Commit

Permalink
Re-factor handling of whole-pattern recursion in the interpreter
Browse files Browse the repository at this point in the history
  • Loading branch information
PhilipHazel committed Nov 28, 2023
1 parent 198379c commit 86919c9
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 17 deletions.
5 changes: 5 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,11 @@ undefined behaviour.

46. Fix backref iterators when PCRE2_MATCH_UNSET_BACKREF is set in JIT.

47. Refactor the handling of whole-pattern recursion (?0) in pcre2_match() so
that its end is handled similarly to other recursions. This has altered the
behaviour of /|(?0)./endanchored which was previously not right. However,
it still differs from JIT.


Version 10.42 11-December-2022
------------------------------
Expand Down
66 changes: 49 additions & 17 deletions src/pcre2_match.c
Original file line number Diff line number Diff line change
Expand Up @@ -838,17 +838,15 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
assert_accept_frame = F;
RRETURN(MATCH_ACCEPT);

/* If recursing, we have to find the most recent recursion. */
/* For ACCEPT within a recursion, we have to find the most recent
recursion. If not in a recursion, fall through to code that is common with
OP_END. */

case OP_ACCEPT:
case OP_END:

/* Handle end of a recursion. */

if (Fcurrent_recurse != RECURSE_UNSET)
{
#ifdef DEBUG_SHOW_OPS
fprintf(stderr, "++ End within recursion\n");
fprintf(stderr, "++ Accept within recursion\n");
#endif
offset = Flast_group_offset;
for(;;)
Expand All @@ -857,7 +855,6 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
N = (heapframe *)((char *)match_data->heapframes + offset);
P = (heapframe *)((char *)N - frame_size);
if (GF_IDMASK(N->group_frame_type) == GF_RECURSE) break;

offset = P->last_group_offset;
}

Expand All @@ -873,11 +870,17 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
Fecode += 1 + LINK_SIZE;
continue;
}
/* Fall through */

/* Not a recursion. Fail for an empty string match if either PCRE2_NOTEMPTY
is set, or if PCRE2_NOTEMPTY_ATSTART is set and we have matched at the
start of the subject. In both cases, backtracking will then try other
alternatives, if any. */
/* OP_END itself can never be reached within a recursion because that is
picked up when the OP_KET that always precedes OP_END is reached. */

case OP_END:

/* Fail for an empty string match if either PCRE2_NOTEMPTY is set, or if
PCRE2_NOTEMPTY_ATSTART is set and we have matched at the start of the
subject. In both cases, backtracking will then try other alternatives, if
any. */

if (Feptr == Fstart_match &&
((mb->moptions & PCRE2_NOTEMPTY) != 0 ||
Expand Down Expand Up @@ -5856,7 +5859,8 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
/* ===================================================================== */
/* The end of a parenthesized group. For all but OP_BRA and OP_COND, the
starting frame was added to the chained frames in order to remember the
starting subject position for the group. */
starting subject position for the group. (Not true for OP_BRA when it's a
whole pattern recursion, but that is handled separately below.)*/

case OP_KET:
case OP_KETRMIN:
Expand Down Expand Up @@ -5908,8 +5912,37 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,

switch (*bracode)
{
case OP_BRA: /* No need to do anything for these */
case OP_COND:
/* Whole pattern recursion is handled as a recursion into group 0, but
the entire pattern is wrapped in OP_BRA/OP_KET rather than a capturing
group - a design mistake: it should perhaps have been capture group 0.
Anyway, that means the end of such recursion must be handled here. It is
detected by checking for an immediately following OP_END when we are
recursing in group 0. If this is not the end of a whole-pattern
recursion, there is nothing to be done. */

case OP_BRA:
if (Fcurrent_recurse != 0 || Fecode[1+LINK_SIZE] != OP_END) break;

/* It is the end of whole-pattern recursion. */

offset = Flast_group_offset;
if (offset == PCRE2_UNSET) return PCRE2_ERROR_INTERNAL;
N = (heapframe *)((char *)match_data->heapframes + offset);
P = (heapframe *)((char *)N - frame_size);
Flast_group_offset = P->last_group_offset;

/* Reinstate the previous set of captures and then carry on after the
recursion call. */

memcpy((char *)F + offsetof(heapframe, ovector), P->ovector,
Foffset_top * sizeof(PCRE2_SIZE));
Foffset_top = P->offset_top;
Fcapture_last = P->capture_last;
Fcurrent_recurse = P->current_recurse;
Fecode = P->ecode + 1 + LINK_SIZE;
continue; /* With next opcode */

case OP_COND: /* No need to do anything for these */
case OP_SCOND:
break;

Expand Down Expand Up @@ -5976,9 +6009,8 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode,
if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH);
break;

/* Whole-pattern recursion is coded as a recurse into group 0, so it
won't be picked up here. Instead, we catch it when the OP_END is reached.
Other recursion is handled here. */
/* Whole-pattern recursion is coded as a recurse into group 0, and is
handled with OP_BRA above. Other recursion is handled here. */

case OP_CBRA:
case OP_CBRAPOS:
Expand Down
15 changes: 15 additions & 0 deletions testdata/testinput2
Original file line number Diff line number Diff line change
Expand Up @@ -6066,4 +6066,19 @@ a)"xI
/\G(?:(?=(\1.|)(.))){1,13}?(?!.*\2.*\2)\1\K\2/g
aaabcccdeee

# This currently doesn't match JIT

/|(?0)./endanchored,aftertext
\= Expect error
abcd\=no_jit

/|a(?0)/endanchored
aaaa

# This currently doesn't match JIT

/(?:|(?0).)(?(R)|\z)/
\= Expect error
abcd\=no_jit

# End of testinput2
18 changes: 18 additions & 0 deletions testdata/testoutput2
Original file line number Diff line number Diff line change
Expand Up @@ -17965,6 +17965,24 @@ No match
1: ccc
2: d

# This currently doesn't match JIT

/|(?0)./endanchored,aftertext
\= Expect error
abcd\=no_jit
Failed: error -52: nested recursion at the same subject position

/|a(?0)/endanchored
aaaa
0: aaaa

# This currently doesn't match JIT

/(?:|(?0).)(?(R)|\z)/
\= Expect error
abcd\=no_jit
Failed: error -52: nested recursion at the same subject position

# End of testinput2
Error -70: PCRE2_ERROR_BADDATA (unknown error number)
Error -62: bad serialized data
Expand Down

0 comments on commit 86919c9

Please sign in to comment.