From ddebab2733b94bb1f3cd826bdbb3b75c2c3ff41c Mon Sep 17 00:00:00 2001 From: Zoltan Herczeg Date: Wed, 21 Aug 2024 15:56:07 +0000 Subject: [PATCH] Support a new experimental feature called scan substring --- ChangeLog | 4 + src/pcre2_compile.c | 185 +++++++++++++++++++++++++++++----------- src/pcre2_dfa_match.c | 2 + src/pcre2_internal.h | 83 +++++++++--------- src/pcre2_jit_compile.c | 1 + src/pcre2_match.c | 83 +++++++++++++++++- src/pcre2_printint.c | 1 + src/pcre2_study.c | 1 + testdata/testinput2 | 44 ++++++++++ testdata/testinput6 | 3 + testdata/testoutput2 | 134 +++++++++++++++++++++++++++++ testdata/testoutput6 | 4 + 12 files changed, 457 insertions(+), 88 deletions(-) diff --git a/ChangeLog b/ChangeLog index bb3bb06ad..ddc3f6959 100644 --- a/ChangeLog +++ b/ChangeLog @@ -75,6 +75,10 @@ with JIT was correct. 12. Add a new error code (PCRE2_ERROR_JIT_UNSUPPORTED) which is yielded for unsupported jit features. +13. Add a new experimental feature called scan substring. This feature +is a new type of assertion which matches the content of a captruing block +to a sub pattern. + Version 10.44 07-June-2024 -------------------------- diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 79e98f83d..512f40b57 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -231,63 +231,65 @@ code (meta_extra_lengths, just below) must be updated to remain in step. */ #define META_COND_RNAME 0x80130000u /* (?(R&name)... */ #define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */ #define META_COND_VERSION 0x80150000u /* (?(VERSIONx.y)... */ -#define META_DOLLAR 0x80160000u /* $ metacharacter */ -#define META_DOT 0x80170000u /* . metacharacter */ -#define META_ESCAPE 0x80180000u /* \d and friends */ -#define META_KET 0x80190000u /* closing parenthesis */ -#define META_NOCAPTURE 0x801a0000u /* no capture parens */ -#define META_OPTIONS 0x801b0000u /* (?i) and friends */ -#define META_POSIX 0x801c0000u /* POSIX class item */ -#define META_POSIX_NEG 0x801d0000u /* negative POSIX class item */ -#define META_RANGE_ESCAPED 0x801e0000u /* range with at least one escape */ -#define META_RANGE_LITERAL 0x801f0000u /* range defined literally */ -#define META_RECURSE 0x80200000u /* Recursion */ -#define META_RECURSE_BYNAME 0x80210000u /* (?&name) */ -#define META_SCRIPT_RUN 0x80220000u /* (*script_run:...) */ +#define META_SCS_NAME 0x80160000u /* (*scs:()... */ +#define META_SCS_NUMBER 0x80170000u /* (*scs:(digits)... */ +#define META_DOLLAR 0x80180000u /* $ metacharacter */ +#define META_DOT 0x80190000u /* . metacharacter */ +#define META_ESCAPE 0x801a0000u /* \d and friends */ +#define META_KET 0x801b0000u /* closing parenthesis */ +#define META_NOCAPTURE 0x801c0000u /* no capture parens */ +#define META_OPTIONS 0x801d0000u /* (?i) and friends */ +#define META_POSIX 0x801e0000u /* POSIX class item */ +#define META_POSIX_NEG 0x801f0000u /* negative POSIX class item */ +#define META_RANGE_ESCAPED 0x80200000u /* range with at least one escape */ +#define META_RANGE_LITERAL 0x80210000u /* range defined literally */ +#define META_RECURSE 0x80220000u /* Recursion */ +#define META_RECURSE_BYNAME 0x80230000u /* (?&name) */ +#define META_SCRIPT_RUN 0x80240000u /* (*script_run:...) */ /* These must be kept together to make it easy to check that an assertion is present where expected in a conditional group. */ -#define META_LOOKAHEAD 0x80230000u /* (?= */ -#define META_LOOKAHEADNOT 0x80240000u /* (?! */ -#define META_LOOKBEHIND 0x80250000u /* (?<= */ -#define META_LOOKBEHINDNOT 0x80260000u /* (?) length=%d offset=", *pptr++); + GETOFFSET(offset, pptr); + fprintf(stderr, "%zd", offset); + break; + + case META_SCS_NUMBER: + fprintf(stderr, "META_SCS_NUMBER %d offset=", pptr[SIZEOFFSET]); + GETOFFSET(offset, pptr); + fprintf(stderr, "%zd", offset); + pptr++; + break; + case META_MARK: fprintf(stderr, "META (*MARK:"); goto SHOWARG; @@ -4053,6 +4074,67 @@ while (ptr < ptrend) case META_LOOKAHEADNOT: goto NEGATIVE_LOOK_AHEAD; + case META_SCS_NUMBER: + nest_depth++; + + if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; + + if (*ptr != CHAR_LEFT_PARENTHESIS) + { + errorcode = ERR15; + goto FAILED; + } + + ptr++; + + /* Handle (scan_substring:([+-]number)... */ + if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, + &i, &errorcode)) + { + if (i <= 0) + { + errorcode = ERR15; + goto FAILED; + } + *parsed_pattern++ = META_SCS_NUMBER; + offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2); + PUTOFFSET(offset, parsed_pattern); + *parsed_pattern++ = i; + } + else if (errorcode != 0) goto FAILED; /* Number too big */ + else + { + if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS; + + /* Handle (*scan_substring:('name') or (*scan_substring:() */ + if (*ptr == CHAR_LESS_THAN_SIGN) + terminator = CHAR_GREATER_THAN_SIGN; + else if (*ptr == CHAR_APOSTROPHE) + terminator = CHAR_APOSTROPHE; + else + { + errorcode = ERR15; + goto FAILED; + } + + if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, + &namelen, &errorcode, cb)) goto FAILED; + + *parsed_pattern++ = META_SCS_NAME; + *parsed_pattern++ = namelen; + PUTOFFSET(offset, parsed_pattern); + } + + if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS; + + if (*ptr != CHAR_RIGHT_PARENTHESIS) + { + errorcode = ERR24; + break; + } + ptr++; + break; + case META_LOOKBEHIND: case META_LOOKBEHINDNOT: case META_LOOKBEHIND_NA: @@ -6642,7 +6724,8 @@ for (;; pptr++) case META_COND_RNUMBER: /* (?(Rdigits) */ case META_COND_NAME: /* (?(name) or (?'name') or ?() */ case META_COND_RNAME: /* (?(R&name) - test for recursion */ - bravalue = OP_COND; + case META_SCS_NAME: /* (*scan_substring:'name') or (*scan_substring:()) */ + bravalue = meta == META_SCS_NAME ? OP_ASSERT_SCS : OP_COND; { int count, index; unsigned int i; @@ -6736,7 +6819,9 @@ for (;; pptr++) PUT2(code, 2+LINK_SIZE, index); PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count); } - goto GROUP_PROCESS_NOTE_EMPTY; + if (meta != META_SCS_NAME) goto GROUP_PROCESS_NOTE_EMPTY; + cb->assert_depth += 1; + goto GROUP_PROCESS; /* The DEFINE condition is always false. Its internal groups may never be called, so matched_char must remain false, hence the jump to @@ -6752,7 +6837,8 @@ for (;; pptr++) /* Conditional test of a group's being set. */ case META_COND_NUMBER: - bravalue = OP_COND; + case META_SCS_NUMBER: + bravalue = meta == META_SCS_NUMBER ? OP_ASSERT_SCS : OP_COND; GETPLUSOFFSET(offset, pptr); groupnumber = *(++pptr); if (groupnumber > cb->bracount) @@ -6762,11 +6848,14 @@ for (;; pptr++) return 0; } if (groupnumber > cb->top_backref) cb->top_backref = groupnumber; - offset -= 2; /* Point at initial ( for too many branches error */ + /* Point at initial ( for too many branches error */ + if (meta != META_SCS_NUMBER) offset -= 2; code[1+LINK_SIZE] = OP_CREF; skipunits = 1+IMM2_SIZE; PUT2(code, 2+LINK_SIZE, groupnumber); - goto GROUP_PROCESS_NOTE_EMPTY; + if (meta != META_SCS_NUMBER) goto GROUP_PROCESS_NOTE_EMPTY; + cb->assert_depth += 1; + goto GROUP_PROCESS; /* Test for the PCRE2 version. */ @@ -6900,7 +6989,7 @@ for (;; pptr++) /* If we've just compiled an assertion, pop the assert depth. */ - if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA) + if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERT_SCS) cb->assert_depth -= 1; /* At the end of compiling, code is still pointing to the start of the diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c index 348e51182..3e34c7ca5 100644 --- a/src/pcre2_dfa_match.c +++ b/src/pcre2_dfa_match.c @@ -175,6 +175,7 @@ static const uint8_t coptable[] = { 0, /* Assert behind not */ 0, /* NA assert */ 0, /* NA assert behind */ + 0, /* Assert scan substring */ 0, /* ONCE */ 0, /* SCRIPT_RUN */ 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */ @@ -253,6 +254,7 @@ static const uint8_t poptable[] = { 0, /* Assert behind not */ 0, /* NA assert */ 0, /* NA assert behind */ + 0, /* Assert scan substring */ 0, /* ONCE */ 0, /* SCRIPT_RUN */ 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */ diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index 148e21019..4ada47920 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -915,6 +915,7 @@ a positive value. */ #define STRING_naplb0 "naplb\0" #define STRING_nla0 "nla\0" #define STRING_nlb0 "nlb\0" +#define STRING_scc0 "scs\0" #define STRING_sr0 "sr\0" #define STRING_asr0 "asr\0" #define STRING_positive_lookahead0 "positive_lookahead\0" @@ -925,6 +926,7 @@ a positive value. */ #define STRING_negative_lookbehind0 "negative_lookbehind\0" #define STRING_script_run0 "script_run\0" #define STRING_atomic_script_run "atomic_script_run" +#define STRING_scan_substring0 "scan_substing\0" #define STRING_alpha0 "alpha\0" #define STRING_lower0 "lower\0" @@ -1216,6 +1218,7 @@ only. */ #define STRING_naplb0 STR_n STR_a STR_p STR_l STR_b "\0" #define STRING_nla0 STR_n STR_l STR_a "\0" #define STRING_nlb0 STR_n STR_l STR_b "\0" +#define STRING_scs0 STR_s STR_c STR_s "\0" #define STRING_sr0 STR_s STR_r "\0" #define STRING_asr0 STR_a STR_s STR_r "\0" #define STRING_positive_lookahead0 STR_p STR_o STR_s STR_i STR_t STR_i STR_v STR_e STR_UNDERSCORE STR_l STR_o STR_o STR_k STR_a STR_h STR_e STR_a STR_d "\0" @@ -1226,6 +1229,7 @@ only. */ #define STRING_negative_lookbehind0 STR_n STR_e STR_g STR_a STR_t STR_i STR_v STR_e STR_UNDERSCORE STR_l STR_o STR_o STR_k STR_b STR_e STR_h STR_i STR_n STR_d "\0" #define STRING_script_run0 STR_s STR_c STR_r STR_i STR_p STR_t STR_UNDERSCORE STR_r STR_u STR_n "\0" #define STRING_atomic_script_run STR_a STR_t STR_o STR_m STR_i STR_c STR_UNDERSCORE STR_s STR_c STR_r STR_i STR_p STR_t STR_UNDERSCORE STR_r STR_u STR_n +#define STRING_scan_substring0 STR_s STR_c STR_a STR_n STR_UNDERSCORE STR_s STR_u STR_b STR_s STR_t STR_r STR_i STR_n STR_g "\0" #define STRING_alpha0 STR_a STR_l STR_p STR_h STR_a "\0" #define STRING_lower0 STR_l STR_o STR_w STR_e STR_r "\0" @@ -1579,78 +1583,79 @@ enum { OP_ASSERTBACK_NOT, /* 130 Negative lookbehind */ OP_ASSERT_NA, /* 131 Positive non-atomic lookahead */ OP_ASSERTBACK_NA, /* 132 Positive non-atomic lookbehind */ + OP_ASSERT_SCS, /* 133 Scan substring */ /* ONCE, SCRIPT_RUN, BRA, BRAPOS, CBRA, CBRAPOS, and COND must come immediately after the assertions, with ONCE first, as there's a test for >= ONCE for a subpattern that isn't an assertion. The POS versions must immediately follow the non-POS versions in each case. */ - OP_ONCE, /* 133 Atomic group, contains captures */ - OP_SCRIPT_RUN, /* 134 Non-capture, but check characters' scripts */ - OP_BRA, /* 135 Start of non-capturing bracket */ - OP_BRAPOS, /* 136 Ditto, with unlimited, possessive repeat */ - OP_CBRA, /* 137 Start of capturing bracket */ - OP_CBRAPOS, /* 138 Ditto, with unlimited, possessive repeat */ - OP_COND, /* 139 Conditional group */ + OP_ONCE, /* 134 Atomic group, contains captures */ + OP_SCRIPT_RUN, /* 135 Non-capture, but check characters' scripts */ + OP_BRA, /* 136 Start of non-capturing bracket */ + OP_BRAPOS, /* 137 Ditto, with unlimited, possessive repeat */ + OP_CBRA, /* 138 Start of capturing bracket */ + OP_CBRAPOS, /* 139 Ditto, with unlimited, possessive repeat */ + OP_COND, /* 140 Conditional group */ /* These five must follow the previous five, in the same order. There's a check for >= SBRA to distinguish the two sets. */ - OP_SBRA, /* 140 Start of non-capturing bracket, check empty */ - OP_SBRAPOS, /* 141 Ditto, with unlimited, possessive repeat */ - OP_SCBRA, /* 142 Start of capturing bracket, check empty */ - OP_SCBRAPOS, /* 143 Ditto, with unlimited, possessive repeat */ - OP_SCOND, /* 144 Conditional group, check empty */ + OP_SBRA, /* 141 Start of non-capturing bracket, check empty */ + OP_SBRAPOS, /* 142 Ditto, with unlimited, possessive repeat */ + OP_SCBRA, /* 143 Start of capturing bracket, check empty */ + OP_SCBRAPOS, /* 144 Ditto, with unlimited, possessive repeat */ + OP_SCOND, /* 145 Conditional group, check empty */ /* The next two pairs must (respectively) be kept together. */ - OP_CREF, /* 145 Used to hold a capture number as condition */ - OP_DNCREF, /* 146 Used to point to duplicate names as a condition */ - OP_RREF, /* 147 Used to hold a recursion number as condition */ - OP_DNRREF, /* 148 Used to point to duplicate names as a condition */ - OP_FALSE, /* 149 Always false (used by DEFINE and VERSION) */ - OP_TRUE, /* 150 Always true (used by VERSION) */ + OP_CREF, /* 146 Used to hold a capture number as condition */ + OP_DNCREF, /* 147 Used to point to duplicate names as a condition */ + OP_RREF, /* 148 Used to hold a recursion number as condition */ + OP_DNRREF, /* 149 Used to point to duplicate names as a condition */ + OP_FALSE, /* 150 Always false (used by DEFINE and VERSION) */ + OP_TRUE, /* 151 Always true (used by VERSION) */ - OP_BRAZERO, /* 151 These two must remain together and in this */ - OP_BRAMINZERO, /* 152 order. */ - OP_BRAPOSZERO, /* 153 */ + OP_BRAZERO, /* 152 These two must remain together and in this */ + OP_BRAMINZERO, /* 153 order. */ + OP_BRAPOSZERO, /* 154 */ /* These are backtracking control verbs */ - OP_MARK, /* 154 always has an argument */ - OP_PRUNE, /* 155 */ - OP_PRUNE_ARG, /* 156 same, but with argument */ - OP_SKIP, /* 157 */ - OP_SKIP_ARG, /* 158 same, but with argument */ - OP_THEN, /* 159 */ - OP_THEN_ARG, /* 160 same, but with argument */ - OP_COMMIT, /* 161 */ - OP_COMMIT_ARG, /* 162 same, but with argument */ + OP_MARK, /* 155 always has an argument */ + OP_PRUNE, /* 156 */ + OP_PRUNE_ARG, /* 157 same, but with argument */ + OP_SKIP, /* 158 */ + OP_SKIP_ARG, /* 159 same, but with argument */ + OP_THEN, /* 160 */ + OP_THEN_ARG, /* 161 same, but with argument */ + OP_COMMIT, /* 162 */ + OP_COMMIT_ARG, /* 163 same, but with argument */ /* These are forced failure and success verbs. FAIL and ACCEPT do accept an argument, but these cases can be compiled as, for example, (*MARK:X)(*FAIL) without the need for a special opcode. */ - OP_FAIL, /* 163 */ - OP_ACCEPT, /* 164 */ - OP_ASSERT_ACCEPT, /* 165 Used inside assertions */ - OP_CLOSE, /* 166 Used before OP_ACCEPT to close open captures */ + OP_FAIL, /* 164 */ + OP_ACCEPT, /* 165 */ + OP_ASSERT_ACCEPT, /* 166 Used inside assertions */ + OP_CLOSE, /* 167 Used before OP_ACCEPT to close open captures */ /* This is used to skip a subpattern with a {0} quantifier */ - OP_SKIPZERO, /* 167 */ + OP_SKIPZERO, /* 168 */ /* This is used to identify a DEFINE group during compilation so that it can be checked for having only one branch. It is changed to OP_FALSE before compilation finishes. */ - OP_DEFINE, /* 168 */ + OP_DEFINE, /* 169 */ /* These opcodes replace their normal counterparts in UCP mode when PCRE2_EXTRA_ASCII_BSW is not set. */ - OP_NOT_UCP_WORD_BOUNDARY, /* 169 */ - OP_UCP_WORD_BOUNDARY, /* 170 */ + OP_NOT_UCP_WORD_BOUNDARY, /* 170 */ + OP_UCP_WORD_BOUNDARY, /* 171 */ /* This is not an opcode, but is used to check that tables indexed by opcode are the correct length, in order to catch updating errors - there have been @@ -1699,6 +1704,7 @@ some cases doesn't actually use these names at all). */ "Reverse", "VReverse", "Assert", "Assert not", \ "Assert back", "Assert back not", \ "Non-atomic assert", "Non-atomic assert back", \ + "Scan substring", \ "Once", \ "Script run", \ "Bra", "BraPos", "CBra", "CBraPos", \ @@ -1786,6 +1792,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */ 1+LINK_SIZE, /* Assert behind not */ \ 1+LINK_SIZE, /* NA Assert */ \ 1+LINK_SIZE, /* NA Assert behind */ \ + 1+LINK_SIZE, /* Scan substring */ \ 1+LINK_SIZE, /* ONCE */ \ 1+LINK_SIZE, /* SCRIPT_RUN */ \ 1+LINK_SIZE, /* BRA */ \ diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c index b684980f0..19846119d 100644 --- a/src/pcre2_jit_compile.c +++ b/src/pcre2_jit_compile.c @@ -1110,6 +1110,7 @@ switch(*cc) default: SLJIT_UNREACHABLE(); + case OP_ASSERT_SCS: return NULL; } } diff --git a/src/pcre2_match.c b/src/pcre2_match.c index 3424e6d9f..1a302fe0e 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -155,7 +155,7 @@ changed, the code at RETURN_SWITCH below must be updated in sync. */ enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, - RM31, RM32, RM33, RM34, RM35, RM36, RM37 }; + RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38 }; #ifdef SUPPORT_WIDE_CHARS enum { RM100=100, RM101 }; @@ -5585,6 +5585,80 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode, #undef Lframe_type + /* ===================================================================== */ + /* Handle scan substring operation. */ + +#define Lframe_type F->temp_32[0] +#define Lextra_size F->temp_32[1] +#define Lsaved_end_subject F->temp_sptr[0] +#define Lsaved_eptr F->temp_sptr[1] + + case OP_ASSERT_SCS: + Lextra_size = PRIV(OP_lengths)[Fecode[1 + LINK_SIZE]]; + + if (Fecode[1 + LINK_SIZE] == OP_CREF) + offset = (GET2(Fecode, 1 + LINK_SIZE + 1) << 1) - 2; + else + { + /* The OP_DNCREF case. */ + int count = GET2(Fecode, 1 + LINK_SIZE + 1 + IMM2_SIZE); + PCRE2_SPTR slot = mb->name_table + + GET2(Fecode, 1 + LINK_SIZE + 1) * mb->name_entry_size; + while (count > 0) + { + offset = (GET2(slot, 0) << 1) - 2; + if (offset < Foffset_top && Fovector[offset] != PCRE2_UNSET) break; + slot += mb->name_entry_size; + count--; + } + + /* Not found any valid capturing brackets. */ + if (count == 0) + offset = Foffset_top; + } + + if (offset >= Foffset_top && Fovector[offset] == PCRE2_UNSET) + RRETURN(MATCH_NOMATCH); + + Lsaved_end_subject = mb->end_subject; + Lsaved_eptr = Feptr; + + Feptr = mb->start_subject + Fovector[offset]; + mb->end_subject = mb->start_subject + Fovector[offset + 1]; + + Lframe_type = GF_NOCAPTURE | Fop; + for (;;) + { + group_frame_type = Lframe_type; + RMATCH(Fecode + 1 + LINK_SIZE + Lextra_size, RM38); + if (rrc == MATCH_ACCEPT) + { + memcpy(Fovector, + (char *)assert_accept_frame + offsetof(heapframe, ovector), + assert_accept_frame->offset_top * sizeof(PCRE2_SIZE)); + Foffset_top = assert_accept_frame->offset_top; + Fmark = assert_accept_frame->mark; + break; + } + if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); + Fecode += GET(Fecode, 1); + if (*Fecode != OP_ALT) + { + mb->end_subject = Lsaved_end_subject; + RRETURN(MATCH_NOMATCH); + } + Lextra_size = 0; + } + + do Fecode += GET(Fecode, 1); while (*Fecode == OP_ALT); + Fecode += 1 + LINK_SIZE; + Feptr = Lsaved_eptr; + break; + +#undef Lframe_type +#undef Lextra_size +#undef Lsaved_end_subject +#undef Lsaved_eptr /* ===================================================================== */ /* The callout item calls an external function, if one is provided, passing @@ -6052,6 +6126,11 @@ fprintf(stderr, "++ %2ld op=%3d %s\n", Fecode - mb->start_code, *Fecode, will never by exercised if Unicode support it not compiled, because in that environment script runs cause an error at compile time. */ + case OP_ASSERT_SCS: + mb->end_subject = P->temp_sptr[0]; + Feptr = P->temp_sptr[1]; + break; + case OP_SCRIPT_RUN: if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH); break; @@ -6488,7 +6567,7 @@ switch (Freturn_id) LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16) LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24) LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32) - LBL(33) LBL(34) LBL(35) LBL(36) LBL(37) + LBL(33) LBL(34) LBL(35) LBL(36) LBL(37) LBL(38) #ifdef SUPPORT_WIDE_CHARS LBL(100) LBL(101) diff --git a/src/pcre2_printint.c b/src/pcre2_printint.c index 62afcc656..61c78f638 100644 --- a/src/pcre2_printint.c +++ b/src/pcre2_printint.c @@ -424,6 +424,7 @@ for(;;) case OP_ASSERTBACK_NOT: case OP_ASSERT_NA: case OP_ASSERTBACK_NA: + case OP_ASSERT_SCS: case OP_ONCE: case OP_SCRIPT_RUN: case OP_COND: diff --git a/src/pcre2_study.c b/src/pcre2_study.c index 7c065e694..58c534f04 100644 --- a/src/pcre2_study.c +++ b/src/pcre2_study.c @@ -250,6 +250,7 @@ for (;;) case OP_ASSERTBACK: case OP_ASSERTBACK_NOT: case OP_ASSERT_NA: + case OP_ASSERT_SCS: case OP_ASSERTBACK_NA: do cc += GET(cc, 1); while (*cc == OP_ALT); /* Fall through */ diff --git a/testdata/testinput2 b/testdata/testinput2 index 5ebd78723..358ae4748 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -6139,4 +6139,48 @@ a)"xI /(?<=xy|a.b?|cd)/B +# Tests for scan substring, a non Perl feature of PCRE2 + +# Parse errors first + +/(*scs:/ + +/(*scan_substring:(/ + +/(*scs:('name'/ + +/(*scs:(1)a|b)/ + +/(*scan_substring:(1)a|b)/ + +/(*scs:()a|b)/ + +/(*scan_substring:()a|b)/ + +# Tests for the feature + +/([a-z]++)(*scs:(1)(stx)|(ne))(.)/B + ##string##next!## + __aastxaa:__ + __abababab:__ + +/(?[a-z]++)##(*scan_substring:('XX').*(..)$)\2/B + ##abcd##abcd##cd## + ##abcd##abcd##abcd## + +/([a-z])([a-z]++)(#+)(*scs:(2)(ab.))/ + xab## + yabc### + zababc#### + +/(?:(?[a-z]++)|(?[0-9]++)|$)(*scan_substring:('YYY')((?.).*\k$))/dupnames + $$abacd$$112345$$abca$$ + $$abcdeaf$$1234567819$$123456781$$ + +/([a-zA-Z]+)(*scs:(1).*?(?[A-Z]+)(*scan_substring:('ABC').*(.)\3))/ + ##abABCtuTUVXz##abCDEFGxyCDEEFGhi## + +/([a-zA-Z]+)(*scs:(1)(xy|ab(*ACCEPT)cd))/ + ##cdefgh##cdeabxy## + # End of testinput2 diff --git a/testdata/testinput6 b/testdata/testinput6 index 7f5241ba2..a6b0038dc 100644 --- a/testdata/testinput6 +++ b/testdata/testinput6 @@ -5051,4 +5051,7 @@ /|a(?0)/endanchored aaaa +/([a-z]++)(*scs:(1).)/ + aa + # End of testinput6 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index b39664783..577878a2f 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -18140,6 +18140,140 @@ No match End ------------------------------------------------------------------ +# Tests for scan substring, a non Perl feature of PCRE2 + +# Parse errors first + +/(*scs:/ +Failed: error 114 at offset 6: missing closing parenthesis + +/(*scan_substring:(/ +Failed: error 114 at offset 18: missing closing parenthesis + +/(*scs:('name'/ +Failed: error 114 at offset 14: missing closing parenthesis + +/(*scs:(1)a|b)/ +Failed: error 115 at offset 6: reference to non-existent subpattern + +/(*scan_substring:(1)a|b)/ +Failed: error 115 at offset 17: reference to non-existent subpattern + +/(*scs:()a|b)/ +Failed: error 115 at offset 8: reference to non-existent subpattern + +/(*scan_substring:()a|b)/ +Failed: error 115 at offset 19: reference to non-existent subpattern + +# Tests for the feature + +/([a-z]++)(*scs:(1)(stx)|(ne))(.)/B +------------------------------------------------------------------ + Bra + CBra 1 + [a-z]++ + Ket + Scan substring + 1 Cond ref + CBra 2 + stx + Ket + Alt + CBra 3 + ne + Ket + Ket + CBra 4 + Any + Ket + Ket + End +------------------------------------------------------------------ + ##string##next!## + 0: next! + 1: next + 2: + 3: ne + 4: ! + __aastxaa:__ + 0: stxaa: + 1: stxaa + 2: stx + 3: + 4: : + __abababab:__ +No match + +/(?[a-z]++)##(*scan_substring:('XX').*(..)$)\2/B +------------------------------------------------------------------ + Bra + CBra 1 + [a-z]++ + Ket + ## + Scan substring + 1 Cond ref + Any* + CBra 2 + Any + Any + Ket + $ + Ket + \2 + Ket + End +------------------------------------------------------------------ + ##abcd##abcd##cd## + 0: abcd##cd + 1: abcd + 2: cd + ##abcd##abcd##abcd## +No match + +/([a-z])([a-z]++)(#+)(*scs:(2)(ab.))/ + xab## +No match + yabc### + 0: yabc### + 1: y + 2: abc + 3: ### + 4: abc + zababc#### + 0: zababc#### + 1: z + 2: ababc + 3: #### + 4: aba + +/(?:(?[a-z]++)|(?[0-9]++)|$)(*scan_substring:('YYY')((?.).*\k$))/dupnames + $$abacd$$112345$$abca$$ + 0: abca + 1: abca + 2: + 3: abca + 4: a + $$abcdeaf$$1234567819$$123456781$$ + 0: 123456781 + 1: + 2: 123456781 + 3: 123456781 + 4: 1 + +/([a-zA-Z]+)(*scs:(1).*?(?[A-Z]+)(*scan_substring:('ABC').*(.)\3))/ + ##abABCtuTUVXz##abCDEFGxyCDEEFGhi## + 0: abCDEFGxyCDEEFGhi + 1: abCDEFGxyCDEEFGhi + 2: CDEEFG + 3: E + +/([a-zA-Z]+)(*scs:(1)(xy|ab(*ACCEPT)cd))/ + ##cdefgh##cdeabxy## + 0: abxy + 1: abxy + 2: ab + # End of testinput2 Error -70: PCRE2_ERROR_BADDATA (unknown error number) Error -62: bad serialized data diff --git a/testdata/testoutput6 b/testdata/testoutput6 index b307eeeba..283b00da0 100644 --- a/testdata/testoutput6 +++ b/testdata/testoutput6 @@ -7933,4 +7933,8 @@ Partial match: 3: a 4: +/([a-z]++)(*scs:(1).)/ + aa +Failed: error -42: pattern contains an item that is not supported for DFA matching + # End of testinput6