From b3dbe8d981b1963d16335233555c447f6829ccb9 Mon Sep 17 00:00:00 2001 From: Philip Hazel Date: Wed, 15 Nov 2023 12:50:44 +0000 Subject: [PATCH] Make data for OP_REVERSE use IMM2_SIZE instead of LINK_SIZE, for consistency with OP_VREVERSE --- ChangeLog | 3 +++ HACKING | 9 +++++---- src/pcre2_compile.c | 4 ++-- src/pcre2_dfa_match.c | 4 ++-- src/pcre2_internal.h | 2 +- src/pcre2_match.c | 4 ++-- src/pcre2_printint.c | 2 +- 7 files changed, 16 insertions(+), 12 deletions(-) diff --git a/ChangeLog b/ChangeLog index 0dfb0e19b..003d3ad25 100644 --- a/ChangeLog +++ b/ChangeLog @@ -153,6 +153,9 @@ the pattern /|a(?0)/ matched against "aaaa". 39. Add a test for ridiculous ovector offset values to the substring extraction functions. +40. Make OP_REVERSE use IMM2_SIZE for its data instead of LINK_SIZE, for +consistency with OP_VREVERSE. + Version 10.42 11-December-2022 ------------------------------ diff --git a/HACKING b/HACKING index 3c450c93a..b806a6f3b 100644 --- a/HACKING +++ b/HACKING @@ -735,15 +735,16 @@ OP_ASSERT_NOT. Backward assertions use the opcodes OP_ASSERTBACK, OP_ASSERTBACK_NA, and OP_ASSERTBACK_NOT. If all the branches of a backward assertion are of fixed length (not necessarily the same), the first opcode inside each branch is -OP_REVERSE, followed by a LINK_SIZE count of the number of characters to move +OP_REVERSE, followed by an IMM2_SIZE count of the number of characters to move back the pointer in the subject string, thus allowing each branch to have a different (but fixed) length. Variable-length backward assertions whose maximum matching length is limited are also supported. For such assertions, the first opcode inside each branch is OP_VREVERSE, followed by the minimum and maximum lengths for that branch, -unless these happen to be equal, in which case OP_REVERSE is used. These values -occupy two code units each in 8-bit mode, and 1 code unit in 16/32 bit modes. +unless these happen to be equal, in which case OP_REVERSE is used. These +IMM2_SIZE values occupy two code units each in 8-bit mode, and 1 code unit in +16/32 bit modes. In ASCII or UTF-32 mode, the character counts in OP_REVERSE and OP_VREVERSE are also the number of code units, but in UTF-8/16 mode each character may occupy @@ -849,4 +850,4 @@ not a real opcode, but is used to check at compile time that tables indexed by opcode are the correct length, in order to catch updating errors. Philip Hazel -July 2023 +November 2023 diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index a7abcc733..feb5bcda9 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -8450,8 +8450,8 @@ for (;;) lookbehindminlength == lookbehindlength) { *code++ = OP_REVERSE; - PUTINC(code, 0, lookbehindlength); - length += 1 + LINK_SIZE; + PUT2INC(code, 0, lookbehindlength); + length += 1 + IMM2_SIZE; } else { diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c index 9eb4fca11..e90c984fd 100644 --- a/src/pcre2_dfa_match.c +++ b/src/pcre2_dfa_match.c @@ -635,8 +635,8 @@ if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT) end_code = this_start_code; do { - uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + LINK_SIZE : 0; - size_t back = (revlen == 0)? 0 : (size_t)GET(end_code, 2+LINK_SIZE); + uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + IMM2_SIZE : 0; + size_t back = (revlen == 0)? 0 : (size_t)GET2(end_code, 2+LINK_SIZE); if (back <= gone_back) { int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen); diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index b10dcad04..e5808182e 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -1778,7 +1778,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */ 1+LINK_SIZE, /* KetRmax */ \ 1+LINK_SIZE, /* KetRmin */ \ 1+LINK_SIZE, /* KetRpos */ \ - 1+LINK_SIZE, /* Reverse */ \ + 1+IMM2_SIZE, /* Reverse */ \ 1+2*IMM2_SIZE, /* VReverse */ \ 1+LINK_SIZE, /* Assert */ \ 1+LINK_SIZE, /* Assert not */ \ diff --git a/src/pcre2_match.c b/src/pcre2_match.c index c5fbbc060..7fac32bc5 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -5718,7 +5718,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); we move back a number of characters, not bytes. */ case OP_REVERSE: - number = GET(Fecode, 1); + number = GET2(Fecode, 1); #ifdef SUPPORT_UNICODE if (utf) { @@ -5742,7 +5742,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); /* Save the earliest consulted character, then skip to next opcode */ if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr; - Fecode += 1 + LINK_SIZE; + Fecode += 1 + IMM2_SIZE; break; diff --git a/src/pcre2_printint.c b/src/pcre2_printint.c index 4eb04dba5..c2d8b3e33 100644 --- a/src/pcre2_printint.c +++ b/src/pcre2_printint.c @@ -429,7 +429,7 @@ for(;;) case OP_COND: case OP_SCOND: case OP_REVERSE: - if (print_lengths) fprintf(f, "%3d ", GET(code, 1)); + if (print_lengths) fprintf(f, "%3d ", GET2(code, 1)); else fprintf(f, " "); fprintf(f, "%s", OP_names[*code]); break;