From 4b66716fd067dcabd946bca16a6f43b5cfb1df8d Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sat, 24 Aug 2024 09:19:21 +0200 Subject: [PATCH] Fix bug in 'first code unit' and 'last code unit' optimization combined with lookahead assertion --- src/pcre2_compile.c | 15 +++++++++++++-- testdata/testinput1 | 6 ++++++ testdata/testoutput1 | 8 ++++++++ testdata/testoutput2 | 4 ++-- 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 38b76f697..79e98f83d 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -10895,8 +10895,19 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) (these are not saved during the compile because they can cause conflicts with actual literals that follow). */ - if (firstcuflags >= REQ_NONE) - firstcu = find_firstassertedcu(codestart, &firstcuflags, 0); + if (firstcuflags >= REQ_NONE) { + uint32_t assertedcuflags = 0; + uint32_t assertedcu = find_firstassertedcu(codestart, &assertedcuflags, 0); + /* It would be wrong to use the asserted first code unit as `firstcu` for + * regexes which are able to match a 1-character string (e.g. /(?=a)b?a/) + * For that example, if we set both firstcu and reqcu to 'a', it would mean + * the subject string needs to be at least 2 characters long, which is wrong. + * With more analysis, we would be able to set firstcu in more cases. */ + if (assertedcuflags < REQ_NONE && assertedcu != reqcu) { + firstcu = assertedcu; + firstcuflags = assertedcuflags; + } + } /* Save the data for a first code unit. The existence of one means the minimum length must be at least 1. */ diff --git a/testdata/testinput1 b/testdata/testinput1 index 75c3b40aa..9c502f7cc 100644 --- a/testdata/testinput1 +++ b/testdata/testinput1 @@ -6677,4 +6677,10 @@ $/x /(?(?<=a(?:a.b|b))b).b/ aaab +/(?=a)b?a/ + a + +/(?=a)b?a./ + ab + # End of testinput1 diff --git a/testdata/testoutput1 b/testdata/testoutput1 index 9c2d78c4e..585258f6a 100644 --- a/testdata/testoutput1 +++ b/testdata/testoutput1 @@ -10536,4 +10536,12 @@ No match aaab 0: ab +/(?=a)b?a/ + a + 0: a + +/(?=a)b?a./ + ab + 0: ab + # End of testinput1 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index e8bb3ed7f..b39664783 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -10747,9 +10747,9 @@ Subject length lower bound = 1 /(?=a{3})[bcd]/Ii Capture group count = 0 Options: caseless -First code unit = 'a' (caseless) +Starting code units: A a Last code unit = 'a' (caseless) -Subject length lower bound = 2 +Subject length lower bound = 1 /(abc)\1+/