From 52041d82e9a0b95f510f6f300b7f3ca43c14b035 Mon Sep 17 00:00:00 2001 From: Philip Hazel Date: Sat, 11 Nov 2023 15:56:07 +0000 Subject: [PATCH] Fix misbehaviour of pcre2_match() and pcre2_dfa_match() when PCRE2_FIRSTLINE was set for an anchored pattern. --- ChangeLog | 4 ++++ doc/html/pcre2api.html | 6 +++--- doc/pcre2.txt | 7 ++++--- doc/pcre2api.3 | 8 ++++---- doc/pcre2demo.3 | 2 +- src/pcre2_dfa_match.c | 2 +- src/pcre2_match.c | 2 +- testdata/testinput2 | 13 +++++++++++++ testdata/testinput6 | 13 +++++++++++++ testdata/testoutput2 | 17 +++++++++++++++++ testdata/testoutput6 | 17 +++++++++++++++++ 11 files changed, 78 insertions(+), 13 deletions(-) diff --git a/ChangeLog b/ChangeLog index ea2e07daf..20a6a49b2 100644 --- a/ChangeLog +++ b/ChangeLog @@ -142,6 +142,10 @@ above because \b and \B are defined in terms of \w. option, and (?aP) also sets (?aT) so that (?-aP) disables all ASCII restrictions on POSIX classes. +37. If PCRE2_FIRSTLINE was set on an anchored pattern, pcre2_match() and +pcre2_dfa_match() misbehaved. PCRE2_FIRSTLINE is now ignored for anchored +patterns. + Version 10.42 11-December-2022 ------------------------------ diff --git a/doc/html/pcre2api.html b/doc/html/pcre2api.html index f99beb166..22cf4b0d4 100644 --- a/doc/html/pcre2api.html +++ b/doc/html/pcre2api.html @@ -1686,7 +1686,7 @@

pcre2api man page

PCRE2_USE_OFFSET_LIMIT, which provides a more general limiting facility. If PCRE2_FIRSTLINE is set with an offset limit, a match must occur in the first line and also within the offset limit. In other words, whichever limit comes -first is used. +first is used. This option has no effect for anchored patterns.
   PCRE2_LITERAL
 
@@ -2021,7 +2021,7 @@

pcre2api man page

This option forces all the POSIX character classes, including [:digit:] and [:xdigit:], to match only ASCII characters, even when PCRE2_UCP is set. It can -be changed within a pattern by means of the (?aP) option setting, but note that +be changed within a pattern by means of the (?aP) option setting, but note that this also sets PCRE2_EXTRA_ASCII_DIGIT in order to ensure that (?-aP) unsets all ASCII restrictions for POSIX classes.
@@ -4140,7 +4140,7 @@ 

pcre2api man page


REVISION

-Last updated: 12 October 2023 +Last updated: 11 November 2023
Copyright © 1997-2023 University of Cambridge.
diff --git a/doc/pcre2.txt b/doc/pcre2.txt index 8c82b1c29..b0cad0cb3 100644 --- a/doc/pcre2.txt +++ b/doc/pcre2.txt @@ -1653,7 +1653,8 @@ COMPILING A PATTERN greater than 3. See also PCRE2_USE_OFFSET_LIMIT, which provides a more general limiting facility. If PCRE2_FIRSTLINE is set with an offset limit, a match must occur in the first line and also within the offset - limit. In other words, whichever limit comes first is used. + limit. In other words, whichever limit comes first is used. This option + has no effect for anchored patterns. PCRE2_LITERAL @@ -3975,11 +3976,11 @@ AUTHOR REVISION - Last updated: 12 October 2023 + Last updated: 11 November 2023 Copyright (c) 1997-2023 University of Cambridge. -PCRE2 10.43 12 October 2023 PCRE2API(3) +PCRE2 10.43 11 November 2023 PCRE2API(3) ------------------------------------------------------------------------------ diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index c183aeeb3..0edddbdc8 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -1,4 +1,4 @@ -.TH PCRE2API 3 "12 October 2023" "PCRE2 10.43" +.TH PCRE2API 3 "11 November 2023" "PCRE2 10.43" .SH NAME PCRE2 - Perl-compatible regular expressions (revised API) .sp @@ -1628,7 +1628,7 @@ PCRE2_FIRSTLINE if \fIstartoffset\fP is greater than 3. See also PCRE2_USE_OFFSET_LIMIT, which provides a more general limiting facility. If PCRE2_FIRSTLINE is set with an offset limit, a match must occur in the first line and also within the offset limit. In other words, whichever limit comes -first is used. +first is used. This option has no effect for anchored patterns. .sp PCRE2_LITERAL .sp @@ -1979,7 +1979,7 @@ a pattern by means of the (?aT) option setting. .sp This option forces all the POSIX character classes, including [:digit:] and [:xdigit:], to match only ASCII characters, even when PCRE2_UCP is set. It can -be changed within a pattern by means of the (?aP) option setting, but note that +be changed within a pattern by means of the (?aP) option setting, but note that this also sets PCRE2_EXTRA_ASCII_DIGIT in order to ensure that (?-aP) unsets all ASCII restrictions for POSIX classes. .sp @@ -4148,6 +4148,6 @@ Cambridge, England. .rs .sp .nf -Last updated: 12 October 2023 +Last updated: 11 November 2023 Copyright (c) 1997-2023 University of Cambridge. .fi diff --git a/doc/pcre2demo.3 b/doc/pcre2demo.3 index c40096eae..e5283605a 100644 --- a/doc/pcre2demo.3 +++ b/doc/pcre2demo.3 @@ -1,4 +1,4 @@ -.TH PCRE2DEMO 3 "12 October 2023" "PCRE2 10.43-DEV" +.TH PCRE2DEMO 3 "11 November 2023" "PCRE2 10.43-DEV" .\"AUTOMATICALLY GENERATED BY PrepareRelease - do not EDIT! .SH NAME // - A demonstration C program for PCRE2 - // diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c index 27b6ef042..acab75c42 100644 --- a/src/pcre2_dfa_match.c +++ b/src/pcre2_dfa_match.c @@ -3443,7 +3443,7 @@ anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 || where to start. */ startline = (re->flags & PCRE2_STARTLINE) != 0; -firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0; +firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0; bumpalong_limit = end_subject; /* Initialize and set up the fixed fields in the callout block, with a pointer diff --git a/src/pcre2_match.c b/src/pcre2_match.c index d4d5121c4..8ae9c4999 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -6836,7 +6836,7 @@ if (mcontext == NULL) else mb->memctl = mcontext->memctl; anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0; -firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0; +firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0; startline = (re->flags & PCRE2_STARTLINE) != 0; bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)? true_end_subject : subject + mcontext->offset_limit; diff --git a/testdata/testinput2 b/testdata/testinput2 index 39e12d48a..0bb658542 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -6028,4 +6028,17 @@ a)"xI # -------- +/ +/anchored, firstline + \x0a + +/ +/anchored,firstline,no_start_optimize + \x0a + +/ +/firstline + \x0a + abc\x0adef + # End of testinput2 diff --git a/testdata/testinput6 b/testdata/testinput6 index 0ca0d23c4..b15fe0631 100644 --- a/testdata/testinput6 +++ b/testdata/testinput6 @@ -5026,4 +5026,17 @@ /c*+/ ab\=ph,offset=2 +/ +/anchored, firstline + \x0a + +/ +/anchored,firstline,no_start_optimize + \x0a + +/ +/firstline + \x0a + abc\x0adef + # End of testinput6 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 5411a401c..675d99c81 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -17901,6 +17901,23 @@ No match # -------- +/ +/anchored, firstline + \x0a + 0: \x0a + +/ +/anchored,firstline,no_start_optimize + \x0a + 0: \x0a + +/ +/firstline + \x0a + 0: \x0a + abc\x0adef + 0: \x0a + # End of testinput2 Error -70: PCRE2_ERROR_BADDATA (unknown error number) Error -62: bad serialized data diff --git a/testdata/testoutput6 b/testdata/testoutput6 index 9d02ea85e..bd65130a2 100644 --- a/testdata/testoutput6 +++ b/testdata/testoutput6 @@ -7895,4 +7895,21 @@ Partial match: ab\=ph,offset=2 Partial match: +/ +/anchored, firstline + \x0a + 0: \x0a + +/ +/anchored,firstline,no_start_optimize + \x0a + 0: \x0a + +/ +/firstline + \x0a + 0: \x0a + abc\x0adef + 0: \x0a + # End of testinput6