From cacd570967668cbb157cbf4fab508c9b597ceb1b Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sat, 14 Sep 2024 21:15:06 +0900 Subject: [PATCH] Add new API function pcre2_set_optimization() for controlling enabled optimizations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is anticipated that over time, more and more optimizations will be added to PCRE2, and we want to be able to switch optimizations off/on, both for testing purposes and to be able to work around bugs in a released library version. The number of free bits left in the compile options word is very small. Hence, we will start putting all optimization enable/disable flags in a separate word. To switch these off/on, the new API function pcre2_set_optimization() will be used. The values which can be passed to pcre2_set_optimization() are different from the internal flag bit values. The values accepted by pcre2_set_optimization() are contiguous integers, so there is no danger of ever running out of them. This means in the future, the internal representation can be changed at any time without breaking backwards compatibility. Further, the 'directives' passed to pcre2_set_optimization() are not restricted to control a single, specific optimization. As an example, passing PCRE2_OPTIMIZATION_FULL will turn on all optimizations supported by whatever version of PCRE2 the client program happens to be linked with. Co-Authored-By: Carlo Marcelo Arenas Belón Co-Authored-by: Zoltan Herczeg --- doc/html/pcre2pattern.html | 2 +- doc/pcre2pattern.3 | 2 +- src/pcre2.h.generic | 17 ++++++- src/pcre2.h.in | 17 ++++++- src/pcre2_compile.c | 98 ++++++++++++++++++++++++++++---------- src/pcre2_context.c | 26 +++++++++- src/pcre2_dfa_match.c | 4 +- src/pcre2_internal.h | 7 +++ src/pcre2_intmodedep.h | 2 + src/pcre2_jit_compile.c | 8 ++-- src/pcre2_match.c | 4 +- src/pcre2test.c | 58 +++++++++++++++++++++- testdata/testinput2 | 18 +++++++ testdata/testoutput15 | 2 + testdata/testoutput2 | 60 +++++++++++++++++++++++ testdata/testoutput5 | 1 + testdata/testoutput6 | 1 + 17 files changed, 286 insertions(+), 41 deletions(-) diff --git a/doc/html/pcre2pattern.html b/doc/html/pcre2pattern.html index 1902f1030..fe7c42eef 100644 --- a/doc/html/pcre2pattern.html +++ b/doc/html/pcre2pattern.html @@ -2243,7 +2243,7 @@

pcre2pattern man page

PCRE2 has an optimization that automatically "possessifies" certain simple pattern constructs. For example, the sequence A+B is treated as A++B because there is no point in backtracking into a sequence of A's when B must follow. -This feature can be disabled by the PCRE2_NO_AUTOPOSSESS option, or starting +This feature can be disabled by the PCRE2_NO_AUTO_POSSESS option, or starting the pattern with (*NO_AUTO_POSSESS).

diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3 index 84e4aff47..b0936c91a 100644 --- a/doc/pcre2pattern.3 +++ b/doc/pcre2pattern.3 @@ -2242,7 +2242,7 @@ package, and PCRE1 copied it from there. It found its way into Perl at release PCRE2 has an optimization that automatically "possessifies" certain simple pattern constructs. For example, the sequence A+B is treated as A++B because there is no point in backtracking into a sequence of A's when B must follow. -This feature can be disabled by the PCRE2_NO_AUTOPOSSESS option, or starting +This feature can be disabled by the PCRE2_NO_AUTO_POSSESS option, or starting the pattern with (*NO_AUTO_POSSESS). .P When a pattern contains an unlimited repeat inside a group that can itself be diff --git a/src/pcre2.h.generic b/src/pcre2.h.generic index a3341e6f5..0896b72ca 100644 --- a/src/pcre2.h.generic +++ b/src/pcre2.h.generic @@ -464,6 +464,18 @@ released, the numbers must not be changed. */ #define PCRE2_CONFIG_COMPILED_WIDTHS 14 #define PCRE2_CONFIG_TABLES_LENGTH 15 +/* Optimization directives for pcre2_set_optimize(). +For binary compatibility, only add to this list; do not renumber. */ + +#define PCRE2_OPTIMIZATION_NONE 0 +#define PCRE2_OPTIMIZATION_FULL 1 + +#define PCRE2_AUTO_POSSESS 64 +#define PCRE2_AUTO_POSSESS_OFF 65 +#define PCRE2_DOTSTAR_ANCHOR 66 +#define PCRE2_DOTSTAR_ANCHOR_OFF 67 +#define PCRE2_START_OPTIMIZE 68 +#define PCRE2_START_OPTIMIZE_OFF 69 /* Types for code units in patterns and subject strings. */ @@ -617,7 +629,9 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ pcre2_set_parens_nest_limit(pcre2_compile_context *, uint32_t); \ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ pcre2_set_compile_recursion_guard(pcre2_compile_context *, \ - int (*)(uint32_t, void *), void *); + int (*)(uint32_t, void *), void *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_optimize(pcre2_compile_context *, uint32_t); #define PCRE2_MATCH_CONTEXT_FUNCTIONS \ PCRE2_EXP_DECL pcre2_match_context *PCRE2_CALL_CONVENTION \ @@ -912,6 +926,7 @@ pcre2_compile are called by application code. */ #define pcre2_set_newline PCRE2_SUFFIX(pcre2_set_newline_) #define pcre2_set_parens_nest_limit PCRE2_SUFFIX(pcre2_set_parens_nest_limit_) #define pcre2_set_offset_limit PCRE2_SUFFIX(pcre2_set_offset_limit_) +#define pcre2_set_optimize PCRE2_SUFFIX(pcre2_set_optimize_) #define pcre2_set_substitute_callout PCRE2_SUFFIX(pcre2_set_substitute_callout_) #define pcre2_substitute PCRE2_SUFFIX(pcre2_substitute_) #define pcre2_substring_copy_byname PCRE2_SUFFIX(pcre2_substring_copy_byname_) diff --git a/src/pcre2.h.in b/src/pcre2.h.in index a19313c9e..9595a8540 100644 --- a/src/pcre2.h.in +++ b/src/pcre2.h.in @@ -464,6 +464,18 @@ released, the numbers must not be changed. */ #define PCRE2_CONFIG_COMPILED_WIDTHS 14 #define PCRE2_CONFIG_TABLES_LENGTH 15 +/* Optimization directives for pcre2_set_optimize(). +For binary compatibility, only add to this list; do not renumber. */ + +#define PCRE2_OPTIMIZATION_NONE 0 +#define PCRE2_OPTIMIZATION_FULL 1 + +#define PCRE2_AUTO_POSSESS 64 +#define PCRE2_AUTO_POSSESS_OFF 65 +#define PCRE2_DOTSTAR_ANCHOR 66 +#define PCRE2_DOTSTAR_ANCHOR_OFF 67 +#define PCRE2_START_OPTIMIZE 68 +#define PCRE2_START_OPTIMIZE_OFF 69 /* Types for code units in patterns and subject strings. */ @@ -617,7 +629,9 @@ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ pcre2_set_parens_nest_limit(pcre2_compile_context *, uint32_t); \ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ pcre2_set_compile_recursion_guard(pcre2_compile_context *, \ - int (*)(uint32_t, void *), void *); + int (*)(uint32_t, void *), void *); \ +PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ + pcre2_set_optimize(pcre2_compile_context *, uint32_t); #define PCRE2_MATCH_CONTEXT_FUNCTIONS \ PCRE2_EXP_DECL pcre2_match_context *PCRE2_CALL_CONVENTION \ @@ -912,6 +926,7 @@ pcre2_compile are called by application code. */ #define pcre2_set_newline PCRE2_SUFFIX(pcre2_set_newline_) #define pcre2_set_parens_nest_limit PCRE2_SUFFIX(pcre2_set_parens_nest_limit_) #define pcre2_set_offset_limit PCRE2_SUFFIX(pcre2_set_offset_limit_) +#define pcre2_set_optimize PCRE2_SUFFIX(pcre2_set_optimize_) #define pcre2_set_substitute_callout PCRE2_SUFFIX(pcre2_set_substitute_callout_) #define pcre2_substitute PCRE2_SUFFIX(pcre2_substitute_) #define pcre2_substring_copy_byname PCRE2_SUFFIX(pcre2_substring_copy_byname_) diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 48dae18fa..1e787e952 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -834,7 +834,8 @@ enum { PSO_OPT, /* Value is an option bit */ PSO_BSR, /* Value is a \R type */ PSO_LIMH, /* Read integer value for heap limit */ PSO_LIMM, /* Read integer value for match limit */ - PSO_LIMD /* Read integer value for depth limit */ + PSO_LIMD, /* Read integer value for depth limit */ + PSO_OPTMZ /* Value is an optimization bit */ }; typedef struct pso { @@ -852,10 +853,10 @@ static const pso pso_list[] = { { STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP }, { STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET }, { STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET }, - { STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS }, - { STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR }, + { STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPTMZ, PCRE2_OPTIM_AUTO_POSSESS }, + { STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPTMZ, PCRE2_OPTIM_DOTSTAR_ANCHOR }, { STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT }, - { STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE }, + { STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPTMZ, PCRE2_OPTIM_START_OPTIMIZE }, { STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 }, { STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 }, { STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 }, @@ -8883,13 +8884,14 @@ this prevents the number of characters it matches from being adjusted. cb points to the compile data block atomcount atomic group level inassert TRUE if in an assertion + dotstar_anchor TRUE if automatic anchoring optimization is enabled Returns: TRUE or FALSE */ static BOOL is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb, - int atomcount, BOOL inassert) + int atomcount, BOOL inassert, BOOL dotstar_anchor) { do { PCRE2_SPTR scode = first_significant_code( @@ -8901,7 +8903,7 @@ do { if (op == OP_BRA || op == OP_BRAPOS || op == OP_SBRA || op == OP_SBRAPOS) { - if (!is_anchored(scode, bracket_map, cb, atomcount, inassert)) + if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor)) return FALSE; } @@ -8912,14 +8914,14 @@ do { { int n = GET2(scode, 1+LINK_SIZE); uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1); - if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE; + if (!is_anchored(scode, new_map, cb, atomcount, inassert, dotstar_anchor)) return FALSE; } /* Positive forward assertion */ else if (op == OP_ASSERT || op == OP_ASSERT_NA) { - if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE; + if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor)) return FALSE; } /* Condition. If there is no second branch, it can't be anchored. */ @@ -8927,7 +8929,7 @@ do { else if (op == OP_COND || op == OP_SCOND) { if (scode[GET(scode,1)] != OP_ALT) return FALSE; - if (!is_anchored(scode, bracket_map, cb, atomcount, inassert)) + if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor)) return FALSE; } @@ -8935,7 +8937,7 @@ do { else if (op == OP_ONCE) { - if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert)) + if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor)) return FALSE; } @@ -8950,8 +8952,7 @@ do { op == OP_TYPEPOSSTAR)) { if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 || - atomcount > 0 || cb->had_pruneorskip || inassert || - (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0) + atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor) return FALSE; } @@ -8988,13 +8989,14 @@ or *SKIP does not count, because once again the assumption no longer holds. cb points to the compile data atomcount atomic group level inassert TRUE if in an assertion + dotstar_anchor TRUE if automatic anchoring optimization is enabled Returns: TRUE or FALSE */ static BOOL is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb, - int atomcount, BOOL inassert) + int atomcount, BOOL inassert, BOOL dotstar_anchor) { do { PCRE2_SPTR scode = first_significant_code( @@ -9025,7 +9027,8 @@ do { return FALSE; default: /* Assertion */ - if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE; + if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor)) + return FALSE; do scode += GET(scode, 1); while (*scode == OP_ALT); scode += 1 + LINK_SIZE; break; @@ -9039,7 +9042,7 @@ do { if (op == OP_BRA || op == OP_BRAPOS || op == OP_SBRA || op == OP_SBRAPOS) { - if (!is_startline(scode, bracket_map, cb, atomcount, inassert)) + if (!is_startline(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor)) return FALSE; } @@ -9050,14 +9053,15 @@ do { { int n = GET2(scode, 1+LINK_SIZE); unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1); - if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE; + if (!is_startline(scode, new_map, cb, atomcount, inassert, dotstar_anchor)) + return FALSE; } /* Positive forward assertions */ else if (op == OP_ASSERT || op == OP_ASSERT_NA) { - if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) + if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor)) return FALSE; } @@ -9065,7 +9069,7 @@ do { else if (op == OP_ONCE) { - if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert)) + if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor)) return FALSE; } @@ -9079,8 +9083,7 @@ do { else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR) { if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 || - atomcount > 0 || cb->had_pruneorskip || inassert || - (cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0) + atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor) return FALSE; } @@ -10362,6 +10365,10 @@ int regexrc; /* Return from compile */ uint32_t i; /* Local loop counter */ +/* Enable all optimizations by default. */ +uint32_t optim_flags = ccontext != NULL ? ccontext->optimization_flags : + PCRE2_OPTIMIZATION_ALL; + /* Comments at the head of this file explain about these variables. */ uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE]; @@ -10432,6 +10439,18 @@ if (patlen > ccontext->max_pattern_length) return NULL; } +/* Optimization flags in 'options' can override those in the compile context. +This is because some options to disable optimizations were added before the +optimization flags word existed, and we need to continue supporting them +for backwards compatibility. */ + +if (options & PCRE2_NO_AUTO_POSSESS) + optim_flags &= ~PCRE2_OPTIM_AUTO_POSSESS; +if (options & PCRE2_NO_DOTSTAR_ANCHOR) + optim_flags &= ~PCRE2_OPTIM_DOTSTAR_ANCHOR; +if (options & PCRE2_NO_START_OPTIMIZE) + optim_flags &= ~PCRE2_OPTIM_START_OPTIMIZE; + /* From here on, all returns from this function should end up going via the EXIT label. */ @@ -10568,6 +10587,32 @@ if ((options & PCRE2_LITERAL) == 0) else limit_depth = c; skipatstart = ++pp; break; + + case PSO_OPTMZ: + optim_flags &= ~(p->value); + + /* For backward compatibility the three original VERBs to disable + optimizations need to also update the corresponding external option. */ + + switch(p->value) + { + case PCRE2_OPTIM_AUTO_POSSESS: + cb.external_options |= PCRE2_NO_AUTO_POSSESS; + break; + + case PCRE2_OPTIM_DOTSTAR_ANCHOR: + cb.external_options |= PCRE2_NO_DOTSTAR_ANCHOR; + break; + + case PCRE2_OPTIM_START_OPTIMIZE: + cb.external_options |= PCRE2_NO_START_OPTIMIZE; + break; + } + + break; + + default: + PCRE2_UNREACHABLE(); } break; /* Out of the table scan loop */ } @@ -10863,6 +10908,7 @@ re->top_bracket = 0; re->top_backref = 0; re->name_entry_size = cb.name_entry_size; re->name_count = cb.names_found; +re->optimization_flags = optim_flags; /* The basic block is immediately followed by the name table, and the compiled code follows after that. */ @@ -11005,7 +11051,7 @@ used in this code because at least one compiler gives a warning about loss of "const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the function call. */ -if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0) +if (errorcode == 0 && (optim_flags & PCRE2_OPTIM_AUTO_POSSESS)) { PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart; if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80; @@ -11022,17 +11068,17 @@ there are no occurrences of *PRUNE or *SKIP (though there is an option to disable this case). */ if ((re->overall_options & PCRE2_ANCHORED) == 0 && - is_anchored(codestart, 0, &cb, 0, FALSE)) + is_anchored(codestart, 0, &cb, 0, FALSE, (optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0)) re->overall_options |= PCRE2_ANCHORED; /* Set up the first code unit or startline flag, the required code unit, and -then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE -is set, as the data it would create will not be used. Note that a first code +then study the pattern. This code need not be obeyed if PCRE2_OPTIM_START_OPTIMIZE +is disabled, as the data it would create will not be used. Note that a first code unit (but not the startline flag) is useful for anchored patterns because it can still give a quick "no match" and also avoid searching for a last code unit. */ -if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) +if (optim_flags & PCRE2_OPTIM_START_OPTIMIZE) { int minminlength = 0; /* For minimal minlength from first/required CU */ @@ -11096,7 +11142,7 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) that disables this case.) */ else if ((re->overall_options & PCRE2_ANCHORED) == 0 && - is_startline(codestart, 0, &cb, 0, FALSE)) + is_startline(codestart, 0, &cb, 0, FALSE, (optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0)) re->flags |= PCRE2_STARTLINE; /* Handle the "required code unit", if one is set. In the UTF case we can diff --git a/src/pcre2_context.c b/src/pcre2_context.c index 84a967d7a..382489c8b 100644 --- a/src/pcre2_context.c +++ b/src/pcre2_context.c @@ -141,7 +141,8 @@ pcre2_compile_context PRIV(default_compile_context) = { NEWLINE_DEFAULT, /* Newline convention */ PARENS_NEST_LIMIT, /* As it says */ 0, /* Extra options */ - MAX_VARLOOKBEHIND /* As it says */ + MAX_VARLOOKBEHIND, /* As it says */ + PCRE2_OPTIMIZATION_ALL /* All optimizations enabled */ }; /* The create function copies the default into the new memory, but must @@ -409,6 +410,29 @@ ccontext->stack_guard_data = user_data; return 0; } +PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION +pcre2_set_optimize(pcre2_compile_context *ccontext, uint32_t directive) +{ +if (directive == PCRE2_OPTIMIZATION_NONE) + { + ccontext->optimization_flags = 0; + } +else if (directive == PCRE2_OPTIMIZATION_FULL) + { + ccontext->optimization_flags = PCRE2_OPTIMIZATION_ALL; + } +else if (directive >= PCRE2_AUTO_POSSESS && directive <= PCRE2_START_OPTIMIZE_OFF) + { + /* Even directive numbers switch a bit on, odd numbers switch a bit off. + * 64-65 affect the LSB, 66-67 the 2 bit, 68-69 the 4 bit, and so on. */ + if (directive & 0x1) + ccontext->optimization_flags &= ~(1 << ((directive >> 1) - 32)); + else + ccontext->optimization_flags |= 1 << ((directive >> 1) - 32); + } + +return 0; +} /* ------------ Match context ------------ */ diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c index 3e34c7ca5..9f44b0d58 100644 --- a/src/pcre2_dfa_match.c +++ b/src/pcre2_dfa_match.c @@ -3432,7 +3432,7 @@ if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8) /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the options variable for this function. Users of PCRE2 who are not calling the function directly would like to have a way of setting these flags, in the same -way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with +way that they can set pcre2_compile() flags like PCRE2_NO_AUTO_POSSESS with constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be transferred to the options for this function. The bits are guaranteed to be @@ -3699,7 +3699,7 @@ for (;;) these, for testing and for ensuring that all callouts do actually occur. The optimizations must also be avoided when restarting a DFA match. */ - if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 && + if ((re->optimization_flags & PCRE2_OPTIM_START_OPTIMIZE) && (options & PCRE2_DFA_RESTART) == 0) { /* If firstline is TRUE, the start of the match is constrained to the first diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index 043d2c563..1b9bdc6a1 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -609,6 +609,13 @@ total length of the tables. */ #define ctypes_offset (cbits_offset + cbit_length) /* Character types */ #define TABLES_LENGTH (ctypes_offset + 256) +/* Private flags used in compile_context.optimization_flags */ + +#define PCRE2_OPTIM_AUTO_POSSESS 0x00000001u +#define PCRE2_OPTIM_DOTSTAR_ANCHOR 0x00000002u +#define PCRE2_OPTIM_START_OPTIMIZE 0x00000004u + +#define PCRE2_OPTIMIZATION_ALL 0x00000007u /* -------------------- Character and string names ------------------------ */ diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h index a798cdd4f..6c14be8dc 100644 --- a/src/pcre2_intmodedep.h +++ b/src/pcre2_intmodedep.h @@ -579,6 +579,7 @@ typedef struct pcre2_real_compile_context { uint32_t parens_nest_limit; uint32_t extra_options; uint32_t max_varlookbehind; + uint32_t optimization_flags; } pcre2_real_compile_context; /* The real match context structure. */ @@ -646,6 +647,7 @@ typedef struct pcre2_real_code { uint16_t top_backref; /* Highest numbered back reference */ uint16_t name_entry_size; /* Size (code units) of table entries */ uint16_t name_count; /* Number of name entries in the table */ + uint32_t optimization_flags; /* Optimizations enabled at compile time */ } pcre2_real_code; /* The real match data structure. Define ovector as large as it can ever diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c index 5de4666d1..78ba5067c 100644 --- a/src/pcre2_jit_compile.c +++ b/src/pcre2_jit_compile.c @@ -14474,7 +14474,7 @@ if (!check_opcode_types(common, common->start, ccend)) } /* Checking flags and updating ovector_start. */ -if (mode == PCRE2_JIT_COMPLETE && (re->flags & PCRE2_LASTSET) != 0 && (re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) +if (mode == PCRE2_JIT_COMPLETE && (re->flags & PCRE2_LASTSET) != 0 && (re->optimization_flags & PCRE2_OPTIM_START_OPTIMIZE)) { common->req_char_ptr = common->ovector_start; common->ovector_start += sizeof(sljit_sw); @@ -14534,7 +14534,7 @@ memset(common->private_data_ptrs, 0, total_length * sizeof(sljit_s32)); private_data_size = common->cbra_ptr + (re->top_bracket + 1) * sizeof(sljit_sw); -if ((re->overall_options & PCRE2_ANCHORED) == 0 && (re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 && !common->has_skip_in_assert_back) +if ((re->overall_options & PCRE2_ANCHORED) == 0 && (re->optimization_flags & PCRE2_OPTIM_START_OPTIMIZE) && !common->has_skip_in_assert_back) detect_early_fail(common, common->start, &private_data_size, 0, 0); set_private_data_ptrs(common, &private_data_size, ccend); @@ -14600,7 +14600,7 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0) mainloop_label = mainloop_entry(common); continue_match_label = LABEL(); /* Forward search if possible. */ - if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) + if (re->optimization_flags & PCRE2_OPTIM_START_OPTIMIZE) { if (mode == PCRE2_JIT_COMPLETE && fast_forward_first_n_chars(common)) ; @@ -14615,7 +14615,7 @@ if ((re->overall_options & PCRE2_ANCHORED) == 0) else continue_match_label = LABEL(); -if (mode == PCRE2_JIT_COMPLETE && re->minlength > 0 && (re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) +if (mode == PCRE2_JIT_COMPLETE && re->minlength > 0 && (re->optimization_flags & PCRE2_OPTIM_START_OPTIMIZE)) { OP1(SLJIT_MOV, SLJIT_RETURN_REG, 0, SLJIT_IMM, PCRE2_ERROR_NOMATCH); OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(re->minlength)); diff --git a/src/pcre2_match.c b/src/pcre2_match.c index f55410394..54b7232f3 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -6788,7 +6788,7 @@ if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8) /* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the options variable for this function. Users of PCRE2 who are not calling the function directly would like to have a way of setting these flags, in the same -way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with +way that they can set pcre2_compile() flags like PCRE2_NO_AUTO_POSSESS with constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and (*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which we now transfer to the options for this function. The bits are guaranteed to be @@ -7326,7 +7326,7 @@ for(;;) However, there is an option (settable at compile time) that disables these, for testing and for ensuring that all callouts do actually occur. */ - if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0) + if (re->optimization_flags & PCRE2_OPTIM_START_OPTIMIZE) { /* If firstline is TRUE, the start of the match is constrained to the first line of a multiline string. That is, the match must be before or at the diff --git a/src/pcre2test.c b/src/pcre2test.c index d8f5d6483..1379aec0e 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -468,6 +468,7 @@ enum { MOD_CTC, /* Applies to a compile context */ MOD_NL, /* Is a newline value */ MOD_NN, /* Is a number or a name; more than one may occur */ MOD_OPT, /* Is an option bit */ + MOD_OPTMZ, /* Is an optimization directive */ MOD_SIZ, /* Is a PCRE2_SIZE value */ MOD_STR }; /* Is a string */ @@ -661,6 +662,8 @@ static modstruct modlist[] = { { "ascii_digit", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_DIGIT, CO(extra_options) }, { "ascii_posix", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ASCII_POSIX, CO(extra_options) }, { "auto_callout", MOD_PAT, MOD_OPT, PCRE2_AUTO_CALLOUT, PO(options) }, + { "auto_possess", MOD_CTC, MOD_OPTMZ, PCRE2_AUTO_POSSESS, 0 }, + { "auto_possess_off", MOD_CTC, MOD_OPTMZ, PCRE2_AUTO_POSSESS_OFF, 0 }, { "bad_escape_is_literal", MOD_CTC, MOD_OPT, PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL, CO(extra_options) }, { "bincode", MOD_PAT, MOD_CTL, CTL_BINCODE, PO(control) }, { "bsr", MOD_CTC, MOD_BSR, 0, CO(bsr_convention) }, @@ -688,6 +691,8 @@ static modstruct modlist[] = { { "disable_recurseloop_check", MOD_DAT, MOD_OPT, PCRE2_DISABLE_RECURSELOOP_CHECK, DO(options) }, { "dollar_endonly", MOD_PAT, MOD_OPT, PCRE2_DOLLAR_ENDONLY, PO(options) }, { "dotall", MOD_PATP, MOD_OPT, PCRE2_DOTALL, PO(options) }, + { "dotstar_anchor", MOD_CTC, MOD_OPTMZ, PCRE2_DOTSTAR_ANCHOR, 0 }, + { "dotstar_anchor_off", MOD_CTC, MOD_OPTMZ, PCRE2_DOTSTAR_ANCHOR_OFF, 0 }, { "dupnames", MOD_PATP, MOD_OPT, PCRE2_DUPNAMES, PO(options) }, { "endanchored", MOD_PD, MOD_OPT, PCRE2_ENDANCHORED, PD(options) }, { "escaped_cr_is_lf", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ESCAPED_CR_IS_LF, CO(extra_options) }, @@ -744,6 +749,8 @@ static modstruct modlist[] = { { "null_subject", MOD_DAT, MOD_CTL, CTL2_NULL_SUBJECT, DO(control2) }, { "offset", MOD_DAT, MOD_INT, 0, DO(offset) }, { "offset_limit", MOD_CTM, MOD_SIZ, 0, MO(offset_limit)}, + { "optimization_full", MOD_CTC, MOD_OPTMZ, PCRE2_OPTIMIZATION_FULL, 0 }, + { "optimization_none", MOD_CTC, MOD_OPTMZ, PCRE2_OPTIMIZATION_NONE, 0 }, { "ovector", MOD_DAT, MOD_INT, 0, DO(oveccount) }, { "parens_nest_limit", MOD_CTC, MOD_INT, 0, CO(parens_nest_limit) }, { "partial_hard", MOD_DAT, MOD_OPT, PCRE2_PARTIAL_HARD, DO(options) }, @@ -760,6 +767,8 @@ static modstruct modlist[] = { { "regerror_buffsize", MOD_PAT, MOD_INT, 0, PO(regerror_buffsize) }, { "replace", MOD_PND, MOD_STR, REPLACE_MODSIZE, PO(replacement) }, { "stackguard", MOD_PAT, MOD_INT, 0, PO(stackguard_test) }, + { "start_optimize", MOD_CTC, MOD_OPTMZ, PCRE2_START_OPTIMIZE, 0 }, + { "start_optimize_off", MOD_CTC, MOD_OPTMZ, PCRE2_START_OPTIMIZE_OFF, 0 }, { "startchar", MOD_PND, MOD_CTL, CTL_STARTCHAR, PO(control) }, { "startoffset", MOD_DAT, MOD_INT, 0, DO(offset) }, { "subject_literal", MOD_PATP, MOD_CTL, CTL2_SUBJECT_LITERAL, PO(control2) }, @@ -3884,7 +3893,7 @@ for (;;) when needed. */ m = modlist + index; /* Save typing */ - if (m->type != MOD_CTL && m->type != MOD_OPT && + if (m->type != MOD_CTL && m->type != MOD_OPT && m->type != MOD_OPTMZ && (m->type != MOD_IND || *pp == '=')) { if (*pp++ != '=') @@ -3925,6 +3934,21 @@ for (;;) else *((uint32_t *)field) |= m->value; break; + case MOD_OPTMZ: +#ifdef SUPPORT_PCRE2_8 + if (test_mode == PCRE8_MODE) + pcre2_set_optimize_8((pcre2_compile_context_8*)field, m->value); +#endif +#ifdef SUPPORT_PCRE2_16 + if (test_mode == PCRE16_MODE) + pcre2_set_optimize_16((pcre2_compile_context_16*)field, m->value); +#endif +#ifdef SUPPORT_PCRE2_32 + if (test_mode == PCRE32_MODE) + pcre2_set_optimize_32((pcre2_compile_context_32*)field, m->value); +#endif + break; + case MOD_BSR: if (len == 7 && strncmpic(pp, (const uint8_t *)"default", 7) == 0) { @@ -4361,6 +4385,33 @@ else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s", } +/************************************************* +* Show optimization flags * +*************************************************/ + +/* +Arguments: + flags an options word + before text to print before + after text to print after + +Returns: nothing +*/ + +static void +show_optimize_flags(uint32_t flags, const char *before, const char *after) +{ +if (flags == 0) fprintf(outfile, "%s%s", before, after); +else fprintf(outfile, "%s%s%s%s%s%s%s", + before, + ((flags & PCRE2_OPTIM_AUTO_POSSESS) != 0) ? "auto_possess" : "", + ((flags & PCRE2_OPTIM_AUTO_POSSESS) != 0 && (flags >> 1) != 0) ? "," : "", + ((flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0) ? "dotstar_anchor" : "", + ((flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0 && (flags >> 2) != 0) ? "," : "", + ((flags & PCRE2_OPTIM_START_OPTIMIZE) != 0) ? "start_optimize" : "", + after); +} + #ifdef SUPPORT_PCRE2_8 /************************************************* @@ -4777,6 +4828,9 @@ if ((pat_patctl.control & CTL_INFO) != 0) if (extra_options != 0) show_compile_extra_options(extra_options, "Extra options:", "\n"); + if (FLD(compiled_code, optimization_flags) != PCRE2_OPTIMIZATION_ALL) + show_optimize_flags(FLD(compiled_code, optimization_flags), "Optimizations: ", "\n"); + if (jchanged) fprintf(outfile, "Duplicate name status changes\n"); if ((pat_patctl.control2 & CTL2_BSR_SET) != 0 || @@ -4879,7 +4933,7 @@ if ((pat_patctl.control & CTL_INFO) != 0) } } - if ((FLD(compiled_code, overall_options) & PCRE2_NO_START_OPTIMIZE) == 0) + if (FLD(compiled_code, optimization_flags) & PCRE2_OPTIM_START_OPTIMIZE) fprintf(outfile, "Subject length lower bound = %d\n", minlength); if (pat_patctl.jit != 0 && (pat_patctl.control & CTL_JITVERIFY) != 0) diff --git a/testdata/testinput2 b/testdata/testinput2 index 51e2095c8..a00c1a9ba 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -839,6 +839,8 @@ /[^x]{1,3}+/Bi,no_auto_possess +/x{1,3}+/IB,auto_possess_off + /(x)*+/IB /^(\w++|\s++)*$/I @@ -4060,6 +4062,8 @@ /abcd/I,no_start_optimize +/abcd/I,start_optimize_off + /(|ab)*?d/I abd xyd @@ -4224,6 +4228,8 @@ /^abc/info,no_dotstar_anchor +/^abc/info,dotstar_anchor_off + /.*\d/info,auto_callout \= Expect no match aaa @@ -6390,6 +6396,18 @@ a)"xI ab ac +# Tests for pcre2_set_optimize() + +/abc/I,optimization_none + +/abc/I,optimization_none,auto_possess + +/abc/I,optimization_none,dotstar_anchor,auto_possess + +/abc/I,optimization_none,start_optimize + +/abc/I,dotstar_anchor_off,optimization_full + # -------------- # End of testinput2 diff --git a/testdata/testoutput15 b/testdata/testoutput15 index f36faeeaf..892473bc9 100644 --- a/testdata/testoutput15 +++ b/testdata/testoutput15 @@ -477,6 +477,7 @@ Failed: error -52: nested recursion at the same subject position ------------------------------------------------------------------ Capture group count = 0 Options: no_auto_possess +Optimizations: dotstar_anchor,start_optimize Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z Subject length lower bound = 1 @@ -501,6 +502,7 @@ No match Capture group count = 0 Compile options: Overall options: no_auto_possess +Optimizations: dotstar_anchor,start_optimize Starting code units: 0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z Subject length lower bound = 1 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index eeb635d6d..edca460fa 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -2978,6 +2978,19 @@ Subject length lower bound = 1 End ------------------------------------------------------------------ +/x{1,3}+/IB,auto_possess_off +------------------------------------------------------------------ + Bra + x + x{0,2}+ + Ket + End +------------------------------------------------------------------ +Capture group count = 0 +Optimizations: dotstar_anchor,start_optimize +First code unit = 'x' +Subject length lower bound = 1 + /(x)*+/IB ------------------------------------------------------------------ Bra @@ -13601,6 +13614,11 @@ Subject length lower bound = 4 /abcd/I,no_start_optimize Capture group count = 0 Options: no_start_optimize +Optimizations: auto_possess,dotstar_anchor + +/abcd/I,start_optimize_off +Capture group count = 0 +Optimizations: auto_possess,dotstar_anchor /(|ab)*?d/I Capture group count = 1 @@ -13616,6 +13634,7 @@ Subject length lower bound = 1 /(|ab)*?d/I,no_start_optimize Capture group count = 1 Options: no_start_optimize +Optimizations: auto_possess,dotstar_anchor abd 0: abd 1: ab @@ -13887,6 +13906,15 @@ Subject length lower bound = 3 Capture group count = 0 Compile options: no_dotstar_anchor Overall options: anchored no_dotstar_anchor +Optimizations: auto_possess,start_optimize +First code unit = 'a' +Subject length lower bound = 3 + +/^abc/info,dotstar_anchor_off +Capture group count = 0 +Compile options: +Overall options: anchored +Optimizations: auto_possess,start_optimize First code unit = 'a' Subject length lower bound = 3 @@ -13908,6 +13936,7 @@ No match /.*\d/info,no_dotstar_anchor,auto_callout Capture group count = 0 Options: auto_callout no_dotstar_anchor +Optimizations: auto_possess,start_optimize Subject length lower bound = 1 \= Expect no match aaa @@ -13935,12 +13964,14 @@ Subject length lower bound = 1 /.*\d/dotall,no_dotstar_anchor,info Capture group count = 0 Options: dotall no_dotstar_anchor +Optimizations: auto_possess,start_optimize Subject length lower bound = 1 /(*NO_DOTSTAR_ANCHOR)(?s).*\d/info Capture group count = 0 Compile options: Overall options: no_dotstar_anchor +Optimizations: auto_possess,start_optimize Subject length lower bound = 1 '^(?:(a)|b)(?(1)A|B)' @@ -18049,12 +18080,14 @@ Subject length lower bound = 1 /a?(?=b(*COMMIT)c|)d/I,no_start_optimize Capture group count = 0 Options: no_start_optimize +Optimizations: auto_possess,dotstar_anchor bd No match /(?=b(*COMMIT)c|)d/I,no_start_optimize Capture group count = 0 Options: no_start_optimize +Optimizations: auto_possess,dotstar_anchor bd No match @@ -19060,6 +19093,33 @@ No match ac No match +# Tests for pcre2_set_optimize() + +/abc/I,optimization_none +Capture group count = 0 +Optimizations: + +/abc/I,optimization_none,auto_possess +Capture group count = 0 +Optimizations: auto_possess + +/abc/I,optimization_none,dotstar_anchor,auto_possess +Capture group count = 0 +Optimizations: auto_possess,dotstar_anchor + +/abc/I,optimization_none,start_optimize +Capture group count = 0 +Optimizations: start_optimize +First code unit = 'a' +Last code unit = 'c' +Subject length lower bound = 3 + +/abc/I,dotstar_anchor_off,optimization_full +Capture group count = 0 +First code unit = 'a' +Last code unit = 'c' +Subject length lower bound = 3 + # -------------- # End of testinput2 diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 1b658f99e..befccd419 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -474,6 +474,7 @@ Subject length lower bound = 0 Capture group count = 0 Compile options: no_start_optimize utf Overall options: anchored no_start_optimize utf +Optimizations: auto_possess,dotstar_anchor /()()()()()()()()()() ()()()()()()()()()() diff --git a/testdata/testoutput6 b/testdata/testoutput6 index 283b00da0..63ec1ee29 100644 --- a/testdata/testoutput6 +++ b/testdata/testoutput6 @@ -6860,6 +6860,7 @@ No match /(abc|def|xyz)/I,no_start_optimize Capture group count = 1 Options: no_start_optimize +Optimizations: auto_possess,dotstar_anchor terhjk;abcdaadsfe 0: abc the quick xyz brown fox