Skip to content

Commit

Permalink
Add new API function pcre2_set_optimization() for controlling enabled…
Browse files Browse the repository at this point in the history
… optimizations

It is anticipated that over time, more and more optimizations will be
added to PCRE2, and we want to be able to switch optimizations off/on,
both for testing purposes and to be able to work around bugs in a
released library version.

The number of free bits left in the compile options word is very small.
Hence, we will start putting all optimization enable/disable flags in
a separate word. To switch these off/on, the new API function
pcre2_set_optimization() will be used.

The values which can be passed to pcre2_set_optimization() are
different from the internal flag bit values. The values accepted by
pcre2_set_optimization() are contiguous integers, so there is no
danger of ever running out of them. This means in the future, the
internal representation can be changed at any time without breaking
backwards compatibility. Further, the 'directives' passed to
pcre2_set_optimization() are not restricted to control a single,
specific optimization. As an example, passing PCRE2_FULL_OPTIMIZATION
will turn on all optimizations supported by whatever version of
PCRE2 the client program happens to be linked with.
  • Loading branch information
alexdowad committed Sep 14, 2024
1 parent 5e75d9b commit 3bac482
Show file tree
Hide file tree
Showing 10 changed files with 137 additions and 35 deletions.
12 changes: 12 additions & 0 deletions src/pcre2.h.generic
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,17 @@ released, the numbers must not be changed. */
#define PCRE2_CONFIG_COMPILED_WIDTHS 14
#define PCRE2_CONFIG_TABLES_LENGTH 15

/* Optimization directives for pcre2_set_optimize().
For binary compatibility, only add to this list; do not renumber. */

#define PCRE2_NO_OPTIMIZATION 0
#define PCRE2_FULL_OPTIMIZATION 1
#define PCRE2_DO_AUTO_POSSESS 2
#define PCRE2_NO_AUTO_POSSESS 3
#define PCRE2_DO_DOTSTAR_ANCHOR 4
#define PCRE2_NO_DOTSTAR_ANCHOR 5
#define PCRE2_DO_START_OPTIMIZE 6
#define PCRE2_NO_START_OPTIMIZE 7

/* Types for code units in patterns and subject strings. */

Expand Down Expand Up @@ -912,6 +923,7 @@ pcre2_compile are called by application code. */
#define pcre2_set_newline PCRE2_SUFFIX(pcre2_set_newline_)
#define pcre2_set_parens_nest_limit PCRE2_SUFFIX(pcre2_set_parens_nest_limit_)
#define pcre2_set_offset_limit PCRE2_SUFFIX(pcre2_set_offset_limit_)
#define pcre2_set_optimize PCRE2_SUFFIX(pcre2_set_optimize_)
#define pcre2_set_substitute_callout PCRE2_SUFFIX(pcre2_set_substitute_callout_)
#define pcre2_substitute PCRE2_SUFFIX(pcre2_substitute_)
#define pcre2_substring_copy_byname PCRE2_SUFFIX(pcre2_substring_copy_byname_)
Expand Down
12 changes: 12 additions & 0 deletions src/pcre2.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,17 @@ released, the numbers must not be changed. */
#define PCRE2_CONFIG_COMPILED_WIDTHS 14
#define PCRE2_CONFIG_TABLES_LENGTH 15

/* Optimization directives for pcre2_set_optimize().
For binary compatibility, only add to this list; do not renumber. */

#define PCRE2_NO_OPTIMIZATION 0
#define PCRE2_FULL_OPTIMIZATION 1
#define PCRE2_DO_AUTO_POSSESS 2
#define PCRE2_NO_AUTO_POSSESS 3
#define PCRE2_DO_DOTSTAR_ANCHOR 4
#define PCRE2_NO_DOTSTAR_ANCHOR 5
#define PCRE2_DO_START_OPTIMIZE 6
#define PCRE2_NO_START_OPTIMIZE 7

/* Types for code units in patterns and subject strings. */

Expand Down Expand Up @@ -912,6 +923,7 @@ pcre2_compile are called by application code. */
#define pcre2_set_newline PCRE2_SUFFIX(pcre2_set_newline_)
#define pcre2_set_parens_nest_limit PCRE2_SUFFIX(pcre2_set_parens_nest_limit_)
#define pcre2_set_offset_limit PCRE2_SUFFIX(pcre2_set_offset_limit_)
#define pcre2_set_optimize PCRE2_SUFFIX(pcre2_set_optimize_)
#define pcre2_set_substitute_callout PCRE2_SUFFIX(pcre2_set_substitute_callout_)
#define pcre2_substitute PCRE2_SUFFIX(pcre2_substitute_)
#define pcre2_substring_copy_byname PCRE2_SUFFIX(pcre2_substring_copy_byname_)
Expand Down
77 changes: 51 additions & 26 deletions src/pcre2_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -834,7 +834,8 @@ enum { PSO_OPT, /* Value is an option bit */
PSO_BSR, /* Value is a \R type */
PSO_LIMH, /* Read integer value for heap limit */
PSO_LIMM, /* Read integer value for match limit */
PSO_LIMD /* Read integer value for depth limit */
PSO_LIMD, /* Read integer value for depth limit */
PSO_OPTMZ /* Value is an optimization bit */
};

typedef struct pso {
Expand All @@ -852,10 +853,10 @@ static const pso pso_list[] = {
{ STRING_UCP_RIGHTPAR, 4, PSO_OPT, PCRE2_UCP },
{ STRING_NOTEMPTY_RIGHTPAR, 9, PSO_FLG, PCRE2_NOTEMPTY_SET },
{ STRING_NOTEMPTY_ATSTART_RIGHTPAR, 17, PSO_FLG, PCRE2_NE_ATST_SET },
{ STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPT, PCRE2_NO_AUTO_POSSESS },
{ STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPT, PCRE2_NO_DOTSTAR_ANCHOR },
{ STRING_NO_AUTO_POSSESS_RIGHTPAR, 16, PSO_OPTMZ, PCRE2_OPTIM_AUTO_POSSESS },
{ STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPTMZ, PCRE2_OPTIM_DOTSTAR_ANCHOR },
{ STRING_NO_JIT_RIGHTPAR, 7, PSO_FLG, PCRE2_NOJIT },
{ STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPT, PCRE2_NO_START_OPTIMIZE },
{ STRING_NO_START_OPT_RIGHTPAR, 13, PSO_OPTMZ, PCRE2_OPTIM_START_OPTIMIZE },
{ STRING_LIMIT_HEAP_EQ, 11, PSO_LIMH, 0 },
{ STRING_LIMIT_MATCH_EQ, 12, PSO_LIMM, 0 },
{ STRING_LIMIT_DEPTH_EQ, 12, PSO_LIMD, 0 },
Expand Down Expand Up @@ -8883,13 +8884,14 @@ this prevents the number of characters it matches from being adjusted.
cb points to the compile data block
atomcount atomic group level
inassert TRUE if in an assertion
dotstar_anchor TRUE if automatic anchoring optimization is enabled
Returns: TRUE or FALSE
*/

static BOOL
is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
int atomcount, BOOL inassert)
int atomcount, BOOL inassert, BOOL dotstar_anchor)
{
do {
PCRE2_SPTR scode = first_significant_code(
Expand All @@ -8901,7 +8903,7 @@ do {
if (op == OP_BRA || op == OP_BRAPOS ||
op == OP_SBRA || op == OP_SBRAPOS)
{
if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
return FALSE;
}

Expand All @@ -8912,30 +8914,30 @@ do {
{
int n = GET2(scode, 1+LINK_SIZE);
uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
if (!is_anchored(scode, new_map, cb, atomcount, inassert)) return FALSE;
if (!is_anchored(scode, new_map, cb, atomcount, inassert, dotstar_anchor)) return FALSE;
}

/* Positive forward assertion */

else if (op == OP_ASSERT || op == OP_ASSERT_NA)
{
if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor)) return FALSE;
}

/* Condition. If there is no second branch, it can't be anchored. */

else if (op == OP_COND || op == OP_SCOND)
{
if (scode[GET(scode,1)] != OP_ALT) return FALSE;
if (!is_anchored(scode, bracket_map, cb, atomcount, inassert))
if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
return FALSE;
}

/* Atomic groups */

else if (op == OP_ONCE)
{
if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert))
if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))
return FALSE;
}

Expand All @@ -8950,8 +8952,7 @@ do {
op == OP_TYPEPOSSTAR))
{
if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
atomcount > 0 || cb->had_pruneorskip || inassert ||
(cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)
return FALSE;
}

Expand Down Expand Up @@ -8988,13 +8989,14 @@ or *SKIP does not count, because once again the assumption no longer holds.
cb points to the compile data
atomcount atomic group level
inassert TRUE if in an assertion
dotstar_anchor TRUE if automatic anchoring optimization is enabled
Returns: TRUE or FALSE
*/

static BOOL
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
int atomcount, BOOL inassert)
int atomcount, BOOL inassert, BOOL dotstar_anchor)
{
do {
PCRE2_SPTR scode = first_significant_code(
Expand Down Expand Up @@ -9025,7 +9027,8 @@ do {
return FALSE;

default: /* Assertion */
if (!is_startline(scode, bracket_map, cb, atomcount, TRUE)) return FALSE;
if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))
return FALSE;
do scode += GET(scode, 1); while (*scode == OP_ALT);
scode += 1 + LINK_SIZE;
break;
Expand All @@ -9039,7 +9042,7 @@ do {
if (op == OP_BRA || op == OP_BRAPOS ||
op == OP_SBRA || op == OP_SBRAPOS)
{
if (!is_startline(scode, bracket_map, cb, atomcount, inassert))
if (!is_startline(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
return FALSE;
}

Expand All @@ -9050,22 +9053,23 @@ do {
{
int n = GET2(scode, 1+LINK_SIZE);
unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
if (!is_startline(scode, new_map, cb, atomcount, inassert)) return FALSE;
if (!is_startline(scode, new_map, cb, atomcount, inassert, dotstar_anchor))
return FALSE;
}

/* Positive forward assertions */

else if (op == OP_ASSERT || op == OP_ASSERT_NA)
{
if (!is_startline(scode, bracket_map, cb, atomcount, TRUE))
if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))
return FALSE;
}

/* Atomic brackets */

else if (op == OP_ONCE)
{
if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert))
if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))
return FALSE;
}

Expand All @@ -9079,8 +9083,7 @@ do {
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
{
if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
atomcount > 0 || cb->had_pruneorskip || inassert ||
(cb->external_options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)
return FALSE;
}

Expand Down Expand Up @@ -10362,6 +10365,8 @@ int regexrc; /* Return from compile */

uint32_t i; /* Local loop counter */

uint32_t optim_flags = ccontext->optimization_flags;

/* Comments at the head of this file explain about these variables. */

uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
Expand Down Expand Up @@ -10432,6 +10437,18 @@ if (patlen > ccontext->max_pattern_length)
return NULL;
}

/* Optimization flags in 'options' can override those in the compile context.
This is because some options to disable optimizations were added before the
optimization flags word existed, and we need to continue supporting them
for backwards compatibility. */

if (options & PCRE2_NO_AUTO_POSSESS)
optim_flags &= ~PCRE2_OPTIM_AUTO_POSSESS;
if (options & PCRE2_NO_DOTSTAR_ANCHOR)
optim_flags &= ~PCRE2_OPTIM_DOTSTAR_ANCHOR;
if (options & PCRE2_NO_START_OPTIMIZE)
optim_flags &= ~PCRE2_OPTIM_START_OPTIMIZE;

/* From here on, all returns from this function should end up going via the
EXIT label. */

Expand Down Expand Up @@ -10568,6 +10585,13 @@ if ((options & PCRE2_LITERAL) == 0)
else limit_depth = c;
skipatstart = ++pp;
break;

case PSO_OPTMZ:
optim_flags &= ~(p->value);
break;

default:
PCRE2_UNREACHABLE();
}
break; /* Out of the table scan loop */
}
Expand Down Expand Up @@ -10863,6 +10887,7 @@ re->top_bracket = 0;
re->top_backref = 0;
re->name_entry_size = cb.name_entry_size;
re->name_count = cb.names_found;
re->optimization_flags = optim_flags;

/* The basic block is immediately followed by the name table, and the compiled
code follows after that. */
Expand Down Expand Up @@ -11005,7 +11030,7 @@ used in this code because at least one compiler gives a warning about loss of
"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
function call. */

if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)
if (errorcode == 0 && (optim_flags & PCRE2_OPTIM_AUTO_POSSESS))
{
PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;
Expand All @@ -11022,17 +11047,17 @@ there are no occurrences of *PRUNE or *SKIP (though there is an option to
disable this case). */

if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
is_anchored(codestart, 0, &cb, 0, FALSE))
is_anchored(codestart, 0, &cb, 0, FALSE, optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR))
re->overall_options |= PCRE2_ANCHORED;

/* Set up the first code unit or startline flag, the required code unit, and
then study the pattern. This code need not be obeyed if PCRE2_NO_START_OPTIMIZE
is set, as the data it would create will not be used. Note that a first code
then study the pattern. This code need not be obeyed if PCRE2_OPTIM_START_OPTIMIZE
is disabled, as the data it would create will not be used. Note that a first code
unit (but not the startline flag) is useful for anchored patterns because it
can still give a quick "no match" and also avoid searching for a last code
unit. */

if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
if (optim_flags & PCRE2_OPTIM_START_OPTIMIZE)
{
int minminlength = 0; /* For minimal minlength from first/required CU */

Expand Down Expand Up @@ -11096,7 +11121,7 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)
that disables this case.) */

else if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
is_startline(codestart, 0, &cb, 0, FALSE))
is_startline(codestart, 0, &cb, 0, FALSE, optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR))
re->flags |= PCRE2_STARTLINE;

/* Handle the "required code unit", if one is set. In the UTF case we can
Expand Down
23 changes: 22 additions & 1 deletion src/pcre2_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,8 @@ pcre2_compile_context PRIV(default_compile_context) = {
NEWLINE_DEFAULT, /* Newline convention */
PARENS_NEST_LIMIT, /* As it says */
0, /* Extra options */
MAX_VARLOOKBEHIND /* As it says */
MAX_VARLOOKBEHIND, /* As it says */
0x7 /* All optimizations enabled */
};

/* The create function copies the default into the new memory, but must
Expand Down Expand Up @@ -409,6 +410,26 @@ ccontext->stack_guard_data = user_data;
return 0;
}

static uint16_t optimize_setbits = {
OPTIM_SETBITS
};
static uint16_t optimize_clearbits = {
OPTIM_CLEARBITS
}

PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION
pcre2_set_optimize(pcre2_compile_context *ccontext, uint32_t directive)
{
if (directive < (sizeof(optimize_setbits)/sizeof(uint16_t)))
{
/* Both of the public directive → private flag conversion tables should
* be the same size, but let's make sure */
PCRE2_ASSERT(directive < (sizeof(optimize_clearbits)/sizeof(uint16_t)));
cb->optimization_flags |= optimize_setbits[directive];
cb->optimization_flags &= ~optimize_clearbits[directive];
}
return 0;
}

/* ------------ Match context ------------ */

Expand Down
2 changes: 1 addition & 1 deletion src/pcre2_dfa_match.c
Original file line number Diff line number Diff line change
Expand Up @@ -3699,7 +3699,7 @@ for (;;)
these, for testing and for ensuring that all callouts do actually occur.
The optimizations must also be avoided when restarting a DFA match. */

if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
if ((re->optimization_flags & PCRE2_OPTIM_START_OPTIMIZE) &&
(options & PCRE2_DFA_RESTART) == 0)
{
/* If firstline is TRUE, the start of the match is constrained to the first
Expand Down
30 changes: 30 additions & 0 deletions src/pcre2_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,36 @@ total length of the tables. */
#define ctypes_offset (cbits_offset + cbit_length) /* Character types */
#define TABLES_LENGTH (ctypes_offset + 256)

/* Private flags used in compile_context.optimization_flags */

#define PCRE2_OPTIM_AUTO_POSSESS 0x00000001u
#define PCRE2_OPTIM_DOTSTAR_ANCHOR 0x00000002u
#define PCRE2_OPTIM_START_OPTIMIZE 0x00000004u

/* Tables for converting public optimization directives listed in
pcre2.h to private optimization flags. */

#define OPTIM_SETBITS \
0, /* no optimization */ \
0xff, /* full optimization */ \
PCRE2_OPTIM_AUTO_POSSESS, \
0, \
PCRE2_OPTIM_DOTSTAR_ANCHOR, \
0, \
PCRE2_OPTIM_START_OPTIMIZE, \
0

#define OPTIM_CLEARBITS \
0xff, /* no optimization */ \
0, /* full optimization */ \
0, \
PCRE2_OPTIM_AUTO_POSSESS, \
0, \
PCRE2_OPTIM_DOTSTAR_ANCHOR, \
0, \
PCRE2_OPTIM_START_OPTIMIZE, \

#define

/* -------------------- Character and string names ------------------------ */

Expand Down
Loading

0 comments on commit 3bac482

Please sign in to comment.