Skip to content

Commit

Permalink
Add implementation of \g<...> and $<...> in substitute
Browse files Browse the repository at this point in the history
  • Loading branch information
NWilson committed Sep 18, 2024
1 parent d8b7f31 commit 6ef4ea8
Show file tree
Hide file tree
Showing 4 changed files with 290 additions and 11 deletions.
38 changes: 37 additions & 1 deletion src/pcre2_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -1700,7 +1700,7 @@ else
if (cb == NULL)
{
if (c < CHAR_0 ||
(c > CHAR_9 && (c != CHAR_c && c != CHAR_o && c != CHAR_x)))
(c > CHAR_9 && (c != CHAR_c && c != CHAR_o && c != CHAR_x && c != CHAR_g)))
{
*errorcodeptr = ERR3;
return 0;
Expand Down Expand Up @@ -1815,6 +1815,11 @@ else
Summary: Return a negative number for a numerical back reference, ESC_k for
a named back reference, and ESC_g for a named or numbered subroutine call.
The above describes the \g behaviour inside patterns. Inside replacement
strings (pcre2_substitute) we support only \g<nameornum> for Python
compatibility. Return ESG_g for the named case, and -num for the
numbered case.
*/

case CHAR_g:
Expand All @@ -1826,6 +1831,36 @@ else
break;
}

if (cb == NULL)
{
/* Substitution strings */
if (*ptr != CHAR_LESS_THAN_SIGN)
{
*errorcodeptr = ERR57;
break;
}

PCRE2_SPTR p = ptr + 1;

if (!read_number(&p, ptrend, -1, MAX_GROUP_NUMBER, ERR61, &s,
errorcodeptr))
{
if (*errorcodeptr == 0) escape = ESC_g; /* No number found */
break;
}

if (p >= ptrend || *p != CHAR_GREATER_THAN_SIGN)
{
/* not advancing ptr; report error at the \g character */
*errorcodeptr = ERR57;
break;
}

ptr = p + 1;
escape = -s;
break;
}

if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
{
escape = ESC_g;
Expand All @@ -1850,6 +1885,7 @@ else

if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
{
/* not advancing ptr; report error at the \g character */
*errorcodeptr = ERR57;
break;
}
Expand Down
72 changes: 62 additions & 10 deletions src/pcre2_substitute.c
Original file line number Diff line number Diff line change
Expand Up @@ -133,10 +133,7 @@ for (; ptr < ptrend; ptr++)
code->overall_options, code->extra_options, code->top_bracket, FALSE, NULL);
ptr -= 1; /* Back to last code unit of escape */
if (errorcode != 0)
{
rc = errorcode;
goto EXIT;
}
goto BADESCAPE;

switch(erc)
{
Expand All @@ -148,16 +145,28 @@ for (; ptr < ptrend; ptr++)
literal = TRUE;
break;

case ESC_g:
/* The \g<name> form (\g<number> already handled by check_escape)
Don't worry about finding the matching ">". We are super, super lenient
about validating ${} replacements inside find_text_end(), so we certainly
don't need to worry about other syntax. Importantly, a \g<..> or $<...>
sequence can't contain a '}' character. */
break;

default:
if (erc < 0)
break; /* capture group reference */
rc = PCRE2_ERROR_BADREPESCAPE;
goto EXIT;
goto BADESCAPE;
}
}
}

rc = PCRE2_ERROR_REPMISSINGBRACE; /* Terminator not found */
goto EXIT;

BADESCAPE:
rc = PCRE2_ERROR_BADREPESCAPE;

EXIT:
*ptrptr = ptr;
Expand Down Expand Up @@ -522,6 +531,7 @@ do
PCRE2_SPTR text1_end = NULL;
PCRE2_SPTR text2_start = NULL;
PCRE2_SPTR text2_end = NULL;
PCRE2_UCHAR name[33];

/* If at the end of a nested substring, pop the stack. */

Expand Down Expand Up @@ -551,10 +561,10 @@ do
if (*ptr == CHAR_DOLLAR_SIGN)
{
BOOL inparens;
BOOL inangle;
BOOL star;
PCRE2_SIZE sublength;
PCRE2_UCHAR next;
PCRE2_UCHAR name[33];

if (++ptr >= repend) goto BAD;
if ((next = *ptr) == CHAR_DOLLAR_SIGN) goto LOADLITERAL;
Expand All @@ -567,6 +577,7 @@ do
group = -1;
n = 0;
inparens = FALSE;
inangle = FALSE;
star = FALSE;

if (next == CHAR_LEFT_CURLY_BRACKET)
Expand All @@ -575,15 +586,24 @@ do
next = *ptr;
inparens = TRUE;
}
else if (next == CHAR_LESS_THAN_SIGN)
{
/* JavaScript compatibility syntax, $<name>. Processes only named
groups (not numbered) and does not support extensions such as star
(you can do ${name} and ${*name}, but not $<*name>). */
if (++ptr >= repend) goto BAD;
next = *ptr;
inangle = TRUE;
}

if (next == CHAR_ASTERISK)
if (!inangle && next == CHAR_ASTERISK)
{
if (++ptr >= repend) goto BAD;
next = *ptr;
star = TRUE;
}

if (!star && next >= CHAR_0 && next <= CHAR_9)
if (!star && !inangle && next >= CHAR_0 && next <= CHAR_9)
{
group = next - CHAR_0;
while (++ptr < repend)
Expand Down Expand Up @@ -618,7 +638,7 @@ do
while (MAX_255(next) && (ctypes[next] & ctype_word) != 0)
{
name[n++] = next;
if (n > 32) goto BAD;
if (n > (sizeof(name)/sizeof(*name)) - 1) goto BAD;
if (++ptr >= repend) break;
next = *ptr;
}
Expand Down Expand Up @@ -667,6 +687,13 @@ do
ptr++;
}

if (inangle)
{
if (ptr >= repend || *ptr != CHAR_GREATER_THAN_SIGN)
goto BAD;
ptr++;
}

/* Have found a syntactically correct group number or name, or *name.
Only *MARK is currently recognized. */

Expand Down Expand Up @@ -893,6 +920,31 @@ do
case 0: /* Data character */
goto LITERAL;

case ESC_g:
{
/* Parse the \g<name> form (\g<number> already handled by check_escape) */
if (ptr >= repend || *ptr != CHAR_LESS_THAN_SIGN)
goto BADESCAPE;
++ptr;
PCRE2_SPTR name_start = ptr;
const uint8_t* ctypes = code->tables + ctypes_offset;
while (ptr < repend && *ptr != CHAR_GREATER_THAN_SIGN &&
MAX_255(*ptr) && (ctypes[*ptr] & ctype_word) != 0)
++ptr;
if (ptr >= repend || *ptr != CHAR_GREATER_THAN_SIGN)
goto BADESCAPE;
PCRE2_SIZE name_len = ptr - name_start;
if (name_len == 0 || name_len > (sizeof(name)/sizeof(*name)) - 1)
goto BADESCAPE;
++ptr;

special = 0;
group = -1;
memcpy(name, name_start, name_len * sizeof(*name));
name[name_len] = 0;
goto GROUP_SUBSTITUTE;
}

default:
if (rc < 0)
{
Expand Down
67 changes: 67 additions & 0 deletions testdata/testinput2
Original file line number Diff line number Diff line change
Expand Up @@ -4201,6 +4201,73 @@
123abc123\=substitute_overflow_length,replace=[1]x$1z
123abc123\=substitute_overflow_length,replace=[0]x$1z

/a(b)c/substitute_extended
ZabcZ\=replace=>\1<
ZabcZ\=replace=>\2<
ZabcZ\=replace=>\8<
ZabcZ\=replace=>${1}<
ZabcZ\=replace=>${ 1 }<
ZabcZ\=replace=>${2}<
ZabcZ\=replace=>${8}<
ZabcZ\=replace=>$<1><
ZabcZ\=replace=>$< 1 ><
ZabcZ\=replace=>$<2><
ZabcZ\=replace=>$<8><
ZabcZ\=replace=>\g<1><
ZabcZ\=replace=>\g< 1 ><
ZabcZ\=replace=>\g<2><
ZabcZ\=replace=>\g<8><

/(*:pear)apple/substitute_extended
ZappleZ\=replace=>${*MARK}<
ZappleZ\=replace=>$<*MARK><
ZappleZ\=replace=>\g<*MARK><

/a(?<named>b)c/substitute_extended
ZabcZ\=replace=>${named}<
ZabcZ\=replace=>${noexist}<
ZabcZ\=replace=>${}<
ZabcZ\=replace=>${ }<
ZabcZ\=replace=>${ named }<
ZabcZ\=replace=>$<named><
ZabcZ\=replace=>$<noexist><
ZabcZ\=replace=>$<><
ZabcZ\=replace=>$< ><
ZabcZ\=replace=>$< named ><
ZabcZ\=replace=>\g<named><
ZabcZ\=replace=>\g<noexist><
ZabcZ\=replace=>\g<><
ZabcZ\=replace=>\g< ><
ZabcZ\=replace=>\g< named ><

/a(b)c/substitute_extended
ZabcZ\=replace=>${1:+ yes : no }
ZabcZ\=replace=>${1:+ \o{100} : \o{100} }
ZabcZ\=replace=>${1:+ \o{Z} : no }
ZabcZ\=replace=>${1:+ yes : \o{Z} }
ZabcZ\=replace=>${1:+ \g<1> : no }
ZabcZ\=replace=>${1:+ yes : \g<1> }
ZabcZ\=replace=>${1:+ \g<1 : no }
ZabcZ\=replace=>${1:+ yes : \g<1 }
ZabcZ\=replace=>${1:+ $<1> : no }
ZabcZ\=replace=>${1:+ yes : $<1> }
ZabcZ\=replace=>${1:+ $<1 : no }
ZabcZ\=replace=>${1:+ yes : $<1 }

/a(b)c/substitute_extended
ZabcZ\=replace=>${
ZabcZ\=replace=>${1
ZabcZ\=replace=>${1Z
ZabcZ\=replace=>${1;
ZabcZ\=replace=>$<
ZabcZ\=replace=>$<1
ZabcZ\=replace=>$<1Z
ZabcZ\=replace=>$<1;
ZabcZ\=replace=>\g<
ZabcZ\=replace=>\g<1
ZabcZ\=replace=>\g<1Z
ZabcZ\=replace=>\g<1;

"((?=(?(?=(?(?=(?(?=()))))))))"
a

Expand Down
Loading

0 comments on commit 6ef4ea8

Please sign in to comment.