diff --git a/ChangeLog b/ChangeLog index d080b2949..c8a246882 100644 --- a/ChangeLog +++ b/ChangeLog @@ -166,6 +166,8 @@ undefined behaviour. 44. Implement --group-separator and --no-group-separator for pcre2grep. +45. Fix \X matching in 32 bit mode without UTF in JIT. + Version 10.42 11-December-2022 ------------------------------ diff --git a/src/pcre2_jit_compile.c b/src/pcre2_jit_compile.c index 510c39213..8d64e1cfc 100644 --- a/src/pcre2_jit_compile.c +++ b/src/pcre2_jit_compile.c @@ -8718,7 +8718,7 @@ c = *cc++; #if PCRE2_CODE_UNIT_WIDTH == 32 if (c >= 0x110000) - return NULL; + return cc; #endif /* PCRE2_CODE_UNIT_WIDTH == 32 */ lgb = UCD_GRAPHBREAK(c); @@ -8958,7 +8958,7 @@ switch(type) #else sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS2(W, W, W), SLJIT_IMM, common->invalid_utf ? SLJIT_FUNC_ADDR(do_extuni_utf_invalid) : SLJIT_FUNC_ADDR(do_extuni_no_utf)); - if (!common->utf || common->invalid_utf) + if (common->invalid_utf) add_jump(compiler, backtracks, CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0)); #endif @@ -12044,7 +12044,7 @@ switch(opcode) } #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 - if (common->utf) + if (type == OP_EXTUNI || common->utf) { OP1(SLJIT_MOV, tmp_base, tmp_offset, STR_PTR, 0); detect_partial_match(common, &no_match); diff --git a/testdata/testinput12 b/testdata/testinput12 index 5a2d8d2c5..a6678bb1e 100644 --- a/testdata/testinput12 +++ b/testdata/testinput12 @@ -569,4 +569,8 @@ /\x{802a0000}*/ \x{802a0000}\x{802a0000} +# UTF matching without UTF, check invalid UTF characters +/\X++/ + a\x{110000}\x{ffffffff} + # End of testinput12 diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16 index 9ac403e50..f3b40a35f 100644 --- a/testdata/testoutput12-16 +++ b/testdata/testoutput12-16 @@ -1814,4 +1814,13 @@ No match Failed: error 134 at offset 11: character code point value in \x{} or \o{} is too large \x{802a0000}\x{802a0000} +# UTF matching without UTF, check invalid UTF characters +/\X++/ + a\x{110000}\x{ffffffff} +** Character \x{110000} is greater than 0xffff and UTF-16 mode is not enabled. +** Truncation will probably give the wrong result. +** Character \x{ffffffff} is greater than 0xffff and UTF-16 mode is not enabled. +** Truncation will probably give the wrong result. + 0: a\x00\x{ffff} + # End of testinput12 diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32 index 9396305df..dd42f8685 100644 --- a/testdata/testoutput12-32 +++ b/testdata/testoutput12-32 @@ -1812,4 +1812,9 @@ No match \x{802a0000}\x{802a0000} 0: \x{802a0000}\x{802a0000} +# UTF matching without UTF, check invalid UTF characters +/\X++/ + a\x{110000}\x{ffffffff} + 0: a\x{110000}\x{ffffffff} + # End of testinput12