From af1e83833b080f9b8102301951b316f9f157587f Mon Sep 17 00:00:00 2001 From: dimop Date: Mon, 3 Nov 2014 15:20:06 +0100 Subject: [PATCH 1/4] Correct usage of unicode binaries in test_combinators.erl. --- test/test_combinators.erl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_combinators.erl b/test/test_combinators.erl index 47fdb6b..ad93f6a 100644 --- a/test/test_combinators.erl +++ b/test/test_combinators.erl @@ -82,7 +82,7 @@ anything_test_() -> charclass_test_() -> [ ?_assertEqual({<<"+">>,<<"----">>,{{line,1},{column,2}}}, (neotoma_peg:p_charclass(<<"[+]">>))(<<"+----">>,?STARTINDEX)), - ?_assertEqual({fail,{expected, {character_class, "[+]"}, ?STARTINDEX}}, (neotoma_peg:p_charclass(<<"[+]">>))(<<"----">>,?STARTINDEX)) + ?_assertEqual({fail,{expected, {character_class, "[+]"}, ?STARTINDEX}}, (neotoma_peg:p_charclass(<<"[+]">>))(<<"----">>,?STARTINDEX)), ]. regexp_test_() -> @@ -101,6 +101,6 @@ column_test() -> utf8_string_test_() -> [ - ?_assertEqual({<<"世界">>, <<"def">>,{{line,1},{column,3}}}, (neotoma_peg:p_string(<<"世界">>))(<<"世界def">>,?STARTINDEX)), - ?_assertEqual({fail,{expected, {string, <<"世界">>}, ?STARTINDEX}}, (neotoma_peg:p_string(<<"世界">>))(<<"界世abc">>,?STARTINDEX)) + ?_assertEqual({<<"世界"/utf8>>, <<"def">>,{{line,1},{column,3}}}, (neotoma_peg:p_string(<<"世界"/utf8>>))(<<"世界def"/utf8>>,?STARTINDEX)), + ?_assertEqual({fail,{expected, {string, <<"世界"/utf8>>}, ?STARTINDEX}}, (neotoma_peg:p_string(<<"世界"/utf8>>))(<<"界世abc"/utf8>>,?STARTINDEX)) ]. From 3ecb836f4dfe69b0433b834389c215b8ff25d4ae Mon Sep 17 00:00:00 2001 From: dimop Date: Mon, 3 Nov 2014 15:20:40 +0100 Subject: [PATCH 2/4] Added test showing that neotoma cannot handle unicode symbols in regexps. --- test/test_parse.erl | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/test/test_parse.erl b/test/test_parse.erl index 77181c2..f914e13 100644 --- a/test/test_parse.erl +++ b/test/test_parse.erl @@ -7,7 +7,7 @@ parser_test() -> file:write_file("test_parser.peg", io_lib:fwrite("~s\n", [Data])), neotoma:file("test_parser.peg"), compile:file("test_parser.erl", []), - try + try TestString = [19990,30028,32,102,111,111], Result = test_parser:parse(TestString), ?assertEqual(6, length(Result)), @@ -17,3 +17,17 @@ parser_test() -> _:_ -> ?assert(false) end. +unicode_parser_test() -> + Data = "rule <- [\\x{1}-\\x{D7FF}]+;", + file:write_file("test_unicode_parser.peg", io_lib:fwrite("~s\n", [Data])), + neotoma:file("test_unicode_parser.peg"), + compile:file("test_unicode_parser.erl", []), + try + TestString = [19990,30028,32,102,111,111], + Result = test_unicode_parser:parse(TestString), + ?assertEqual(6, length(Result)), + StringResult = lists:flatten(io_lib:format("~ts", [Result])), + ?assertEqual(TestString, StringResult) + catch + _:_ -> ?assert(false) + end. From 299a3c4c739a94f96a3714aaf3c8243811a1ee24 Mon Sep 17 00:00:00 2001 From: dimop Date: Mon, 3 Nov 2014 15:21:22 +0100 Subject: [PATCH 3/4] Handle unicode symbols in regexps correctly. --- priv/neotoma_parse.peg | 4 ++-- src/neotoma_parse.erl | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/priv/neotoma_parse.peg b/priv/neotoma_parse.peg index 49d9fda..d1763ea 100644 --- a/priv/neotoma_parse.peg +++ b/priv/neotoma_parse.peg @@ -156,7 +156,7 @@ regexp_string <- '#' string:(!'#' ('\\#' / .))+ '#' % \ -> \\ % " -> \" re:replace(proplists:get_value(string, Node), "\"|\\\\", "\\\\&", [{return, binary}, global]), - "\">>)"] + "\"/utf8>>)"] `; quoted_string <- single_quoted_string / double_quoted_string @@ -176,7 +176,7 @@ character_class <- '[' characters:(!']' ('\\\\' . / !'\\\\' .))+ ']' used_combinator(p_charclass), ["p_charclass(<<\"[", escape_string(unicode:characters_to_list(proplists:get_value(characters, Node))), - "]\">>)"] + "]\"/utf8>>)"] `; anything_symbol <- '.' ` used_combinator(p_anything), <<"p_anything()">> `; diff --git a/src/neotoma_parse.erl b/src/neotoma_parse.erl index 1bffaf8..2d61d88 100644 --- a/src/neotoma_parse.erl +++ b/src/neotoma_parse.erl @@ -306,7 +306,7 @@ end % \ -> \\ % " -> \" re:replace(proplists:get_value(string, Node), "\"|\\\\", "\\\\&", [{return, binary}, global]), - "\">>)"] + "\"/utf8>>)"] end). -spec 'quoted_string'(input(), index()) -> parse_result(). @@ -332,7 +332,7 @@ end used_combinator(p_charclass), ["p_charclass(<<\"[", escape_string(unicode:characters_to_list(proplists:get_value(characters, Node))), - "]\">>)"] + "]\"/utf8>>)"] end). -spec 'anything_symbol'(input(), index()) -> parse_result(). From 1f5ead26725ff90afa6b91d3d1070c745318fb7e Mon Sep 17 00:00:00 2001 From: dimitarp Date: Sun, 9 Nov 2014 16:48:59 +0100 Subject: [PATCH 4/4] Fixed syntax error. --- test/test_combinators.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_combinators.erl b/test/test_combinators.erl index ad93f6a..1f6a189 100644 --- a/test/test_combinators.erl +++ b/test/test_combinators.erl @@ -82,7 +82,7 @@ anything_test_() -> charclass_test_() -> [ ?_assertEqual({<<"+">>,<<"----">>,{{line,1},{column,2}}}, (neotoma_peg:p_charclass(<<"[+]">>))(<<"+----">>,?STARTINDEX)), - ?_assertEqual({fail,{expected, {character_class, "[+]"}, ?STARTINDEX}}, (neotoma_peg:p_charclass(<<"[+]">>))(<<"----">>,?STARTINDEX)), + ?_assertEqual({fail,{expected, {character_class, "[+]"}, ?STARTINDEX}}, (neotoma_peg:p_charclass(<<"[+]">>))(<<"----">>,?STARTINDEX)) ]. regexp_test_() ->