From c6d35e87915e78f4d95177b5107cd8fb68c70643 Mon Sep 17 00:00:00 2001 From: Zach Laine Date: Sun, 8 Dec 2024 17:16:15 -0600 Subject: [PATCH] Address TODOs about resolving the value used in detail::token_with_value, and doing transcoding for the match in detail::token_with_string_view. See #202. --- include/boost/parser/lexer.hpp | 3 +- include/boost/parser/parser.hpp | 12 +- include/boost/parser/token_parser.hpp | 86 +++++++++--- test/lexer_and_parser_terminals.cpp | 187 +++++++++++++++++++++++++- 4 files changed, 260 insertions(+), 28 deletions(-) diff --git a/include/boost/parser/lexer.hpp b/include/boost/parser/lexer.hpp index e6f7ee25..d653546b 100644 --- a/include/boost/parser/lexer.hpp +++ b/include/boost/parser/lexer.hpp @@ -264,7 +264,8 @@ namespace boost { namespace parser { { os << "[value: "; if (token.has_string_view()) { - os << '"' << token.get_string_view() << '"'; + os << '"' << (token.get_string_view() | detail::text::as_utf8) + << '"'; } else if (token.has_long_long()) { if (token.id() == character_id) { os << "'" << (char)token.get_long_long() << "'"; diff --git a/include/boost/parser/parser.hpp b/include/boost/parser/parser.hpp index a0441e29..685fba15 100644 --- a/include/boost/parser/parser.hpp +++ b/include/boost/parser/parser.hpp @@ -7206,7 +7206,9 @@ namespace boost { namespace parser { return; } - auto const cps = make_subrange(expected_first_, expected_last_); + using char_type = detail::char_type_from_iter; + auto const cps = + make_subrange(expected_first_, expected_last_); if constexpr (detail::is_token_iter_v) { if (!(*first).has_string_view()) { @@ -7215,7 +7217,7 @@ namespace boost { namespace parser { } auto const sv = (*first).get_string_view(); - auto token_cps = make_subrange(sv.begin(), sv.end()); + auto token_cps = make_subrange(sv.begin(), sv.end()); auto const mismatch = detail::no_case_aware_string_mismatch( token_cps.begin(), token_cps.end(), @@ -7254,13 +7256,11 @@ namespace boost { namespace parser { } } - template + template static auto make_subrange(I f, S l) { auto subrange = BOOST_PARSER_SUBRANGE(f, l); - if constexpr (std::is_same_v< - detail::remove_cv_ref_t, - char>) { + if constexpr (std::is_same_v) { return subrange; } else { return subrange | detail::text::as_utf32; diff --git a/include/boost/parser/token_parser.hpp b/include/boost/parser/token_parser.hpp index f426c3b8..8330c4d5 100644 --- a/include/boost/parser/token_parser.hpp +++ b/include/boost/parser/token_parser.hpp @@ -38,33 +38,65 @@ namespace boost { namespace parser { } } - // TODO: This needs to use resolve(value_), and we need a test for - // that. - template + template struct token_with_value { - static_assert(std::integral || std::floating_point); - explicit token_with_value(T value) : value_(value) {} - bool matches(T value) const { return value == value_; } - T value_; + explicit constexpr token_with_value(Expected value) : + expected_(value) + {} + + template + bool matches(T value, Context const & context) const + { + return value == detail::resolve(context, expected_); + } + + Expected expected_; }; template struct token_with_string_view { - explicit token_with_string_view(Subrange value) : value_(value) {} + explicit constexpr token_with_string_view(Subrange subrange) : + subrange_(subrange) + {} - template - bool matches(std::basic_string_view value) const + template + bool matches( + std::basic_string_view value, Context const &) const { - // TODO: this is wrong. We need to transcode both sides to - // UTF-32, when !same_as. (Need to write some - // tests, and evaluate whether this is a good idea. If not, - // go change the docs on token_parser). - return std::ranges::equal(value, value_); + auto const value_cps = + make_subrange(value.begin(), value.end()); + auto const subrange_cps = + make_subrange(subrange_.begin(), subrange_.end()); + return std::ranges::equal( + value_cps, subrange_cps, [](auto a, auto b) { + return cast_char(a) == cast_char(b); + }); } - Subrange value_; + template + static auto cast_char(T c) + { + if constexpr (std::same_as) { + return (unsigned char)c; + } else { + return c; + } + } + + template + static auto make_subrange(I f, S l) + { + auto subrange = BOOST_PARSER_SUBRANGE(f, l); + if constexpr (std::is_same_v) { + return subrange; + } else { + return subrange | detail::text::as_utf32; + } + } + + Subrange subrange_; }; } @@ -142,7 +174,7 @@ namespace boost { namespace parser { if (use_expected || detail::gen_attrs(flags)) { auto opt_attr = detail::token_as>(x); if constexpr (use_expected) { - if (!opt_attr || !expected_.matches(*opt_attr)) { + if (!opt_attr || !expected_.matches(*opt_attr, context)) { success = false; return; } @@ -157,7 +189,7 @@ namespace boost { namespace parser { /** Returns a `parser_interface` containing a `token_parser` that matches `value`. */ template - requires std::is_integral_v || std::is_floating_point_v + requires(!parsable_range_like) constexpr auto operator()(T value) const noexcept { BOOST_PARSER_ASSERT( @@ -191,12 +223,26 @@ namespace boost { namespace parser { "token_spec, like 'token_spec(char-set)(char-set)'. Quit " "it!'")); auto expected = - detail::token_with_string_view{BOOST_PARSER_SUBRANGE( - std::ranges::begin(r), std::ranges::end(r))}; + detail::token_with_string_view{make_expected_range((R &&)r)}; return parser_interface( token_parser(expected)); } + template + static constexpr auto make_expected_range(R && r) + { + using T = detail::remove_cv_ref_t; + if constexpr (std::is_bounded_array_v) { + constexpr auto n = std::extent_v; + auto const offset = n && !r[n - 1] ? 1 : 0; + return BOOST_PARSER_SUBRANGE( + std::ranges::begin(r), std::ranges::end(r) - offset); + } else { + return BOOST_PARSER_SUBRANGE( + std::ranges::begin(r), std::ranges::end(r)); + } + } + // TODO: Consider adding a special string_view-like type that can be // passed to the range overload above. It would be based on // adobe::name_t. When comparing it to a tokens' string_view, if it diff --git a/test/lexer_and_parser_terminals.cpp b/test/lexer_and_parser_terminals.cpp index 84c2e7bd..d29398f3 100644 --- a/test/lexer_and_parser_terminals.cpp +++ b/test/lexer_and_parser_terminals.cpp @@ -13,10 +13,195 @@ namespace bp = boost::parser; constexpr auto true_false = bp::token_spec<"true|false", 0, bool>; -constexpr auto identifier = bp::token_spec<"[a-zA-Z]\\w*", 1>; +constexpr auto identifier = bp::token_spec<"\\p{L}+", 1>; int main() { + // token_parser + { + { + constexpr auto lexer = bp::lexer | true_false | + identifier | + bp::token_chars<'=', ';', '#', '$', '%'>; + { + constexpr auto parser = true_false; + BOOST_TEST(bp::parse("true" | bp::to_tokens(lexer), parser)); + BOOST_TEST(bp::parse("false" | bp::to_tokens(lexer), parser)); + BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser)); + } + { + constexpr auto parser = true_false(false); + BOOST_TEST(!bp::parse("true" | bp::to_tokens(lexer), parser)); + BOOST_TEST(bp::parse("false" | bp::to_tokens(lexer), parser)); + BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser)); + } + { + bool b = true; + auto get_bool = [](auto & ctx) { return _globals(ctx); }; + auto parser = bp::with_globals(true_false(get_bool), b); + BOOST_TEST(bp::parse("true" | bp::to_tokens(lexer), parser)); + BOOST_TEST(!bp::parse("false" | bp::to_tokens(lexer), parser)); + BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser)); + } + } + { + constexpr auto lexer = bp::lexer | true_false | + identifier | + bp::token_chars<'=', ';', '#', '$', '%'>; + { + constexpr auto parser = identifier; + BOOST_TEST(bp::parse("func" | bp::to_tokens(lexer), parser)); + BOOST_TEST(bp::parse("foo" | bp::to_tokens(lexer), parser)); + BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser)); + } + { + constexpr auto parser = identifier("func"); + BOOST_TEST(bp::parse("func" | bp::to_tokens(lexer), parser)); + BOOST_TEST(!bp::parse("foo" | bp::to_tokens(lexer), parser)); + BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser)); + } + { + constexpr auto parser = identifier("f\xC3\xBCnc"); + constexpr auto parser_u8 = identifier(u8"fünc"); + constexpr auto parser_u16 = identifier(u"fünc"); + constexpr auto parser_u32 = identifier(U"fünc"); + + constexpr auto lexer_u8 = + bp::lexer | true_false | identifier | + bp::token_chars<'=', ';', '#', '$', '%'>; + constexpr auto lexer_u16 = + bp::lexer | true_false | identifier | + bp::token_chars<'=', ';', '#', '$', '%'>; + constexpr auto lexer_u32 = + bp::lexer | true_false | identifier | + bp::token_chars<'=', ';', '#', '$', '%'>; + + // There appears to be a bug in CTRE, related to the use of + // char8_t. It produces bad tokens, including appearances of + // the replacement character. The exact same input here + // results in good tokens for all cases except char8_t input. +#if 0 + std::cout << "char tokens:\n"; + for (auto tok : "fünc" | bp::to_tokens(lexer)) { + std::cout << "tok=" << tok << "\n"; + } + std::cout << "\n\n"; + + std::cout << "char tokens:\n"; + for (auto tok : "f\xC3\xBCnc" | bp::to_tokens(lexer)) { + std::cout << "tok=" << tok << "\n"; + } + std::cout << "\n\n"; + + // BAD! + std::cout << "char8_t tokens:\n"; + for (auto tok : u8"fünc" | bp::to_tokens(lexer_u8)) { + std::cout << "tok=" << tok << "\n"; + } + std::cout << "\n\n"; + + std::cout << "char16_t tokens:\n"; + for (auto tok : u"fünc" | bp::to_tokens(lexer_u16)) { + std::cout << "tok=" << tok << "\n"; + } + std::cout << "\n\n"; + + std::cout << "char32_t tokens:\n"; + for (auto tok : U"fünc" | bp::to_tokens(lexer_u32)) { + std::cout << "tok=" << tok << "\n"; + } + std::cout << "\n\n"; +#endif + + // Range to match is sequence of char; no transcoding will be + // done. + BOOST_TEST( + bp::parse("f\xC3\xBCnc" | bp::to_tokens(lexer), parser)); +#if 0 + BOOST_TEST( + bp::parse(u8"fünc" | bp::to_tokens(lexer_u8), parser)); +#endif + BOOST_TEST( + bp::parse(u"fünc" | bp::to_tokens(lexer_u16), parser)); + BOOST_TEST( + bp::parse(U"fünc" | bp::to_tokens(lexer_u32), parser)); + + BOOST_TEST( // char input; no transcoding on this one. + bp::parse("f\xC3\xBCnc" | bp::to_tokens(lexer), parser_u8)); +#if 0 + BOOST_TEST( + bp::parse(u8"fünc" | bp::to_tokens(lexer_u8), parser_u8)); +#endif + BOOST_TEST( + bp::parse(u"fünc" | bp::to_tokens(lexer_u16), parser_u8)); + BOOST_TEST( + bp::parse(U"fünc" | bp::to_tokens(lexer_u32), parser_u8)); + + BOOST_TEST( // char input; no transcoding on this one. + !bp::parse( + "f\xC3\xBCnc" | bp::to_tokens(lexer), parser_u16)); +#if 0 + BOOST_TEST( + bp::parse(u8"fünc" | bp::to_tokens(lexer_u8), parser_u16)); +#endif + BOOST_TEST( + bp::parse(u"fünc" | bp::to_tokens(lexer_u16), parser_u16)); + BOOST_TEST( + bp::parse(U"fünc" | bp::to_tokens(lexer_u32), parser_u16)); + + BOOST_TEST( // char input; no transcoding on this one. + !bp::parse( + "f\xC3\xBCnc" | bp::to_tokens(lexer), parser_u32)); +#if 0 + BOOST_TEST( + bp::parse(u8"fünc" | bp::to_tokens(lexer_u8), parser_u32)); +#endif + BOOST_TEST( + bp::parse(u"fünc" | bp::to_tokens(lexer_u16), parser_u32)); + BOOST_TEST( + bp::parse(U"fünc" | bp::to_tokens(lexer_u32), parser_u32)); + + BOOST_TEST(!bp::parse("func" | bp::to_tokens(lexer), parser)); + BOOST_TEST( + !bp::parse(u8"func" | bp::to_tokens(lexer_u8), parser)); + BOOST_TEST( + !bp::parse(u"func" | bp::to_tokens(lexer_u16), parser)); + BOOST_TEST( + !bp::parse(U"func" | bp::to_tokens(lexer_u32), parser)); + + BOOST_TEST( + !bp::parse("func" | bp::to_tokens(lexer), parser_u8)); + BOOST_TEST( + !bp::parse(u8"func" | bp::to_tokens(lexer_u8), parser_u8)); + BOOST_TEST( + !bp::parse(u"func" | bp::to_tokens(lexer_u16), parser_u8)); + BOOST_TEST( + !bp::parse(U"func" | bp::to_tokens(lexer_u32), parser_u8)); + + BOOST_TEST( + !bp::parse("func" | bp::to_tokens(lexer), parser_u16)); + BOOST_TEST( + !bp::parse(u8"func" | bp::to_tokens(lexer_u8), parser_u16)); + BOOST_TEST( + !bp::parse(u"func" | bp::to_tokens(lexer_u16), parser_u16)); + BOOST_TEST( + !bp::parse(U"func" | bp::to_tokens(lexer_u32), parser_u16)); + + BOOST_TEST( + !bp::parse("func" | bp::to_tokens(lexer), parser_u32)); + BOOST_TEST( + !bp::parse(u8"func" | bp::to_tokens(lexer_u8), parser_u32)); + BOOST_TEST( + !bp::parse(u"func" | bp::to_tokens(lexer_u16), parser_u32)); + BOOST_TEST( + !bp::parse(U"func" | bp::to_tokens(lexer_u32), parser_u32)); + + BOOST_TEST(!bp::parse("foo" | bp::to_tokens(lexer), parser)); + BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser)); + } + } + } + // basic { constexpr auto lexer = bp::lexer | true_false | identifier |