Address TODOs about resolving the value used in detail::token_with_value, and

doing transcoding for the match in detail::token_with_string_view. See #202.
2026-01-19 04:22:13 +00:00 · 2024-12-08 17:16:15 -06:00
parent ff1059695d
commit c6d35e8791
4 changed files with 260 additions and 28 deletions
--- a/include/boost/parser/lexer.hpp
+++ b/include/boost/parser/lexer.hpp
@@ -264,7 +264,8 @@ namespace boost { namespace parser {
    {
        os << "[value: ";
        if (token.has_string_view()) {
-            os << '"' << token.get_string_view() << '"';
+            os << '"' << (token.get_string_view() | detail::text::as_utf8)
+               << '"';
        } else if (token.has_long_long()) {
            if (token.id() == character_id) {
                os << "'" << (char)token.get_long_long() << "'";
--- a/include/boost/parser/parser.hpp
+++ b/include/boost/parser/parser.hpp
@@ -7206,7 +7206,9 @@ namespace boost { namespace parser {
                return;
            }

-            auto const cps = make_subrange(expected_first_, expected_last_);
+            using char_type = detail::char_type_from_iter<Iter, Sentinel>;
+            auto const cps =
+                make_subrange<char_type>(expected_first_, expected_last_);

            if constexpr (detail::is_token_iter_v<Iter>) {
                if (!(*first).has_string_view()) {
@@ -7215,7 +7217,7 @@ namespace boost { namespace parser {
                }

                auto const sv = (*first).get_string_view();
-                auto token_cps = make_subrange(sv.begin(), sv.end());
+                auto token_cps = make_subrange<char_type>(sv.begin(), sv.end());
                auto const mismatch = detail::no_case_aware_string_mismatch(
                    token_cps.begin(),
                    token_cps.end(),
@@ -7254,13 +7256,11 @@ namespace boost { namespace parser {
            }
        }

-        template<typename I, typename S>
+        template<typename CharType, typename I, typename S>
        static auto make_subrange(I f, S l)
        {
            auto subrange = BOOST_PARSER_SUBRANGE(f, l);
-            if constexpr (std::is_same_v<
-                              detail::remove_cv_ref_t<decltype(*f)>,
-                              char>) {
+            if constexpr (std::is_same_v<CharType, char>) {
                return subrange;
            } else {
                return subrange | detail::text::as_utf32;
--- a/include/boost/parser/token_parser.hpp
+++ b/include/boost/parser/token_parser.hpp
@@ -38,33 +38,65 @@ namespace boost { namespace parser {
            }
        }

-        // TODO: This needs to use resolve(value_), and we need a test for
-        // that.
-        template<typename T>
+        template<typename Expected>
        struct token_with_value
        {
-            static_assert(std::integral<T> || std::floating_point<T>);
-            explicit token_with_value(T value) : value_(value) {}
-            bool matches(T value) const { return value == value_; }
-            T value_;
+            explicit constexpr token_with_value(Expected value) :
+                expected_(value)
+            {}
+
+            template<typename T, typename Context>
+            bool matches(T value, Context const & context) const
+            {
+                return value == detail::resolve(context, expected_);
+            }
+
+            Expected expected_;
        };

        template<typename Subrange>
        struct token_with_string_view
        {
-            explicit token_with_string_view(Subrange value) : value_(value) {}
+            explicit constexpr token_with_string_view(Subrange subrange) :
+                subrange_(subrange)
+            {}

-            template<typename CharType>
-            bool matches(std::basic_string_view<CharType> value) const
+            template<typename CharType, typename Context>
+            bool matches(
+                std::basic_string_view<CharType> value, Context const &) const
            {
-                // TODO: this is wrong.  We need to transcode both sides to
-                // UTF-32, when !same_as<CharType, char>.  (Need to write some
-                // tests, and evaluate whether this is a good idea.  If not,
-                // go change the docs on token_parser).
-                return std::ranges::equal(value, value_);
+                auto const value_cps =
+                    make_subrange<CharType>(value.begin(), value.end());
+                auto const subrange_cps =
+                    make_subrange<CharType>(subrange_.begin(), subrange_.end());
+                return std::ranges::equal(
+                    value_cps, subrange_cps, [](auto a, auto b) {
+                        return cast_char(a) == cast_char(b);
+                    });
            }

-            Subrange value_;
+            template<typename T>
+            static auto cast_char(T c)
+            {
+                if constexpr (std::same_as<T, char>) {
+                    return (unsigned char)c;
+                } else {
+                    return c;
+                }
+            }
+
+            template<typename CharType, typename I, typename S>
+            static auto make_subrange(I f, S l)
+            {
+                auto subrange = BOOST_PARSER_SUBRANGE(f, l);
+                if constexpr (std::is_same_v<CharType, char>) {
+                    return subrange;
+                } else {
+                    return subrange | detail::text::as_utf32;
+                }
+            }
+
+            Subrange subrange_;
        };
    }

@@ -142,7 +174,7 @@ namespace boost { namespace parser {
            if (use_expected || detail::gen_attrs(flags)) {
                auto opt_attr = detail::token_as<attribute_type<Iter>>(x);
                if constexpr (use_expected) {
-                    if (!opt_attr || !expected_.matches(*opt_attr)) {
+                    if (!opt_attr || !expected_.matches(*opt_attr, context)) {
                        success = false;
                        return;
                    }
@@ -157,7 +189,7 @@ namespace boost { namespace parser {
        /** Returns a `parser_interface` containing a `token_parser` that
            matches `value`. */
        template<typename T>
-            requires std::is_integral_v<T> || std::is_floating_point_v<T>
+            requires(!parsable_range_like<T>)
        constexpr auto operator()(T value) const noexcept
        {
            BOOST_PARSER_ASSERT(
@@ -191,12 +223,26 @@ namespace boost { namespace parser {
                 "token_spec, like 'token_spec(char-set)(char-set)'.  Quit "
                 "it!'"));
            auto expected =
-                detail::token_with_string_view{BOOST_PARSER_SUBRANGE(
-                    std::ranges::begin(r), std::ranges::end(r))};
+                detail::token_with_string_view{make_expected_range((R &&)r)};
            return parser_interface(
                token_parser<token_spec, decltype(expected)>(expected));
        }

+        template<typename R>
+        static constexpr auto make_expected_range(R && r)
+        {
+            using T = detail::remove_cv_ref_t<R>;
+            if constexpr (std::is_bounded_array_v<T>) {
+                constexpr auto n = std::extent_v<T>;
+                auto const offset = n && !r[n - 1] ? 1 : 0;
+                return BOOST_PARSER_SUBRANGE(
+                    std::ranges::begin(r), std::ranges::end(r) - offset);
+            } else {
+                return BOOST_PARSER_SUBRANGE(
+                    std::ranges::begin(r), std::ranges::end(r));
+            }
+        }
+
        // TODO: Consider adding a special string_view-like type that can be
        // passed to the range overload above.  It would be based on
        // adobe::name_t.  When comparing it to a tokens' string_view, if it
--- a/test/lexer_and_parser_terminals.cpp
+++ b/test/lexer_and_parser_terminals.cpp
@@ -13,10 +13,195 @@
 namespace bp = boost::parser;

 constexpr auto true_false = bp::token_spec<"true|false", 0, bool>;
-constexpr auto identifier = bp::token_spec<"[a-zA-Z]\\w*", 1>;
+constexpr auto identifier = bp::token_spec<"\\p{L}+", 1>;

 int main()
 {
+    // token_parser
+    {
+        {
+            constexpr auto lexer = bp::lexer<char, int> | true_false |
+                                   identifier |
+                                   bp::token_chars<'=', ';', '#', '$', '%'>;
+            {
+                constexpr auto parser = true_false;
+                BOOST_TEST(bp::parse("true" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(bp::parse("false" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser));
+            }
+            {
+                constexpr auto parser = true_false(false);
+                BOOST_TEST(!bp::parse("true" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(bp::parse("false" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser));
+            }
+            {
+                bool b = true;
+                auto get_bool = [](auto & ctx) { return _globals(ctx); };
+                auto parser = bp::with_globals(true_false(get_bool), b);
+                BOOST_TEST(bp::parse("true" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(!bp::parse("false" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser));
+            }
+        }
+        {
+            constexpr auto lexer = bp::lexer<char, int> | true_false |
+                                   identifier |
+                                   bp::token_chars<'=', ';', '#', '$', '%'>;
+            {
+                constexpr auto parser = identifier;
+                BOOST_TEST(bp::parse("func" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(bp::parse("foo" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser));
+            }
+            {
+                constexpr auto parser = identifier("func");
+                BOOST_TEST(bp::parse("func" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(!bp::parse("foo" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser));
+            }
+            {
+                constexpr auto parser = identifier("f\xC3\xBCnc");
+                constexpr auto parser_u8 = identifier(u8"fünc");
+                constexpr auto parser_u16 = identifier(u"fünc");
+                constexpr auto parser_u32 = identifier(U"fünc");
+
+                constexpr auto lexer_u8 =
+                    bp::lexer<char8_t, int> | true_false | identifier |
+                    bp::token_chars<'=', ';', '#', '$', '%'>;
+                constexpr auto lexer_u16 =
+                    bp::lexer<char16_t, int> | true_false | identifier |
+                    bp::token_chars<'=', ';', '#', '$', '%'>;
+                constexpr auto lexer_u32 =
+                    bp::lexer<char32_t, int> | true_false | identifier |
+                    bp::token_chars<'=', ';', '#', '$', '%'>;
+
+                // There appears to be a bug in CTRE, related to the use of
+                // char8_t.  It produces bad tokens, including appearances of
+                // the replacement character.  The exact same input here
+                // results in good tokens for all cases except char8_t input.
+#if 0
+                std::cout << "char tokens:\n";
+                for (auto tok : "fünc" | bp::to_tokens(lexer)) {
+                    std::cout << "tok=" << tok << "\n";
+                }
+                std::cout << "\n\n";
+
+                std::cout << "char tokens:\n";
+                for (auto tok : "f\xC3\xBCnc" | bp::to_tokens(lexer)) {
+                    std::cout << "tok=" << tok << "\n";
+                }
+                std::cout << "\n\n";
+
+                // BAD!
+                std::cout << "char8_t tokens:\n";
+                for (auto tok : u8"fünc" | bp::to_tokens(lexer_u8)) {
+                    std::cout << "tok=" << tok << "\n";
+                }
+                std::cout << "\n\n";
+
+                std::cout << "char16_t tokens:\n";
+                for (auto tok : u"fünc" | bp::to_tokens(lexer_u16)) {
+                    std::cout << "tok=" << tok << "\n";
+                }
+                std::cout << "\n\n";
+
+                std::cout << "char32_t tokens:\n";
+                for (auto tok : U"fünc" | bp::to_tokens(lexer_u32)) {
+                    std::cout << "tok=" << tok << "\n";
+                }
+                std::cout << "\n\n";
+#endif
+
+                // Range to match is sequence of char; no transcoding will be
+                // done.
+                BOOST_TEST(
+                    bp::parse("f\xC3\xBCnc" | bp::to_tokens(lexer), parser));
+#if 0
+                BOOST_TEST(
+                    bp::parse(u8"fünc" | bp::to_tokens(lexer_u8), parser));
+#endif
+                BOOST_TEST(
+                    bp::parse(u"fünc" | bp::to_tokens(lexer_u16), parser));
+                BOOST_TEST(
+                    bp::parse(U"fünc" | bp::to_tokens(lexer_u32), parser));
+
+                BOOST_TEST( // char input; no transcoding on this one.
+                    bp::parse("f\xC3\xBCnc" | bp::to_tokens(lexer), parser_u8));
+#if 0
+                BOOST_TEST(
+                    bp::parse(u8"fünc" | bp::to_tokens(lexer_u8), parser_u8));
+#endif
+                BOOST_TEST(
+                    bp::parse(u"fünc" | bp::to_tokens(lexer_u16), parser_u8));
+                BOOST_TEST(
+                    bp::parse(U"fünc" | bp::to_tokens(lexer_u32), parser_u8));
+
+                BOOST_TEST( // char input; no transcoding on this one.
+                    !bp::parse(
+                        "f\xC3\xBCnc" | bp::to_tokens(lexer), parser_u16));
+#if 0
+                BOOST_TEST(
+                    bp::parse(u8"fünc" | bp::to_tokens(lexer_u8), parser_u16));
+#endif
+                BOOST_TEST(
+                    bp::parse(u"fünc" | bp::to_tokens(lexer_u16), parser_u16));
+                BOOST_TEST(
+                    bp::parse(U"fünc" | bp::to_tokens(lexer_u32), parser_u16));
+
+                BOOST_TEST( // char input; no transcoding on this one.
+                    !bp::parse(
+                        "f\xC3\xBCnc" | bp::to_tokens(lexer), parser_u32));
+#if 0
+                BOOST_TEST(
+                    bp::parse(u8"fünc" | bp::to_tokens(lexer_u8), parser_u32));
+#endif
+                BOOST_TEST(
+                    bp::parse(u"fünc" | bp::to_tokens(lexer_u16), parser_u32));
+                BOOST_TEST(
+                    bp::parse(U"fünc" | bp::to_tokens(lexer_u32), parser_u32));
+
+                BOOST_TEST(!bp::parse("func" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(
+                    !bp::parse(u8"func" | bp::to_tokens(lexer_u8), parser));
+                BOOST_TEST(
+                    !bp::parse(u"func" | bp::to_tokens(lexer_u16), parser));
+                BOOST_TEST(
+                    !bp::parse(U"func" | bp::to_tokens(lexer_u32), parser));
+
+                BOOST_TEST(
+                    !bp::parse("func" | bp::to_tokens(lexer), parser_u8));
+                BOOST_TEST(
+                    !bp::parse(u8"func" | bp::to_tokens(lexer_u8), parser_u8));
+                BOOST_TEST(
+                    !bp::parse(u"func" | bp::to_tokens(lexer_u16), parser_u8));
+                BOOST_TEST(
+                    !bp::parse(U"func" | bp::to_tokens(lexer_u32), parser_u8));
+
+                BOOST_TEST(
+                    !bp::parse("func" | bp::to_tokens(lexer), parser_u16));
+                BOOST_TEST(
+                    !bp::parse(u8"func" | bp::to_tokens(lexer_u8), parser_u16));
+                BOOST_TEST(
+                    !bp::parse(u"func" | bp::to_tokens(lexer_u16), parser_u16));
+                BOOST_TEST(
+                    !bp::parse(U"func" | bp::to_tokens(lexer_u32), parser_u16));
+
+                BOOST_TEST(
+                    !bp::parse("func" | bp::to_tokens(lexer), parser_u32));
+                BOOST_TEST(
+                    !bp::parse(u8"func" | bp::to_tokens(lexer_u8), parser_u32));
+                BOOST_TEST(
+                    !bp::parse(u"func" | bp::to_tokens(lexer_u16), parser_u32));
+                BOOST_TEST(
+                    !bp::parse(U"func" | bp::to_tokens(lexer_u32), parser_u32));
+
+                BOOST_TEST(!bp::parse("foo" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser));
+            }
+        }
+    }
+
    // basic
    {
        constexpr auto lexer = bp::lexer<char, int> | true_false | identifier |