From c6d35e87915e78f4d95177b5107cd8fb68c70643 Mon Sep 17 00:00:00 2001
From: Zach Laine <whatwasthataddress@gmail.com>
Date: Sun, 8 Dec 2024 17:16:15 -0600
Subject: [PATCH] Address TODOs about resolving the value used in
 detail::token_with_value, and doing transcoding for the match in
 detail::token_with_string_view.

See #202.
---
 include/boost/parser/lexer.hpp        |   3 +-
 include/boost/parser/parser.hpp       |  12 +-
 include/boost/parser/token_parser.hpp |  86 +++++++++---
 test/lexer_and_parser_terminals.cpp   | 187 +++++++++++++++++++++++++-
 4 files changed, 260 insertions(+), 28 deletions(-)
diff --git a/include/boost/parser/lexer.hpp b/include/boost/parser/lexer.hpp
index e6f7ee25..d653546b 100644
--- a/include/boost/parser/lexer.hpp
+++ b/include/boost/parser/lexer.hpp
@@ -264,7 +264,8 @@ namespace boost { namespace parser {
     {
         os << "[value: ";
         if (token.has_string_view()) {
-            os << '"' << token.get_string_view() << '"';
+            os << '"' << (token.get_string_view() | detail::text::as_utf8)
+               << '"';
         } else if (token.has_long_long()) {
             if (token.id() == character_id) {
                 os << "'" << (char)token.get_long_long() << "'";
diff --git a/include/boost/parser/parser.hpp b/include/boost/parser/parser.hpp
index a0441e29..685fba15 100644
--- a/include/boost/parser/parser.hpp
+++ b/include/boost/parser/parser.hpp
@@ -7206,7 +7206,9 @@ namespace boost { namespace parser {
                 return;
             }
 
-            auto const cps = make_subrange(expected_first_, expected_last_);
+            using char_type = detail::char_type_from_iter<Iter, Sentinel>;
+            auto const cps =
+                make_subrange<char_type>(expected_first_, expected_last_);
 
             if constexpr (detail::is_token_iter_v<Iter>) {
                 if (!(*first).has_string_view()) {
@@ -7215,7 +7217,7 @@ namespace boost { namespace parser {
                 }
 
                 auto const sv = (*first).get_string_view();
-                auto token_cps = make_subrange(sv.begin(), sv.end());
+                auto token_cps = make_subrange<char_type>(sv.begin(), sv.end());
                 auto const mismatch = detail::no_case_aware_string_mismatch(
                     token_cps.begin(),
                     token_cps.end(),
@@ -7254,13 +7256,11 @@ namespace boost { namespace parser {
             }
         }
 
-        template<typename I, typename S>
+        template<typename CharType, typename I, typename S>
         static auto make_subrange(I f, S l)
         {
             auto subrange = BOOST_PARSER_SUBRANGE(f, l);
-            if constexpr (std::is_same_v<
-                              detail::remove_cv_ref_t<decltype(*f)>,
-                              char>) {
+            if constexpr (std::is_same_v<CharType, char>) {
                 return subrange;
             } else {
                 return subrange | detail::text::as_utf32;
diff --git a/include/boost/parser/token_parser.hpp b/include/boost/parser/token_parser.hpp
index f426c3b8..8330c4d5 100644
--- a/include/boost/parser/token_parser.hpp
+++ b/include/boost/parser/token_parser.hpp
@@ -38,33 +38,65 @@ namespace boost { namespace parser {
             }
         }
 
-        // TODO: This needs to use resolve(value_), and we need a test for
-        // that.
-        template<typename T>
+        template<typename Expected>
         struct token_with_value
         {
-            static_assert(std::integral<T> || std::floating_point<T>);
-            explicit token_with_value(T value) : value_(value) {}
-            bool matches(T value) const { return value == value_; }
-            T value_;
+            explicit constexpr token_with_value(Expected value) :
+                expected_(value)
+            {}
+
+            template<typename T, typename Context>
+            bool matches(T value, Context const & context) const
+            {
+                return value == detail::resolve(context, expected_);
+            }
+
+            Expected expected_;
         };
 
         template<typename Subrange>
         struct token_with_string_view
         {
-            explicit token_with_string_view(Subrange value) : value_(value) {}
+            explicit constexpr token_with_string_view(Subrange subrange) :
+                subrange_(subrange)
+            {}
 
-            template<typename CharType>
-            bool matches(std::basic_string_view<CharType> value) const
+            template<typename CharType, typename Context>
+            bool matches(
+                std::basic_string_view<CharType> value, Context const &) const
             {
-                // TODO: this is wrong.  We need to transcode both sides to
-                // UTF-32, when !same_as<CharType, char>.  (Need to write some
-                // tests, and evaluate whether this is a good idea.  If not,
-                // go change the docs on token_parser).
-                return std::ranges::equal(value, value_);
+                auto const value_cps =
+                    make_subrange<CharType>(value.begin(), value.end());
+                auto const subrange_cps =
+                    make_subrange<CharType>(subrange_.begin(), subrange_.end());
+                return std::ranges::equal(
+                    value_cps, subrange_cps, [](auto a, auto b) {
+                        return cast_char(a) == cast_char(b);
+                    });
             }
 
-            Subrange value_;
+            template<typename T>
+            static auto cast_char(T c)
+            {
+                if constexpr (std::same_as<T, char>) {
+                    return (unsigned char)c;
+                } else {
+                    return c;
+                }
+            }
+
+            template<typename CharType, typename I, typename S>
+            static auto make_subrange(I f, S l)
+            {
+                auto subrange = BOOST_PARSER_SUBRANGE(f, l);
+                if constexpr (std::is_same_v<CharType, char>) {
+                    return subrange;
+                } else {
+                    return subrange | detail::text::as_utf32;
+                }
+            }
+
+            Subrange subrange_;
         };
     }
 
@@ -142,7 +174,7 @@ namespace boost { namespace parser {
             if (use_expected || detail::gen_attrs(flags)) {
                 auto opt_attr = detail::token_as<attribute_type<Iter>>(x);
                 if constexpr (use_expected) {
-                    if (!opt_attr || !expected_.matches(*opt_attr)) {
+                    if (!opt_attr || !expected_.matches(*opt_attr, context)) {
                         success = false;
                         return;
                     }
@@ -157,7 +189,7 @@ namespace boost { namespace parser {
         /** Returns a `parser_interface` containing a `token_parser` that
             matches `value`. */
         template<typename T>
-            requires std::is_integral_v<T> || std::is_floating_point_v<T>
+            requires(!parsable_range_like<T>)
         constexpr auto operator()(T value) const noexcept
         {
             BOOST_PARSER_ASSERT(
@@ -191,12 +223,26 @@ namespace boost { namespace parser {
                  "token_spec, like 'token_spec(char-set)(char-set)'.  Quit "
                  "it!'"));
             auto expected =
-                detail::token_with_string_view{BOOST_PARSER_SUBRANGE(
-                    std::ranges::begin(r), std::ranges::end(r))};
+                detail::token_with_string_view{make_expected_range((R &&)r)};
             return parser_interface(
                 token_parser<token_spec, decltype(expected)>(expected));
         }
 
+        template<typename R>
+        static constexpr auto make_expected_range(R && r)
+        {
+            using T = detail::remove_cv_ref_t<R>;
+            if constexpr (std::is_bounded_array_v<T>) {
+                constexpr auto n = std::extent_v<T>;
+                auto const offset = n && !r[n - 1] ? 1 : 0;
+                return BOOST_PARSER_SUBRANGE(
+                    std::ranges::begin(r), std::ranges::end(r) - offset);
+            } else {
+                return BOOST_PARSER_SUBRANGE(
+                    std::ranges::begin(r), std::ranges::end(r));
+            }
+        }
+
         // TODO: Consider adding a special string_view-like type that can be
         // passed to the range overload above.  It would be based on
         // adobe::name_t.  When comparing it to a tokens' string_view, if it
diff --git a/test/lexer_and_parser_terminals.cpp b/test/lexer_and_parser_terminals.cpp
index 84c2e7bd..d29398f3 100644
--- a/test/lexer_and_parser_terminals.cpp
+++ b/test/lexer_and_parser_terminals.cpp
@@ -13,10 +13,195 @@
 namespace bp = boost::parser;
 
 constexpr auto true_false = bp::token_spec<"true|false", 0, bool>;
-constexpr auto identifier = bp::token_spec<"[a-zA-Z]\\w*", 1>;
+constexpr auto identifier = bp::token_spec<"\\p{L}+", 1>;
 
 int main()
 {
+    // token_parser
+    {
+        {
+            constexpr auto lexer = bp::lexer<char, int> | true_false |
+                                   identifier |
+                                   bp::token_chars<'=', ';', '#', '$', '%'>;
+            {
+                constexpr auto parser = true_false;
+                BOOST_TEST(bp::parse("true" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(bp::parse("false" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser));
+            }
+            {
+                constexpr auto parser = true_false(false);
+                BOOST_TEST(!bp::parse("true" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(bp::parse("false" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser));
+            }
+            {
+                bool b = true;
+                auto get_bool = [](auto & ctx) { return _globals(ctx); };
+                auto parser = bp::with_globals(true_false(get_bool), b);
+                BOOST_TEST(bp::parse("true" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(!bp::parse("false" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser));
+            }
+        }
+        {
+            constexpr auto lexer = bp::lexer<char, int> | true_false |
+                                   identifier |
+                                   bp::token_chars<'=', ';', '#', '$', '%'>;
+            {
+                constexpr auto parser = identifier;
+                BOOST_TEST(bp::parse("func" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(bp::parse("foo" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser));
+            }
+            {
+                constexpr auto parser = identifier("func");
+                BOOST_TEST(bp::parse("func" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(!bp::parse("foo" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser));
+            }
+            {
+                constexpr auto parser = identifier("f\xC3\xBCnc");
+                constexpr auto parser_u8 = identifier(u8"fünc");
+                constexpr auto parser_u16 = identifier(u"fünc");
+                constexpr auto parser_u32 = identifier(U"fünc");
+
+                constexpr auto lexer_u8 =
+                    bp::lexer<char8_t, int> | true_false | identifier |
+                    bp::token_chars<'=', ';', '#', '$', '%'>;
+                constexpr auto lexer_u16 =
+                    bp::lexer<char16_t, int> | true_false | identifier |
+                    bp::token_chars<'=', ';', '#', '$', '%'>;
+                constexpr auto lexer_u32 =
+                    bp::lexer<char32_t, int> | true_false | identifier |
+                    bp::token_chars<'=', ';', '#', '$', '%'>;
+
+                // There appears to be a bug in CTRE, related to the use of
+                // char8_t.  It produces bad tokens, including appearances of
+                // the replacement character.  The exact same input here
+                // results in good tokens for all cases except char8_t input.
+#if 0
+                std::cout << "char tokens:\n";
+                for (auto tok : "fünc" | bp::to_tokens(lexer)) {
+                    std::cout << "tok=" << tok << "\n";
+                }
+                std::cout << "\n\n";
+
+                std::cout << "char tokens:\n";
+                for (auto tok : "f\xC3\xBCnc" | bp::to_tokens(lexer)) {
+                    std::cout << "tok=" << tok << "\n";
+                }
+                std::cout << "\n\n";
+
+                // BAD!
+                std::cout << "char8_t tokens:\n";
+                for (auto tok : u8"fünc" | bp::to_tokens(lexer_u8)) {
+                    std::cout << "tok=" << tok << "\n";
+                }
+                std::cout << "\n\n";
+
+                std::cout << "char16_t tokens:\n";
+                for (auto tok : u"fünc" | bp::to_tokens(lexer_u16)) {
+                    std::cout << "tok=" << tok << "\n";
+                }
+                std::cout << "\n\n";
+
+                std::cout << "char32_t tokens:\n";
+                for (auto tok : U"fünc" | bp::to_tokens(lexer_u32)) {
+                    std::cout << "tok=" << tok << "\n";
+                }
+                std::cout << "\n\n";
+#endif
+
+                // Range to match is sequence of char; no transcoding will be
+                // done.
+                BOOST_TEST(
+                    bp::parse("f\xC3\xBCnc" | bp::to_tokens(lexer), parser));
+#if 0
+                BOOST_TEST(
+                    bp::parse(u8"fünc" | bp::to_tokens(lexer_u8), parser));
+#endif
+                BOOST_TEST(
+                    bp::parse(u"fünc" | bp::to_tokens(lexer_u16), parser));
+                BOOST_TEST(
+                    bp::parse(U"fünc" | bp::to_tokens(lexer_u32), parser));
+
+                BOOST_TEST( // char input; no transcoding on this one.
+                    bp::parse("f\xC3\xBCnc" | bp::to_tokens(lexer), parser_u8));
+#if 0
+                BOOST_TEST(
+                    bp::parse(u8"fünc" | bp::to_tokens(lexer_u8), parser_u8));
+#endif
+                BOOST_TEST(
+                    bp::parse(u"fünc" | bp::to_tokens(lexer_u16), parser_u8));
+                BOOST_TEST(
+                    bp::parse(U"fünc" | bp::to_tokens(lexer_u32), parser_u8));
+
+                BOOST_TEST( // char input; no transcoding on this one.
+                    !bp::parse(
+                        "f\xC3\xBCnc" | bp::to_tokens(lexer), parser_u16));
+#if 0
+                BOOST_TEST(
+                    bp::parse(u8"fünc" | bp::to_tokens(lexer_u8), parser_u16));
+#endif
+                BOOST_TEST(
+                    bp::parse(u"fünc" | bp::to_tokens(lexer_u16), parser_u16));
+                BOOST_TEST(
+                    bp::parse(U"fünc" | bp::to_tokens(lexer_u32), parser_u16));
+
+                BOOST_TEST( // char input; no transcoding on this one.
+                    !bp::parse(
+                        "f\xC3\xBCnc" | bp::to_tokens(lexer), parser_u32));
+#if 0
+                BOOST_TEST(
+                    bp::parse(u8"fünc" | bp::to_tokens(lexer_u8), parser_u32));
+#endif
+                BOOST_TEST(
+                    bp::parse(u"fünc" | bp::to_tokens(lexer_u16), parser_u32));
+                BOOST_TEST(
+                    bp::parse(U"fünc" | bp::to_tokens(lexer_u32), parser_u32));
+
+                BOOST_TEST(!bp::parse("func" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(
+                    !bp::parse(u8"func" | bp::to_tokens(lexer_u8), parser));
+                BOOST_TEST(
+                    !bp::parse(u"func" | bp::to_tokens(lexer_u16), parser));
+                BOOST_TEST(
+                    !bp::parse(U"func" | bp::to_tokens(lexer_u32), parser));
+
+                BOOST_TEST(
+                    !bp::parse("func" | bp::to_tokens(lexer), parser_u8));
+                BOOST_TEST(
+                    !bp::parse(u8"func" | bp::to_tokens(lexer_u8), parser_u8));
+                BOOST_TEST(
+                    !bp::parse(u"func" | bp::to_tokens(lexer_u16), parser_u8));
+                BOOST_TEST(
+                    !bp::parse(U"func" | bp::to_tokens(lexer_u32), parser_u8));
+
+                BOOST_TEST(
+                    !bp::parse("func" | bp::to_tokens(lexer), parser_u16));
+                BOOST_TEST(
+                    !bp::parse(u8"func" | bp::to_tokens(lexer_u8), parser_u16));
+                BOOST_TEST(
+                    !bp::parse(u"func" | bp::to_tokens(lexer_u16), parser_u16));
+                BOOST_TEST(
+                    !bp::parse(U"func" | bp::to_tokens(lexer_u32), parser_u16));
+
+                BOOST_TEST(
+                    !bp::parse("func" | bp::to_tokens(lexer), parser_u32));
+                BOOST_TEST(
+                    !bp::parse(u8"func" | bp::to_tokens(lexer_u8), parser_u32));
+                BOOST_TEST(
+                    !bp::parse(u"func" | bp::to_tokens(lexer_u16), parser_u32));
+                BOOST_TEST(
+                    !bp::parse(U"func" | bp::to_tokens(lexer_u32), parser_u32));
+
+                BOOST_TEST(!bp::parse("foo" | bp::to_tokens(lexer), parser));
+                BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser));
+            }
+        }
+    }
+
     // basic
     {
         constexpr auto lexer = bp::lexer<char, int> | true_false | identifier |