Add unicode symbols parser (#213)

* Add symb parser to handle unicode symbols * Add documentation for symb * Add tests for symb * Fix typo in the documentation --------- Contributed by: Antoine Fontaine <antoinefontaine@posteo.net>
2026-02-03 09:22:14 +00:00 · 2025-02-21 06:51:17 +01:00
parent 0a34acc42a
commit b253d9ca53
9 changed files with 1203 additions and 1 deletions
--- a/doc/parser.qbk
+++ b/doc/parser.qbk
@@ -218,6 +218,7 @@
 [def _control_             [globalref boost::parser::control `control`]]
 [def _digit_               [globalref boost::parser::digit `digit`]]
 [def _punct_               [globalref boost::parser::punct `punct`]]
+[def _symb_                [globalref boost::parser::symb `symb`]]
 [def _hex_digit_           [globalref boost::parser::hex_digit `hex_digit`]]
 [def _lower_               [globalref boost::parser::lower `lower`]]
 [def _upper_               [globalref boost::parser::upper `upper`]]
--- a/doc/tables.qbk
+++ b/doc/tables.qbk
@@ -132,6 +132,11 @@ the input they match unless otherwise stated in the table below.]
     [ The code point type in Unicode parsing, or `char` in non-Unicode parsing.  See the entry for _ch_. ]
     []]

+    [[ `_symb_` ]
+     [ Matches a single symbol code point. ]
+     [ The code point type in Unicode parsing, or `char` in non-Unicode parsing.  See the entry for _ch_. ]
+     []]
+
    [[ `_hex_digit_` ]
     [ Matches a single hexidecimal digit code point. ]
     [ The code point type in Unicode parsing, or `char` in non-Unicode parsing.  See the entry for _ch_. ]
--- a/include/boost/parser/detail/printing.hpp
+++ b/include/boost/parser/detail/printing.hpp
@@ -245,6 +245,13 @@ namespace boost { namespace parser { namespace detail {
        std::ostream & os,
        int components = 0);

+    template<typename Context>
+    void print_parser(
+        Context const & context,
+        char_set_parser<symb_chars> const & parser,
+        std::ostream & os,
+        int components = 0);
+
    template<typename Context>
    void print_parser(
        Context const & context,
--- a/include/boost/parser/detail/printing_impl.hpp
+++ b/include/boost/parser/detail/printing_impl.hpp
@@ -636,6 +636,16 @@ namespace boost { namespace parser { namespace detail {
        os << "punct";
    }

+    template<typename Context>
+    void print_parser(
+        Context const & context,
+        char_set_parser<symb_chars> const & parser,
+        std::ostream & os,
+        int components)
+    {
+        os << "symb";
+    }
+
    template<typename Context>
    void print_parser(
        Context const & context,
--- a/include/boost/parser/detail/unicode_char_sets.hpp
+++ b/include/boost/parser/detail/unicode_char_sets.hpp
--- a/include/boost/parser/parser.hpp
+++ b/include/boost/parser/parser.hpp
@@ -7811,12 +7811,18 @@ namespace boost { namespace parser {
        control;

    /** The punctuation character parser.  Matches the full set of Unicode
-        punctuation clases (specifically, "Pc", "Pd", "Pe", "Pf", "Pi", "Ps",
+        punctuation classes (specifically, "Pc", "Pd", "Pe", "Pf", "Pi", "Ps",
        and "Po"). */
    inline BOOST_PARSER_ALGO_CONSTEXPR
        parser_interface<char_set_parser<detail::punct_chars>>
            punct;

+    /** The symbol character parser.  Matches the full set of Unicode
+        symbol classes (specifically, "Sc", "Sk", "Sm", and "So"). */
+    inline BOOST_PARSER_ALGO_CONSTEXPR
+        parser_interface<char_set_parser<detail::symb_chars>>
+            symb;
+
    /** The lower case character parser.  Matches the full set of Unicode
        lower case code points (class "Ll"). */
    inline BOOST_PARSER_ALGO_CONSTEXPR
--- a/include/boost/parser/parser_fwd.hpp
+++ b/include/boost/parser/parser_fwd.hpp
@@ -143,6 +143,8 @@ namespace boost { namespace parser {

        struct punct_chars
        {};
+        struct symb_chars
+        {};
        struct lower_case_chars
        {};
        struct upper_case_chars
--- a/test/github_issues.cpp
+++ b/test/github_issues.cpp
@@ -245,6 +245,10 @@ void github_issue_209()
        std::begin(bp::detail::char_set<detail::punct_chars>::chars),
        std::end(bp::detail::char_set<detail::punct_chars>::chars)));

+    BOOST_TEST(std::is_sorted(
+        std::begin(bp::detail::char_set<detail::symb_chars>::chars),
+        std::end(bp::detail::char_set<detail::symb_chars>::chars)));
+
    BOOST_TEST(std::is_sorted(
        std::begin(bp::detail::char_set<detail::lower_case_chars>::chars),
        std::end(bp::detail::char_set<detail::lower_case_chars>::chars)));
--- a/test/parser.cpp
+++ b/test/parser.cpp
@@ -2753,6 +2753,16 @@ int main()
        BOOST_TEST(result == std::vector<uint32_t>({0x21, 0xfda}));
    }

+    // symb_
+    {
+        auto parser = +symb;
+
+        std::u32string str = U"$^\u20AC!\u2194\u220F\U0001D7C6b\u2280\U0001FACE\U0001039F";
+        std::vector<uint32_t> result;
+        BOOST_TEST(parse(str, parser, char_ - symb, result));
+        BOOST_TEST(result == std::vector<uint32_t>({U'$', U'^', 0x20AC, 0x2194, 0x220F, 0x2280, 0x1FACE}));
+    }
+
    // lower_
    {
        auto parser = +lower;