Complete initial pass on token parsing documentation.

See #202.
2026-01-19 04:22:13 +00:00 · 2024-12-07 23:50:16 -06:00
parent 9a958224e4
commit ff1059695d
6 changed files with 123 additions and 61 deletions
--- a/doc/tables.qbk
+++ b/doc/tables.qbk
@@ -694,97 +694,97 @@ the input they match unless otherwise stated in the table below.]
    [[ _ch_ ]
     [ Matches any single code point. ]
     [ The code point type in Unicode parsing, or `char` in non-Unicode parsing.  See _attr_gen_. ]
-     []]
+     [ Only matches tokens with the ID _ch_id_. ]]

    [[ `_ch_(arg0)` ]
     [ Matches exactly the code point `_RES_np_(arg0)`. ]
     [ The code point type in Unicode parsing, or `char` in non-Unicode parsing.  See _attr_gen_. ]
-     []]
+     [ Only matches tokens with the ID _ch_id_. ]]

    [[ `_ch_(arg0, arg1)` ]
     [ Matches the next code point `n` in the input, if `_RES_np_(arg0) <= n && n <= _RES_np_(arg1)`. ]
     [ The code point type in Unicode parsing, or `char` in non-Unicode parsing.  See _attr_gen_. ]
-     []]
+     [ Only matches tokens with the ID _ch_id_. ]]

    [[ `_ch_(r)` ]
     [ Matches the next code point `n` in the input, if `n` is one of the code points in `r`. ]
     [ The code point type in Unicode parsing, or `char` in non-Unicode parsing.  See _attr_gen_. ]
-     [ `r` is taken to be in a UTF encoding.  The exact UTF used depends on `r`'s element type.  If you do not pass UTF encoded ranges for `r`, the behavior of _ch_ is undefined.  Note that ASCII is a subset of UTF-8, so ASCII is fine.  EBCDIC is not.  `r` is not copied; a reference to it is taken.  The lifetime of `_ch_(r)` must be within the lifetime of `r`.  This overload of _ch_ does *not* take parse arguments. ]]
+     [ `r` is taken to be in a UTF encoding.  The exact UTF used depends on `r`'s element type.  If you do not pass UTF encoded ranges for `r`, the behavior of _ch_ is undefined.  Note that ASCII is a subset of UTF-8, so ASCII is fine.  EBCDIC is not.  `r` is not copied; a reference to it is taken.  The lifetime of `_ch_(r)` must be within the lifetime of `r`.  This overload of _ch_ does *not* take parse arguments.  Only matches tokens with the ID _ch_id_. ]]

    [[ _cp_ ]
     [ Matches a single code point. ]
     [ `char32_t` ]
-     [ Similar to _ch_, but with a fixed `char32_t` attribute type; _cp_ has all the same call operator overloads as _ch_, though they are not repeated here, for brevity. ]]
+     [ Similar to _ch_, but with a fixed `char32_t` attribute type; _cp_ has all the same call operator overloads as _ch_, though they are not repeated here, for brevity.  Only matches tokens with the ID _ch_id_. ]]

    [[ _cu_ ]
     [ Matches a single code point. ]
     [ `char` ]
-     [ Similar to _ch_, but with a fixed `char` attribute type; _cu_ has all the same call operator overloads as _ch_, though they are not repeated here, for brevity.  Even though the name "`cu`" suggests that this parser match at the code unit level, it does not.  The name refers to the attribute type generated, much like the names _i_ versus _ui_. ]]
+     [ Similar to _ch_, but with a fixed `char` attribute type; _cu_ has all the same call operator overloads as _ch_, though they are not repeated here, for brevity.  Even though the name "`cu`" suggests that this parser match at the code unit level, it does not.  The name refers to the attribute type generated, much like the names _i_ versus _ui_.  Only matches tokens with the ID _ch_id_. ]]

    [[ `_blank_` ]
     [ Equivalent to `_ws_ - _eol_`. ]
     [ The code point type in Unicode parsing, or `char` in non-Unicode parsing.  See the entry for _ch_. ]
-     []]
+     [ Only matches tokens with the ID _ch_id_. ]]

    [[ `_control_` ]
     [ Matches a single control-character code point. ]
     [ The code point type in Unicode parsing, or `char` in non-Unicode parsing.  See the entry for _ch_. ]
-     []]
+     [ Only matches tokens with the ID _ch_id_. ]]

    [[ `_digit_` ]
     [ Matches a single decimal digit code point. ]
     [ The code point type in Unicode parsing, or `char` in non-Unicode parsing.  See the entry for _ch_. ]
-     []]
+     [ Only matches tokens with the ID _ch_id_. ]]

    [[ `_punct_` ]
     [ Matches a single punctuation code point. ]
     [ The code point type in Unicode parsing, or `char` in non-Unicode parsing.  See the entry for _ch_. ]
-     []]
+     [ Only matches tokens with the ID _ch_id_. ]]

    [[ `_hex_digit_` ]
     [ Matches a single hexidecimal digit code point. ]
     [ The code point type in Unicode parsing, or `char` in non-Unicode parsing.  See the entry for _ch_. ]
-     []]
+     [ Only matches tokens with the ID _ch_id_. ]]

    [[ `_lower_` ]
     [ Matches a single lower-case code point. ]
     [ The code point type in Unicode parsing, or `char` in non-Unicode parsing.  See the entry for _ch_. ]
-     []]
+     [ Only matches tokens with the ID _ch_id_. ]]

    [[ `_upper_` ]
     [ Matches a single upper-case code point. ]
     [ The code point type in Unicode parsing, or `char` in non-Unicode parsing.  See the entry for _ch_. ]
-     []]
+     [ Only matches tokens with the ID _ch_id_. ]]

    [[ _lit_np_`(c)`]
     [ Matches exactly the given code point `c`. ]
     [ None. ]
-     [_lit_ does *not* take parse arguments. ]]
+     [_lit_ does *not* take parse arguments.  Only matches tokens with the ID _ch_id_. ]]

    [[ `c_l` ]
     [ Matches exactly the given code point `c`. ]
     [ None. ]
-     [ This is a _udl_ that represents `_lit_np_(c)`, for example `'F'_l`. ]]
+     [ This is a _udl_ that represents `_lit_np_(c)`, for example `'F'_l`.  Only matches tokens with the ID _ch_id_. ]]

    [[ _lit_np_`(r)`]
     [ Matches exactly the given string `r`. ]
     [ None. ]
-     [ _lit_ does *not* take parse arguments. ]]
+     [ _lit_ does *not* take parse arguments.  _str_ matches the entire token or not at all.  Only matches tokens with an attribute type that is a specialization of `std::basic_string_view`. ]]

    [[ `str_l` ]
     [ Matches exactly the given string `str`. ]
     [ None. ]
-     [ This is a _udl_ that represents `_lit_np_(s)`, for example `"a string"_l`. ]]
+     [ This is a _udl_ that represents `_lit_np_(s)`, for example `"a string"_l`.  Only matches tokens with an attribute type that is a specialization of `std::basic_string_view`. ]]

    [[ `_str_np_(r)`]
     [ Matches exactly `r`, and generates the match as an attribute. ]
     [ _std_str_ ]
-     [ _str_ does *not* take parse arguments. ]]
+     [ _str_ does *not* take parse arguments.  _str_ matches the entire token or not at all.  Only matches tokens with an attribute type that is a specialization of `std::basic_string_view`. ]]

    [[ `str_p`]
     [ Matches exactly `str`, and generates the match as an attribute. ]
     [ _std_str_ ]
-     [ This is a _udl_ that represents `_str_np_(s)`, for example `"a string"_p`. ]]
+     [ This is a _udl_ that represents `_str_np_(s)`, for example `"a string"_p`.  Only matches tokens with an attribute type that is a specialization of `std::basic_string_view`. ]]

    [[ `_rpt_np_(arg0)[p]` ]
     [ Matches iff `p` matches exactly `_RES_np_(arg0)` times. ]
--- a/doc/tutorial.qbk
+++ b/doc/tutorial.qbk
@@ -3665,7 +3665,7 @@ Some things to be aware of when looking at _Parser_ trace output:

 [endsect]

-[section Using a Lexer / Token Parsing]
+[section Token parsing / Using a Lexer]

 _Parser_ has optional support for lexing before parsing.  The optional support
 is based on an external dependency, _ctre_.  _ctre_ produces a sequence of
@@ -3702,7 +3702,7 @@ an ID of `0`, etc.  _lex_ takes two template parameters.  The first parameter
 indicates that the value type of the parsed input sequence is `char`.  The
 second one indicates that the ID-type of all subsequent _tok_specs_ will be
 `int`.  We create a full lexer by starting with the `lexer<...>` expression,
-follwed by a piped-together sequence of _tok_specs_.
+followed by a piped-together sequence of _tok_specs_.

 The final lexer `lexer` has a combined regex string, `"(foo)|(b.*r)|(b.+z)"`.
 This string is built up at compile time, and is represented by an _nttp_.  It
@@ -3734,7 +3734,7 @@ Next, you define a parser.

 [tokens_basics_parser]

-This has the same semantics as the characater parsers you've seen in the rest
+This has the same semantics as the character parsers you've seen in the rest
 of the documentation.  Each _tok_spec_ has the same interface as a parser, so
 it can be used with all the parser combining operations, like `operator>>`.
 However, unlike when doing character parsing, when token parsing all the
@@ -3772,7 +3772,7 @@ _lex_ has one built in; it uses `"\\s+"` by default.  Whitespace is matched,
 but produces no tokens.  If you want to change the whitespace/skipper regex,
 you can provide it when specifying the lexer.  For example, here is how you
 would specify the whitespace/skipped tokens to be any sequence of whitespace
-charaters, or any C++-style trailing comment (`// ...`).
+characters, or any C++-style trailing comment (`// ...`).

    bp::lexer<char, int, "\\s+|\\/\\/.*$">

@@ -3797,7 +3797,7 @@ The attribute types for these tokens are `bool`, `std::string_view`, and
 because that is the default if you do not specify a type.

 A _tok_ is essentially a variant of `std::basic_string_view<CharType>`, `long
-long`, and `long double`.  The latter two types were seleced because they can
+long`, and `long double`.  The latter two types were selected because they can
 fit any value of an integral or floating-point type, respectively.  Even
 though _tok_ effectively erases the exact type when it is integral or
 floating-point, the token parser retains the information of what the exact
@@ -3841,10 +3841,69 @@ literals are turned into _ch_ parsers.  _ch_ parsers that you explicitly write
 may be used as well.  They will only match single-character tokens, though
 (that is, tokens with the ID _ch_id_).

+[heading The differences between parsing characters and parsing tokens]
+
+Even though _ch_ and _str_ (and lots of other character parsers _emdash_ see
+the table below) are available when doing token parsing, their semantics are
+subtly different when using for token parsing.  This is because token parsing
+involves parsing chunks of input as tokens, rather than individual characters.
+This may sound obvious, but the implications are not.  Consider this example.
+
+[tokens_string_in_character_vs_token_parsing]
+
+Why doesn't the token parsing case work?  In the character parsing case,
+_str_np_ tries to match characters from the input, one at a time; it sees
+`'='` followed by `';'`, so it matches.  In the token parsing case, this does
+not happen.  Instead, the input is broken up into two tokens (one for `'='`
+and one for `';'`).  `_str_np_("=;")` tries to match the first token in its
+entirety, but that token is a character token, not a token with a
+`std::basic_string_view` attribute.  Even if that token did have
+a`std::basic_string_view` attribute, it would be `"="`, not `"=;"`, and so the
+match would still fail.
+
+So, even though string matching is available using _str_, make sure you
+understand that _str_ is looking for 1) a token with a string view attribute,
+and 2) a full match of the token's string view against the range provided to
+_str_.
+
+_ch_ is also a bit different, since it only matches character tokens that you
+make with _tok_chs_.  Such tokens have the token ID _ch_id_.  _ch_ will
+*never* match any other kind of token.  This goes for all the character
+parsers (_blank_, _punct_, _upper_, etc).
+
+The character class parsers (e.g. _punct_) are also limited in token parsing
+vs. their use in character parsing.  _tok_chs_ limits characters to the ASCII
+range for simplicity, and to discourage parsing of sequences of tokens to find
+things that are detectable using _pcre_ directly.  In other words, if you need
+the full set of punctuation characters, use `"\p{P}"` in one of your token
+regexes, rather than trying to parse punctuation characters out of the input
+using _punct_.  Because _tok_chs_ limits characters to the ASCII range, all
+the matching for any character class parser (like _punct_) above the ASCII
+range will fail.
+
+[important Though the string and character parsers are available, they're a
+bit clunky and should be avoided in most cases.  Instead, use the character
+handling from the _pcre_ regex language to make the tokens you want.  The best
+use of string and character parsers in your _Parser_ token parsers is as
+literals like `"function"`, `'='`, etc.]
+
+One more important difference between token and character parsing is the
+effect that using _lexeme_ and/or _skip_ has.  If you use _lexeme_ or _skip_,
+you are changing the sequence tokens that must be in the token cache.  As
+such, whenever you *enter* or *leave* a _lexeme_ *or* _skip_ directive, the
+token cache is flushed.  The flushed tokens are everything from the current
+token position to the end of the cache.  If you write `bp::lexeme[p]`
+frequently enough in your parsers, you could be in for some very uneven
+performance.
+
+[important Though you may be used to using _lexeme_ and _skip_ in character
+parsing, prefer to write explicit token regexes that have equivalent
+semantics, but operating during lexing rather than during parsing.]
+
 [heading Parsing tokens with a specific value]

 So far, we've only seen examples of parsing for a particular token.  Sometimes
-we want to match only occurrances of a given token with a particular value,
+we want to match only occurrences of a given token with a particular value,
 just like when we write something like `_ch_('a', 'z')` in a character parser.

 Just as with _ch_ and most other _Parser_ parsers, you can just add the value
@@ -3953,7 +4012,7 @@ Note the size of the cache after the parse; it still contains some tokens.
 This is a special case of a more general phenomenon: the token cache grows
 without bound when there are no expectation points.  This is because, without
 expectation points, backtracking is unbounded (refer to the _expect_pts_
-section to see why).  If you can go back arbitarily far in order to backtrack,
+section to see why).  If you can go back arbitrarily far in order to backtrack,
 you need to be sure that there will be a token at the place you backtrack to.

 However, if you use expectation points, the cache is trimmed.  The prefix of
@@ -3985,10 +4044,6 @@ guaranteed to fail.
 The takeaway here is that a lexing failure might be due to bad input, but it
 can also be the sign of a bug in one or more of your _tok_specs_.

-[heading Tradeoffs of token- vs. character-parsing]
-
-TODO
-
 [heading The token parsers]

 Many of the parsers that work in character parsing do not work in token
@@ -4009,10 +4064,10 @@ by _lexeme_.

 [heading The token parsing API]

-Not all the _p_ and _cbp_ overloads can do token parsing, because some of them
-cannot accept a _tok_v_ as input.  In particular, the overloads that take a
-skipper are precluded, since the skipper must be built into the lexer itself
-(see the section above about whitespace handling for details).
+Not all the _p_ and _cbp_ overloads can do token parsing.  In particular, the
+overloads that take a skipper are precluded, since the skipper must be built
+into the lexer itself (see the section above about whitespace handling for
+details).

 [heading _ctre_ particulars]

@@ -4036,18 +4091,23 @@ array of `char` will be interpreted as UTF-8, and will be transcoded to UTF-32
 before being stored in the array of `char32_t`.  All the `charN_t` character
 types will be interpreted as UTF-N encoded, and will be transcoded to UTF-32
 if needed.  `wchar_t` is taken to mean UTF-32 *even on Windows*.  Again, all
-of this trancoding happens at compile time.
+of this transcoding happens at compile time.

 [heading Error handling details]

-TODO: Describe how it mostly just works, but that if you use the error
-reporting API you need to know which functions require token iterators and
-which do not, and how to get from token iterators down to the underlying input
-iterators.
+Error handling during token parsing mostly Just Works.  That is, you don't
+need to know or do anything special just because you are parsing tokens.

-TODO: Note on the error handling-specific page that some error handling
-functions require normalize_iterators, and some apply it themselves.  Also
-note that all the error handlers appply it.
+However, the error reporting functions all operate at the level of character
+input, not tokens.  The higher level functions provided in _err_fwd_hpp_ and
+_err_hpp_ (like `write_formatted_message()`) simply get the iterators to the
+underlying range of input before doing their work.  The lower-level functions
+provided in _err_fwd_hpp_ and _err_hpp_ (like `find_line_position()`) do not.
+Each function's API documentation specifies whether or not it does this
+"normalization" to underlying iterators.  If you use the lower-level API
+directly in your code, you can call one of the overloads of
+`normalize_iterators()` to get the underlying iterators in the token parsing
+case.

 [endsect]

--- a/include/boost/parser/error_handling_fwd.hpp
+++ b/include/boost/parser/error_handling_fwd.hpp
@@ -117,9 +117,6 @@ namespace boost { namespace parser {
        int64_t max_after_caret = 40);
 #endif

-    // TODO: Document that users may need to use this if they make their own
-    // error handlers and do token parsing.
-
    /** Returns a tuple of three iterators (corresponding to `first`, `curr`,
        and `last`) that are suitable for use in the other error handling
        functions, many of which require iterators into the undelying sequence
--- a/include/boost/parser/lexer.hpp
+++ b/include/boost/parser/lexer.hpp
@@ -857,10 +857,6 @@ namespace boost { namespace parser {
        template<typename I, typename Context>
        friend struct detail::scoped_lexeme;

-        // TODO: Document that the token cache will grow without bound if the
-        // parser contains no sequence points.  Document this in the doc
-        // section that talks about the importance of sequence points.
-
        V base_ = V();
        Lexer lexer_;
        mutable std::ranges::iterator_t<V> latest_;
@@ -981,9 +977,6 @@ namespace boost { namespace parser {
                token_offset_ = other.token_offset_;
            }

-            // TODO: Document that lexeme/skip cause re-tokenization;
-            // recommend using a token instead.
-
            iterator & operator++()
            {
                if (parent_->tokens_.size() <=
--- a/test/lexer_and_parser.cpp
+++ b/test/lexer_and_parser.cpp
@@ -210,6 +210,27 @@ int main()
    assert(cache.size() == 2u);
    //]
    }
+
+    {
+    //[ tokens_string_in_character_vs_token_parsing
+    constexpr auto true_false = bp::token_spec<"true|false", 0, bool>;
+    constexpr auto identifier = bp::token_spec<"[a-zA-Z]\\w*", 1>;
+
+    constexpr auto lexer =
+        bp::lexer<char, int> | true_false | identifier | bp::token_chars<'=', ';'>;
+
+    auto parser = bp::string("=;");
+
+    // NOTE: Character parsing here.
+    auto character_parse_result = bp::parse("=;", parser);
+    assert(character_parse_result);
+    assert(*character_parse_result == "=;");
+
+    // NOTE: Token parsing here.
+    auto token_parse_result = bp::parse("=;" | bp::to_tokens(lexer), parser);
+    assert(!token_parse_result);
+    //]
+    }
    // clang-format on

    return boost::report_errors();
--- a/test/lexer_and_parser_terminals.cpp
+++ b/test/lexer_and_parser_terminals.cpp
@@ -532,11 +532,6 @@ int main()
        }

        {
-            // TODO: Document that bp::string(";=") will *not* match ";=" in
-            // the input, because the token boundaries form the tokens ";",
-            // "=", neither of which matches ";=".  Also, document that char
-            // parsers *only* match with char tokens, and bp::string *never*
-            // do.
            constexpr auto parser = bp::omit[*bp::string("ab")];

            {
@@ -2020,10 +2015,6 @@ int main()
        BOOST_TEST(result == std::vector<uint32_t>({'a'}));
    }

-    // TODO: Document that ranges of UTF-16 input will never match the unicode
-    // cases of the character parsers, because they are examined CU-by-CU, and
-    // surrogate pairs are therefore never combined.
-
    // upper_
    {
        constexpr auto lexer = bp::lexer<char, int> | bp::token_chars<'A', 'a'>;