2
0
mirror of https://github.com/boostorg/parser.git synced 2026-01-19 04:22:13 +00:00

Complete initial pass on token parsing documentation.

See #202.
This commit is contained in:
Zach Laine
2024-12-07 23:50:16 -06:00
parent 9a958224e4
commit ff1059695d
6 changed files with 123 additions and 61 deletions

View File

@@ -694,97 +694,97 @@ the input they match unless otherwise stated in the table below.]
[[ _ch_ ]
[ Matches any single code point. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See _attr_gen_. ]
[]]
[ Only matches tokens with the ID _ch_id_. ]]
[[ `_ch_(arg0)` ]
[ Matches exactly the code point `_RES_np_(arg0)`. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See _attr_gen_. ]
[]]
[ Only matches tokens with the ID _ch_id_. ]]
[[ `_ch_(arg0, arg1)` ]
[ Matches the next code point `n` in the input, if `_RES_np_(arg0) <= n && n <= _RES_np_(arg1)`. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See _attr_gen_. ]
[]]
[ Only matches tokens with the ID _ch_id_. ]]
[[ `_ch_(r)` ]
[ Matches the next code point `n` in the input, if `n` is one of the code points in `r`. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See _attr_gen_. ]
[ `r` is taken to be in a UTF encoding. The exact UTF used depends on `r`'s element type. If you do not pass UTF encoded ranges for `r`, the behavior of _ch_ is undefined. Note that ASCII is a subset of UTF-8, so ASCII is fine. EBCDIC is not. `r` is not copied; a reference to it is taken. The lifetime of `_ch_(r)` must be within the lifetime of `r`. This overload of _ch_ does *not* take parse arguments. ]]
[ `r` is taken to be in a UTF encoding. The exact UTF used depends on `r`'s element type. If you do not pass UTF encoded ranges for `r`, the behavior of _ch_ is undefined. Note that ASCII is a subset of UTF-8, so ASCII is fine. EBCDIC is not. `r` is not copied; a reference to it is taken. The lifetime of `_ch_(r)` must be within the lifetime of `r`. This overload of _ch_ does *not* take parse arguments. Only matches tokens with the ID _ch_id_. ]]
[[ _cp_ ]
[ Matches a single code point. ]
[ `char32_t` ]
[ Similar to _ch_, but with a fixed `char32_t` attribute type; _cp_ has all the same call operator overloads as _ch_, though they are not repeated here, for brevity. ]]
[ Similar to _ch_, but with a fixed `char32_t` attribute type; _cp_ has all the same call operator overloads as _ch_, though they are not repeated here, for brevity. Only matches tokens with the ID _ch_id_. ]]
[[ _cu_ ]
[ Matches a single code point. ]
[ `char` ]
[ Similar to _ch_, but with a fixed `char` attribute type; _cu_ has all the same call operator overloads as _ch_, though they are not repeated here, for brevity. Even though the name "`cu`" suggests that this parser match at the code unit level, it does not. The name refers to the attribute type generated, much like the names _i_ versus _ui_. ]]
[ Similar to _ch_, but with a fixed `char` attribute type; _cu_ has all the same call operator overloads as _ch_, though they are not repeated here, for brevity. Even though the name "`cu`" suggests that this parser match at the code unit level, it does not. The name refers to the attribute type generated, much like the names _i_ versus _ui_. Only matches tokens with the ID _ch_id_. ]]
[[ `_blank_` ]
[ Equivalent to `_ws_ - _eol_`. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ]
[]]
[ Only matches tokens with the ID _ch_id_. ]]
[[ `_control_` ]
[ Matches a single control-character code point. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ]
[]]
[ Only matches tokens with the ID _ch_id_. ]]
[[ `_digit_` ]
[ Matches a single decimal digit code point. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ]
[]]
[ Only matches tokens with the ID _ch_id_. ]]
[[ `_punct_` ]
[ Matches a single punctuation code point. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ]
[]]
[ Only matches tokens with the ID _ch_id_. ]]
[[ `_hex_digit_` ]
[ Matches a single hexidecimal digit code point. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ]
[]]
[ Only matches tokens with the ID _ch_id_. ]]
[[ `_lower_` ]
[ Matches a single lower-case code point. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ]
[]]
[ Only matches tokens with the ID _ch_id_. ]]
[[ `_upper_` ]
[ Matches a single upper-case code point. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ]
[]]
[ Only matches tokens with the ID _ch_id_. ]]
[[ _lit_np_`(c)`]
[ Matches exactly the given code point `c`. ]
[ None. ]
[_lit_ does *not* take parse arguments. ]]
[_lit_ does *not* take parse arguments. Only matches tokens with the ID _ch_id_. ]]
[[ `c_l` ]
[ Matches exactly the given code point `c`. ]
[ None. ]
[ This is a _udl_ that represents `_lit_np_(c)`, for example `'F'_l`. ]]
[ This is a _udl_ that represents `_lit_np_(c)`, for example `'F'_l`. Only matches tokens with the ID _ch_id_. ]]
[[ _lit_np_`(r)`]
[ Matches exactly the given string `r`. ]
[ None. ]
[ _lit_ does *not* take parse arguments. ]]
[ _lit_ does *not* take parse arguments. _str_ matches the entire token or not at all. Only matches tokens with an attribute type that is a specialization of `std::basic_string_view`. ]]
[[ `str_l` ]
[ Matches exactly the given string `str`. ]
[ None. ]
[ This is a _udl_ that represents `_lit_np_(s)`, for example `"a string"_l`. ]]
[ This is a _udl_ that represents `_lit_np_(s)`, for example `"a string"_l`. Only matches tokens with an attribute type that is a specialization of `std::basic_string_view`. ]]
[[ `_str_np_(r)`]
[ Matches exactly `r`, and generates the match as an attribute. ]
[ _std_str_ ]
[ _str_ does *not* take parse arguments. ]]
[ _str_ does *not* take parse arguments. _str_ matches the entire token or not at all. Only matches tokens with an attribute type that is a specialization of `std::basic_string_view`. ]]
[[ `str_p`]
[ Matches exactly `str`, and generates the match as an attribute. ]
[ _std_str_ ]
[ This is a _udl_ that represents `_str_np_(s)`, for example `"a string"_p`. ]]
[ This is a _udl_ that represents `_str_np_(s)`, for example `"a string"_p`. Only matches tokens with an attribute type that is a specialization of `std::basic_string_view`. ]]
[[ `_rpt_np_(arg0)[p]` ]
[ Matches iff `p` matches exactly `_RES_np_(arg0)` times. ]

View File

@@ -3665,7 +3665,7 @@ Some things to be aware of when looking at _Parser_ trace output:
[endsect]
[section Using a Lexer / Token Parsing]
[section Token parsing / Using a Lexer]
_Parser_ has optional support for lexing before parsing. The optional support
is based on an external dependency, _ctre_. _ctre_ produces a sequence of
@@ -3702,7 +3702,7 @@ an ID of `0`, etc. _lex_ takes two template parameters. The first parameter
indicates that the value type of the parsed input sequence is `char`. The
second one indicates that the ID-type of all subsequent _tok_specs_ will be
`int`. We create a full lexer by starting with the `lexer<...>` expression,
follwed by a piped-together sequence of _tok_specs_.
followed by a piped-together sequence of _tok_specs_.
The final lexer `lexer` has a combined regex string, `"(foo)|(b.*r)|(b.+z)"`.
This string is built up at compile time, and is represented by an _nttp_. It
@@ -3734,7 +3734,7 @@ Next, you define a parser.
[tokens_basics_parser]
This has the same semantics as the characater parsers you've seen in the rest
This has the same semantics as the character parsers you've seen in the rest
of the documentation. Each _tok_spec_ has the same interface as a parser, so
it can be used with all the parser combining operations, like `operator>>`.
However, unlike when doing character parsing, when token parsing all the
@@ -3772,7 +3772,7 @@ _lex_ has one built in; it uses `"\\s+"` by default. Whitespace is matched,
but produces no tokens. If you want to change the whitespace/skipper regex,
you can provide it when specifying the lexer. For example, here is how you
would specify the whitespace/skipped tokens to be any sequence of whitespace
charaters, or any C++-style trailing comment (`// ...`).
characters, or any C++-style trailing comment (`// ...`).
bp::lexer<char, int, "\\s+|\\/\\/.*$">
@@ -3797,7 +3797,7 @@ The attribute types for these tokens are `bool`, `std::string_view`, and
because that is the default if you do not specify a type.
A _tok_ is essentially a variant of `std::basic_string_view<CharType>`, `long
long`, and `long double`. The latter two types were seleced because they can
long`, and `long double`. The latter two types were selected because they can
fit any value of an integral or floating-point type, respectively. Even
though _tok_ effectively erases the exact type when it is integral or
floating-point, the token parser retains the information of what the exact
@@ -3841,10 +3841,69 @@ literals are turned into _ch_ parsers. _ch_ parsers that you explicitly write
may be used as well. They will only match single-character tokens, though
(that is, tokens with the ID _ch_id_).
[heading The differences between parsing characters and parsing tokens]
Even though _ch_ and _str_ (and lots of other character parsers _emdash_ see
the table below) are available when doing token parsing, their semantics are
subtly different when using for token parsing. This is because token parsing
involves parsing chunks of input as tokens, rather than individual characters.
This may sound obvious, but the implications are not. Consider this example.
[tokens_string_in_character_vs_token_parsing]
Why doesn't the token parsing case work? In the character parsing case,
_str_np_ tries to match characters from the input, one at a time; it sees
`'='` followed by `';'`, so it matches. In the token parsing case, this does
not happen. Instead, the input is broken up into two tokens (one for `'='`
and one for `';'`). `_str_np_("=;")` tries to match the first token in its
entirety, but that token is a character token, not a token with a
`std::basic_string_view` attribute. Even if that token did have
a`std::basic_string_view` attribute, it would be `"="`, not `"=;"`, and so the
match would still fail.
So, even though string matching is available using _str_, make sure you
understand that _str_ is looking for 1) a token with a string view attribute,
and 2) a full match of the token's string view against the range provided to
_str_.
_ch_ is also a bit different, since it only matches character tokens that you
make with _tok_chs_. Such tokens have the token ID _ch_id_. _ch_ will
*never* match any other kind of token. This goes for all the character
parsers (_blank_, _punct_, _upper_, etc).
The character class parsers (e.g. _punct_) are also limited in token parsing
vs. their use in character parsing. _tok_chs_ limits characters to the ASCII
range for simplicity, and to discourage parsing of sequences of tokens to find
things that are detectable using _pcre_ directly. In other words, if you need
the full set of punctuation characters, use `"\p{P}"` in one of your token
regexes, rather than trying to parse punctuation characters out of the input
using _punct_. Because _tok_chs_ limits characters to the ASCII range, all
the matching for any character class parser (like _punct_) above the ASCII
range will fail.
[important Though the string and character parsers are available, they're a
bit clunky and should be avoided in most cases. Instead, use the character
handling from the _pcre_ regex language to make the tokens you want. The best
use of string and character parsers in your _Parser_ token parsers is as
literals like `"function"`, `'='`, etc.]
One more important difference between token and character parsing is the
effect that using _lexeme_ and/or _skip_ has. If you use _lexeme_ or _skip_,
you are changing the sequence tokens that must be in the token cache. As
such, whenever you *enter* or *leave* a _lexeme_ *or* _skip_ directive, the
token cache is flushed. The flushed tokens are everything from the current
token position to the end of the cache. If you write `bp::lexeme[p]`
frequently enough in your parsers, you could be in for some very uneven
performance.
[important Though you may be used to using _lexeme_ and _skip_ in character
parsing, prefer to write explicit token regexes that have equivalent
semantics, but operating during lexing rather than during parsing.]
[heading Parsing tokens with a specific value]
So far, we've only seen examples of parsing for a particular token. Sometimes
we want to match only occurrances of a given token with a particular value,
we want to match only occurrences of a given token with a particular value,
just like when we write something like `_ch_('a', 'z')` in a character parser.
Just as with _ch_ and most other _Parser_ parsers, you can just add the value
@@ -3953,7 +4012,7 @@ Note the size of the cache after the parse; it still contains some tokens.
This is a special case of a more general phenomenon: the token cache grows
without bound when there are no expectation points. This is because, without
expectation points, backtracking is unbounded (refer to the _expect_pts_
section to see why). If you can go back arbitarily far in order to backtrack,
section to see why). If you can go back arbitrarily far in order to backtrack,
you need to be sure that there will be a token at the place you backtrack to.
However, if you use expectation points, the cache is trimmed. The prefix of
@@ -3985,10 +4044,6 @@ guaranteed to fail.
The takeaway here is that a lexing failure might be due to bad input, but it
can also be the sign of a bug in one or more of your _tok_specs_.
[heading Tradeoffs of token- vs. character-parsing]
TODO
[heading The token parsers]
Many of the parsers that work in character parsing do not work in token
@@ -4009,10 +4064,10 @@ by _lexeme_.
[heading The token parsing API]
Not all the _p_ and _cbp_ overloads can do token parsing, because some of them
cannot accept a _tok_v_ as input. In particular, the overloads that take a
skipper are precluded, since the skipper must be built into the lexer itself
(see the section above about whitespace handling for details).
Not all the _p_ and _cbp_ overloads can do token parsing. In particular, the
overloads that take a skipper are precluded, since the skipper must be built
into the lexer itself (see the section above about whitespace handling for
details).
[heading _ctre_ particulars]
@@ -4036,18 +4091,23 @@ array of `char` will be interpreted as UTF-8, and will be transcoded to UTF-32
before being stored in the array of `char32_t`. All the `charN_t` character
types will be interpreted as UTF-N encoded, and will be transcoded to UTF-32
if needed. `wchar_t` is taken to mean UTF-32 *even on Windows*. Again, all
of this trancoding happens at compile time.
of this transcoding happens at compile time.
[heading Error handling details]
TODO: Describe how it mostly just works, but that if you use the error
reporting API you need to know which functions require token iterators and
which do not, and how to get from token iterators down to the underlying input
iterators.
Error handling during token parsing mostly Just Works. That is, you don't
need to know or do anything special just because you are parsing tokens.
TODO: Note on the error handling-specific page that some error handling
functions require normalize_iterators, and some apply it themselves. Also
note that all the error handlers appply it.
However, the error reporting functions all operate at the level of character
input, not tokens. The higher level functions provided in _err_fwd_hpp_ and
_err_hpp_ (like `write_formatted_message()`) simply get the iterators to the
underlying range of input before doing their work. The lower-level functions
provided in _err_fwd_hpp_ and _err_hpp_ (like `find_line_position()`) do not.
Each function's API documentation specifies whether or not it does this
"normalization" to underlying iterators. If you use the lower-level API
directly in your code, you can call one of the overloads of
`normalize_iterators()` to get the underlying iterators in the token parsing
case.
[endsect]

View File

@@ -117,9 +117,6 @@ namespace boost { namespace parser {
int64_t max_after_caret = 40);
#endif
// TODO: Document that users may need to use this if they make their own
// error handlers and do token parsing.
/** Returns a tuple of three iterators (corresponding to `first`, `curr`,
and `last`) that are suitable for use in the other error handling
functions, many of which require iterators into the undelying sequence

View File

@@ -857,10 +857,6 @@ namespace boost { namespace parser {
template<typename I, typename Context>
friend struct detail::scoped_lexeme;
// TODO: Document that the token cache will grow without bound if the
// parser contains no sequence points. Document this in the doc
// section that talks about the importance of sequence points.
V base_ = V();
Lexer lexer_;
mutable std::ranges::iterator_t<V> latest_;
@@ -981,9 +977,6 @@ namespace boost { namespace parser {
token_offset_ = other.token_offset_;
}
// TODO: Document that lexeme/skip cause re-tokenization;
// recommend using a token instead.
iterator & operator++()
{
if (parent_->tokens_.size() <=

View File

@@ -210,6 +210,27 @@ int main()
assert(cache.size() == 2u);
//]
}
{
//[ tokens_string_in_character_vs_token_parsing
constexpr auto true_false = bp::token_spec<"true|false", 0, bool>;
constexpr auto identifier = bp::token_spec<"[a-zA-Z]\\w*", 1>;
constexpr auto lexer =
bp::lexer<char, int> | true_false | identifier | bp::token_chars<'=', ';'>;
auto parser = bp::string("=;");
// NOTE: Character parsing here.
auto character_parse_result = bp::parse("=;", parser);
assert(character_parse_result);
assert(*character_parse_result == "=;");
// NOTE: Token parsing here.
auto token_parse_result = bp::parse("=;" | bp::to_tokens(lexer), parser);
assert(!token_parse_result);
//]
}
// clang-format on
return boost::report_errors();

View File

@@ -532,11 +532,6 @@ int main()
}
{
// TODO: Document that bp::string(";=") will *not* match ";=" in
// the input, because the token boundaries form the tokens ";",
// "=", neither of which matches ";=". Also, document that char
// parsers *only* match with char tokens, and bp::string *never*
// do.
constexpr auto parser = bp::omit[*bp::string("ab")];
{
@@ -2020,10 +2015,6 @@ int main()
BOOST_TEST(result == std::vector<uint32_t>({'a'}));
}
// TODO: Document that ranges of UTF-16 input will never match the unicode
// cases of the character parsers, because they are examined CU-by-CU, and
// surrogate pairs are therefore never combined.
// upper_
{
constexpr auto lexer = bp::lexer<char, int> | bp::token_chars<'A', 'a'>;