mirror of
https://github.com/boostorg/parser.git
synced 2026-01-19 04:22:13 +00:00
@@ -694,97 +694,97 @@ the input they match unless otherwise stated in the table below.]
|
||||
[[ _ch_ ]
|
||||
[ Matches any single code point. ]
|
||||
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See _attr_gen_. ]
|
||||
[]]
|
||||
[ Only matches tokens with the ID _ch_id_. ]]
|
||||
|
||||
[[ `_ch_(arg0)` ]
|
||||
[ Matches exactly the code point `_RES_np_(arg0)`. ]
|
||||
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See _attr_gen_. ]
|
||||
[]]
|
||||
[ Only matches tokens with the ID _ch_id_. ]]
|
||||
|
||||
[[ `_ch_(arg0, arg1)` ]
|
||||
[ Matches the next code point `n` in the input, if `_RES_np_(arg0) <= n && n <= _RES_np_(arg1)`. ]
|
||||
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See _attr_gen_. ]
|
||||
[]]
|
||||
[ Only matches tokens with the ID _ch_id_. ]]
|
||||
|
||||
[[ `_ch_(r)` ]
|
||||
[ Matches the next code point `n` in the input, if `n` is one of the code points in `r`. ]
|
||||
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See _attr_gen_. ]
|
||||
[ `r` is taken to be in a UTF encoding. The exact UTF used depends on `r`'s element type. If you do not pass UTF encoded ranges for `r`, the behavior of _ch_ is undefined. Note that ASCII is a subset of UTF-8, so ASCII is fine. EBCDIC is not. `r` is not copied; a reference to it is taken. The lifetime of `_ch_(r)` must be within the lifetime of `r`. This overload of _ch_ does *not* take parse arguments. ]]
|
||||
[ `r` is taken to be in a UTF encoding. The exact UTF used depends on `r`'s element type. If you do not pass UTF encoded ranges for `r`, the behavior of _ch_ is undefined. Note that ASCII is a subset of UTF-8, so ASCII is fine. EBCDIC is not. `r` is not copied; a reference to it is taken. The lifetime of `_ch_(r)` must be within the lifetime of `r`. This overload of _ch_ does *not* take parse arguments. Only matches tokens with the ID _ch_id_. ]]
|
||||
|
||||
[[ _cp_ ]
|
||||
[ Matches a single code point. ]
|
||||
[ `char32_t` ]
|
||||
[ Similar to _ch_, but with a fixed `char32_t` attribute type; _cp_ has all the same call operator overloads as _ch_, though they are not repeated here, for brevity. ]]
|
||||
[ Similar to _ch_, but with a fixed `char32_t` attribute type; _cp_ has all the same call operator overloads as _ch_, though they are not repeated here, for brevity. Only matches tokens with the ID _ch_id_. ]]
|
||||
|
||||
[[ _cu_ ]
|
||||
[ Matches a single code point. ]
|
||||
[ `char` ]
|
||||
[ Similar to _ch_, but with a fixed `char` attribute type; _cu_ has all the same call operator overloads as _ch_, though they are not repeated here, for brevity. Even though the name "`cu`" suggests that this parser match at the code unit level, it does not. The name refers to the attribute type generated, much like the names _i_ versus _ui_. ]]
|
||||
[ Similar to _ch_, but with a fixed `char` attribute type; _cu_ has all the same call operator overloads as _ch_, though they are not repeated here, for brevity. Even though the name "`cu`" suggests that this parser match at the code unit level, it does not. The name refers to the attribute type generated, much like the names _i_ versus _ui_. Only matches tokens with the ID _ch_id_. ]]
|
||||
|
||||
[[ `_blank_` ]
|
||||
[ Equivalent to `_ws_ - _eol_`. ]
|
||||
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ]
|
||||
[]]
|
||||
[ Only matches tokens with the ID _ch_id_. ]]
|
||||
|
||||
[[ `_control_` ]
|
||||
[ Matches a single control-character code point. ]
|
||||
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ]
|
||||
[]]
|
||||
[ Only matches tokens with the ID _ch_id_. ]]
|
||||
|
||||
[[ `_digit_` ]
|
||||
[ Matches a single decimal digit code point. ]
|
||||
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ]
|
||||
[]]
|
||||
[ Only matches tokens with the ID _ch_id_. ]]
|
||||
|
||||
[[ `_punct_` ]
|
||||
[ Matches a single punctuation code point. ]
|
||||
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ]
|
||||
[]]
|
||||
[ Only matches tokens with the ID _ch_id_. ]]
|
||||
|
||||
[[ `_hex_digit_` ]
|
||||
[ Matches a single hexidecimal digit code point. ]
|
||||
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ]
|
||||
[]]
|
||||
[ Only matches tokens with the ID _ch_id_. ]]
|
||||
|
||||
[[ `_lower_` ]
|
||||
[ Matches a single lower-case code point. ]
|
||||
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ]
|
||||
[]]
|
||||
[ Only matches tokens with the ID _ch_id_. ]]
|
||||
|
||||
[[ `_upper_` ]
|
||||
[ Matches a single upper-case code point. ]
|
||||
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ]
|
||||
[]]
|
||||
[ Only matches tokens with the ID _ch_id_. ]]
|
||||
|
||||
[[ _lit_np_`(c)`]
|
||||
[ Matches exactly the given code point `c`. ]
|
||||
[ None. ]
|
||||
[_lit_ does *not* take parse arguments. ]]
|
||||
[_lit_ does *not* take parse arguments. Only matches tokens with the ID _ch_id_. ]]
|
||||
|
||||
[[ `c_l` ]
|
||||
[ Matches exactly the given code point `c`. ]
|
||||
[ None. ]
|
||||
[ This is a _udl_ that represents `_lit_np_(c)`, for example `'F'_l`. ]]
|
||||
[ This is a _udl_ that represents `_lit_np_(c)`, for example `'F'_l`. Only matches tokens with the ID _ch_id_. ]]
|
||||
|
||||
[[ _lit_np_`(r)`]
|
||||
[ Matches exactly the given string `r`. ]
|
||||
[ None. ]
|
||||
[ _lit_ does *not* take parse arguments. ]]
|
||||
[ _lit_ does *not* take parse arguments. _str_ matches the entire token or not at all. Only matches tokens with an attribute type that is a specialization of `std::basic_string_view`. ]]
|
||||
|
||||
[[ `str_l` ]
|
||||
[ Matches exactly the given string `str`. ]
|
||||
[ None. ]
|
||||
[ This is a _udl_ that represents `_lit_np_(s)`, for example `"a string"_l`. ]]
|
||||
[ This is a _udl_ that represents `_lit_np_(s)`, for example `"a string"_l`. Only matches tokens with an attribute type that is a specialization of `std::basic_string_view`. ]]
|
||||
|
||||
[[ `_str_np_(r)`]
|
||||
[ Matches exactly `r`, and generates the match as an attribute. ]
|
||||
[ _std_str_ ]
|
||||
[ _str_ does *not* take parse arguments. ]]
|
||||
[ _str_ does *not* take parse arguments. _str_ matches the entire token or not at all. Only matches tokens with an attribute type that is a specialization of `std::basic_string_view`. ]]
|
||||
|
||||
[[ `str_p`]
|
||||
[ Matches exactly `str`, and generates the match as an attribute. ]
|
||||
[ _std_str_ ]
|
||||
[ This is a _udl_ that represents `_str_np_(s)`, for example `"a string"_p`. ]]
|
||||
[ This is a _udl_ that represents `_str_np_(s)`, for example `"a string"_p`. Only matches tokens with an attribute type that is a specialization of `std::basic_string_view`. ]]
|
||||
|
||||
[[ `_rpt_np_(arg0)[p]` ]
|
||||
[ Matches iff `p` matches exactly `_RES_np_(arg0)` times. ]
|
||||
|
||||
106
doc/tutorial.qbk
106
doc/tutorial.qbk
@@ -3665,7 +3665,7 @@ Some things to be aware of when looking at _Parser_ trace output:
|
||||
|
||||
[endsect]
|
||||
|
||||
[section Using a Lexer / Token Parsing]
|
||||
[section Token parsing / Using a Lexer]
|
||||
|
||||
_Parser_ has optional support for lexing before parsing. The optional support
|
||||
is based on an external dependency, _ctre_. _ctre_ produces a sequence of
|
||||
@@ -3702,7 +3702,7 @@ an ID of `0`, etc. _lex_ takes two template parameters. The first parameter
|
||||
indicates that the value type of the parsed input sequence is `char`. The
|
||||
second one indicates that the ID-type of all subsequent _tok_specs_ will be
|
||||
`int`. We create a full lexer by starting with the `lexer<...>` expression,
|
||||
follwed by a piped-together sequence of _tok_specs_.
|
||||
followed by a piped-together sequence of _tok_specs_.
|
||||
|
||||
The final lexer `lexer` has a combined regex string, `"(foo)|(b.*r)|(b.+z)"`.
|
||||
This string is built up at compile time, and is represented by an _nttp_. It
|
||||
@@ -3734,7 +3734,7 @@ Next, you define a parser.
|
||||
|
||||
[tokens_basics_parser]
|
||||
|
||||
This has the same semantics as the characater parsers you've seen in the rest
|
||||
This has the same semantics as the character parsers you've seen in the rest
|
||||
of the documentation. Each _tok_spec_ has the same interface as a parser, so
|
||||
it can be used with all the parser combining operations, like `operator>>`.
|
||||
However, unlike when doing character parsing, when token parsing all the
|
||||
@@ -3772,7 +3772,7 @@ _lex_ has one built in; it uses `"\\s+"` by default. Whitespace is matched,
|
||||
but produces no tokens. If you want to change the whitespace/skipper regex,
|
||||
you can provide it when specifying the lexer. For example, here is how you
|
||||
would specify the whitespace/skipped tokens to be any sequence of whitespace
|
||||
charaters, or any C++-style trailing comment (`// ...`).
|
||||
characters, or any C++-style trailing comment (`// ...`).
|
||||
|
||||
bp::lexer<char, int, "\\s+|\\/\\/.*$">
|
||||
|
||||
@@ -3797,7 +3797,7 @@ The attribute types for these tokens are `bool`, `std::string_view`, and
|
||||
because that is the default if you do not specify a type.
|
||||
|
||||
A _tok_ is essentially a variant of `std::basic_string_view<CharType>`, `long
|
||||
long`, and `long double`. The latter two types were seleced because they can
|
||||
long`, and `long double`. The latter two types were selected because they can
|
||||
fit any value of an integral or floating-point type, respectively. Even
|
||||
though _tok_ effectively erases the exact type when it is integral or
|
||||
floating-point, the token parser retains the information of what the exact
|
||||
@@ -3841,10 +3841,69 @@ literals are turned into _ch_ parsers. _ch_ parsers that you explicitly write
|
||||
may be used as well. They will only match single-character tokens, though
|
||||
(that is, tokens with the ID _ch_id_).
|
||||
|
||||
[heading The differences between parsing characters and parsing tokens]
|
||||
|
||||
Even though _ch_ and _str_ (and lots of other character parsers _emdash_ see
|
||||
the table below) are available when doing token parsing, their semantics are
|
||||
subtly different when using for token parsing. This is because token parsing
|
||||
involves parsing chunks of input as tokens, rather than individual characters.
|
||||
This may sound obvious, but the implications are not. Consider this example.
|
||||
|
||||
[tokens_string_in_character_vs_token_parsing]
|
||||
|
||||
Why doesn't the token parsing case work? In the character parsing case,
|
||||
_str_np_ tries to match characters from the input, one at a time; it sees
|
||||
`'='` followed by `';'`, so it matches. In the token parsing case, this does
|
||||
not happen. Instead, the input is broken up into two tokens (one for `'='`
|
||||
and one for `';'`). `_str_np_("=;")` tries to match the first token in its
|
||||
entirety, but that token is a character token, not a token with a
|
||||
`std::basic_string_view` attribute. Even if that token did have
|
||||
a`std::basic_string_view` attribute, it would be `"="`, not `"=;"`, and so the
|
||||
match would still fail.
|
||||
|
||||
So, even though string matching is available using _str_, make sure you
|
||||
understand that _str_ is looking for 1) a token with a string view attribute,
|
||||
and 2) a full match of the token's string view against the range provided to
|
||||
_str_.
|
||||
|
||||
_ch_ is also a bit different, since it only matches character tokens that you
|
||||
make with _tok_chs_. Such tokens have the token ID _ch_id_. _ch_ will
|
||||
*never* match any other kind of token. This goes for all the character
|
||||
parsers (_blank_, _punct_, _upper_, etc).
|
||||
|
||||
The character class parsers (e.g. _punct_) are also limited in token parsing
|
||||
vs. their use in character parsing. _tok_chs_ limits characters to the ASCII
|
||||
range for simplicity, and to discourage parsing of sequences of tokens to find
|
||||
things that are detectable using _pcre_ directly. In other words, if you need
|
||||
the full set of punctuation characters, use `"\p{P}"` in one of your token
|
||||
regexes, rather than trying to parse punctuation characters out of the input
|
||||
using _punct_. Because _tok_chs_ limits characters to the ASCII range, all
|
||||
the matching for any character class parser (like _punct_) above the ASCII
|
||||
range will fail.
|
||||
|
||||
[important Though the string and character parsers are available, they're a
|
||||
bit clunky and should be avoided in most cases. Instead, use the character
|
||||
handling from the _pcre_ regex language to make the tokens you want. The best
|
||||
use of string and character parsers in your _Parser_ token parsers is as
|
||||
literals like `"function"`, `'='`, etc.]
|
||||
|
||||
One more important difference between token and character parsing is the
|
||||
effect that using _lexeme_ and/or _skip_ has. If you use _lexeme_ or _skip_,
|
||||
you are changing the sequence tokens that must be in the token cache. As
|
||||
such, whenever you *enter* or *leave* a _lexeme_ *or* _skip_ directive, the
|
||||
token cache is flushed. The flushed tokens are everything from the current
|
||||
token position to the end of the cache. If you write `bp::lexeme[p]`
|
||||
frequently enough in your parsers, you could be in for some very uneven
|
||||
performance.
|
||||
|
||||
[important Though you may be used to using _lexeme_ and _skip_ in character
|
||||
parsing, prefer to write explicit token regexes that have equivalent
|
||||
semantics, but operating during lexing rather than during parsing.]
|
||||
|
||||
[heading Parsing tokens with a specific value]
|
||||
|
||||
So far, we've only seen examples of parsing for a particular token. Sometimes
|
||||
we want to match only occurrances of a given token with a particular value,
|
||||
we want to match only occurrences of a given token with a particular value,
|
||||
just like when we write something like `_ch_('a', 'z')` in a character parser.
|
||||
|
||||
Just as with _ch_ and most other _Parser_ parsers, you can just add the value
|
||||
@@ -3953,7 +4012,7 @@ Note the size of the cache after the parse; it still contains some tokens.
|
||||
This is a special case of a more general phenomenon: the token cache grows
|
||||
without bound when there are no expectation points. This is because, without
|
||||
expectation points, backtracking is unbounded (refer to the _expect_pts_
|
||||
section to see why). If you can go back arbitarily far in order to backtrack,
|
||||
section to see why). If you can go back arbitrarily far in order to backtrack,
|
||||
you need to be sure that there will be a token at the place you backtrack to.
|
||||
|
||||
However, if you use expectation points, the cache is trimmed. The prefix of
|
||||
@@ -3985,10 +4044,6 @@ guaranteed to fail.
|
||||
The takeaway here is that a lexing failure might be due to bad input, but it
|
||||
can also be the sign of a bug in one or more of your _tok_specs_.
|
||||
|
||||
[heading Tradeoffs of token- vs. character-parsing]
|
||||
|
||||
TODO
|
||||
|
||||
[heading The token parsers]
|
||||
|
||||
Many of the parsers that work in character parsing do not work in token
|
||||
@@ -4009,10 +4064,10 @@ by _lexeme_.
|
||||
|
||||
[heading The token parsing API]
|
||||
|
||||
Not all the _p_ and _cbp_ overloads can do token parsing, because some of them
|
||||
cannot accept a _tok_v_ as input. In particular, the overloads that take a
|
||||
skipper are precluded, since the skipper must be built into the lexer itself
|
||||
(see the section above about whitespace handling for details).
|
||||
Not all the _p_ and _cbp_ overloads can do token parsing. In particular, the
|
||||
overloads that take a skipper are precluded, since the skipper must be built
|
||||
into the lexer itself (see the section above about whitespace handling for
|
||||
details).
|
||||
|
||||
[heading _ctre_ particulars]
|
||||
|
||||
@@ -4036,18 +4091,23 @@ array of `char` will be interpreted as UTF-8, and will be transcoded to UTF-32
|
||||
before being stored in the array of `char32_t`. All the `charN_t` character
|
||||
types will be interpreted as UTF-N encoded, and will be transcoded to UTF-32
|
||||
if needed. `wchar_t` is taken to mean UTF-32 *even on Windows*. Again, all
|
||||
of this trancoding happens at compile time.
|
||||
of this transcoding happens at compile time.
|
||||
|
||||
[heading Error handling details]
|
||||
|
||||
TODO: Describe how it mostly just works, but that if you use the error
|
||||
reporting API you need to know which functions require token iterators and
|
||||
which do not, and how to get from token iterators down to the underlying input
|
||||
iterators.
|
||||
Error handling during token parsing mostly Just Works. That is, you don't
|
||||
need to know or do anything special just because you are parsing tokens.
|
||||
|
||||
TODO: Note on the error handling-specific page that some error handling
|
||||
functions require normalize_iterators, and some apply it themselves. Also
|
||||
note that all the error handlers appply it.
|
||||
However, the error reporting functions all operate at the level of character
|
||||
input, not tokens. The higher level functions provided in _err_fwd_hpp_ and
|
||||
_err_hpp_ (like `write_formatted_message()`) simply get the iterators to the
|
||||
underlying range of input before doing their work. The lower-level functions
|
||||
provided in _err_fwd_hpp_ and _err_hpp_ (like `find_line_position()`) do not.
|
||||
Each function's API documentation specifies whether or not it does this
|
||||
"normalization" to underlying iterators. If you use the lower-level API
|
||||
directly in your code, you can call one of the overloads of
|
||||
`normalize_iterators()` to get the underlying iterators in the token parsing
|
||||
case.
|
||||
|
||||
[endsect]
|
||||
|
||||
|
||||
@@ -117,9 +117,6 @@ namespace boost { namespace parser {
|
||||
int64_t max_after_caret = 40);
|
||||
#endif
|
||||
|
||||
// TODO: Document that users may need to use this if they make their own
|
||||
// error handlers and do token parsing.
|
||||
|
||||
/** Returns a tuple of three iterators (corresponding to `first`, `curr`,
|
||||
and `last`) that are suitable for use in the other error handling
|
||||
functions, many of which require iterators into the undelying sequence
|
||||
|
||||
@@ -857,10 +857,6 @@ namespace boost { namespace parser {
|
||||
template<typename I, typename Context>
|
||||
friend struct detail::scoped_lexeme;
|
||||
|
||||
// TODO: Document that the token cache will grow without bound if the
|
||||
// parser contains no sequence points. Document this in the doc
|
||||
// section that talks about the importance of sequence points.
|
||||
|
||||
V base_ = V();
|
||||
Lexer lexer_;
|
||||
mutable std::ranges::iterator_t<V> latest_;
|
||||
@@ -981,9 +977,6 @@ namespace boost { namespace parser {
|
||||
token_offset_ = other.token_offset_;
|
||||
}
|
||||
|
||||
// TODO: Document that lexeme/skip cause re-tokenization;
|
||||
// recommend using a token instead.
|
||||
|
||||
iterator & operator++()
|
||||
{
|
||||
if (parent_->tokens_.size() <=
|
||||
|
||||
@@ -210,6 +210,27 @@ int main()
|
||||
assert(cache.size() == 2u);
|
||||
//]
|
||||
}
|
||||
|
||||
{
|
||||
//[ tokens_string_in_character_vs_token_parsing
|
||||
constexpr auto true_false = bp::token_spec<"true|false", 0, bool>;
|
||||
constexpr auto identifier = bp::token_spec<"[a-zA-Z]\\w*", 1>;
|
||||
|
||||
constexpr auto lexer =
|
||||
bp::lexer<char, int> | true_false | identifier | bp::token_chars<'=', ';'>;
|
||||
|
||||
auto parser = bp::string("=;");
|
||||
|
||||
// NOTE: Character parsing here.
|
||||
auto character_parse_result = bp::parse("=;", parser);
|
||||
assert(character_parse_result);
|
||||
assert(*character_parse_result == "=;");
|
||||
|
||||
// NOTE: Token parsing here.
|
||||
auto token_parse_result = bp::parse("=;" | bp::to_tokens(lexer), parser);
|
||||
assert(!token_parse_result);
|
||||
//]
|
||||
}
|
||||
// clang-format on
|
||||
|
||||
return boost::report_errors();
|
||||
|
||||
@@ -532,11 +532,6 @@ int main()
|
||||
}
|
||||
|
||||
{
|
||||
// TODO: Document that bp::string(";=") will *not* match ";=" in
|
||||
// the input, because the token boundaries form the tokens ";",
|
||||
// "=", neither of which matches ";=". Also, document that char
|
||||
// parsers *only* match with char tokens, and bp::string *never*
|
||||
// do.
|
||||
constexpr auto parser = bp::omit[*bp::string("ab")];
|
||||
|
||||
{
|
||||
@@ -2020,10 +2015,6 @@ int main()
|
||||
BOOST_TEST(result == std::vector<uint32_t>({'a'}));
|
||||
}
|
||||
|
||||
// TODO: Document that ranges of UTF-16 input will never match the unicode
|
||||
// cases of the character parsers, because they are examined CU-by-CU, and
|
||||
// surrogate pairs are therefore never combined.
|
||||
|
||||
// upper_
|
||||
{
|
||||
constexpr auto lexer = bp::lexer<char, int> | bp::token_chars<'A', 'a'>;
|
||||
|
||||
Reference in New Issue
Block a user