2
0
mirror of https://github.com/boostorg/parser.git synced 2026-01-19 16:32:13 +00:00

74 Commits

Author SHA1 Message Date
Zach Laine
c6d35e8791 Address TODOs about resolving the value used in detail::token_with_value, and
doing transcoding for the match in detail::token_with_string_view.

See #202.
2024-12-08 17:16:15 -06:00
Zach Laine
ff1059695d Complete initial pass on token parsing documentation.
See #202.
2024-12-07 23:50:27 -06:00
Zach Laine
9a958224e4 Remove mooted TODOs. 2024-12-07 21:57:53 -06:00
Zach Laine
0463ecb4f6 Add Doxygen comments to all normalize_iterators() overloads, and indicate
which error handling function apply this function to their inputs and which do
not.

See #202.
2024-12-07 19:02:08 -06:00
Zach Laine
5c036a778a Add lexer_and_parser_terminals.cpp, based on parser.cpp, to more fully
exercise the token parsing code; fix errors.

See #202.
2024-12-07 18:40:08 -06:00
Zach Laine
c3667f5265 Add proper const/non-const interop to tokens_view::iterator.
See #202.
2024-12-06 17:29:38 -06:00
Zach Laine
e2f015991c Pull the single-character-fetching logic of the token parsing modes of the
character parsers out into a standalone detail:: function; use it in all the
character parsers.

See #202.
2024-12-05 18:27:52 -06:00
Zach Laine
2b48b8656e Fix behavior of tokens_view::iterator::base() in the non-common_range case.
See #202.
2024-12-04 23:44:18 -06:00
Zach Laine
df966c78f6 Use a move instead of a copy in detail::make_parse_result(). 2024-12-02 21:46:51 -06:00
Zach Laine
9733445118 Trim trailing null from array ranges adapted by the to_tokens range adaptor.
See #202.
2024-12-02 21:46:51 -06:00
Zach Laine
b1c5c4b487 Add char to the set of types that can be used as token attributes.
See #202.
2024-12-02 21:46:51 -06:00
Zach Laine
96824b2013 Add a TODO. 2024-12-02 21:46:51 -06:00
Zach Laine
19952581f0 Extend support for token parsing to the prefix*parse() API. Add tests for the
full set of *parse() functions that support token parsing; fix errors.

See #202.
2024-12-02 21:46:36 -06:00
Zach Laine
06816abc62 Initial sketch of adapting lexeme and skip directives for use in token
parsing.

See #202.
2024-12-02 21:46:15 -06:00
Zach Laine
8f8791244a Adapt symbol_parser for use in token_parsing.
See #202.
2024-12-02 21:46:07 -06:00
Zach Laine
05a110c54d Initial sketch of adapting string_view_parser for use in token parsing.
See #202.
2024-12-01 20:36:42 -06:00
Zach Laine
cafd04c391 Fix return statement on the wrong side of macro guard.
See #202.
2024-12-01 20:27:27 -06:00
Zach Laine
3fd285c014 Correct memory safety issue in converting token iteratos to underlying
iterators.

See #202.
2024-12-01 20:26:22 -06:00
Zach Laine
bc6e9e3447 Grooming. 2024-11-30 17:35:06 -06:00
Zach Laine
eab7e82988 Slight simplification of code trying to detect token iterators.
See #202.
2024-11-30 17:24:05 -06:00
Zach Laine
655870000b Remove a mootetd TODO. 2024-11-29 20:17:49 -06:00
Zach Laine
039453079e Cruft removal. 2024-11-29 20:11:50 -06:00
Zach Laine
a908e950d5 Address TODO about removing detail::defulat_flags(). 2024-11-29 20:10:46 -06:00
Zach Laine
87e00a173d Add static_assert(std::integral<T> || std::floating_point<T>) to
detail::token_with_value, just in case.

See #202.
2024-11-29 20:03:39 -06:00
Zach Laine
3176c6f823 Fix error detected in tests of token caching introduced in doc examples. Make
sure not to copy tokens_view within the *parse() call graph.

See #202.
2024-11-29 20:00:28 -06:00
Zach Laine
92c4993b87 Documentation pass on the changes made so far to support token parsing.
See #202.
2024-11-29 20:00:28 -06:00
Zach Laine
9d67b0df7f Cruft removal. 2024-11-29 17:17:49 -06:00
Zach Laine
d4f4589ead Add TODOs. 2024-11-29 16:29:55 -06:00
Zach Laine
178d62a250 Preclude skipper *parse() overloads from token parsing with a static_assert.
See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
e654c9fda7 Macro guard private members of tokens_view from inclusion in Doxygen reference
entry.

See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
9dbf30241d Plumb a pointer to the tokens_view through to the parse context; change
seq_parser to use the tokens_view pointer from the contet to trim the cache at
an expectation point.

See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
f298bfe59b Remove the support for token_view::iterator -> underlying iterator translation
in *parse_impl(); add support to the error handlers and their support
functions directly instead.  There are simply too many APIs there that need
the translation to leave it to other code.

Add lex_error exception type, and add support for all the APIs that used to
take a parse_error param to now take either a parse_error or a lex_error.

Throw lex_error from failed parsing of lexed tokens in detail::make_token().

See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
07cd667a91 Add TODOs. 2024-11-29 16:29:55 -06:00
Zach Laine
00510ed962 Add cases to tracing.cpp to cover the printing of token parsers; fix errors.
See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
1a5b7467ca Add a TODO. 2024-11-29 16:29:55 -06:00
Zach Laine
13a52e10f9 Cruft removal. 2024-11-29 16:29:55 -06:00
Zach Laine
67c3ec180c Add a smoke test for using token views as input to parse(); fix errors.
See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
8658f8cd6b Unify the implementations of the callback parse() API. 2024-11-29 16:29:55 -06:00
Zach Laine
f69d7acdd9 Unify the implementations of the non-callback parse() API, to make it posible
for the new token parsingcode path to use that one unified implementation too.
The previous implementation dispatched like parse() -> prefix_parse(), the
latter of which is incompatible with token parsing.
2024-11-29 16:29:55 -06:00
Zach Laine
542bdb0e0e Unify the detail::{skip_,}parse_impl overloads. 2024-11-29 16:29:55 -06:00
Zach Laine
f3e326e344 Unify the two detail::skip_parse_impl overloads. 2024-11-29 16:29:55 -06:00
Zach Laine
027d861b08 Unify the two detail::callback_*parse_impl overloads. 2024-11-29 16:29:55 -06:00
Zach Laine
6ab8f96e19 Unify the two detail::parse_impl overloads. 2024-11-29 16:29:55 -06:00
Zach Laine
2b518bc74d Fix multiple thinkos in tokens_view.
See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
1421592876 Add a std::string data member, and an override of what() to parser_error, to
ensure that the messge is preserved exactly when translating exceptions in
failed lexer parsing.

See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
092a76173b Change token<> to include the position of the start of the token in the
underlying sequence, and change the way that the error handler is invoked, so
that it detects token iterators, and passes iterators into the underlying
range to the error handler, instead of the token iterators.

See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
8830dfed02 Alter detail::make_input_subrange() to return its input unchanged when the
range's value type is a specializtion of token.

See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
935322649b Add an iterator/sentinel overload of detail::make_input_subrange(), so that an
overload of this function is called in every *parse() overload.
2024-11-29 16:29:55 -06:00
Zach Laine
7566dbdde1 Remove the pointer case from detail::make_input_range(). 2024-11-29 16:29:55 -06:00
Zach Laine
298bae0058 Sketch in the printing implemetnation for token_parser.
See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
b6c1229c54 Reorganize the lexer-related header inclusion scheme slightly. lexer.hpp is
now required to come before parser.hpp.

See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
793e519eb8 Put detail::make_input_subrange back into parer.hpp.
See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
416607c954 Add token_parser::operator()(range) to support matchign against tokens that
producestring_views.

See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
e419ef2a60 Completely rethink the relationship among token_parser, token_spec_t, and
parser_interface.  token_spec is now a variable template that generates a
parser_interface wrapping a token_parser, which parameterized on the
token_spec_t.  This way, a single token_spec use can be used to specify how to
lex, and how to parse.

See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
1a405f8133 Sketch in operator() on token_spec_t.
See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
fbc21ef2fd Sketch in support for matching tokens against expected values in token_parser.
See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
f00f4dfa75 Initial, partial sketch of token_parser.
See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
358adf247a Address the remaining non-documentation TODOs in the lexer header and tests.
See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
ee8ab13779 Extend the lexer tests, addressing some of the testing TODOs; fix errors.
See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
391bb2b5b0 Factor out a lot of the lexer.cpp tests into a separate file.
See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
ad64fb6973 Use bitfields to reduce the size of token by a few bytes.
See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
6ed1152390 Switch from double to long double as token's floating point representation.
Therer's room for it, since it's in a union with a string_view anyway.

See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
51cc855dd7 Grooming. 2024-11-29 16:29:55 -06:00
Zach Laine
bf336fb096 Add much longer lexing test; fix errors.
See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
63483cb284 Specify the true/false and number token_specs in the adobe lexer test code to
use a bool and a double as its value, repsectively, instead of a string_view.

See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
8b24206aee Add a TODO. 2024-11-29 16:29:55 -06:00
Zach Laine
49213c428e Change all the existing lex runs to compare their output against an expected
sequence of tokens.

See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
3df6626e58 Rewrite a TODO. 2024-11-29 16:29:55 -06:00
Zach Laine
79f34ef252 Add support for an externally-supplied token cache.
See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
f465a75069 Add a token cache to tokens_view; grab a bunch of tokens at a time when
advancing past the end of the cache.  Filter out whitespace tokens entirely.
Make tokens_view noncopyable+nonmovable.

See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
6159a481db Make tokens_view regular by removing tokens_view::tokens_, which creates
internal references.

See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
cd9c7492dd Add a range adaptor for token_view; token_view -> tokens_view. Break
detail::make_input_subrange() out into its own header for reuse.

See #202.
2024-11-29 16:29:55 -06:00
Zach Laine
9a1fadbe82 Remove/update some TODOs. 2024-11-29 16:29:55 -06:00
Zach Laine
e5c101378b Initial sketch of a lexer for Boost.Parser, based on using CTRE as an external
dependency.  Lots of TODOs, somewhat thin testing, but the basics work.

See #202.
2024-11-29 16:29:55 -06:00
29 changed files with 7727 additions and 692 deletions

View File

@@ -42,6 +42,7 @@
[import ../test/parser.cpp]
[import ../test/parser_rule.cpp]
[import ../test/parser_quoted_string.cpp]
[import ../test/lexer_and_parser.cpp]
[import ../include/boost/parser/concepts.hpp]
[import ../include/boost/parser/error_handling_fwd.hpp]
@@ -109,6 +110,16 @@
[def _trans_replace_vs_ [classref boost::parser::transform_replace_view `boost::parser::transform_replace_view`s]]
[def _lex_ [classref boost::parser::lexer_t `boost::parser::lexer_t`]]
[def _tok_ [classref boost::parser::token `boost::parser::token`]]
[def _toks_ [classref boost::parser::token `boost::parser::token`s]]
[def _tok_spec_ [classref boost::parser::token_spec_t `boost::parser::token_spec_t`]]
[def _tok_specs_ [classref boost::parser::token_spec_t `boost::parser::token_spec_t`s]]
[def _tok_chs_ [globalref boost::parser::token_chars `boost::parser::token_chars`]]
[def _to_tok_ [globalref boost::parser::to_tokens `boost::parser::to_tokens`]]
[def _tok_v_ [classref boost::parser::tokens_view `boost::parser::tokens_view`]]
[def _ch_id_ [globalref boost::parser::character_id `boost::parser::character_id`]]
[def _std_str_ `std::string`]
[def _std_vec_char_ `std::vector<char>`]
[def _std_vec_char32_ `std::vector<char32_t>`]
@@ -253,6 +264,12 @@
[def _udls_ [@https://en.cppreference.com/w/cpp/language/user_literal UDLs]]
[def _yaml_ [@https://yaml.org/spec/1.2/spec.html YAML 1.2]]
[def _nttp_ [@https://en.cppreference.com/w/cpp/language/template_parameters NTTP]]
[def _nttps_ [@https://en.cppreference.com/w/cpp/language/template_parameters NTTPs]]
[def _ctre_ [@https://github.com/hanickadot/compile-time-regular-expressions CTRE]]
[def _pcre_ [@https://www.pcre.org PCRE]]
[def _Spirit_ [@https://www.boost.org/doc/libs/release/libs/spirit Boost.Spirit]]
[def _spirit_reals_ [@https://www.boost.org/doc/libs/release/libs/spirit/doc/html/spirit/qi/reference/numeric/real.html real number parsers]]

View File

@@ -595,3 +595,220 @@ same attribute generation rules.
[[`p1 | p2[a] | p3`] [`std::optional<std::variant<_ATTR_np_(p1), _ATTR_np_(p3)>>`]]
]
]
[template table_token_parsers_and_their_semantics
This table lists all the _Parser_ parsers usable during token parsing. For
the callable parsers, a separate entry exists for each possible arity of
arguments. For a parser `p`, if there is no entry for `p` without arguments,
`p` is a function, and cannot itself be used as a parser; it must be called.
In the table below:
* each entry is a global object usable directly in your parsers, unless
otherwise noted;
* "code point" is used to refer to the elements of the input range, which
assumes that the parse is being done in the Unicode-aware code path (if the
parse is being done in the non-Unicode code path, read "code point" as
"`char`");
* _RES_ is a notional macro that expands to the resolution of parse argument
or evaluation of a parse predicate (see _parsers_uses_);
* "`_RES_np_(pred) == true`" is a shorthand notation for "`_RES_np_(pred)` is
contextually convertible to `bool` and `true`"; likewise for `false`;
* `c` is a character of some character type;
* `str` is a string literal of type `CharType const[]`, for some character
type `Char\Type`;
* `pred` is a parse predicate;
* `arg0`, `arg1`, `arg2`, ... are parse arguments;
* `a` is a semantic action;
* `r` is an object whose type models `parsable_range`;
* `tok` is a token parser created using _tok_spec_; and
* `p`, `p1`, `p2`, ... are parsers.
[note The definition of `parsable_range` is:
[parsable_range_concept]
]
[note Some of the parsers in this table consume no input. All parsers consume
the input they match unless otherwise stated in the table below.]
[table Token Parsers and Their Semantics
[[Parser] [Semantics] [Attribute Type] [Notes]]
[[ `tok` ]
[ Matches any token with the same ID as `tok`. ]
[ The attribute type given when specifying `tok`, or a string view if unspecified. The attribute type must be a specialization of `std::basic_string_view`, an integral type, or a floating point type. ]
[]]
[[ `tok(arg0)` ]
[ Matches exactly the value `_RES_np_(arg0)`. ]
[ The attribute type given when specifying `tok`. The attribute type must be a an integral type or a floating point type. ]
[ This case applies only when `arg0` is *not* a range. ]]
[[ `tok(r)` ]
[ Matches exactly the value `r`. ]
[ The attribute type given when specifying `tok`. The attribute type must be a specialization of `std::basic_string_view`. ]
[ This overload does *not* take parse arguments. ]]
[[ _e_ ]
[ Matches /epsilon/, the empty string. Always matches, and consumes no input. ]
[ None. ]
[ Matching _e_ an unlimited number of times creates an infinite loop, which is undefined behavior in C++. _Parser_ will assert in debug mode when it encounters `*_e_`, `+_e_`, etc (this applies to unconditional _e_ only). ]]
[[ `_e_(pred)` ]
[ Fails to match the input if `_RES_np_(pred) == false`. Otherwise, the semantics are those of _e_. ]
[ None. ]
[]]
[[ _ws_ ]
[ Matches a single whitespace code point (see note), according to the Unicode White_Space property. ]
[ None. ]
[ For more info, see the [@https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt Unicode properties]. _ws_ may consume one code point or two. It only consumes two code points when it matches `"\r\n"`. ]]
[[ _eol_ ]
[ Matches a single newline (see note), following the "hard" line breaks in the Unicode line breaking algorithm. ]
[ None. ]
[ For more info, see the [@https://unicode.org/reports/tr14 Unicode Line Breaking Algorithm]. _eol_ may consume one code point or two. It only consumes two code points when it matches `"\r\n"`. ]]
[[ _eoi_ ]
[ Matches only at the end of input, and consumes no input. ]
[ None. ]
[]]
[[ _attr_np_`(arg0)` ]
[ Always matches, and consumes no input. Generates the attribute `_RES_np_(arg0)`. ]
[ `decltype(_RES_np_(arg0))`. ]
[ An important use case for `_attr_` is to provide a default attribute value as a trailing alternative. For instance, an *optional* comma-delmited list is: `int_ % ',' | attr(std::vector<int>)`. Without the "`| attr(...)`", at least one `int_` match would be required. ]]
[[ _ch_ ]
[ Matches any single code point. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See _attr_gen_. ]
[ Only matches tokens with the ID _ch_id_. ]]
[[ `_ch_(arg0)` ]
[ Matches exactly the code point `_RES_np_(arg0)`. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See _attr_gen_. ]
[ Only matches tokens with the ID _ch_id_. ]]
[[ `_ch_(arg0, arg1)` ]
[ Matches the next code point `n` in the input, if `_RES_np_(arg0) <= n && n <= _RES_np_(arg1)`. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See _attr_gen_. ]
[ Only matches tokens with the ID _ch_id_. ]]
[[ `_ch_(r)` ]
[ Matches the next code point `n` in the input, if `n` is one of the code points in `r`. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See _attr_gen_. ]
[ `r` is taken to be in a UTF encoding. The exact UTF used depends on `r`'s element type. If you do not pass UTF encoded ranges for `r`, the behavior of _ch_ is undefined. Note that ASCII is a subset of UTF-8, so ASCII is fine. EBCDIC is not. `r` is not copied; a reference to it is taken. The lifetime of `_ch_(r)` must be within the lifetime of `r`. This overload of _ch_ does *not* take parse arguments. Only matches tokens with the ID _ch_id_. ]]
[[ _cp_ ]
[ Matches a single code point. ]
[ `char32_t` ]
[ Similar to _ch_, but with a fixed `char32_t` attribute type; _cp_ has all the same call operator overloads as _ch_, though they are not repeated here, for brevity. Only matches tokens with the ID _ch_id_. ]]
[[ _cu_ ]
[ Matches a single code point. ]
[ `char` ]
[ Similar to _ch_, but with a fixed `char` attribute type; _cu_ has all the same call operator overloads as _ch_, though they are not repeated here, for brevity. Even though the name "`cu`" suggests that this parser match at the code unit level, it does not. The name refers to the attribute type generated, much like the names _i_ versus _ui_. Only matches tokens with the ID _ch_id_. ]]
[[ `_blank_` ]
[ Equivalent to `_ws_ - _eol_`. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ]
[ Only matches tokens with the ID _ch_id_. ]]
[[ `_control_` ]
[ Matches a single control-character code point. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ]
[ Only matches tokens with the ID _ch_id_. ]]
[[ `_digit_` ]
[ Matches a single decimal digit code point. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ]
[ Only matches tokens with the ID _ch_id_. ]]
[[ `_punct_` ]
[ Matches a single punctuation code point. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ]
[ Only matches tokens with the ID _ch_id_. ]]
[[ `_hex_digit_` ]
[ Matches a single hexidecimal digit code point. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ]
[ Only matches tokens with the ID _ch_id_. ]]
[[ `_lower_` ]
[ Matches a single lower-case code point. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ]
[ Only matches tokens with the ID _ch_id_. ]]
[[ `_upper_` ]
[ Matches a single upper-case code point. ]
[ The code point type in Unicode parsing, or `char` in non-Unicode parsing. See the entry for _ch_. ]
[ Only matches tokens with the ID _ch_id_. ]]
[[ _lit_np_`(c)`]
[ Matches exactly the given code point `c`. ]
[ None. ]
[_lit_ does *not* take parse arguments. Only matches tokens with the ID _ch_id_. ]]
[[ `c_l` ]
[ Matches exactly the given code point `c`. ]
[ None. ]
[ This is a _udl_ that represents `_lit_np_(c)`, for example `'F'_l`. Only matches tokens with the ID _ch_id_. ]]
[[ _lit_np_`(r)`]
[ Matches exactly the given string `r`. ]
[ None. ]
[ _lit_ does *not* take parse arguments. _str_ matches the entire token or not at all. Only matches tokens with an attribute type that is a specialization of `std::basic_string_view`. ]]
[[ `str_l` ]
[ Matches exactly the given string `str`. ]
[ None. ]
[ This is a _udl_ that represents `_lit_np_(s)`, for example `"a string"_l`. Only matches tokens with an attribute type that is a specialization of `std::basic_string_view`. ]]
[[ `_str_np_(r)`]
[ Matches exactly `r`, and generates the match as an attribute. ]
[ _std_str_ ]
[ _str_ does *not* take parse arguments. _str_ matches the entire token or not at all. Only matches tokens with an attribute type that is a specialization of `std::basic_string_view`. ]]
[[ `str_p`]
[ Matches exactly `str`, and generates the match as an attribute. ]
[ _std_str_ ]
[ This is a _udl_ that represents `_str_np_(s)`, for example `"a string"_p`. Only matches tokens with an attribute type that is a specialization of `std::basic_string_view`. ]]
[[ `_rpt_np_(arg0)[p]` ]
[ Matches iff `p` matches exactly `_RES_np_(arg0)` times. ]
[ `std::string` if `_ATTR_np_(p)` is `char` or `char32_t`, otherwise `std::vector<_ATTR_np_(p)>` ]
[ The special value _inf_ may be used; it indicates unlimited repetition. `decltype(_RES_np_(arg0))` must be implicitly convertible to `int64_t`. Matching _e_ an unlimited number of times creates an infinite loop, which is undefined behavior in C++. _Parser_ will assert in debug mode when it encounters `_rpt_np_(_inf_)[_e_]` (this applies to unconditional _e_ only). ]]
[[ `_rpt_np_(arg0, arg1)[p]` ]
[ Matches iff `p` matches between `_RES_np_(arg0)` and `_RES_np_(arg1)` times, inclusively. ]
[ `std::string` if `_ATTR_np_(p)` is `char` or `char32_t`, otherwise `std::vector<_ATTR_np_(p)>` ]
[ The special value _inf_ may be used for the upper bound; it indicates unlimited repetition. `decltype(_RES_np_(arg0))` and `decltype(_RES_np_(arg1))` each must be implicitly convertible to `int64_t`. Matching _e_ an unlimited number of times creates an infinite loop, which is undefined behavior in C++. _Parser_ will assert in debug mode when it encounters `_rpt_np_(n, _inf_)[_e_]` (this applies to unconditional _e_ only). ]]
[[ `_if_np_(pred)[p]` ]
[ Equivalent to `_e_(pred) >> p`. ]
[ `std::optional<_ATTR_np_(p)>` ]
[ It is an error to write `_if_np_(pred)`. That is, it is an error to omit the conditionally matched parser `p`. ]]
[[ `_sw_np_(arg0)(arg1, p1)(arg2, p2) ...` ]
[ Equivalent to `p1` when `_RES_np_(arg0) == _RES_np_(arg1)`, `p2` when `_RES_np_(arg0) == _RES_np_(arg2)`, etc. If there is such no `argN`, the behavior of _sw_ is undefined. ]
[ `std::variant<_ATTR_np_(p1), _ATTR_np_(p2), ...>` ]
[ It is an error to write `_sw_np_(arg0)`. That is, it is an error to omit the conditionally matched parsers `p1`, `p2`, .... ]]
[[ _symbols_t_ ]
[ _symbols_ is an associative container of key, value pairs. Each key is a _std_str_ and each value has type `T`. In the Unicode parsing path, the strings are considered to be UTF-8 encoded; in the non-Unicode path, no encoding is assumed. _symbols_ Matches the longest prefix `pre` of the input that is equal to one of the keys `k`. If the length `len` of `pre` is zero, and there is no zero-length key, it does not match the input. If `len` is positive, the generated attribute is the value associated with `k`.]
[ `T` ]
[ Unlike the other entries in this table, _symbols_ is a type, not an object. ]]
]
]

View File

@@ -75,6 +75,10 @@ matches the input. _ATTR_ is a notional macro that expands to the attribute
type of the parser passed to it; `_ATTR_np_(_d_)` is `double`. This is
similar to the _attr_ type trait.
/Token parsing/ is parsing using _Parser_'s optional support for
lexing/tokenizing first, and parsing the resulting tokens, as opposed to the
normal operation of _Parser_, in which input characters are parsed.
Next, we'll look at some simple programs that parse using _Parser_. We'll
start small and build up from there.
@@ -1163,7 +1167,7 @@ without the context is for use outside of any parse.]
_Parser_ comes with all the parsers most parsing tasks will ever need. Each
one is a `constexpr` object, or a `constexpr` function. Some of the
non-functions are also callable, such as _ch_, which may be used directly, or
with arguments, as in _ch_`('a', 'z')`. Any parser that can be called,
with arguments, as in `_ch_('a', 'z')`. Any parser that can be called,
whether a function or callable object, will be called a /callable parser/ from
now on. Note that there are no nullary callable parsers; they each take one
or more arguments.
@@ -3661,6 +3665,452 @@ Some things to be aware of when looking at _Parser_ trace output:
[endsect]
[section Token parsing / Using a Lexer]
_Parser_ has optional support for lexing before parsing. The optional support
is based on an external dependency, _ctre_. _ctre_ produces a sequence of
tokens by matching a set of regexes that you provide. Each regex is used to
match against the input to produce one token with an ID associated with that
regex. When you call _p_, you pass it a lazy range of tokens that adapts the
input, and _p_ parses the tokens, not the underlying characters. When you
backtrack, you just move back to an earlier token, not an earlier place in the
underlying sequence of characters.
[heading A basic example]
Let's look at an example of how to do token parsing. First, you must include
the lexer header before the parser header.
[tokens_basics_headers]
The inclusion of this optional header is what enables token parsing.
Character parsing ("normal" parsing) is unaffected by this header inclusion
_emdash_ you can always do character parsing.
[important _ctre_ is a header-only library, and it can be included as a single
header. It requires C++20 or later, _Parser_'s support for token parsing does
as well. _Parser_ uses the single-header version with Unicode support,
`ctre-unicode.hpp`.]
Then, you define a lexer and its tokens.
[tokens_basics_lexer]
Here, we first see three _tok_specs_. Each one consists of an _nttp_ regex
string literal and an _nttp_ token ID; the first one matches `"foo"`, and has
an ID of `0`, etc. _lex_ takes two template parameters. The first parameter
indicates that the value type of the parsed input sequence is `char`. The
second one indicates that the ID-type of all subsequent _tok_specs_ will be
`int`. We create a full lexer by starting with the `lexer<...>` expression,
followed by a piped-together sequence of _tok_specs_.
The final lexer `lexer` has a combined regex string, `"(foo)|(b.*r)|(b.+z)"`.
This string is built up at compile time, and is represented by an _nttp_. It
is the single regex given to _ctre_, which _ctre_ uses to produce a sequence
of matches from it.
`lexer` and `token_spec` are variable templates; they make variables from the
templates _lex_ and _tok_spec_, respectively. The are provided as a
notational convenience, just so you don't have to put `{}` after every lexer
and token spec you write. _lex_ and _tok_spec_ are empty classes. Their
configury is stored in _nttps_.
Next, you create a range of _toks_ from your input. This range of tokens is
what _p_ will parse.
[tokens_basics_input_range]
The input must model `std::ranges::contiguous_range`. This is due to the way
_ctre_ works; it produces a sequence of matches that are convertible to
`std::basic_string_view<CharType>`. In our case, since we are lexing a
sequence of `char`, _ctre_ will produce a sequence of `std::basic_string`
matches. Note that the value type/character type we specified for _lex_ above
must match the input sequence's value type/character type, or the program is
ill-formed. Also note that because we are lexing a contiguous range of
characters, you cannot use any of the `boost::parser::as_utf*` range adaptors
when doing token parsing.
Next, you define a parser.
[tokens_basics_parser]
This has the same semantics as the character parsers you've seen in the rest
of the documentation. Each _tok_spec_ has the same interface as a parser, so
it can be used with all the parser combining operations, like `operator>>`.
However, unlike when doing character parsing, when token parsing all the
terminal parsers are restricted to a subset of the terminal parsers that are
available in character parsing (see the full list in the table below). This
is because most of the parsers in _Parser_ parse sequences of characters. For
example, if you used `_i_(42)` above instead of `foo`, the _i_ parser would
try to match two consecutive values from the input sequence, and would expect
them to equal `'4'` and `'2'`, respectively. It would instead see two tokens,
and the comparisons would not even compile.
Finally, you can put everything together in a call to _p_.
[tokens_basics_parse]
As you can see, the parse succeeded, and we got three attributes out of it.
Each attribute has the type `std::string_view`.
[heading Capture groups]
Capture groups are valid regex syntax, but you cannot use them in your
_tok_spec_ regexes. For instance, `bp::token_spec<"(foo)+", 0>` (to match one
or more consecutive `"foo"`s) will compile and run, and you will get garbage
results. _Parser_ relies on the exact number and order of capture groups to
do its token generation. If you want to group a part of your regex, use a
non-capture group, like `"(?:foo)+"`.
[heading Whitespace in token parsing]
Using the parser above, what if we tried to parse the token range `"foo baz
bar" | bp::to_tokens(lexer)` instead? Turns out, we get the same answer. You
cannot use am explicit skipper when parsing tokens. However, parsers are much
simpler when you have a notion of a skipper, especially for whitespace. So,
_lex_ has one built in; it uses `"\\s+"` by default. Whitespace is matched,
but produces no tokens. If you want to change the whitespace/skipper regex,
you can provide it when specifying the lexer. For example, here is how you
would specify the whitespace/skipped tokens to be any sequence of whitespace
characters, or any C++-style trailing comment (`// ...`).
bp::lexer<char, int, "\\s+|\\/\\/.*$">
If whitespace information is important in your parse, simply provide `""` or
the more readable convenience constant `bp::no_ws` to `lexer<>` as the
whitespace regex, and make a regular token that matches whitespace. That way,
you'll see all the whitespace in the sequence of tokens that you parse.
[heading Token attribute types]
The parser we looked at in the initial simple example produced three
`std::string_view`s, one for each token we parsed. However, we may know that
a particular token is meant to match numbers. If this is the case, we can let
_Parser_ know that we expect the token to be interpretable as a particular
type of numeric value. I'm using "numeric" for brevity, but this includes
`bool` as well. For example:
[tokens_attrs]
The attribute types for these tokens are `bool`, `std::string_view`, and
`double`, respectively. `identifier` has attribute type `std::string_view`
because that is the default if you do not specify a type.
A _tok_ is essentially a variant of `std::basic_string_view<CharType>`, `long
long`, and `long double`. The latter two types were selected because they can
fit any value of an integral or floating-point type, respectively. Even
though _tok_ effectively erases the exact type when it is integral or
floating-point, the token parser retains the information of what the exact
type is. This is why `true_false` above has an attribute type of `bool` and
not `long long`.
_ctre_ produces a sequence of substrings. Each token produced by _Parser_
gets its numeric value (if it should have one) by parsing the substring from
_ctre_ with _emdash_ you guessed it _emdash_ a _Parser_ parser. The parser
for `bool` is just _b_; the one for `int` is _i_, etc. The integral-type
parsers all support a radix/base. If you specify an integral value type for
one of your tokens, you can also specify a base, like `bp::token_spec<"\\d+",
int, 16>` to parse hex-encoded `int`s.
Part of the advantage of doing lexing before parsing is that you don't have to
reparse everything over and over again. If the subsequence `"1.23456789"` is
found in the input, you only lex it once. After that, it's already in the
right form as a floating-point number; backtracking will not provoke reparsing
of those ten characters.
[heading Single-character tokens]
Just about any parser above a certain size will have punctuation of some sort
_emdash_ elements of the input, usually a single character, that delimit other
parts of the input, like commas and braces. To make it easier to specify such
tokens, _Parser_ provides _tok_chs_. You can give _tok_chs_ a list of
individual characters, and it will create a separate, single-character regex
for each one, and add it to your lexer. Each such token will have the special
ID _ch_id_.
Note that the single character you provide must be a `char` in the ASCII range
(that is, less than `128`). If you want to use a single character that is
outside the ASCII range, just make a normal _tok_spec_ for it. Here is an
example using _tok_chs_.
[tokens_token_char]
Just like in a character parser, we can use character literals to match the
single-character tokens (`'='` and `';'` in the example above). The character
literals are turned into _ch_ parsers. _ch_ parsers that you explicitly write
may be used as well. They will only match single-character tokens, though
(that is, tokens with the ID _ch_id_).
[heading The differences between parsing characters and parsing tokens]
Even though _ch_ and _str_ (and lots of other character parsers _emdash_ see
the table below) are available when doing token parsing, their semantics are
subtly different when using for token parsing. This is because token parsing
involves parsing chunks of input as tokens, rather than individual characters.
This may sound obvious, but the implications are not. Consider this example.
[tokens_string_in_character_vs_token_parsing]
Why doesn't the token parsing case work? In the character parsing case,
_str_np_ tries to match characters from the input, one at a time; it sees
`'='` followed by `';'`, so it matches. In the token parsing case, this does
not happen. Instead, the input is broken up into two tokens (one for `'='`
and one for `';'`). `_str_np_("=;")` tries to match the first token in its
entirety, but that token is a character token, not a token with a
`std::basic_string_view` attribute. Even if that token did have
a`std::basic_string_view` attribute, it would be `"="`, not `"=;"`, and so the
match would still fail.
So, even though string matching is available using _str_, make sure you
understand that _str_ is looking for 1) a token with a string view attribute,
and 2) a full match of the token's string view against the range provided to
_str_.
_ch_ is also a bit different, since it only matches character tokens that you
make with _tok_chs_. Such tokens have the token ID _ch_id_. _ch_ will
*never* match any other kind of token. This goes for all the character
parsers (_blank_, _punct_, _upper_, etc).
The character class parsers (e.g. _punct_) are also limited in token parsing
vs. their use in character parsing. _tok_chs_ limits characters to the ASCII
range for simplicity, and to discourage parsing of sequences of tokens to find
things that are detectable using _pcre_ directly. In other words, if you need
the full set of punctuation characters, use `"\p{P}"` in one of your token
regexes, rather than trying to parse punctuation characters out of the input
using _punct_. Because _tok_chs_ limits characters to the ASCII range, all
the matching for any character class parser (like _punct_) above the ASCII
range will fail.
[important Though the string and character parsers are available, they're a
bit clunky and should be avoided in most cases. Instead, use the character
handling from the _pcre_ regex language to make the tokens you want. The best
use of string and character parsers in your _Parser_ token parsers is as
literals like `"function"`, `'='`, etc.]
One more important difference between token and character parsing is the
effect that using _lexeme_ and/or _skip_ has. If you use _lexeme_ or _skip_,
you are changing the sequence tokens that must be in the token cache. As
such, whenever you *enter* or *leave* a _lexeme_ *or* _skip_ directive, the
token cache is flushed. The flushed tokens are everything from the current
token position to the end of the cache. If you write `bp::lexeme[p]`
frequently enough in your parsers, you could be in for some very uneven
performance.
[important Though you may be used to using _lexeme_ and _skip_ in character
parsing, prefer to write explicit token regexes that have equivalent
semantics, but operating during lexing rather than during parsing.]
[heading Parsing tokens with a specific value]
So far, we've only seen examples of parsing for a particular token. Sometimes
we want to match only occurrences of a given token with a particular value,
just like when we write something like `_ch_('a', 'z')` in a character parser.
Just as with _ch_ and most other _Parser_ parsers, you can just add the value
to match in parens after the token, like `true_false(true)` or
`identifier("exact string")`.
[heading Token IDs and diagnostics]
So far, we've only seen `int` used as the token ID type. Any integral type or
enum can be used, though. There are limitations on the values you can provide
for IDs. First, the values must all be nonnegative; negative values are
reserved for use by _Parser_. Second, the values must not exceed `2^23-1`; no
one is likely to have very many unique IDs, and token storage can be reduced a
bit by using 3 bytes for the ID instead of 4.
Using an enum has the advantage of making the code a lot clearer. For
instance:
enum class token_names { foo, bar };
auto const foo = bp::token_spec<"foo", token_names::foo>;
auto const bar = bp::token_spec<"b.r", token_names::bar>;
... reads a lot better than just using IDs like `0` and `1`.
There is another important advantage related to diagnostic messages. Consider
this parse.
constexpr auto lexer = bp::lexer<char, token_names> | foo;
bp::parse("bar" | bp::to_tokens(lexer), bp::eps > foo);
Here is what the diagnostic looks like.
[pre
1:0: error: Expected tok<0> here:
bar
^
]
If we added a specific string value we expect, that would be included.
bp::parse("bar" | bp::to_tokens(lexer), bp::eps > foo("foo"));
[pre
1:0: error: Expected tok<0>("foo") here:
bar
^
]
Instead of `"tok<N>"`, it might be nice to give the failed expectation a
user-friendly name. In character parsers we usually do this by giving _rs_
user-facing diagnostic text. This makes your parse failures much easier to
understand and correct. However, many _tok_specs_ may already have a nice
name, so why not use it? If you use enumerators for you token IDs, and make
their enumeration streamable, _Parser_ will detect this, and use the streamed
enumerator instead of `"tok<N>"`. Here is what we could have written instead.
enum class printable_tokens { foo, bar };
std::ostream & operator<<(std::ostream & os, printable_tokens tok)
{
switch (tok) {
case printable_tokens::foo: os << "foo"; break;
case printable_tokens::bar: os << "bar"; break;
}
return os;
}
auto const foo = bp::token_spec<"foo", printable_tokens::foo>;
auto const bar = bp::token_spec<"b.*r", printable_tokens::bar>;
constexpr auto lexer = bp::lexer<char, printable_tokens> | foo;
bp::parse("bar" | bp::to_tokens(lexer), bp::eps > foo);
That results in the enumerator being printed instead.
[pre
1:0: error: Expected foo here:
bar
^
]
[important If you provide a streamable enumeration as the token ID type, this
enables the alternate printing behavior described above. If you specify a
particular value for the token parser, that value is printed as the expected
value. So the diagnostic name for `bp::token_spec<"\\d+", 3>(42)` is
`tok<3>(42)` but the name for `bp::token_spec<"\\d+",
printable_tokens::foo>(42)` is just `42` (not `foo`).]
The takeaway here is that you should use a streamable enumeration for your ID
type. It makes your code easier to read, and produces better diagnostics.
[heading Token caching]
Given that I told you earlier that we will make a sequence of tokens and
backtrack within those tokens, you may be wondering where the tokens are
stored. The _tok_v_ (the type created by the range adaptor _to_tok_) uses
internal storage or user-provided external storage to store the tokens as they
are generated. Here is an example of using external storage.
[tokens_caching_simple]
The cache could have been a `boost::container::small_vector<bp::token, N>`, or
even a `static_vector` of appropriate size, to reduce or eliminate memory
allocations.
Note the size of the cache after the parse; it still contains some tokens.
This is a special case of a more general phenomenon: the token cache grows
without bound when there are no expectation points. This is because, without
expectation points, backtracking is unbounded (refer to the _expect_pts_
section to see why). If you can go back arbitrarily far in order to backtrack,
you need to be sure that there will be a token at the place you backtrack to.
However, if you use expectation points, the cache is trimmed. The prefix of
tokens before the expectation point is erased from the token cache.
[tokens_caching_expectation_point]
Note the use of `std::ref()` to pass a reference to `cache`. This is
necessary because _to_tok_ uses `std::bind_back()` (or a workalike in C++17
mode). As with the other binders in `std`, it does not gracefully propagate
bare lvalue references, so you have to use `std::ref()`.
[heading Lexing failures]
Parse failures that fail the top-level parse happen only at expectation
points. Lexing failures that fail the top-level parse can happen at any point
in the input. If there is no token regex that matches the current point of
the input, we cannot continue to lex. Lexing failures are usually caused by
bad input, or failure to specify the correct set of _tok_specs_ to cover all
valid input. However, it may also be that you have written an impossible
_tok_spec_. Consider this one.
constexpr auto bad_token = bp::token_spec<"foo", 0, int>;
This _tok_spec_ can never generate a valid token. It will match `"foo"` in
the input, but then it will try to parse `"foo"` as an `int`, which is
guaranteed to fail.
The takeaway here is that a lexing failure might be due to bad input, but it
can also be the sign of a bug in one or more of your _tok_specs_.
[heading The token parsers]
Many of the parsers that work in character parsing do not work in token
parsing, because they try to parse individual characters from the input.
Token parsing only provides tokens, not characters. This table describes all
the parsers compatible with token parsing.
[table_token_parsers_and_their_semantics]
[heading Directives and token parsing]
One directive that works in character parsing does not work in token parsing
_emdash_ the argument form of _skip_. The argument to _skip_ is a new
skipper, and this cannot be changed in the middle of tokenization. The set of
tokens and their regexes are fixed at compile time. The nullary form of
_skip_ works fine; all it does is re-enable skipping that has been turned off
by _lexeme_.
[heading The token parsing API]
Not all the _p_ and _cbp_ overloads can do token parsing. In particular, the
overloads that take a skipper are precluded, since the skipper must be built
into the lexer itself (see the section above about whitespace handling for
details).
[heading _ctre_ particulars]
There are a few details you might want to know about how _ctre_ works.
_ctre_ uses _pcre_ as its regex grammar.
"Maximum munch" appears not to be the way _ctre_ tokenizes input. For
instance, if you have _tok_spec_ A that matches `"<=="` and _tok_spec_ B that
matches `"<|>|<=|>=|==|!="`, the input characters `"<=="` will be tokenized as
`"<=="` if the lexer includes `A | B`, but will be parsed as `"<"` followed by
`"=="` if the lexer includes `B | A`.
_ctre_ uses `char32_t` for all its compile time strings. If you give it a
regex string literal like `bp::token_spec<"foo", 0>` (that is, an array of
`char`), it will be interpreted in one of two ways. By default, the `char`s
are copied into an array of `char32_t`, unmodified. This is fine if you
provide an ASCII regex, or a regex in a non-Unicode encoding. However, if you
define `CTRE_STRING_IS_UTF8` before including `<boost/parser/lexer.hpp>`, the
array of `char` will be interpreted as UTF-8, and will be transcoded to UTF-32
before being stored in the array of `char32_t`. All the `charN_t` character
types will be interpreted as UTF-N encoded, and will be transcoded to UTF-32
if needed. `wchar_t` is taken to mean UTF-32 *even on Windows*. Again, all
of this transcoding happens at compile time.
[heading Error handling details]
Error handling during token parsing mostly Just Works. That is, you don't
need to know or do anything special just because you are parsing tokens.
However, the error reporting functions all operate at the level of character
input, not tokens. The higher level functions provided in _err_fwd_hpp_ and
_err_hpp_ (like `write_formatted_message()`) simply get the iterators to the
underlying range of input before doing their work. The lower-level functions
provided in _err_fwd_hpp_ and _err_hpp_ (like `find_line_position()`) do not.
Each function's API documentation specifies whether or not it does this
"normalization" to underlying iterators. If you use the lower-level API
directly in your code, you can call one of the overloads of
`normalize_iterators()` to get the underlying iterators in the token parsing
case.
[endsect]
[section Memory Allocation]
_Parser_ seldom allocates memory. The exceptions to this are:

View File

@@ -31,9 +31,9 @@ struct logging_error_handler
// and rethrow. Returning fail fails the top-level parse; returning
// rethrow just re-throws the parse_error exception that got us here in
// the first place.
template<typename Iter, typename Sentinel>
template<typename Iter, typename Sentinel, template<class> class Exception>
bp::error_handler_result
operator()(Iter first, Sentinel last, bp::parse_error<Iter> const & e) const
operator()(Iter first, Sentinel last, Exception<Iter> const & e) const
{
bp::write_formatted_expectation_failure_error_message(
ofs_, filename_, first, last, e);

View File

@@ -12,6 +12,8 @@
#if defined(BOOST_PARSER_DOXYGEN) || BOOST_PARSER_USE_CONCEPTS
#include <boost/parser/lexer_fwd.hpp>
#include <ranges>
@@ -27,14 +29,18 @@ namespace boost { namespace parser {
std::same_as<std::remove_cv_t<T>, char32_t>;
template<typename T>
concept parsable_iter =
std::forward_iterator<T> && code_unit<std::iter_value_t<T>>;
concept token_iter = is_token_v<std::iter_value_t<T>>;
template<typename T>
concept parsable_iter =
(std::forward_iterator<T> && code_unit<std::iter_value_t<T>>) ||
token_iter<T>;
//[ parsable_range_like_concept
//[ parsable_range_concept
template<typename T>
concept parsable_range = std::ranges::forward_range<T> &&
code_unit<std::ranges::range_value_t<T>>;
concept parsable_range = (std::ranges::forward_range<T> &&
code_unit<std::ranges::range_value_t<T>>) ||
detail::is_tokens_view_v<T>;
//]
template<typename T>
@@ -43,7 +49,6 @@ namespace boost { namespace parser {
template<typename T>
concept parsable_range_like = parsable_range<T> || parsable_pointer<T>;
//]
template<typename T>
concept range_like = std::ranges::range<T> || parsable_pointer<T>;

View File

@@ -59,6 +59,12 @@
also defined. */
# define BOOST_PARSER_TRACE_TO_VS_OUTPUT
/** When lexing is enabled, each token contains its position within the
underlying range. To save a bit of space, an `unsiged int` is used for
this. If you parse input sequences longer than 2^32-1 characters, define
`BOOST_PARSER_TOKEN_POSITION_TYPE` to be a larger integral type. */
# define BOOST_PARSER_TOKEN_POSITION_TYPE unsigned int
#else
# ifdef BOOST_PARSER_NO_RUNTIME_ASSERTIONS
@@ -103,6 +109,10 @@
# define BOOST_PARSER_MAX_AGGREGATE_SIZE 25
#endif
#if !defined(BOOST_PARSER_TOKEN_POSITION_TYPE)
# define BOOST_PARSER_TOKEN_POSITION_TYPE unsigned int
#endif
// VS2019 and VS2017 need conditional constexpr in some places, even in C++17 mode.
#if !defined(_MSC_VER) || 1930 <= _MSC_VER
# define BOOST_PARSER_CONSTEXPR constexpr
@@ -116,4 +126,18 @@
# define BOOST_PARSER_TRACE_OSTREAM std::cout
#endif
#if defined(_MSC_VER)
# define BOOST_PARSER_DIAGNOSTIC_PUSH __pragma(warning(push))
# define BOOST_PARSER_DIAGNOSTIC_POP __pragma(warning(pop))
#elif defined(__clang_major__)
# define BOOST_PARSER_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
# define BOOST_PARSER_DIAGNOSTIC_POP _Pragma("clang diagnostic pop")
#elif defined(__GNUC__)
# define BOOST_PARSER_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push")
# define BOOST_PARSER_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop")
#else
# define BOOST_PARSER_DIAGNOSTIC_PUSH
# define BOOST_PARSER_DIAGNOSTIC_POP
#endif
#endif

View File

@@ -197,6 +197,31 @@ namespace boost { namespace parser { namespace detail::hl {
}
// fold_n
template<std::size_t I, std::size_t N>
struct fold_n_dispatch
{
template<typename F, typename State>
constexpr static auto call(State && s, F const & f)
{
if constexpr (I + 1 == N) {
return f((State &&)s, llong<I>{});
} else {
return fold_n_dispatch<I + 1, N>::call(
f((State &&)s, llong<I>{}), f);
}
}
};
template<std::size_t N, typename F, typename State>
constexpr auto fold_n(State && s, F const & f)
{
static_assert(0 < N, "fold_n must operate on sequences of length >= 1");
return hl::fold_n_dispatch<0, N>::call((State &&)s, (F &&)f);
}
// size
template<typename... Args>

View File

@@ -355,6 +355,13 @@ namespace boost { namespace parser { namespace detail {
std::ostream & os,
int components = 0);
template<typename Context, typename TokenSpec, typename Expected>
void print_parser(
Context const & context,
token_parser<TokenSpec, Expected> const & parser,
std::ostream & os,
int components = 0);
enum { trace_indent_factor = 2 };
inline void trace_indent(std::ostream & os, int indent)
@@ -602,29 +609,19 @@ namespace boost { namespace parser { namespace detail {
Context const & context,
flags f,
Attribute const & attr,
std::string name) :
os_(os),
initial_first_(first),
first_(first),
last_(last),
context_(context),
flags_(f),
attr_(attr),
name_(std::move(name))
{
if (!detail::do_trace(flags_))
return;
detail::trace_prefix(os, first_, last_, context_, name_);
}
std::string name);
~scoped_trace_t();
// implemented in printing_impl.hpp
~scoped_trace_t()
template<typename I, typename S>
void impl(I initial_first, I first, S last)
{
if (!detail::do_trace(flags_))
return;
detail::trace_indent(os_, detail::_indent(context_));
if (*context_.pass_) {
os_ << "matched ";
detail::trace_input(os_, initial_first_, first_);
detail::trace_input(os_, initial_first, first);
os_ << "\n";
detail::print_attribute(
os_,
@@ -633,7 +630,7 @@ namespace boost { namespace parser { namespace detail {
} else {
os_ << "no match\n";
}
detail::trace_suffix(os_, first_, last_, context_, name_);
detail::trace_suffix(os_, first, last, context_, name_);
}
std::ostream & os_;

View File

@@ -942,6 +942,100 @@ namespace boost { namespace parser { namespace detail {
context, parser.or_parser_, os, components);
}
#if defined(BOOST_PARSER_TOKEN_PARSER_HPP)
template<typename Context, typename TokenSpec, typename Expected>
void print_parser(
Context const & context,
token_parser<TokenSpec, Expected> const & parser,
std::ostream & os,
int components)
{
constexpr bool do_print_value = requires { parser.expected_.value_; };
auto print_value = [&] {
if constexpr (do_print_value) {
if constexpr (std::ranges::range<
decltype(parser.expected_.value_)>) {
os << '"';
for (auto c : parser.expected_.value_ | text::as_utf8) {
detail::print_char(os, c);
}
os << '"';
} else {
detail::print(os, parser.expected_.value_);
}
}
};
if constexpr (requires {
os << TokenSpec::id;
} && std::is_enum_v<typename TokenSpec::id_type>) {
if constexpr (do_print_value) {
print_value();
} else {
os << TokenSpec::id;
}
} else {
os << "tok<" << (int)TokenSpec::id << '>';
if constexpr (do_print_value) {
os << '(';
print_value();
os << ')';
}
}
}
#endif
template<
bool DoTrace,
typename Iter,
typename Sentinel,
typename Context,
typename Attribute>
scoped_trace_t<DoTrace, Iter, Sentinel, Context, Attribute>::scoped_trace_t(
std::ostream & os,
Iter & first,
Sentinel last,
Context const & context,
flags f,
Attribute const & attr,
std::string name) :
os_(os),
initial_first_(first),
first_(first),
last_(last),
context_(context),
flags_(f),
attr_(attr),
name_(std::move(name))
{
if (!detail::do_trace(flags_))
return;
if constexpr (is_token_iter_v<Iter>) {
detail::trace_prefix(
os, first_.base(), first_.range_end(), context_, name_);
} else {
detail::trace_prefix(os, first_, last_, context_, name_);
}
}
template<
bool DoTrace,
typename Iter,
typename Sentinel,
typename Context,
typename Attribute>
scoped_trace_t<DoTrace, Iter, Sentinel, Context, Attribute>::
~scoped_trace_t()
{
if constexpr (is_token_iter_v<Iter>)
impl(first_.range_begin(), first_.base(), first_.range_end());
else
impl(initial_first_, first_, last_);
}
}}}
#endif

View File

@@ -693,12 +693,14 @@ namespace boost::parser::detail { namespace text {
using T = detail::remove_cv_ref_t<R>;
if constexpr (forward_range_v<T>) {
auto unpacked =
boost::parser::detail::text::unpack_iterator_and_sentinel(detail::begin(r), detail::end(r));
boost::parser::detail::text::unpack_iterator_and_sentinel(
detail::begin(r), detail::end(r));
if constexpr (is_bounded_array_v<T>) {
constexpr auto n = std::extent_v<T>;
if (n && !r[n - 1])
--unpacked.last;
return BOOST_PARSER_DETAIL_TEXT_SUBRANGE(unpacked.first, unpacked.last);
return BOOST_PARSER_DETAIL_TEXT_SUBRANGE(
unpacked.first, unpacked.last);
} else if constexpr (
!std::is_same_v<decltype(unpacked.first), iterator_t<R>> ||
!std::is_same_v<decltype(unpacked.last), sentinel_t<R>>) {

View File

@@ -31,7 +31,7 @@ namespace boost { namespace parser {
}
/** Returns the `line_position` for `it`, counting lines from the
beginning of the input `first`. */
beginning of the input `first`. Requires non-token iterators. */
template<typename Iter>
line_position<Iter> find_line_position(Iter first, Iter it)
{
@@ -57,7 +57,7 @@ namespace boost { namespace parser {
}
/** Returns the iterator to the end of the line in which `it` is
found. */
found. Requires non-token iterators. */
template<typename Iter, typename Sentinel>
Iter find_line_end(Iter it, Sentinel last)
{
@@ -73,13 +73,16 @@ namespace boost { namespace parser {
std::ostream & write_formatted_message(
std::ostream & os,
std::string_view filename,
Iter first,
Iter it,
Sentinel last,
Iter first_,
Iter it_,
Sentinel last_,
std::string_view message,
int64_t preferred_max_line_length,
int64_t max_after_caret)
{
auto [first, it, last] =
parser::normalize_iterators(first_, it_, last_);
if (!filename.empty())
os << filename << ':';
auto const position = parser::find_line_position(first, it);
@@ -118,13 +121,15 @@ namespace boost { namespace parser {
std::ostream & write_formatted_message(
std::ostream & os,
std::wstring_view filename,
Iter first,
Iter it,
Sentinel last,
Iter first_,
Iter it_,
Sentinel last_,
std::string_view message,
int64_t preferred_max_line_length,
int64_t max_after_caret)
{
auto [first, it, last] =
parser::normalize_iterators(first_, it_, last_);
auto const r = filename | parser::detail::text::as_utf8;
std::string s(r.begin(), r.end());
return parser::write_formatted_message(
@@ -139,23 +144,24 @@ namespace boost { namespace parser {
}
#endif
template<typename Iter, typename Sentinel>
template<typename Iter, typename Sentinel, template<class> class Exception>
std::ostream & write_formatted_expectation_failure_error_message(
std::ostream & os,
std::string_view filename,
Iter first,
Sentinel last,
parse_error<Iter> const & e,
Iter first_,
Sentinel last_,
Exception<Iter> const & e,
int64_t preferred_max_line_length,
int64_t max_after_caret)
{
std::string message = "error: Expected ";
message += e.what();
auto [first, it, last] = parser::normalize_iterators(first_, e, last_);
return parser::write_formatted_message(
os,
filename,
first,
e.iter,
it,
last,
message,
preferred_max_line_length,
@@ -163,13 +169,13 @@ namespace boost { namespace parser {
}
#if defined(_MSC_VER)
template<typename Iter, typename Sentinel>
template<typename Iter, typename Sentinel, template<class> class Exception>
std::ostream & write_formatted_expectation_failure_error_message(
std::ostream & os,
std::wstring_view filename,
Iter first,
Sentinel last,
parse_error<Iter> const & e,
Exception<Iter> const & e,
int64_t preferred_max_line_length,
int64_t max_after_caret)
{
@@ -180,6 +186,35 @@ namespace boost { namespace parser {
}
#endif
namespace detail {
template<typename I, typename S>
auto normalize_iterators_impl(I first, I it, S last)
{
if constexpr (detail::is_token_iter_v<I>)
return std::tuple(it.range_begin(), it.base(), it.range_end());
else
return std::tuple(first, it, last);
}
}
template<typename I, typename S>
auto normalize_iterators(I first, I it, S last)
{
return detail::normalize_iterators_impl(first, it, last);
}
template<typename I, typename S>
auto normalize_iterators(I first, parse_error<I> e, S last)
{
return detail::normalize_iterators_impl(first, e.iter, last);
}
template<typename I, typename S>
auto normalize_iterators(I first, lex_error<I> e, S last)
{
return detail::normalize_iterators_impl(first, e.iter, last);
}
/** An error handler that allows users to supply callbacks to handle the
reporting of warnings and errors. The reporting of errors and/or
warnings can be suppressed by supplying one or both
@@ -211,9 +246,13 @@ namespace boost { namespace parser {
filename_.assign(r.begin(), r.end());
}
#endif
template<typename Iter, typename Sentinel>
template<
typename Iter,
typename Sentinel,
template<class>
class Exception>
error_handler_result
operator()(Iter first, Sentinel last, parse_error<Iter> const & e) const
operator()(Iter first, Sentinel last, Exception<Iter> const & e) const
{
if (error_) {
std::stringstream ss;
@@ -260,13 +299,15 @@ namespace boost { namespace parser {
std::string filename_;
};
/** An error handler that just re-throws any exception generated by the
parse. */
struct rethrow_error_handler
{
template<typename Iter, typename Sentinel>
template<
typename Iter,
typename Sentinel,
template<class>
class Exception>
error_handler_result
operator()(Iter first, Sentinel last, parse_error<Iter> const & e) const
operator()(Iter first, Sentinel last, Exception<Iter> const & e) const
{
return error_handler_result::rethrow;
}
@@ -288,8 +329,6 @@ namespace boost { namespace parser {
};
#if defined(_MSC_VER) || defined(BOOST_PARSER_DOXYGEN)
/** An error handler that prints to the Visual Studio debugger via calls
to `OutputDebugString()`. */
struct vs_output_error_handler : stream_error_handler
{
vs_output_error_handler() :
@@ -309,9 +348,9 @@ namespace boost { namespace parser {
// implementations
template<typename Iter, typename Sentinel>
template<typename Iter, typename Sentinel, template<class> class Exception>
error_handler_result default_error_handler::operator()(
Iter first, Sentinel last, parse_error<Iter> const & e) const
Iter first, Sentinel last, Exception<Iter> const & e) const
{
parser::write_formatted_expectation_failure_error_message(
std::cerr, "", first, last, e);
@@ -343,9 +382,9 @@ namespace boost { namespace parser {
diagnose(kind, message, context, parser::_where(context).begin());
}
template<typename Iter, typename Sentinel>
template<typename Iter, typename Sentinel, template<class> class Exception>
error_handler_result stream_error_handler::operator()(
Iter first, Sentinel last, parse_error<Iter> const & e) const
Iter first, Sentinel last, Exception<Iter> const & e) const
{
std::ostream * os = err_os_;
if (!os)

View File

@@ -24,10 +24,29 @@ namespace boost { namespace parser {
template<typename Iter>
struct parse_error : std::runtime_error
{
parse_error(Iter it, std::string const & msg) :
runtime_error(msg), iter(it)
parse_error(Iter it, std::string msg) :
runtime_error(""), message(msg), iter(it)
{}
char const * what() const noexcept override { return message.c_str(); }
std::string message;
Iter iter;
};
/** The exception thrown when a lexing error is encountered, consisting of
an iterator to the point of failure, and a description of the value
expected at the point of failure in `what()`. */
template<typename Iter>
struct lex_error : std::runtime_error
{
lex_error(Iter it, std::string msg) :
runtime_error(""), message(msg), iter(it)
{}
char const * what() const noexcept override { return message.c_str(); }
std::string message;
Iter iter;
};
@@ -42,7 +61,7 @@ namespace boost { namespace parser {
};
/** Writes a formatted message (meaning prefixed with the file name, line,
and column number) to `os`. */
and column number) to `os`. Normalizes token iterators as needed. */
template<typename Iter, typename Sentinel>
std::ostream & write_formatted_message(
std::ostream & os,
@@ -56,7 +75,8 @@ namespace boost { namespace parser {
#if defined(_MSC_VER) || defined(BOOST_PARSER_DOXYGEN)
/** Writes a formatted message (meaning prefixed with the file name, line,
and column number) to `os`. This overload is Windows-only. */
and column number) to `os`. Normalizes token iterators as needed.
This overload is Windows-only. */
template<typename Iter, typename Sentinel>
std::ostream & write_formatted_message(
std::ostream & os,
@@ -70,32 +90,59 @@ namespace boost { namespace parser {
#endif
/** Writes a formatted parse-expectation failure (meaning prefixed with
the file name, line, and column number) to `os`. */
template<typename Iter, typename Sentinel>
the file name, line, and column number) to `os`. Normalizes token
iterators as needed. */
template<typename Iter, typename Sentinel, template<class> class Exception>
std::ostream & write_formatted_expectation_failure_error_message(
std::ostream & os,
std::string_view filename,
Iter first,
Sentinel last,
parse_error<Iter> const & e,
Exception<Iter> const & e,
int64_t preferred_max_line_length = 80,
int64_t max_after_caret = 40);
#if defined(_MSC_VER) || defined(BOOST_PARSER_DOXYGEN)
/** Writes a formatted parse-expectation failure (meaning prefixed with
the file name, line, and column number) to `os`. This overload is
Windows-only. */
template<typename Iter, typename Sentinel>
the file name, line, and column number) to `os`. Normalizes token
iterators as needed. This overload is Windows-only. */
template<typename Iter, typename Sentinel, template<class> class Exception>
std::ostream & write_formatted_expectation_failure_error_message(
std::ostream & os,
std::wstring_view filename,
Iter first,
Sentinel last,
parse_error<Iter> const & e,
Exception<Iter> const & e,
int64_t preferred_max_line_length = 80,
int64_t max_after_caret = 40);
#endif
/** Returns a tuple of three iterators (corresponding to `first`, `curr`,
and `last`) that are suitable for use in the other error handling
functions, many of which require iterators into the undelying sequence
being parsed. For non-token parsing cases, this is effectively a
no-op; the given iterators are simply returned as-is. */
template<typename I, typename S>
auto normalize_iterators(I first, I curr, S last);
/** Returns a tuple of three iterators (corresponding to `first`, the
iterator captured in `e`, and `last`) that are suitable for use in the
other error handling functions, many of which require iterators into
the undelying sequence being parsed. For non-token parsing cases,
this is effectively a no-op; the given iterators are simply returned
as-is. */
template<typename I, typename S>
auto normalize_iterators(I first, parse_error<I> e, S last);
/** Returns a tuple of three iterators (corresponding to `first`, the
iterator captured in `e`, and `last`) that are suitable for use in the
other error handling functions, many of which require iterators into
the undelying sequence being parsed. For non-token parsing cases,
this is effectively a no-op; the given iterators are simply returned
as-is. */
template<typename I, typename S>
auto normalize_iterators(I first, lex_error<I> e, S last);
/** The kinds of diagnostics that can be handled by an error handler. */
enum class diagnostic_kind {
error, /// An error diagnostic.
@@ -109,12 +156,16 @@ namespace boost { namespace parser {
{
constexpr default_error_handler() = default;
/** Handles a `parse_error` exception thrown during parsing. A
formatted parse-expectation failure is printed to `std::cerr`.
Always returns `error_handler_result::fail`. */
template<typename Iter, typename Sentinel>
error_handler_result operator()(
Iter first, Sentinel last, parse_error<Iter> const & e) const;
/** Handles a `parse_error` or `lex_error` exception thrown during
parsing/lexing. A formatted parse-expectation failure is printed
to `std::cerr`. Always returns `error_handler_result::fail`. */
template<
typename Iter,
typename Sentinel,
template<class>
class Exception>
error_handler_result
operator()(Iter first, Sentinel last, Exception<Iter> const & e) const;
/** Prints `message` to `std::cerr`. The diagnostic is printed with
the given `kind`, indicating the location as being at `it`. This
@@ -191,9 +242,13 @@ namespace boost { namespace parser {
formatted parse-expectation failure is printed to `*err_os_` when
`err_os_` is non-null, or `std::cerr` otherwise. Always returns
`error_handler_result::fail`. */
template<typename Iter, typename Sentinel>
template<
typename Iter,
typename Sentinel,
template<class>
class Exception>
error_handler_result
operator()(Iter first, Sentinel last, parse_error<Iter> const & e) const;
operator()(Iter first, Sentinel last, Exception<Iter> const & e) const;
/** Let `std::ostream * s = kind == diagnostic_kind::error : err_os_ :
warn_os_`; prints `message` to `*s` when `s` is non-null, or
@@ -225,6 +280,16 @@ namespace boost { namespace parser {
std::ostream * warn_os_;
};
/** An error handler that just re-throws any exception generated by the
parse. */
struct rethrow_error_handler;
#if defined(_MSC_VER) || defined(BOOST_PARSER_DOXYGEN)
/** An error handler that prints to the Visual Studio debugger via calls
to `OutputDebugString()`. */
struct vs_output_error_handler;
#endif
}}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,36 @@
// Copyright (C) 2024 T. Zachary Laine
//
// Distributed under the Boost Software License, Version 1.0. (See
// accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#ifndef BOOST_PARSER_LEXER_FWD_HPP
#define BOOST_PARSER_LEXER_FWD_HPP
#include <ranges>
#include <vector>
namespace boost { namespace parser {
/** A `std::views`-compatible view that provides the tokens from the given
contiguous range, using the given lexer and optional token cache. You
should typically not need to use this type directly; use
`boost::parser::to_tokens` instead. */
template<
std::ranges::contiguous_range V,
typename Lexer,
typename TokenCache = std::vector<typename Lexer::token_type>>
requires std::ranges::view<V>
struct tokens_view;
namespace detail {
template<typename T>
constexpr bool is_tokens_view_v = false;
template<typename V, typename Lexer, typename TokenCache>
constexpr bool is_tokens_view_v<tokens_view<V, Lexer, TokenCache>> =
true;
}
}}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -68,6 +68,32 @@ namespace boost { namespace parser {
return BOOST_PARSER_SUBRANGE(ptr, detail::text::null_sentinel);
}
/** The token ID used for whitespace tokens. */
inline constexpr int ws_id = -1000000;
/** The token ID used for single-character tokens. */
inline constexpr int character_id = -2000000;
#ifdef BOOST_PARSER_DOXYGEN
/** A type trait that evaluates to `true` iff `T` is a specialization of
`boost::parser::token`. */
template<typename T>
constexpr bool is_token_v = detail::foo;
#else
template<typename CharType>
struct token;
template<typename T>
constexpr bool is_token_v = false;
template<typename CharType>
constexpr bool is_token_v<token<CharType>> = true;
#endif
namespace detail {
template<typename T>
constexpr bool is_optional_v = enable_optional<T>;
@@ -147,6 +173,18 @@ namespace boost { namespace parser {
{};
struct upper_case_chars
{};
struct any_token_value
{
template<typename T>
bool matches_value(T) const
{
return true;
}
};
template<typename I, typename Context>
struct scoped_lexeme;
}
/** Repeats the application of another parser `p` of type `Parser`,
@@ -428,6 +466,20 @@ namespace boost { namespace parser {
template<typename T>
struct float_parser;
/** A tag type used to represent a value type that is any specialization
of `std::basic_string_view`. Which specialization is used depends on
the input. */
struct string_view_tag
{};
/** Matches a token from the input with ID `TokenSpec::id`. Fails on any
other input. The parse will also fail if `Expected` is anything but
`detail::nope` (which it is by default), and `expected_.matches(attr)`
is not `true` for the produced attribute `attr`. Used in token
parsing only. */
template<typename TokenSpec, typename Expected>
struct token_parser;
/** Applies at most one of the parsers in `OrParser`. If `switch_value_`
matches one or more of the values in the parsers in `OrParser`, the
first such parser is applied, and the success or failure and attribute

View File

@@ -0,0 +1,305 @@
// Copyright (C) 2024 T. Zachary Laine
//
// Distributed under the Boost Software License, Version 1.0. (See
// accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#ifndef BOOST_PARSER_TOKEN_PARSER_HPP
#define BOOST_PARSER_TOKEN_PARSER_HPP
#if !defined(BOOST_PARSER_PARSER_HPP) || !defined(BOOST_PARSER_LEXER_HPP)
#error "token_parser.hpp must be included after lexer.hpp and parser.hpp."
#endif
#include <boost/parser/parser_fwd.hpp>
#include <boost/parser/concepts.hpp>
#include <boost/parser/error_handling.hpp>
#include <algorithm>
namespace boost { namespace parser {
namespace detail {
template<typename AttributeType, typename CharType>
std::optional<AttributeType> token_as(token<CharType> tok)
{
if constexpr (std::is_floating_point_v<AttributeType>) {
if (tok.has_long_double())
return tok.get_long_double();
return std::nullopt;
} else if constexpr (std::is_integral_v<AttributeType>) {
if (tok.has_long_long())
return AttributeType(tok.get_long_long());
return std::nullopt;
} else {
if (tok.has_string_view())
return tok.get_string_view();
return std::nullopt;
}
}
template<typename Expected>
struct token_with_value
{
explicit constexpr token_with_value(Expected value) :
expected_(value)
{}
template<typename T, typename Context>
bool matches(T value, Context const & context) const
{
return value == detail::resolve(context, expected_);
}
Expected expected_;
};
template<typename Subrange>
struct token_with_string_view
{
explicit constexpr token_with_string_view(Subrange subrange) :
subrange_(subrange)
{}
template<typename CharType, typename Context>
bool matches(
std::basic_string_view<CharType> value, Context const &) const
{
auto const value_cps =
make_subrange<CharType>(value.begin(), value.end());
auto const subrange_cps =
make_subrange<CharType>(subrange_.begin(), subrange_.end());
return std::ranges::equal(
value_cps, subrange_cps, [](auto a, auto b) {
return cast_char(a) == cast_char(b);
});
}
template<typename T>
static auto cast_char(T c)
{
if constexpr (std::same_as<T, char>) {
return (unsigned char)c;
} else {
return c;
}
}
template<typename CharType, typename I, typename S>
static auto make_subrange(I f, S l)
{
auto subrange = BOOST_PARSER_SUBRANGE(f, l);
if constexpr (std::is_same_v<CharType, char>) {
return subrange;
} else {
return subrange | detail::text::as_utf32;
}
}
Subrange subrange_;
};
}
#ifndef BOOST_PARSER_DOXYGEN
template<typename TokenSpec, typename Expected>
struct token_parser
{
using token_spec = TokenSpec;
template<typename Iter>
using attribute_type = std::conditional_t<
std::same_as<typename token_spec::value_type, string_view_tag>,
std::basic_string_view<
typename detail::iter_value_t<Iter>::char_type>,
typename token_spec::value_type>;
constexpr token_parser() = default;
constexpr token_parser(Expected expected) : expected_(expected) {}
template<
typename Iter,
typename Sentinel,
typename Context,
typename SkipParser>
auto call(
Iter & first,
Sentinel last,
Context const & context,
SkipParser const & skip,
detail::flags flags,
bool & success) const -> attribute_type<Iter>
{
attribute_type<Iter> retval;
call(first, last, context, skip, flags, success, retval);
return retval;
}
template<
typename Iter,
typename Sentinel,
typename Context,
typename SkipParser,
typename Attribute>
void call(
Iter & first,
Sentinel last,
Context const & context,
SkipParser const & skip,
detail::flags flags,
bool & success,
Attribute & retval) const
{
using value_type = std::remove_cvref_t<decltype(*first)>;
static_assert(
is_token_v<value_type>,
"token_parser can only be used when parsing sequences of "
"tokens.");
[[maybe_unused]] auto _ = detail::scoped_trace(
*this, first, last, context, flags, retval);
if (first == last) {
success = false;
return;
}
value_type const x = *first;
if (x.id() != (int)token_spec::id) {
success = false;
return;
}
constexpr bool use_expected = !std::same_as<Expected, detail::nope>;
if (use_expected || detail::gen_attrs(flags)) {
auto opt_attr = detail::token_as<attribute_type<Iter>>(x);
if constexpr (use_expected) {
if (!opt_attr || !expected_.matches(*opt_attr, context)) {
success = false;
return;
}
}
if (detail::gen_attrs(flags))
detail::assign(retval, *opt_attr);
}
++first;
}
/** Returns a `parser_interface` containing a `token_parser` that
matches `value`. */
template<typename T>
requires(!parsable_range_like<T>)
constexpr auto operator()(T value) const noexcept
{
BOOST_PARSER_ASSERT(
(detail::is_nope_v<Expected> &&
"If you're seeing this, you tried to chain calls on one of "
"your token_spec's, like 'my_token_spec(id1)(id2)'. Quit "
"it!'"));
return parser_interface(
token_parser<TokenSpec, detail::token_with_value<T>>(
detail::token_with_value(std::move(value))));
}
/** Returns a `parser_interface` containing a `token_parser` that
matches the range `r`. If the token being matched during the
parse has a `char_type` of `char8_t`, `char16_t`, or `char32_t`,
the elements of `r` are transcoded from their presumed encoding to
UTF-32 during the comparison. Otherwise, the character being
matched is directly compared to the elements of `r`. */
template<parsable_range_like R>
constexpr auto operator()(R && r) const noexcept
{
BOOST_PARSER_ASSERT(
((!std::is_rvalue_reference_v<R &&> ||
!detail::is_range<detail::remove_cv_ref_t<R>>) &&
"It looks like you tried to pass an rvalue range to "
"token_spec(). Don't do that, or you'll end up with dangling "
"references."));
BOOST_PARSER_ASSERT(
(detail::is_nope_v<Expected> &&
"If you're seeing this, you tried to chain calls on "
"token_spec, like 'token_spec(char-set)(char-set)'. Quit "
"it!'"));
auto expected =
detail::token_with_string_view{make_expected_range((R &&)r)};
return parser_interface(
token_parser<token_spec, decltype(expected)>(expected));
}
template<typename R>
static constexpr auto make_expected_range(R && r)
{
using T = detail::remove_cv_ref_t<R>;
if constexpr (std::is_bounded_array_v<T>) {
constexpr auto n = std::extent_v<T>;
auto const offset = n && !r[n - 1] ? 1 : 0;
return BOOST_PARSER_SUBRANGE(
std::ranges::begin(r), std::ranges::end(r) - offset);
} else {
return BOOST_PARSER_SUBRANGE(
std::ranges::begin(r), std::ranges::end(r));
}
}
// TODO: Consider adding a special string_view-like type that can be
// passed to the range overload above. It would be based on
// adobe::name_t. When comparing it to a tokens' string_view, if it
// matches, it would replace the token's string_view, so that
// subsequent comparisons are O(1) in the length of the string.
Expected expected_;
};
#endif
/** A variable template that defines a token parser associated with
`boost::parser::token_spec_t<Regex, ID, ValueType, Base>`. This token
parser can be used to specify a lexer, and may also be used in
parsers. */
template<
ctll::fixed_string Regex,
auto ID,
typename ValueType = string_view_tag,
int Base = 10>
constexpr parser_interface token_spec{
token_parser<token_spec_t<Regex, ID, ValueType, Base>, detail::nope>()};
#ifndef BOOST_PARSER_DOXYGEN
template<
typename CharType,
typename ID,
ctll::fixed_string WsStr,
ctll::fixed_string RegexStr,
detail::nttp_array IDs,
detail::nttp_array Specs>
template<
ctll::fixed_string RegexStr2,
auto ID2,
typename ValueType,
int Base>
constexpr auto
lexer_t<CharType, ID, WsStr, RegexStr, IDs, Specs>::operator|(
parser_interface<token_parser<
token_spec_t<RegexStr2, ID2, ValueType, Base>,
detail::nope>> const &) const
{
static_assert(
std::same_as<ID, decltype(ID2)>,
"All id_types must be the same for all token_specs.");
constexpr auto new_regex =
detail::wrap_escape_concat<regex_str, RegexStr2>();
constexpr auto new_ids = IDs.template append<(int)ID2>();
constexpr auto new_specs = Specs.template append<detail::parse_spec_for<
token_spec_t<RegexStr2, ID2, ValueType, Base>>()>();
return lexer_t<CharType, ID, WsStr, new_regex, new_ids, new_specs>{};
}
#endif
}}
#endif

View File

@@ -22,7 +22,7 @@ namespace boost::parser {
std::declval<
parse_context<false, false, I, S, default_error_handler>>(),
ws,
detail::default_flags(),
flags(uint32_t(flags::gen_attrs) | uint32_t(flags::use_skip)),
std::declval<bool &>()));
template<typename R, typename Parser>
using range_attr_t = attr_type<iterator_t<R>, sentinel_t<R>, Parser>;

View File

@@ -6,6 +6,23 @@ enable_testing()
add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -j4 -C ${CMAKE_CFG_INTDIR})
if (CXX_STD GREATER_EQUAL 20)
include(FetchContent)
FetchContent_Declare(
ctre
URL https://raw.githubusercontent.com/hanickadot/compile-time-regular-expressions/refs/heads/main/single-header/ctre-unicode.hpp
DOWNLOAD_NO_EXTRACT true
)
FetchContent_MakeAvailable(ctre)
set(ctre_include_dir ${CMAKE_BINARY_DIR}/_deps/ctre-src)
add_library(ctre_single_header INTERFACE)
target_include_directories(ctre_single_header INTERFACE ${ctre_include_dir})
else()
add_library(ctre_single_header INTERFACE)
endif()
##################################################
# Parser tests
##################################################
@@ -31,6 +48,7 @@ add_test(NAME parser_api COMMAND parser_api)
add_executable(
compile_tests
compile_include_lexer_parser.cpp
compile_tests_main.cpp
compile_attribute.cpp
compile_seq_attribute.cpp
@@ -39,12 +57,12 @@ add_executable(
compile_all_t.cpp
)
set_property(TARGET compile_tests PROPERTY CXX_STANDARD ${CXX_STD})
target_link_libraries(compile_tests parser boost)
target_link_libraries(compile_tests parser boost ctre_single_header)
macro(add_test_executable name)
add_executable(${name} ${name}.cpp)
set_property(TARGET ${name} PROPERTY CXX_STANDARD ${CXX_STD})
target_link_libraries(${name} parser boost ${link_flags})
target_link_libraries(${name} parser boost ctre_single_header ${link_flags})
if (MSVC)
target_compile_options(${name} PRIVATE /source-charset:utf-8 /bigobj)
elseif (USE_ASAN OR USE_UBSAN)
@@ -82,6 +100,14 @@ add_test_executable(parser_seq_permutations_1)
add_test_executable(parser_seq_permutations_2)
add_test_executable(parser_or_permutations_1)
add_test_executable(parser_or_permutations_2)
if (CXX_STD GREATER_EQUAL 20)
add_test_executable(lexer)
add_test_executable(lexer_adobe_files)
add_test_executable(lexer_and_parser)
add_test_executable(lexer_and_parser_api)
add_test_executable(lexer_and_parser_terminals)
add_test_executable(lexer_and_parser_symbol_table)
endif()
if (MSVC)
add_executable(vs_output_tracing tracing.cpp)

76
test/adobe_lexer.hpp Normal file
View File

@@ -0,0 +1,76 @@
/**
* Copyright (C) 2024 T. Zachary Laine
*
* Distributed under the Boost Software License, Version 1.0. (See
* accompanying file LICENSE_1_0.txt or copy at
* http://www.boost.org/LICENSE_1_0.txt)
*/
#ifndef BOOST_PARSER_TEST_ADOBE_LEXER
#define BOOST_PARSER_TEST_ADOBE_LEXER
#include <boost/parser/parser.hpp>
#include <boost/parser/lexer.hpp>
namespace bp = boost::parser;
enum class adobe_tokens {
keyword_true_false,
keyword_empty,
identifier,
lead_comment,
trail_comment,
quoted_string,
number,
eq_op,
rel_op,
mul_op,
define,
or_,
and_
};
constexpr auto true_false =
bp::token_spec<"true|false", adobe_tokens::keyword_true_false, bool>;
constexpr auto empty = bp::token_spec<"empty", adobe_tokens::keyword_empty>;
constexpr auto identifier =
bp::token_spec<"[a-zA-Z]\\w*", adobe_tokens::identifier>;
constexpr auto lead_comment = bp::token_spec<
"\\/\\*[^*]*\\*+(?:[^/*][^*]*\\*+)*\\/",
adobe_tokens::lead_comment>;
constexpr auto trail_comment =
bp::token_spec<"\\/\\/.*$", adobe_tokens::trail_comment>;
constexpr auto quoted_string =
bp::token_spec<"\\\"[^\\\"]*\\\"|'[^']*'", adobe_tokens::quoted_string>;
constexpr auto number =
bp::token_spec<"\\d+(?:\\.\\d*)?", adobe_tokens::number, double>;
constexpr auto eq_op = bp::token_spec<"==|!=", adobe_tokens::eq_op>;
constexpr auto define = bp::token_spec<"<==", adobe_tokens::define>;
constexpr auto rel_op = bp::token_spec<"<|>|<=|>=", adobe_tokens::rel_op>;
constexpr auto mul_op = bp::token_spec<"\\*|\\/|%", adobe_tokens::mul_op>;
constexpr auto or_ = bp::token_spec<"\\|\\|", adobe_tokens::or_>;
constexpr auto and_ = bp::token_spec<"&&", adobe_tokens::and_>;
constexpr auto adobe_lexer = bp::lexer<char, adobe_tokens> | true_false |
empty | identifier | lead_comment | trail_comment |
quoted_string | number | eq_op | define | rel_op |
mul_op | or_ | and_ |
bp::token_chars<
'=',
'+',
'-',
'!',
'?',
':',
'.',
',',
'(',
')',
'[',
']',
'{',
'}',
'@',
';'>;
#endif

View File

@@ -65,7 +65,7 @@ void compile_attribute_non_unicode()
using attr_t = decltype(parse(null_term(r), parser));
static_assert(std::is_same_v<attr_t, std::optional<char>>);
static_assert(std::is_same_v<
attribute_t<decltype(r), decltype(parser)>,
attribute_t<decltype(null_term(r)), decltype(parser)>,
char>);
}
{
@@ -73,7 +73,7 @@ void compile_attribute_non_unicode()
using attr_t = decltype(parse(null_term(r), parser));
static_assert(std::is_same_v<attr_t, std::optional<std::string>>);
static_assert(std::is_same_v<
attribute_t<decltype(r), decltype(parser)>,
attribute_t<decltype(null_term(r)), decltype(parser)>,
std::string>);
}
{
@@ -81,7 +81,7 @@ void compile_attribute_non_unicode()
using attr_t = decltype(parse(null_term(r), parser));
static_assert(std::is_same_v<attr_t, std::optional<std::string>>);
static_assert(std::is_same_v<
attribute_t<decltype(r), decltype(parser)>,
attribute_t<decltype(null_term(r)), decltype(parser)>,
std::string>);
}
{
@@ -89,7 +89,7 @@ void compile_attribute_non_unicode()
using attr_t = decltype(parse(null_term(r), parser));
static_assert(std::is_same_v<attr_t, std::optional<std::string>>);
static_assert(std::is_same_v<
attribute_t<decltype(r), decltype(parser)>,
attribute_t<decltype(null_term(r)), decltype(parser)>,
std::string>);
}
{
@@ -97,7 +97,7 @@ void compile_attribute_non_unicode()
using attr_t = decltype(parse(null_term(r), parser));
static_assert(std::is_same_v<attr_t, std::optional<std::string>>);
static_assert(std::is_same_v<
attribute_t<decltype(r), decltype(parser)>,
attribute_t<decltype(null_term(r)), decltype(parser)>,
std::string>);
}
}

View File

@@ -0,0 +1,10 @@
// Copyright (C) 2024 T. Zachary Laine
//
// Distributed under the Boost Software License, Version 1.0. (See
// accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#include <boost/parser/config.hpp>
#if BOOST_PARSER_USE_CONCEPTS
#include <boost/parser/lexer.hpp>
#endif
#include <boost/parser/parser.hpp>

569
test/lexer.cpp Normal file
View File

@@ -0,0 +1,569 @@
/**
* Copyright (C) 2024 T. Zachary Laine
*
* Distributed under the Boost Software License, Version 1.0. (See
* accompanying file LICENSE_1_0.txt or copy at
* http://www.boost.org/LICENSE_1_0.txt)
*/
#define BOOST_PARSER_TESTING
#include <boost/parser/lexer.hpp>
#include <boost/parser/parser.hpp>
#include <boost/parser/transcode_view.hpp>
#include "ill_formed.hpp"
#include <boost/core/lightweight_test.hpp>
#include <boost/container/small_vector.hpp>
#include <deque>
namespace bp = boost::parser;
enum class my_tokens { ws, foo, bar, baz };
int main()
{
// formation of token_specs
{
auto const token_spec = bp::token_spec<"foo", 12>;
bp::token_spec_t<"foo", 12, bp::string_view_tag, 10>
token_spec_explicit;
static_assert(std::same_as<
decltype(token_spec.parser_)::token_spec,
decltype(token_spec_explicit)>);
}
{
auto const token_spec = bp::token_spec<"foo", my_tokens::foo>;
bp::token_spec_t<"foo", my_tokens::foo, bp::string_view_tag, 10>
token_spec_explicit;
static_assert(std::same_as<
decltype(token_spec.parser_)::token_spec,
decltype(token_spec_explicit)>);
}
{
auto const token_spec = bp::token_spec<"bar", my_tokens::bar>;
bp::token_spec_t<"bar", my_tokens::bar, bp::string_view_tag, 10>
token_spec_explicit;
static_assert(std::same_as<
decltype(token_spec.parser_)::token_spec,
decltype(token_spec_explicit)>);
}
{
auto const token_spec = bp::token_spec<"foo", 12, int, 2>;
bp::token_spec_t<"foo", 12, int, 2> token_spec_explicit;
static_assert(std::same_as<
decltype(token_spec.parser_)::token_spec,
decltype(token_spec_explicit)>);
}
{
auto const token_spec = bp::token_spec<"foo", 12>;
bp::token_spec_t<"foo", 12, bp::string_view_tag, 10>
token_spec_explicit;
static_assert(std::same_as<
decltype(token_spec.parser_)::token_spec,
decltype(token_spec_explicit)>);
}
{
auto const token_spec = bp::token_spec<"foo", 12, unsigned int, 8>;
bp::token_spec_t<"foo", 12, unsigned int, 8> token_spec_explicit;
static_assert(std::same_as<
decltype(token_spec.parser_)::token_spec,
decltype(token_spec_explicit)>);
}
{
auto const token_spec = bp::token_spec<"foo", 12, short>;
bp::token_spec_t<"foo", 12, short, 10> token_spec_explicit;
static_assert(std::same_as<
decltype(token_spec.parser_)::token_spec,
decltype(token_spec_explicit)>);
}
{
auto const token_spec = bp::token_spec<"foo", 12, float>;
bp::token_spec_t<"foo", 12, float, 10> token_spec_explicit;
static_assert(std::same_as<
decltype(token_spec.parser_)::token_spec,
decltype(token_spec_explicit)>);
}
{
auto const token_spec = bp::token_spec<"foo", 12, double>;
bp::token_spec_t<"foo", 12, double, 10> token_spec_explicit;
static_assert(std::same_as<
decltype(token_spec.parser_)::token_spec,
decltype(token_spec_explicit)>);
}
// making lexers
{
auto const lexer = bp::lexer<char, my_tokens> |
bp::token_spec<"foo", my_tokens::foo> |
bp::token_spec<"bar", my_tokens::bar> |
bp::token_spec<"baz", my_tokens::baz>;
// +1 because of the 0-group
static_assert(decltype(lexer)::size() == 3 + 1);
static_assert(std::same_as<decltype(lexer)::id_type, my_tokens>);
}
{
auto const lexer = bp::lexer<char, my_tokens> | bp::token_chars<'='>;
static_assert(decltype(lexer)::size() == 1 + 1);
static_assert(std::same_as<decltype(lexer)::id_type, my_tokens>);
}
{
auto const lexer = bp::lexer<char, my_tokens> | bp::token_chars<'='> |
bp::token_spec<"foo", my_tokens::foo> |
bp::token_spec<"bar", my_tokens::bar> |
bp::token_spec<"baz", my_tokens::baz>;
static_assert(decltype(lexer)::size() == 4 + 1);
static_assert(std::same_as<decltype(lexer)::id_type, my_tokens>);
}
{
auto const lexer =
bp::lexer<char, my_tokens> | bp::token_spec<"foo", my_tokens::foo> |
bp::token_spec<"bar", my_tokens::bar> |
bp::token_spec<"baz", my_tokens::baz> | bp::token_chars<'='>;
static_assert(decltype(lexer)::size() == 4 + 1);
static_assert(std::same_as<decltype(lexer)::id_type, my_tokens>);
}
{
auto const lexer = bp::lexer<char, my_tokens> | bp::token_chars<
'=',
'+',
'-',
'!',
'?',
':',
'.',
',',
'(',
')',
'[',
']',
'{',
'}',
'@',
';'>;
static_assert(decltype(lexer)::size() == 16 + 1);
static_assert(std::same_as<decltype(lexer)::id_type, my_tokens>);
}
#if 0 // This is a test of whether the escapes work for every possible char
// value accepted by detail::token_chars_spec. This takes a long time and
// really only needs to happen once.
{
auto const lexer = bp::lexer<char, my_tokens> | bp::token_chars<
char(0),
char(1),
char(2),
char(3),
char(4),
char(5),
char(6),
char(7),
char(8),
char(9),
char(10),
char(11),
char(12),
char(13),
char(14),
char(15),
char(16),
char(17),
char(18),
char(19),
char(20),
char(21),
char(22),
char(23),
char(24),
char(25),
char(26),
char(27),
char(28),
char(29),
char(30),
char(31),
char(32),
char(33),
char(34),
char(35),
char(36),
char(37),
char(38),
char(39),
char(40),
char(41),
char(42),
char(43),
char(44),
char(45),
char(46),
char(47),
char(48),
char(49),
char(50),
char(51),
char(52),
char(53),
char(54),
char(55),
char(56),
char(57),
char(58),
char(59),
char(60),
char(61),
char(62),
char(63),
char(64),
char(65),
char(66),
char(67),
char(68),
char(69),
char(70),
char(71),
char(72),
char(73),
char(74),
char(75),
char(76),
char(77),
char(78),
char(79),
char(80),
char(81),
char(82),
char(83),
char(84),
char(85),
char(86),
char(87),
char(88),
char(89),
char(90),
char(91),
char(92),
char(93),
char(94),
char(95),
char(96),
char(97),
char(98),
char(99),
char(100),
char(101),
char(103),
char(102),
char(104),
char(105),
char(106),
char(107),
char(108),
char(109),
char(110),
char(111),
char(112),
char(113),
char(114),
char(115),
char(116),
char(117),
char(118),
char(119),
char(120),
char(121),
char(122),
char(123),
char(124),
char(125),
char(126),
char(127)>;
}
#endif
{
// Mixed UTFs.
auto const lexer =
bp::lexer<char, my_tokens> | bp::token_spec<"foo", my_tokens::foo> |
bp::token_spec<u"bar", my_tokens::bar> |
bp::token_spec<U"baz", my_tokens::baz> | bp::token_chars<'='>;
// mutable vs. const token_views + mutable vs. const input views
std::string input = "foo = bar";
auto mr_mi = input | bp::to_tokens(lexer);
auto const cr_mi = input | bp::to_tokens(lexer);
auto const const_input = input;
auto mr_ci = input | bp::to_tokens(lexer);
auto const cr_ci = input | bp::to_tokens(lexer);
using tok_t = bp::token<char>;
tok_t const expected[] = {
tok_t((int)my_tokens::foo, 0, "foo"),
tok_t(bp::character_id, 0, (long long)'='),
tok_t((int)my_tokens::bar, 0, "bar")};
int position = 0;
position = 0;
for (auto tok : mr_mi) {
BOOST_TEST(tok == expected[position]);
++position;
}
BOOST_TEST(position == (int)std::size(expected));
position = 0;
for (auto tok : cr_mi) {
BOOST_TEST(tok == expected[position]);
++position;
}
BOOST_TEST(position == (int)std::size(expected));
position = 0;
for (auto tok : mr_ci) {
BOOST_TEST(tok == expected[position]);
++position;
}
BOOST_TEST(position == (int)std::size(expected));
position = 0;
for (auto tok : cr_ci) {
BOOST_TEST(tok == expected[position]);
++position;
}
BOOST_TEST(position == (int)std::size(expected));
}
// Check basic plumbing of connecting UTF inputs to CTRE.
{
auto const lexer =
bp::lexer<char, my_tokens> | bp::token_spec<"foo", my_tokens::foo> |
bp::token_spec<"bar", my_tokens::bar> |
bp::token_spec<"baz", my_tokens::baz> | bp::token_chars<'='>;
std::string s = "foo = bar";
using tok_t = bp::token<char>;
tok_t const expected[] = {
tok_t((int)my_tokens::foo, 0, "foo"),
tok_t(bp::character_id, 0, (long long)'='),
tok_t((int)my_tokens::bar, 0, "bar")};
auto const lexer8 = bp::lexer<char8_t, my_tokens> |
bp::token_spec<"foo", my_tokens::foo> |
bp::token_spec<"bar", my_tokens::bar> |
bp::token_spec<"baz", my_tokens::baz> |
bp::token_chars<'='>;
std::u8string u8s = u8"foo = bar";
using tok8_t = bp::token<char8_t>;
tok8_t const expected8[] = {
tok8_t((int)my_tokens::foo, 0, u8"foo"),
tok8_t(bp::character_id, 0, (long long)'='),
tok8_t((int)my_tokens::bar, 0, u8"bar")};
auto const lexer16 = bp::lexer<char16_t, my_tokens> |
bp::token_spec<"foo", my_tokens::foo> |
bp::token_spec<"bar", my_tokens::bar> |
bp::token_spec<"baz", my_tokens::baz> |
bp::token_chars<'='>;
std::u16string u16s = u"foo = bar";
using tok16_t = bp::token<char16_t>;
tok16_t const expected16[] = {
tok16_t((int)my_tokens::foo, 0, u"foo"),
tok16_t(bp::character_id, 0, (long long)'='),
tok16_t((int)my_tokens::bar, 0, u"bar")};
auto const lexer32 = bp::lexer<char32_t, my_tokens> |
bp::token_spec<"foo", my_tokens::foo> |
bp::token_spec<"bar", my_tokens::bar> |
bp::token_spec<"baz", my_tokens::baz> |
bp::token_chars<'='>;
std::u32string u32s = U"foo = bar";
using tok32_t = bp::token<char32_t>;
tok32_t const expected32[] = {
tok32_t((int)my_tokens::foo, 0, U"foo"),
tok32_t(bp::character_id, 0, (long long)'='),
tok32_t((int)my_tokens::bar, 0, U"bar")};
int position = 0;
position = 0;
for (auto tok : s | bp::to_tokens(lexer)) {
BOOST_TEST(tok == expected[position]);
static_assert(
std::
same_as<decltype(tok.get_string_view()), std::string_view>);
++position;
}
BOOST_TEST(position == (int)std::size(expected));
position = 0;
for (auto tok : u8s | bp::to_tokens(lexer8)) {
BOOST_TEST(tok == expected8[position]);
static_assert(std::same_as<
decltype(tok.get_string_view()),
std::u8string_view>);
++position;
}
BOOST_TEST(position == (int)std::size(expected));
position = 0;
for (auto tok : u16s | bp::to_tokens(lexer16)) {
BOOST_TEST(tok == expected16[position]);
static_assert(std::same_as<
decltype(tok.get_string_view()),
std::u16string_view>);
++position;
}
BOOST_TEST(position == (int)std::size(expected));
position = 0;
for (auto tok : u32s | bp::to_tokens(lexer32)) {
BOOST_TEST(tok == expected32[position]);
static_assert(std::same_as<
decltype(tok.get_string_view()),
std::u32string_view>);
++position;
}
BOOST_TEST(position == (int)std::size(expected));
}
// no-ws lexer
{
auto const lexer = bp::lexer<char, my_tokens, bp::no_ws> |
bp::token_spec<"foo", my_tokens::foo> |
bp::token_spec<"bar", my_tokens::bar> |
bp::token_spec<"baz", my_tokens::baz> |
bp::token_chars<'='>;
std::string s = "foo=bar";
using tok_t = bp::token<char>;
tok_t const expected[] = {
tok_t((int)my_tokens::foo, 0, "foo"),
tok_t(bp::character_id, 0, (long long)'='),
tok_t((int)my_tokens::bar, 0, "bar")};
int position = 0;
for (auto tok : s | bp::to_tokens(lexer)) {
BOOST_TEST(tok == expected[position]);
++position;
}
BOOST_TEST(position == (int)std::size(expected));
}
// ws-as-token lexers
{
auto const lexer = bp::lexer<char, my_tokens, bp::no_ws> |
bp::token_spec<"\\s+", my_tokens::ws> |
bp::token_spec<"foo", my_tokens::foo> |
bp::token_spec<"bar", my_tokens::bar> |
bp::token_spec<"baz", my_tokens::baz> |
bp::token_chars<'='>;
std::string s = "foo = bar";
using tok_t = bp::token<char>;
tok_t const expected[] = {
tok_t((int)my_tokens::foo, 0, "foo"),
tok_t((int)my_tokens::ws, 0, " "),
tok_t(bp::character_id, 0, (long long)'='),
tok_t((int)my_tokens::ws, 0, " "),
tok_t((int)my_tokens::bar, 0, "bar")};
int position = 0;
for (auto tok : s | bp::to_tokens(lexer)) {
BOOST_TEST(tok == expected[position]);
++position;
}
BOOST_TEST(position == (int)std::size(expected));
}
// lexing errors
{
using namespace std::literals;
auto const lexer = bp::lexer<char, int> |
bp::token_spec<"foo", 0, float> |
bp::token_spec<"bar", 1, int> |
bp::token_spec<"baz", 2, unsigned short> |
bp::token_spec<"quux", 3, int, 8> |
bp::token_spec<"next", 4, unsigned long long, 16>;
bool caught_exception = false;
caught_exception = false;
try {
for (auto tok : "foo" | bp::to_tokens(lexer)) {
(void)tok;
}
} catch (std::exception const & e) {
BOOST_TEST(e.what() == "32-bit floating-point number"sv);
caught_exception = true;
}
BOOST_TEST(caught_exception);
caught_exception = false;
try {
for (auto tok : "bar" | bp::to_tokens(lexer)) {
(void)tok;
}
} catch (std::exception const & e) {
BOOST_TEST(e.what() == "32-bit signed integer"sv);
caught_exception = true;
}
BOOST_TEST(caught_exception);
caught_exception = false;
try {
for (auto tok : "baz" | bp::to_tokens(lexer)) {
(void)tok;
}
} catch (std::exception const & e) {
BOOST_TEST(e.what() == "16-bit unsigned integer"sv);
caught_exception = true;
}
BOOST_TEST(caught_exception);
caught_exception = false;
try {
for (auto tok : "quux" | bp::to_tokens(lexer)) {
(void)tok;
}
} catch (std::exception const & e) {
BOOST_TEST(e.what() == "32-bit, base-8 signed integer"sv);
caught_exception = true;
}
BOOST_TEST(caught_exception);
caught_exception = false;
try {
for (auto tok : "next" | bp::to_tokens(lexer)) {
(void)tok;
}
} catch (std::exception const & e) {
BOOST_TEST(e.what() == "64-bit, base-16 unsigned integer"sv);
caught_exception = true;
}
BOOST_TEST(caught_exception);
}
return boost::report_errors();
}

828
test/lexer_adobe_files.cpp Normal file
View File

@@ -0,0 +1,828 @@
/**
* Copyright (C) 2024 T. Zachary Laine
*
* Distributed under the Boost Software License, Version 1.0. (See
* accompanying file LICENSE_1_0.txt or copy at
* http://www.boost.org/LICENSE_1_0.txt)
*/
#define BOOST_PARSER_TESTING
#include <boost/parser/lexer.hpp>
#include <boost/parser/transcode_view.hpp>
#include "ill_formed.hpp"
#include "adobe_lexer.hpp"
#include <boost/core/lightweight_test.hpp>
#include <boost/container/small_vector.hpp>
#include <deque>
namespace bp = boost::parser;
int main()
{
{
static_assert(decltype(adobe_lexer)::size() == 29 + 1);
static_assert(
std::same_as<decltype(adobe_lexer)::id_type, adobe_tokens>);
// tokens_view from adobe_lexer
{
char const input[] = R"(/*
Copyright 2005-2007 Adobe Systems Incorporated
Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
or a copy at http://stlab.adobe.com/licenses.html)
*/
sheet alert_dialog
{
output:
result <== { dummy_value: 42 };
})";
// first, just make a ctre range
{
std::string_view const expected[] = {
R"(/*
Copyright 2005-2007 Adobe Systems Incorporated
Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
or a copy at http://stlab.adobe.com/licenses.html)
*/)",
R"(
)", R"(sheet)", R"( )", R"(alert_dialog)",
R"(
)", R"({)",
R"(
)", R"(output)", R"(:)",
R"(
)", R"(result)", R"( )", R"(<==)",
R"( )", R"({)", R"( )", R"(dummy_value)",
R"(:)", R"( )", R"(42)", R"( )",
R"(})", R"(;)",
R"(
)", R"(})"};
auto r = adobe_lexer.regex_range(input);
int position = 0;
for (auto subrange : r) {
std::string_view sv = subrange;
BOOST_TEST(sv == expected[position]);
++position;
}
BOOST_TEST(position == (int)std::size(expected));
std::cout << "\n";
}
using tok_t = bp::token<char>;
tok_t const expected[] = {
tok_t((int)adobe_tokens::lead_comment, 0, R"(/*
Copyright 2005-2007 Adobe Systems Incorporated
Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
or a copy at http://stlab.adobe.com/licenses.html)
*/)"),
tok_t((int)adobe_tokens::identifier, 0, "sheet"),
tok_t((int)adobe_tokens::identifier, 0, "alert_dialog"),
tok_t(bp::character_id, 0, (long long)'{'),
tok_t((int)adobe_tokens::identifier, 0, "output"),
tok_t(bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::identifier, 0, "result"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t(bp::character_id, 0, (long long)'{'),
tok_t((int)adobe_tokens::identifier, 0, "dummy_value"),
tok_t(bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::number, 0, (long double)42.0),
tok_t(bp::character_id, 0, (long long)'}'),
tok_t(bp::character_id, 0, (long long)';'),
tok_t(bp::character_id, 0, (long long)'}')};
// make a tokens_view
{
auto r = bp::tokens_view(input, adobe_lexer);
int position = 0;
for (auto tok : r) {
BOOST_TEST(tok == expected[position]);
++position;
}
BOOST_TEST(position == (int)std::size(expected));
}
// to_tokens range adaptor
{
int position = 0;
for (auto tok : bp::to_tokens(input, adobe_lexer)) {
BOOST_TEST(tok == expected[position]);
++position;
}
BOOST_TEST(position == (int)std::size(expected));
}
{
std::string const input_str = input;
int position = 0;
for (auto tok : bp::to_tokens(input_str, adobe_lexer)) {
BOOST_TEST(tok == expected[position]);
++position;
}
BOOST_TEST(position == (int)std::size(expected));
}
{
int position = 0;
for (auto tok :
std::string(input) | bp::to_tokens(adobe_lexer)) {
BOOST_TEST(tok == expected[position]);
++position;
}
BOOST_TEST(position == (int)std::size(expected));
}
// using external caches
{
std::vector<bp::token<char>> cache;
int position = 0;
for (auto tok :
bp::to_tokens(input, adobe_lexer, std::ref(cache))) {
BOOST_TEST(tok == expected[position]);
++position;
}
BOOST_TEST(position == (int)std::size(expected));
}
{
boost::container::small_vector<bp::token<char>, 10> cache;
int position = 0;
for (auto tok :
input | bp::to_tokens(adobe_lexer, std::ref(cache))) {
BOOST_TEST(tok == expected[position]);
++position;
}
BOOST_TEST(position == (int)std::size(expected));
}
{
char const large_input[] = R"(/*
Copyright 2005-2007 Adobe Systems Incorporated
Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
or a copy at http://stlab.adobe.com/licenses.html)
*/
sheet image_size
{
input:
original_width : 1600;
original_height : 1200;
original_resolution : 300;
constant:
original_doc_width : original_width / original_resolution;
original_doc_height : original_height / original_resolution;
interface:
resample : true;
unlink constrain : true <== resample ? constrain : true;
unlink scale_styles : true <== resample && constrain ? scale_styles : false;
resample_method : @bicubic;
dim_width_pixels : original_width <== resample ? round(dim_width_pixels) : original_width;
dim_width_percent : 100 <== resample ? dim_width_percent : 100;
dim_height_pixels : original_height <== resample ? round(dim_height_pixels) : original_height;
dim_height_percent : 100 <== resample ? dim_height_percent : 100;
doc_width_inches : original_doc_width;
doc_width_percent : 100;
/*
Resolution must be initialized before width and height inches to allow proportions
to be constrained.
*/
doc_resolution : original_resolution;
doc_height_inches : original_doc_height;
doc_height_percent : 100;
auto_quality : @draft;
screen_lpi; // initialized from doc_resolution
logic:
relate {
doc_width_inches <== doc_width_percent * original_doc_width / 100;
doc_width_percent <== doc_width_inches * 100 / original_doc_width;
}
relate {
doc_height_inches <== doc_height_percent * original_doc_height / 100;
doc_height_percent <== doc_height_inches * 100 / original_doc_height;
}
relate {
screen_lpi <== doc_resolution / (auto_quality == @draft ? 1 : (auto_quality == @good ? 1.5 : 2.0));
doc_resolution <== screen_lpi * (auto_quality == @draft ? 1 : (auto_quality == @good ? 1.5 : 2.0));
}
when (resample) relate {
dim_width_pixels <== dim_width_percent * original_width / 100;
dim_width_percent <== dim_width_pixels * 100 / original_width;
}
when (resample) relate {
dim_height_pixels <== dim_height_percent * original_height / 100;
dim_height_percent <== dim_height_pixels * 100 / original_height;
}
when (resample) relate {
doc_width_inches <== dim_width_pixels / doc_resolution;
dim_width_pixels <== doc_width_inches * doc_resolution;
doc_resolution <== dim_width_pixels / doc_width_inches;
}
when (resample) relate {
doc_height_inches <== dim_height_pixels / doc_resolution;
dim_height_pixels <== doc_height_inches * doc_resolution;
doc_resolution <== dim_height_pixels / doc_height_inches;
}
when (!resample) relate {
doc_resolution <== original_width / doc_width_inches;
doc_width_inches <== original_width / doc_resolution;
}
when (!resample) relate {
doc_resolution <== original_height / doc_height_inches;
doc_height_inches <== original_height / doc_resolution;
}
when (constrain && resample) relate {
dim_width_percent <== dim_height_percent;
dim_height_percent <== dim_width_percent;
}
output:
byte_count <== dim_width_pixels * dim_height_pixels * 32;
result <== resample ? {
command: @resize_image,
width: dim_width_pixels,
height: dim_height_pixels,
resolution: doc_resolution,
scale_styles: scale_styles,
resample_method: resample_method
} : {
command: @set_resolution,
resolution: doc_resolution
};
invariant:
width_max <== dim_width_pixels <= 300000;
height_max <== dim_height_pixels <= 300000;
}
)";
tok_t const expected[] = {
tok_t((int)adobe_tokens::lead_comment, 0, R"(/*
Copyright 2005-2007 Adobe Systems Incorporated
Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
or a copy at http://stlab.adobe.com/licenses.html)
*/)"),
tok_t((int)adobe_tokens::identifier, 0, "sheet"),
tok_t((int)adobe_tokens::identifier, 0, "image_size"),
tok_t((int)bp::character_id, 0, (long long)'{'),
tok_t((int)adobe_tokens::identifier, 0, "input"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::identifier, 0, "original_width"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::number, 0, (long double)1600.0),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)adobe_tokens::identifier, 0, "original_height"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::number, 0, (long double)1200.0),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t(
(int)adobe_tokens::identifier,
0,
"original_resolution"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::number, 0, (long double)300.0),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)adobe_tokens::identifier, 0, "constant"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t(
(int)adobe_tokens::identifier, 0, "original_doc_width"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::identifier, 0, "original_width"),
tok_t((int)adobe_tokens::mul_op, 0, "/"),
tok_t(
(int)adobe_tokens::identifier,
0,
"original_resolution"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t(
(int)adobe_tokens::identifier,
0,
"original_doc_height"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::identifier, 0, "original_height"),
tok_t((int)adobe_tokens::mul_op, 0, "/"),
tok_t(
(int)adobe_tokens::identifier,
0,
"original_resolution"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)adobe_tokens::identifier, 0, "interface"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::identifier, 0, "resample"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::keyword_true_false, 0, 1ll),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)adobe_tokens::identifier, 0, "unlink"),
tok_t((int)adobe_tokens::identifier, 0, "constrain"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::keyword_true_false, 0, 1ll),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t((int)adobe_tokens::identifier, 0, "resample"),
tok_t((int)bp::character_id, 0, (long long)'?'),
tok_t((int)adobe_tokens::identifier, 0, "constrain"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::keyword_true_false, 0, 1ll),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)adobe_tokens::identifier, 0, "unlink"),
tok_t((int)adobe_tokens::identifier, 0, "scale_styles"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::keyword_true_false, 0, 1ll),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t((int)adobe_tokens::identifier, 0, "resample"),
tok_t((int)adobe_tokens::and_, 0, "&&"),
tok_t((int)adobe_tokens::identifier, 0, "constrain"),
tok_t((int)bp::character_id, 0, (long long)'?'),
tok_t((int)adobe_tokens::identifier, 0, "scale_styles"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::keyword_true_false, 0, 0ll),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)adobe_tokens::identifier, 0, "resample_method"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)bp::character_id, 0, (long long)'@'),
tok_t((int)adobe_tokens::identifier, 0, "bicubic"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)adobe_tokens::identifier, 0, "dim_width_pixels"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::identifier, 0, "original_width"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t((int)adobe_tokens::identifier, 0, "resample"),
tok_t((int)bp::character_id, 0, (long long)'?'),
tok_t((int)adobe_tokens::identifier, 0, "round"),
tok_t((int)bp::character_id, 0, (long long)'('),
tok_t((int)adobe_tokens::identifier, 0, "dim_width_pixels"),
tok_t((int)bp::character_id, 0, (long long)')'),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::identifier, 0, "original_width"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_width_percent"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::number, 0, (long double)100.0),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t((int)adobe_tokens::identifier, 0, "resample"),
tok_t((int)bp::character_id, 0, (long long)'?'),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_width_percent"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::number, 0, (long double)100.0),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_height_pixels"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::identifier, 0, "original_height"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t((int)adobe_tokens::identifier, 0, "resample"),
tok_t((int)bp::character_id, 0, (long long)'?'),
tok_t((int)adobe_tokens::identifier, 0, "round"),
tok_t((int)bp::character_id, 0, (long long)'('),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_height_pixels"),
tok_t((int)bp::character_id, 0, (long long)')'),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::identifier, 0, "original_height"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_height_percent"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::number, 0, (long double)100.0),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t((int)adobe_tokens::identifier, 0, "resample"),
tok_t((int)bp::character_id, 0, (long long)'?'),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_height_percent"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::number, 0, (long double)100.0),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)adobe_tokens::identifier, 0, "doc_width_inches"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t(
(int)adobe_tokens::identifier, 0, "original_doc_width"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t(
(int)adobe_tokens::identifier, 0, "doc_width_percent"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::number, 0, (long double)100.0),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)adobe_tokens::lead_comment, 0, R"(/*
Resolution must be initialized before width and height inches to allow proportions
to be constrained.
*/)"),
tok_t((int)adobe_tokens::identifier, 0, "doc_resolution"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t(
(int)adobe_tokens::identifier,
0,
"original_resolution"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t(
(int)adobe_tokens::identifier, 0, "doc_height_inches"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t(
(int)adobe_tokens::identifier,
0,
"original_doc_height"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t(
(int)adobe_tokens::identifier, 0, "doc_height_percent"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::number, 0, (long double)100.0),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)adobe_tokens::identifier, 0, "auto_quality"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)bp::character_id, 0, (long long)'@'),
tok_t((int)adobe_tokens::identifier, 0, "draft"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)adobe_tokens::identifier, 0, "screen_lpi"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t(
(int)adobe_tokens::trail_comment,
0,
"// initialized from doc_resolution"),
tok_t((int)adobe_tokens::identifier, 0, "logic"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::identifier, 0, "relate"),
tok_t((int)bp::character_id, 0, (long long)'{'),
tok_t((int)adobe_tokens::identifier, 0, "doc_width_inches"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t(
(int)adobe_tokens::identifier, 0, "doc_width_percent"),
tok_t((int)adobe_tokens::mul_op, 0, "*"),
tok_t(
(int)adobe_tokens::identifier, 0, "original_doc_width"),
tok_t((int)adobe_tokens::mul_op, 0, "/"),
tok_t((int)adobe_tokens::number, 0, (long double)100.0),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t(
(int)adobe_tokens::identifier, 0, "doc_width_percent"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t((int)adobe_tokens::identifier, 0, "doc_width_inches"),
tok_t((int)adobe_tokens::mul_op, 0, "*"),
tok_t((int)adobe_tokens::number, 0, (long double)100.0),
tok_t((int)adobe_tokens::mul_op, 0, "/"),
tok_t(
(int)adobe_tokens::identifier, 0, "original_doc_width"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)bp::character_id, 0, (long long)'}'),
tok_t((int)adobe_tokens::identifier, 0, "relate"),
tok_t((int)bp::character_id, 0, (long long)'{'),
tok_t(
(int)adobe_tokens::identifier, 0, "doc_height_inches"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t(
(int)adobe_tokens::identifier, 0, "doc_height_percent"),
tok_t((int)adobe_tokens::mul_op, 0, "*"),
tok_t(
(int)adobe_tokens::identifier,
0,
"original_doc_height"),
tok_t((int)adobe_tokens::mul_op, 0, "/"),
tok_t((int)adobe_tokens::number, 0, (long double)100.0),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t(
(int)adobe_tokens::identifier, 0, "doc_height_percent"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t(
(int)adobe_tokens::identifier, 0, "doc_height_inches"),
tok_t((int)adobe_tokens::mul_op, 0, "*"),
tok_t((int)adobe_tokens::number, 0, (long double)100.0),
tok_t((int)adobe_tokens::mul_op, 0, "/"),
tok_t(
(int)adobe_tokens::identifier,
0,
"original_doc_height"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)bp::character_id, 0, (long long)'}'),
tok_t((int)adobe_tokens::identifier, 0, "relate"),
tok_t((int)bp::character_id, 0, (long long)'{'),
tok_t((int)adobe_tokens::identifier, 0, "screen_lpi"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t((int)adobe_tokens::identifier, 0, "doc_resolution"),
tok_t((int)adobe_tokens::mul_op, 0, "/"),
tok_t((int)bp::character_id, 0, (long long)'('),
tok_t((int)adobe_tokens::identifier, 0, "auto_quality"),
tok_t((int)adobe_tokens::eq_op, 0, "=="),
tok_t((int)bp::character_id, 0, (long long)'@'),
tok_t((int)adobe_tokens::identifier, 0, "draft"),
tok_t((int)bp::character_id, 0, (long long)'?'),
tok_t((int)adobe_tokens::number, 0, (long double)1.0),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)bp::character_id, 0, (long long)'('),
tok_t((int)adobe_tokens::identifier, 0, "auto_quality"),
tok_t((int)adobe_tokens::eq_op, 0, "=="),
tok_t((int)bp::character_id, 0, (long long)'@'),
tok_t((int)adobe_tokens::identifier, 0, "good"),
tok_t((int)bp::character_id, 0, (long long)'?'),
tok_t((int)adobe_tokens::number, 0, (long double)1.5),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::number, 0, (long double)2.0),
tok_t((int)bp::character_id, 0, (long long)')'),
tok_t((int)bp::character_id, 0, (long long)')'),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)adobe_tokens::identifier, 0, "doc_resolution"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t((int)adobe_tokens::identifier, 0, "screen_lpi"),
tok_t((int)adobe_tokens::mul_op, 0, "*"),
tok_t((int)bp::character_id, 0, (long long)'('),
tok_t((int)adobe_tokens::identifier, 0, "auto_quality"),
tok_t((int)adobe_tokens::eq_op, 0, "=="),
tok_t((int)bp::character_id, 0, (long long)'@'),
tok_t((int)adobe_tokens::identifier, 0, "draft"),
tok_t((int)bp::character_id, 0, (long long)'?'),
tok_t((int)adobe_tokens::number, 0, (long double)1.0),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)bp::character_id, 0, (long long)'('),
tok_t((int)adobe_tokens::identifier, 0, "auto_quality"),
tok_t((int)adobe_tokens::eq_op, 0, "=="),
tok_t((int)bp::character_id, 0, (long long)'@'),
tok_t((int)adobe_tokens::identifier, 0, "good"),
tok_t((int)bp::character_id, 0, (long long)'?'),
tok_t((int)adobe_tokens::number, 0, (long double)1.5),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::number, 0, (long double)2.0),
tok_t((int)bp::character_id, 0, (long long)')'),
tok_t((int)bp::character_id, 0, (long long)')'),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)bp::character_id, 0, (long long)'}'),
tok_t((int)adobe_tokens::identifier, 0, "when"),
tok_t((int)bp::character_id, 0, (long long)'('),
tok_t((int)adobe_tokens::identifier, 0, "resample"),
tok_t((int)bp::character_id, 0, (long long)')'),
tok_t((int)adobe_tokens::identifier, 0, "relate"),
tok_t((int)bp::character_id, 0, (long long)'{'),
tok_t((int)adobe_tokens::identifier, 0, "dim_width_pixels"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_width_percent"),
tok_t((int)adobe_tokens::mul_op, 0, "*"),
tok_t((int)adobe_tokens::identifier, 0, "original_width"),
tok_t((int)adobe_tokens::mul_op, 0, "/"),
tok_t((int)adobe_tokens::number, 0, (long double)100.0),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_width_percent"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t((int)adobe_tokens::identifier, 0, "dim_width_pixels"),
tok_t((int)adobe_tokens::mul_op, 0, "*"),
tok_t((int)adobe_tokens::number, 0, (long double)100.0),
tok_t((int)adobe_tokens::mul_op, 0, "/"),
tok_t((int)adobe_tokens::identifier, 0, "original_width"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)bp::character_id, 0, (long long)'}'),
tok_t((int)adobe_tokens::identifier, 0, "when"),
tok_t((int)bp::character_id, 0, (long long)'('),
tok_t((int)adobe_tokens::identifier, 0, "resample"),
tok_t((int)bp::character_id, 0, (long long)')'),
tok_t((int)adobe_tokens::identifier, 0, "relate"),
tok_t((int)bp::character_id, 0, (long long)'{'),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_height_pixels"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_height_percent"),
tok_t((int)adobe_tokens::mul_op, 0, "*"),
tok_t((int)adobe_tokens::identifier, 0, "original_height"),
tok_t((int)adobe_tokens::mul_op, 0, "/"),
tok_t((int)adobe_tokens::number, 0, (long double)100.0),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_height_percent"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_height_pixels"),
tok_t((int)adobe_tokens::mul_op, 0, "*"),
tok_t((int)adobe_tokens::number, 0, (long double)100.0),
tok_t((int)adobe_tokens::mul_op, 0, "/"),
tok_t((int)adobe_tokens::identifier, 0, "original_height"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)bp::character_id, 0, (long long)'}'),
tok_t((int)adobe_tokens::identifier, 0, "when"),
tok_t((int)bp::character_id, 0, (long long)'('),
tok_t((int)adobe_tokens::identifier, 0, "resample"),
tok_t((int)bp::character_id, 0, (long long)')'),
tok_t((int)adobe_tokens::identifier, 0, "relate"),
tok_t((int)bp::character_id, 0, (long long)'{'),
tok_t((int)adobe_tokens::identifier, 0, "doc_width_inches"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t((int)adobe_tokens::identifier, 0, "dim_width_pixels"),
tok_t((int)adobe_tokens::mul_op, 0, "/"),
tok_t((int)adobe_tokens::identifier, 0, "doc_resolution"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)adobe_tokens::identifier, 0, "dim_width_pixels"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t((int)adobe_tokens::identifier, 0, "doc_width_inches"),
tok_t((int)adobe_tokens::mul_op, 0, "*"),
tok_t((int)adobe_tokens::identifier, 0, "doc_resolution"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)adobe_tokens::identifier, 0, "doc_resolution"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t((int)adobe_tokens::identifier, 0, "dim_width_pixels"),
tok_t((int)adobe_tokens::mul_op, 0, "/"),
tok_t((int)adobe_tokens::identifier, 0, "doc_width_inches"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)bp::character_id, 0, (long long)'}'),
tok_t((int)adobe_tokens::identifier, 0, "when"),
tok_t((int)bp::character_id, 0, (long long)'('),
tok_t((int)adobe_tokens::identifier, 0, "resample"),
tok_t((int)bp::character_id, 0, (long long)')'),
tok_t((int)adobe_tokens::identifier, 0, "relate"),
tok_t((int)bp::character_id, 0, (long long)'{'),
tok_t(
(int)adobe_tokens::identifier, 0, "doc_height_inches"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_height_pixels"),
tok_t((int)adobe_tokens::mul_op, 0, "/"),
tok_t((int)adobe_tokens::identifier, 0, "doc_resolution"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_height_pixels"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t(
(int)adobe_tokens::identifier, 0, "doc_height_inches"),
tok_t((int)adobe_tokens::mul_op, 0, "*"),
tok_t((int)adobe_tokens::identifier, 0, "doc_resolution"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)adobe_tokens::identifier, 0, "doc_resolution"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_height_pixels"),
tok_t((int)adobe_tokens::mul_op, 0, "/"),
tok_t(
(int)adobe_tokens::identifier, 0, "doc_height_inches"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)bp::character_id, 0, (long long)'}'),
tok_t((int)adobe_tokens::identifier, 0, "when"),
tok_t((int)bp::character_id, 0, (long long)'('),
tok_t((int)bp::character_id, 0, (long long)'!'),
tok_t((int)adobe_tokens::identifier, 0, "resample"),
tok_t((int)bp::character_id, 0, (long long)')'),
tok_t((int)adobe_tokens::identifier, 0, "relate"),
tok_t((int)bp::character_id, 0, (long long)'{'),
tok_t((int)adobe_tokens::identifier, 0, "doc_resolution"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t((int)adobe_tokens::identifier, 0, "original_width"),
tok_t((int)adobe_tokens::mul_op, 0, "/"),
tok_t((int)adobe_tokens::identifier, 0, "doc_width_inches"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)adobe_tokens::identifier, 0, "doc_width_inches"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t((int)adobe_tokens::identifier, 0, "original_width"),
tok_t((int)adobe_tokens::mul_op, 0, "/"),
tok_t((int)adobe_tokens::identifier, 0, "doc_resolution"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)bp::character_id, 0, (long long)'}'),
tok_t((int)adobe_tokens::identifier, 0, "when"),
tok_t((int)bp::character_id, 0, (long long)'('),
tok_t((int)bp::character_id, 0, (long long)'!'),
tok_t((int)adobe_tokens::identifier, 0, "resample"),
tok_t((int)bp::character_id, 0, (long long)')'),
tok_t((int)adobe_tokens::identifier, 0, "relate"),
tok_t((int)bp::character_id, 0, (long long)'{'),
tok_t((int)adobe_tokens::identifier, 0, "doc_resolution"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t((int)adobe_tokens::identifier, 0, "original_height"),
tok_t((int)adobe_tokens::mul_op, 0, "/"),
tok_t(
(int)adobe_tokens::identifier, 0, "doc_height_inches"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t(
(int)adobe_tokens::identifier, 0, "doc_height_inches"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t((int)adobe_tokens::identifier, 0, "original_height"),
tok_t((int)adobe_tokens::mul_op, 0, "/"),
tok_t((int)adobe_tokens::identifier, 0, "doc_resolution"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)bp::character_id, 0, (long long)'}'),
tok_t((int)adobe_tokens::identifier, 0, "when"),
tok_t((int)bp::character_id, 0, (long long)'('),
tok_t((int)adobe_tokens::identifier, 0, "constrain"),
tok_t((int)adobe_tokens::and_, 0, "&&"),
tok_t((int)adobe_tokens::identifier, 0, "resample"),
tok_t((int)bp::character_id, 0, (long long)')'),
tok_t((int)adobe_tokens::identifier, 0, "relate"),
tok_t((int)bp::character_id, 0, (long long)'{'),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_width_percent"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_height_percent"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_height_percent"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_width_percent"),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)bp::character_id, 0, (long long)'}'),
tok_t((int)adobe_tokens::identifier, 0, "output"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::identifier, 0, "byte_count"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t((int)adobe_tokens::identifier, 0, "dim_width_pixels"),
tok_t((int)adobe_tokens::mul_op, 0, "*"),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_height_pixels"),
tok_t((int)adobe_tokens::mul_op, 0, "*"),
tok_t((int)adobe_tokens::number, 0, (long double)32.0),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)adobe_tokens::identifier, 0, "result"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t((int)adobe_tokens::identifier, 0, "resample"),
tok_t((int)bp::character_id, 0, (long long)'?'),
tok_t((int)bp::character_id, 0, (long long)'{'),
tok_t((int)adobe_tokens::identifier, 0, "command"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)bp::character_id, 0, (long long)'@'),
tok_t((int)adobe_tokens::identifier, 0, "resize_image"),
tok_t((int)bp::character_id, 0, (long long)','),
tok_t((int)adobe_tokens::identifier, 0, "width"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::identifier, 0, "dim_width_pixels"),
tok_t((int)bp::character_id, 0, (long long)','),
tok_t((int)adobe_tokens::identifier, 0, "height"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_height_pixels"),
tok_t((int)bp::character_id, 0, (long long)','),
tok_t((int)adobe_tokens::identifier, 0, "resolution"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::identifier, 0, "doc_resolution"),
tok_t((int)bp::character_id, 0, (long long)','),
tok_t((int)adobe_tokens::identifier, 0, "scale_styles"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::identifier, 0, "scale_styles"),
tok_t((int)bp::character_id, 0, (long long)','),
tok_t((int)adobe_tokens::identifier, 0, "resample_method"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::identifier, 0, "resample_method"),
tok_t((int)bp::character_id, 0, (long long)'}'),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)bp::character_id, 0, (long long)'{'),
tok_t((int)adobe_tokens::identifier, 0, "command"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)bp::character_id, 0, (long long)'@'),
tok_t((int)adobe_tokens::identifier, 0, "set_resolution"),
tok_t((int)bp::character_id, 0, (long long)','),
tok_t((int)adobe_tokens::identifier, 0, "resolution"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::identifier, 0, "doc_resolution"),
tok_t((int)bp::character_id, 0, (long long)'}'),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)adobe_tokens::identifier, 0, "invariant"),
tok_t((int)bp::character_id, 0, (long long)':'),
tok_t((int)adobe_tokens::identifier, 0, "width_max"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t((int)adobe_tokens::identifier, 0, "dim_width_pixels"),
tok_t((int)adobe_tokens::rel_op, 0, "<"),
tok_t((int)bp::character_id, 0, (long long)'='),
tok_t((int)adobe_tokens::number, 0, (long double)300000.0),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)adobe_tokens::identifier, 0, "height_max"),
tok_t((int)adobe_tokens::define, 0, "<=="),
tok_t(
(int)adobe_tokens::identifier, 0, "dim_height_pixels"),
tok_t((int)adobe_tokens::rel_op, 0, "<"),
tok_t((int)bp::character_id, 0, (long long)'='),
tok_t((int)adobe_tokens::number, 0, (long double)300000.0),
tok_t((int)bp::character_id, 0, (long long)';'),
tok_t((int)bp::character_id, 0, (long long)'}')};
int position = 0;
for (auto tok :
std::string(large_input) | bp::to_tokens(adobe_lexer)) {
BOOST_TEST(tok == expected[position]);
if (tok != expected[position]) {
std::cout << "At pos=" << position << ": got " << tok
<< " expected " << expected[position] << "\n";
}
++position;
}
BOOST_TEST(position == (int)std::size(expected));
}
}
}
return boost::report_errors();
}

237
test/lexer_and_parser.cpp Normal file
View File

@@ -0,0 +1,237 @@
/**
* Copyright (C) 2024 T. Zachary Laine
*
* Distributed under the Boost Software License, Version 1.0. (See
* accompanying file LICENSE_1_0.txt or copy at
* http://www.boost.org/LICENSE_1_0.txt)
*/
#define BOOST_PARSER_TESTING
//[ tokens_basics_headers
#include <boost/parser/lexer.hpp>
#include <boost/parser/parser.hpp>
//]
#include <boost/core/lightweight_test.hpp>
#include "adobe_lexer.hpp"
namespace bp = boost::parser;
int main()
{
// Minimal test; just instantiate the member functions, without involving
// the parse() API.
{
bp::token<char> tokens[1] = {};
auto p = bp::token_spec<"12", 12, int>;
auto first = std::begin(tokens);
auto const last = std::end(tokens);
bp::detail::nope globals;
bp::default_error_handler error_handler;
// From parse_impl().
bool success = true;
int trace_indent = 0;
bp::detail::symbol_table_tries_t symbol_table_tries;
bp::detail::pending_symbol_table_operations_t
pending_symbol_table_operations;
bp::detail::scoped_apply_pending_symbol_table_operations apply_pending(
pending_symbol_table_operations);
auto context = bp::detail::make_context<false, false>(
first,
last,
success,
trace_indent,
error_handler,
globals,
symbol_table_tries,
pending_symbol_table_operations);
auto const flags = bp::detail::flags::gen_attrs;
std::optional<int> result =
p(first, last, context, bp::ws, flags, success);
(void)result;
}
// Minimal tests of building parsers from token_parser and token_spec.
{
auto parser1 = true_false(true);
auto parser2 = true_false(false);
(void)parser1;
(void)parser2;
}
{
auto parser = identifier("foo") >> '=' >> true_false >> ';';
(void)parser;
}
// Minimal tests of using a lexer and parser together.
{
auto parser = identifier("foo") >> '=' >> true_false >> ';';
auto r = "some input" | bp::to_tokens(adobe_lexer);
auto result = bp::parse(r, parser);
BOOST_TEST(!result);
static_assert(!std::same_as<
std::remove_cvref_t<
decltype(bp::detail::tokens_view_or_nope(r))>,
bp::detail::nope>);
auto const & cr = r;
static_assert(!std::same_as<
std::remove_cvref_t<
decltype(bp::detail::tokens_view_or_nope(cr))>,
bp::detail::nope>);
}
{
auto parser = identifier >> '=' >> true_false >> ';';
auto r = "foo = false;" | bp::to_tokens(adobe_lexer);
auto result = bp::parse(r, parser);
BOOST_TEST(result);
BOOST_TEST(std::get<0>(*result) == "foo");
BOOST_TEST(std::get<1>(*result) == false);
}
// Test the use of an external token cache.
{
auto parser = identifier >> '=' >> true_false >> ';';
std::vector<bp::token<char>> cache;
auto r = "foo = false;" | bp::to_tokens(adobe_lexer, std::ref(cache));
auto result = bp::parse(r, parser);
BOOST_TEST(std::get<0>(*result) == "foo");
BOOST_TEST(std::get<1>(*result) == false);
BOOST_TEST(cache.size() == 4u);
}
// Test the clearing of the token cache at expectation points.
{
auto parser = identifier >> '=' > true_false >> ';';
std::vector<bp::token<char>> cache;
auto r = "foo = false;" | bp::to_tokens(adobe_lexer, std::ref(cache));
auto result = bp::parse(r, parser);
BOOST_TEST(std::get<0>(*result) == "foo");
BOOST_TEST(std::get<1>(*result) == false);
BOOST_TEST(cache.size() == 2u);
}
// doc examples
// clang-format off
{
//[ tokens_basics_lexer
auto const foo = bp::token_spec<"foo", 0>;
auto const bar = bp::token_spec<"b.r", 1>;
auto const baz = bp::token_spec<"b.z", 2>;
auto const lexer = bp::lexer<char, int> | foo | bar | baz;
//]
//[ tokens_basics_input_range
auto r = "foobazbar" | bp::to_tokens(lexer);
//]
//[ tokens_basics_parser
auto parser = foo >> baz >> bar;
//]
//[ tokens_basics_parse
auto result = bp::parse(r, parser);
assert(result);
assert(std::get<0>(*result) == "foo");
assert(std::get<1>(*result) == "baz");
assert(std::get<2>(*result) == "bar");
//]
}
{
//[ tokens_attrs
constexpr auto true_false = bp::token_spec<"true|false", 0, bool>;
constexpr auto identifier = bp::token_spec<"[a-zA-Z]\\w*", 1>;
constexpr auto number = bp::token_spec<"\\d+(?:\\.\\d*)?", 2, double>;
//]
(void)true_false;
(void)identifier;
(void)number;
}
{
//[ tokens_token_char
constexpr auto true_false = bp::token_spec<"true|false", 0, bool>;
constexpr auto identifier = bp::token_spec<"[a-zA-Z]\\w*", 1>;
constexpr auto lexer =
bp::lexer<char, int> | true_false | identifier | bp::token_chars<'=', ';'>;
auto parser = identifier >> '=' >> true_false >> ';';
auto r = "foo = false;" | bp::to_tokens(lexer);
auto result = bp::parse(r, parser);
assert(result);
assert(std::get<0>(*result) == "foo");
assert(std::get<1>(*result) == false);
//]
}
{
//[ tokens_caching_simple
constexpr auto true_false = bp::token_spec<"true|false", 0, bool>;
constexpr auto identifier = bp::token_spec<"[a-zA-Z]\\w*", 1>;
constexpr auto lexer =
bp::lexer<char, int> | true_false | identifier | bp::token_chars<'=', ';'>;
auto parser = identifier >> '=' >> true_false >> ';';
std::vector<bp::token<char>> cache;
auto r = "foo = false;" | bp::to_tokens(lexer, std::ref(cache));
auto result = bp::parse(r, parser);
assert(result);
assert(std::get<0>(*result) == "foo");
assert(std::get<1>(*result) == false);
assert(cache.size() == 4u);
//]
}
{
constexpr auto true_false = bp::token_spec<"true|false", 0, bool>;
constexpr auto identifier = bp::token_spec<"[a-zA-Z]\\w*", 1>;
constexpr auto lexer =
bp::lexer<char, int> | true_false | identifier | bp::token_chars<'=', ';'>;
//[ tokens_caching_expectation_point
auto parser = identifier >> '=' > true_false >> ';';
std::vector<bp::token<char>> cache;
auto r = "foo = false;" | bp::to_tokens(lexer, std::ref(cache));
auto result = bp::parse(r, parser);
assert(result);
assert(std::get<0>(*result) == "foo");
assert(std::get<1>(*result) == false);
assert(cache.size() == 2u);
//]
}
{
//[ tokens_string_in_character_vs_token_parsing
constexpr auto true_false = bp::token_spec<"true|false", 0, bool>;
constexpr auto identifier = bp::token_spec<"[a-zA-Z]\\w*", 1>;
constexpr auto lexer =
bp::lexer<char, int> | true_false | identifier | bp::token_chars<'=', ';'>;
auto parser = bp::string("=;");
// NOTE: Character parsing here.
auto character_parse_result = bp::parse("=;", parser);
assert(character_parse_result);
assert(*character_parse_result == "=;");
// NOTE: Token parsing here.
auto token_parse_result = bp::parse("=;" | bp::to_tokens(lexer), parser);
assert(!token_parse_result);
//]
}
// clang-format on
return boost::report_errors();
}

View File

@@ -0,0 +1,184 @@
/**
* Copyright (C) 2024 T. Zachary Laine
*
* Distributed under the Boost Software License, Version 1.0. (See
* accompanying file LICENSE_1_0.txt or copy at
* http://www.boost.org/LICENSE_1_0.txt)
*/
#define BOOST_PARSER_TESTING
#include <boost/parser/lexer.hpp>
#include <boost/parser/parser.hpp>
#include <boost/core/lightweight_test.hpp>
namespace bp = boost::parser;
constexpr auto true_false = bp::token_spec<"true|false", 0, bool>;
constexpr auto identifier = bp::token_spec<"[a-zA-Z]\\w*", 1>;
struct tf_tag
{};
struct id_tag
{};
constexpr bp::callback_rule<tf_tag, bool> callback_true_false = "";
constexpr bp::callback_rule<id_tag, std::string_view> callback_identifier = "";
constexpr auto callback_true_false_def = true_false;
constexpr auto callback_identifier_def = identifier;
BOOST_PARSER_DEFINE_RULES(callback_true_false, callback_identifier);
struct callbacks
{
void operator()(id_tag, std::string_view sv) const { sv_ = sv; }
void operator()(tf_tag, bool b) const { b_ = b; }
std::string_view & sv_;
bool & b_;
};
int main()
{
auto assign_bool_parser = identifier >> '=' >> true_false >> ';';
auto assign_bool_no_semi_parser = identifier >> '=' >> true_false;
constexpr auto lexer = bp::lexer<char, int> | true_false | identifier |
bp::token_chars<'=', ';'>;
auto r = "foo = false;" | bp::to_tokens(lexer);
// prefix_parse() w/attr
{
auto f = r.begin();
auto const l = r.end();
std::tuple<std::string_view, bool> result;
auto success = bp::prefix_parse(f, l, assign_bool_parser, result);
BOOST_TEST(success);
BOOST_TEST(std::get<0>(result) == "foo");
BOOST_TEST(std::get<1>(result) == false);
}
{
auto f = r.begin();
auto const l = r.end();
std::tuple<std::string_view, bool> result;
auto success = bp::prefix_parse(f, l, assign_bool_no_semi_parser, result);
BOOST_TEST(success);
BOOST_TEST(std::get<0>(result) == "foo");
BOOST_TEST(std::get<1>(result) == false);
BOOST_TEST(f != l);
}
// parse() w/attr
{
std::tuple<std::string_view, bool> result;
auto success = bp::parse(r, assign_bool_parser, result);
BOOST_TEST(success);
BOOST_TEST(std::get<0>(result) == "foo");
BOOST_TEST(std::get<1>(result) == false);
}
{
constexpr auto lexer = bp::lexer<char8_t, int> | true_false |
identifier | bp::token_chars<'=', ';'>;
auto r8 = u8"foo = false;" | bp::to_tokens(lexer);
std::tuple<std::u8string_view, bool> result;
auto success = bp::parse(r8, assign_bool_parser, result);
BOOST_TEST(success);
BOOST_TEST(std::get<0>(result) == u8"foo");
BOOST_TEST(std::get<1>(result) == false);
}
{
constexpr auto lexer = bp::lexer<char16_t, int> | true_false |
identifier | bp::token_chars<'=', ';'>;
auto r16 = u"foo = false;" | bp::to_tokens(lexer);
std::tuple<std::u16string_view, bool> result;
auto success = bp::parse(r16, assign_bool_parser, result);
BOOST_TEST(success);
BOOST_TEST(std::get<0>(result) == u"foo");
BOOST_TEST(std::get<1>(result) == false);
}
{
constexpr auto lexer = bp::lexer<char32_t, int> | true_false |
identifier | bp::token_chars<'=', ';'>;
auto r32 = U"foo = false;" | bp::to_tokens(lexer);
std::tuple<std::u32string_view, bool> result;
auto success = bp::parse(r32, assign_bool_parser, result);
BOOST_TEST(success);
BOOST_TEST(std::get<0>(result) == U"foo");
BOOST_TEST(std::get<1>(result) == false);
}
// prefix_parse() no attr
{
auto f = r.begin();
auto const l = r.end();
auto result = bp::prefix_parse(f, l, assign_bool_parser);
BOOST_TEST(result);
BOOST_TEST(std::get<0>(*result) == "foo");
BOOST_TEST(std::get<1>(*result) == false);
}
{
auto f = r.begin();
auto const l = r.end();
auto result = bp::prefix_parse(f, l, assign_bool_no_semi_parser);
BOOST_TEST(result);
BOOST_TEST(std::get<0>(*result) == "foo");
BOOST_TEST(std::get<1>(*result) == false);
BOOST_TEST(f != l);
}
// parse() no attr
{
auto result = bp::parse(r, assign_bool_parser);
BOOST_TEST(result);
BOOST_TEST(std::get<0>(*result) == "foo");
BOOST_TEST(std::get<1>(*result) == false);
}
// callback_prefix_parse()
{
auto assign_bool_parser =
callback_identifier >> '=' >> callback_true_false >> ';';
auto f = r.begin();
auto const l = r.end();
std::string_view sv;
bool b = false;
auto success = bp::callback_prefix_parse(
f, l, assign_bool_parser, callbacks{sv, b});
BOOST_TEST(success);
BOOST_TEST(sv == "foo");
BOOST_TEST(b == false);
}
{
auto assign_bool_no_semi_parser =
callback_identifier >> '=' >> callback_true_false;
auto f = r.begin();
auto const l = r.end();
std::string_view sv;
bool b = false;
auto success = bp::callback_prefix_parse(
f, l, assign_bool_no_semi_parser, callbacks{sv, b});
BOOST_TEST(success);
BOOST_TEST(sv == "foo");
BOOST_TEST(b == false);
BOOST_TEST(f != l);
}
// callback_parse()
{
auto assign_bool_parser =
callback_identifier >> '=' >> callback_true_false >> ';';
std::string_view sv;
bool b = false;
auto success =
bp::callback_parse(r, assign_bool_parser, callbacks{sv, b});
BOOST_TEST(success);
BOOST_TEST(sv == "foo");
BOOST_TEST(b == false);
}
return boost::report_errors();
}

View File

@@ -0,0 +1,112 @@
// Copyright (C) 2024 T. Zachary Laine
//
// Distributed under the Boost Software License, Version 1.0. (See
// accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
#define BOOST_PARSER_TESTING
#include <boost/parser/lexer.hpp>
#include <boost/parser/parser.hpp>
#include <boost/core/lightweight_test.hpp>
namespace bp = boost::parser;
bp::rule<class symbol_rule, std::string_view> const symrule = "symbols";
bp::symbols<std::string_view> rule_symbols;
auto const fwd_attr = [](auto & ctx) { _val(ctx) = _attr(ctx); };
auto symrule_def = rule_symbols[fwd_attr];
BOOST_PARSER_DEFINE_RULES(symrule);
constexpr auto I = bp::token_spec<"I", 0>;
constexpr auto V = bp::token_spec<"V", 1>;
constexpr auto X = bp::token_spec<"X", 2>;
constexpr auto L = bp::token_spec<"L", 3>;
constexpr auto C = bp::token_spec<"C", 4>;
constexpr auto arabic_num = bp::token_spec<"\\d+", 5, int>;
constexpr auto lexer = bp::lexer<char, int> | I | V | X | L | C | arabic_num;
int main()
{
// symbols_empty
{
bp::symbols<int> roman_numerals;
bp::symbols<std::string> named_strings;
auto r = "I" | bp::to_tokens(lexer);
BOOST_TEST(!bp::parse(r, roman_numerals));
BOOST_TEST(!bp::parse(r, named_strings));
}
// symbols_simple
{
bp::symbols<int> const roman_numerals = {
{"I", 1}, {"V", 5}, {"X", 10}, {"L", 50}, {"C", 100}};
bp::symbols<std::string> const named_strings = {
{"I", "1"}, {"V", "5"}, {"X", "10"}, {"L", "50"}, {"C", "100"}};
{
auto const result =
bp::parse("I" | bp::to_tokens(lexer), roman_numerals);
BOOST_TEST(result);
BOOST_TEST(*result == 1);
}
{
auto const result =
bp::parse("I" | bp::to_tokens(lexer), named_strings);
BOOST_TEST(result);
BOOST_TEST(*result == "1");
}
{
auto const result =
bp::parse("L" | bp::to_tokens(lexer), roman_numerals);
BOOST_TEST(result);
BOOST_TEST(*result == 50);
}
{
auto const result =
bp::parse("L" | bp::to_tokens(lexer), named_strings);
BOOST_TEST(result);
BOOST_TEST(*result == "50");
}
}
// symbols_mutating
{
bp::symbols<int> roman_numerals;
roman_numerals.insert_for_next_parse("I", 1);
roman_numerals.insert_for_next_parse("V", 5);
roman_numerals.insert_for_next_parse("X", 10);
auto const add_numeral = [&roman_numerals](auto & context) {
using namespace boost::parser::literals;
const std::string_view sv = bp::get(_attr(context), 0_c);
roman_numerals.insert(context, sv, bp::get(_attr(context), 1_c));
};
auto const numerals_parser =
((I | V | X | L | C) >> arabic_num)[add_numeral] >> roman_numerals;
{
auto const result =
bp::parse("L50L" | bp::to_tokens(lexer), numerals_parser);
BOOST_TEST(result);
BOOST_TEST(*result == 50);
BOOST_TEST(!bp::parse("L", roman_numerals));
}
{
auto const result =
bp::parse("C100C" | bp::to_tokens(lexer), numerals_parser);
BOOST_TEST(result);
BOOST_TEST(*result == 100);
BOOST_TEST(!bp::parse("C", roman_numerals));
}
{
auto const result =
bp::parse("L50C" | bp::to_tokens(lexer), numerals_parser);
BOOST_TEST(!result);
}
}
return boost::report_errors();
}

File diff suppressed because it is too large Load Diff

View File

@@ -6,6 +6,10 @@
* http://www.boost.org/LICENSE_1_0.txt)
*/
#include <boost/parser/config.hpp>
#if BOOST_PARSER_USE_CONCEPTS
#include <boost/parser/lexer.hpp>
#endif
#include <boost/parser/parser.hpp>
@@ -26,6 +30,17 @@ struct globals_t
globals_t const globals;
enum class unprintable_tokens { foo, bar };
enum class printable_tokens { foo, bar };
std::ostream & operator<<(std::ostream & os, printable_tokens tok)
{
switch (tok) {
case printable_tokens::foo: os << "foo"; break;
case printable_tokens::bar: os << "bar"; break;
}
return os;
}
auto i = [](auto & ctx) { return _globals(ctx).i; };
auto i2 = [](auto & ctx) { return _globals(ctx).i2; };
auto u = [](auto & ctx) { return _globals(ctx).u; };
@@ -484,4 +499,64 @@ int main()
PARSE_CHAR32(float_);
PARSE_CHAR32(double_);
#if BOOST_PARSER_USE_CONCEPTS
{
std::cout << "\n\n"
<< "----------------------------------------\n"
<< "| unprintable_foo (token_spec) |\n"
<< "----------------------------------------\n";
constexpr auto unprintable_foo =
token_spec<"\\w\\w\\w", unprintable_tokens::foo>;
constexpr auto unprintable_lexer =
lexer<char, unprintable_tokens> | unprintable_foo;
std::cout << "token_spec<\"\\w\\w\\w\", unprintable_tokens::foo>:\n";
parse(str | to_tokens(unprintable_lexer), unprintable_foo, trace::on);
std::cout
<< "token_spec<\"\\w\\w\\w\", unprintable_tokens::foo>(\"foo\"):\n";
parse(
str | to_tokens(unprintable_lexer),
unprintable_foo("foo"),
trace::on);
}
{
std::cout << "\n\n"
<< "----------------------------------------\n"
<< "| printable_foo (token_spec) |\n"
<< "----------------------------------------\n";
constexpr auto printable_foo =
token_spec<"\\w\\w\\w", printable_tokens::foo>;
constexpr auto printable_lexer =
lexer<char, printable_tokens> | printable_foo;
std::cout << "token_spec<\"\\w\\w\\w\", printable_tokens::foo>:\n";
parse(str | to_tokens(printable_lexer), printable_foo, trace::on);
std::cout
<< "token_spec<\"\\w\\w\\w\", printable_tokens::foo>(\"bar\"):\n";
parse(
str | to_tokens(printable_lexer), printable_foo("bar"), trace::on);
}
{
std::cout << "\n\n"
<< "----------------------------------------\n"
<< "| int_foo (token_spec) |\n"
<< "----------------------------------------\n";
constexpr auto int_foo = token_spec<"\\w\\w\\w", 42, int>;
constexpr auto int_lexer = lexer<char, int> | int_foo;
std::cout << "token_spec<\"\\w\\w\\w\", 42, int>:\n";
parse(str | to_tokens(int_lexer), int_foo, trace::on);
std::cout << "token_spec<\"\\w\\w\\w\", 42, int>(13):\n";
parse(str | to_tokens(int_lexer), int_foo(13), trace::on);
}
#endif
}