2
0
mirror of https://github.com/boostorg/parser.git synced 2026-01-19 04:22:13 +00:00

Address TODOs about resolving the value used in detail::token_with_value, and

doing transcoding for the match in detail::token_with_string_view.

See #202.
This commit is contained in:
Zach Laine
2024-12-08 17:16:15 -06:00
parent ff1059695d
commit c6d35e8791
4 changed files with 260 additions and 28 deletions

View File

@@ -264,7 +264,8 @@ namespace boost { namespace parser {
{
os << "[value: ";
if (token.has_string_view()) {
os << '"' << token.get_string_view() << '"';
os << '"' << (token.get_string_view() | detail::text::as_utf8)
<< '"';
} else if (token.has_long_long()) {
if (token.id() == character_id) {
os << "'" << (char)token.get_long_long() << "'";

View File

@@ -7206,7 +7206,9 @@ namespace boost { namespace parser {
return;
}
auto const cps = make_subrange(expected_first_, expected_last_);
using char_type = detail::char_type_from_iter<Iter, Sentinel>;
auto const cps =
make_subrange<char_type>(expected_first_, expected_last_);
if constexpr (detail::is_token_iter_v<Iter>) {
if (!(*first).has_string_view()) {
@@ -7215,7 +7217,7 @@ namespace boost { namespace parser {
}
auto const sv = (*first).get_string_view();
auto token_cps = make_subrange(sv.begin(), sv.end());
auto token_cps = make_subrange<char_type>(sv.begin(), sv.end());
auto const mismatch = detail::no_case_aware_string_mismatch(
token_cps.begin(),
token_cps.end(),
@@ -7254,13 +7256,11 @@ namespace boost { namespace parser {
}
}
template<typename I, typename S>
template<typename CharType, typename I, typename S>
static auto make_subrange(I f, S l)
{
auto subrange = BOOST_PARSER_SUBRANGE(f, l);
if constexpr (std::is_same_v<
detail::remove_cv_ref_t<decltype(*f)>,
char>) {
if constexpr (std::is_same_v<CharType, char>) {
return subrange;
} else {
return subrange | detail::text::as_utf32;

View File

@@ -38,33 +38,65 @@ namespace boost { namespace parser {
}
}
// TODO: This needs to use resolve(value_), and we need a test for
// that.
template<typename T>
template<typename Expected>
struct token_with_value
{
static_assert(std::integral<T> || std::floating_point<T>);
explicit token_with_value(T value) : value_(value) {}
bool matches(T value) const { return value == value_; }
T value_;
explicit constexpr token_with_value(Expected value) :
expected_(value)
{}
template<typename T, typename Context>
bool matches(T value, Context const & context) const
{
return value == detail::resolve(context, expected_);
}
Expected expected_;
};
template<typename Subrange>
struct token_with_string_view
{
explicit token_with_string_view(Subrange value) : value_(value) {}
explicit constexpr token_with_string_view(Subrange subrange) :
subrange_(subrange)
{}
template<typename CharType>
bool matches(std::basic_string_view<CharType> value) const
template<typename CharType, typename Context>
bool matches(
std::basic_string_view<CharType> value, Context const &) const
{
// TODO: this is wrong. We need to transcode both sides to
// UTF-32, when !same_as<CharType, char>. (Need to write some
// tests, and evaluate whether this is a good idea. If not,
// go change the docs on token_parser).
return std::ranges::equal(value, value_);
auto const value_cps =
make_subrange<CharType>(value.begin(), value.end());
auto const subrange_cps =
make_subrange<CharType>(subrange_.begin(), subrange_.end());
return std::ranges::equal(
value_cps, subrange_cps, [](auto a, auto b) {
return cast_char(a) == cast_char(b);
});
}
Subrange value_;
template<typename T>
static auto cast_char(T c)
{
if constexpr (std::same_as<T, char>) {
return (unsigned char)c;
} else {
return c;
}
}
template<typename CharType, typename I, typename S>
static auto make_subrange(I f, S l)
{
auto subrange = BOOST_PARSER_SUBRANGE(f, l);
if constexpr (std::is_same_v<CharType, char>) {
return subrange;
} else {
return subrange | detail::text::as_utf32;
}
}
Subrange subrange_;
};
}
@@ -142,7 +174,7 @@ namespace boost { namespace parser {
if (use_expected || detail::gen_attrs(flags)) {
auto opt_attr = detail::token_as<attribute_type<Iter>>(x);
if constexpr (use_expected) {
if (!opt_attr || !expected_.matches(*opt_attr)) {
if (!opt_attr || !expected_.matches(*opt_attr, context)) {
success = false;
return;
}
@@ -157,7 +189,7 @@ namespace boost { namespace parser {
/** Returns a `parser_interface` containing a `token_parser` that
matches `value`. */
template<typename T>
requires std::is_integral_v<T> || std::is_floating_point_v<T>
requires(!parsable_range_like<T>)
constexpr auto operator()(T value) const noexcept
{
BOOST_PARSER_ASSERT(
@@ -191,12 +223,26 @@ namespace boost { namespace parser {
"token_spec, like 'token_spec(char-set)(char-set)'. Quit "
"it!'"));
auto expected =
detail::token_with_string_view{BOOST_PARSER_SUBRANGE(
std::ranges::begin(r), std::ranges::end(r))};
detail::token_with_string_view{make_expected_range((R &&)r)};
return parser_interface(
token_parser<token_spec, decltype(expected)>(expected));
}
template<typename R>
static constexpr auto make_expected_range(R && r)
{
using T = detail::remove_cv_ref_t<R>;
if constexpr (std::is_bounded_array_v<T>) {
constexpr auto n = std::extent_v<T>;
auto const offset = n && !r[n - 1] ? 1 : 0;
return BOOST_PARSER_SUBRANGE(
std::ranges::begin(r), std::ranges::end(r) - offset);
} else {
return BOOST_PARSER_SUBRANGE(
std::ranges::begin(r), std::ranges::end(r));
}
}
// TODO: Consider adding a special string_view-like type that can be
// passed to the range overload above. It would be based on
// adobe::name_t. When comparing it to a tokens' string_view, if it

View File

@@ -13,10 +13,195 @@
namespace bp = boost::parser;
constexpr auto true_false = bp::token_spec<"true|false", 0, bool>;
constexpr auto identifier = bp::token_spec<"[a-zA-Z]\\w*", 1>;
constexpr auto identifier = bp::token_spec<"\\p{L}+", 1>;
int main()
{
// token_parser
{
{
constexpr auto lexer = bp::lexer<char, int> | true_false |
identifier |
bp::token_chars<'=', ';', '#', '$', '%'>;
{
constexpr auto parser = true_false;
BOOST_TEST(bp::parse("true" | bp::to_tokens(lexer), parser));
BOOST_TEST(bp::parse("false" | bp::to_tokens(lexer), parser));
BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser));
}
{
constexpr auto parser = true_false(false);
BOOST_TEST(!bp::parse("true" | bp::to_tokens(lexer), parser));
BOOST_TEST(bp::parse("false" | bp::to_tokens(lexer), parser));
BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser));
}
{
bool b = true;
auto get_bool = [](auto & ctx) { return _globals(ctx); };
auto parser = bp::with_globals(true_false(get_bool), b);
BOOST_TEST(bp::parse("true" | bp::to_tokens(lexer), parser));
BOOST_TEST(!bp::parse("false" | bp::to_tokens(lexer), parser));
BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser));
}
}
{
constexpr auto lexer = bp::lexer<char, int> | true_false |
identifier |
bp::token_chars<'=', ';', '#', '$', '%'>;
{
constexpr auto parser = identifier;
BOOST_TEST(bp::parse("func" | bp::to_tokens(lexer), parser));
BOOST_TEST(bp::parse("foo" | bp::to_tokens(lexer), parser));
BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser));
}
{
constexpr auto parser = identifier("func");
BOOST_TEST(bp::parse("func" | bp::to_tokens(lexer), parser));
BOOST_TEST(!bp::parse("foo" | bp::to_tokens(lexer), parser));
BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser));
}
{
constexpr auto parser = identifier("f\xC3\xBCnc");
constexpr auto parser_u8 = identifier(u8"fünc");
constexpr auto parser_u16 = identifier(u"fünc");
constexpr auto parser_u32 = identifier(U"fünc");
constexpr auto lexer_u8 =
bp::lexer<char8_t, int> | true_false | identifier |
bp::token_chars<'=', ';', '#', '$', '%'>;
constexpr auto lexer_u16 =
bp::lexer<char16_t, int> | true_false | identifier |
bp::token_chars<'=', ';', '#', '$', '%'>;
constexpr auto lexer_u32 =
bp::lexer<char32_t, int> | true_false | identifier |
bp::token_chars<'=', ';', '#', '$', '%'>;
// There appears to be a bug in CTRE, related to the use of
// char8_t. It produces bad tokens, including appearances of
// the replacement character. The exact same input here
// results in good tokens for all cases except char8_t input.
#if 0
std::cout << "char tokens:\n";
for (auto tok : "fünc" | bp::to_tokens(lexer)) {
std::cout << "tok=" << tok << "\n";
}
std::cout << "\n\n";
std::cout << "char tokens:\n";
for (auto tok : "f\xC3\xBCnc" | bp::to_tokens(lexer)) {
std::cout << "tok=" << tok << "\n";
}
std::cout << "\n\n";
// BAD!
std::cout << "char8_t tokens:\n";
for (auto tok : u8"fünc" | bp::to_tokens(lexer_u8)) {
std::cout << "tok=" << tok << "\n";
}
std::cout << "\n\n";
std::cout << "char16_t tokens:\n";
for (auto tok : u"fünc" | bp::to_tokens(lexer_u16)) {
std::cout << "tok=" << tok << "\n";
}
std::cout << "\n\n";
std::cout << "char32_t tokens:\n";
for (auto tok : U"fünc" | bp::to_tokens(lexer_u32)) {
std::cout << "tok=" << tok << "\n";
}
std::cout << "\n\n";
#endif
// Range to match is sequence of char; no transcoding will be
// done.
BOOST_TEST(
bp::parse("f\xC3\xBCnc" | bp::to_tokens(lexer), parser));
#if 0
BOOST_TEST(
bp::parse(u8"fünc" | bp::to_tokens(lexer_u8), parser));
#endif
BOOST_TEST(
bp::parse(u"fünc" | bp::to_tokens(lexer_u16), parser));
BOOST_TEST(
bp::parse(U"fünc" | bp::to_tokens(lexer_u32), parser));
BOOST_TEST( // char input; no transcoding on this one.
bp::parse("f\xC3\xBCnc" | bp::to_tokens(lexer), parser_u8));
#if 0
BOOST_TEST(
bp::parse(u8"fünc" | bp::to_tokens(lexer_u8), parser_u8));
#endif
BOOST_TEST(
bp::parse(u"fünc" | bp::to_tokens(lexer_u16), parser_u8));
BOOST_TEST(
bp::parse(U"fünc" | bp::to_tokens(lexer_u32), parser_u8));
BOOST_TEST( // char input; no transcoding on this one.
!bp::parse(
"f\xC3\xBCnc" | bp::to_tokens(lexer), parser_u16));
#if 0
BOOST_TEST(
bp::parse(u8"fünc" | bp::to_tokens(lexer_u8), parser_u16));
#endif
BOOST_TEST(
bp::parse(u"fünc" | bp::to_tokens(lexer_u16), parser_u16));
BOOST_TEST(
bp::parse(U"fünc" | bp::to_tokens(lexer_u32), parser_u16));
BOOST_TEST( // char input; no transcoding on this one.
!bp::parse(
"f\xC3\xBCnc" | bp::to_tokens(lexer), parser_u32));
#if 0
BOOST_TEST(
bp::parse(u8"fünc" | bp::to_tokens(lexer_u8), parser_u32));
#endif
BOOST_TEST(
bp::parse(u"fünc" | bp::to_tokens(lexer_u16), parser_u32));
BOOST_TEST(
bp::parse(U"fünc" | bp::to_tokens(lexer_u32), parser_u32));
BOOST_TEST(!bp::parse("func" | bp::to_tokens(lexer), parser));
BOOST_TEST(
!bp::parse(u8"func" | bp::to_tokens(lexer_u8), parser));
BOOST_TEST(
!bp::parse(u"func" | bp::to_tokens(lexer_u16), parser));
BOOST_TEST(
!bp::parse(U"func" | bp::to_tokens(lexer_u32), parser));
BOOST_TEST(
!bp::parse("func" | bp::to_tokens(lexer), parser_u8));
BOOST_TEST(
!bp::parse(u8"func" | bp::to_tokens(lexer_u8), parser_u8));
BOOST_TEST(
!bp::parse(u"func" | bp::to_tokens(lexer_u16), parser_u8));
BOOST_TEST(
!bp::parse(U"func" | bp::to_tokens(lexer_u32), parser_u8));
BOOST_TEST(
!bp::parse("func" | bp::to_tokens(lexer), parser_u16));
BOOST_TEST(
!bp::parse(u8"func" | bp::to_tokens(lexer_u8), parser_u16));
BOOST_TEST(
!bp::parse(u"func" | bp::to_tokens(lexer_u16), parser_u16));
BOOST_TEST(
!bp::parse(U"func" | bp::to_tokens(lexer_u32), parser_u16));
BOOST_TEST(
!bp::parse("func" | bp::to_tokens(lexer), parser_u32));
BOOST_TEST(
!bp::parse(u8"func" | bp::to_tokens(lexer_u8), parser_u32));
BOOST_TEST(
!bp::parse(u"func" | bp::to_tokens(lexer_u16), parser_u32));
BOOST_TEST(
!bp::parse(U"func" | bp::to_tokens(lexer_u32), parser_u32));
BOOST_TEST(!bp::parse("foo" | bp::to_tokens(lexer), parser));
BOOST_TEST(!bp::parse("$" | bp::to_tokens(lexer), parser));
}
}
}
// basic
{
constexpr auto lexer = bp::lexer<char, int> | true_false | identifier |