From b33992719ca8ea3895c7d47756f4ef70f7d86461 Mon Sep 17 00:00:00 2001 From: Zach Laine Date: Sun, 30 Aug 2020 16:48:30 -0500 Subject: [PATCH] Add symbol parser example. --- doc/parser.qbk | 7 +++ doc/tutorial.qbk | 74 ++++++++++++++++++++++++++++++++ example/CMakeLists.txt | 1 + example/roman_numerals.cpp | 73 ++++++++++++++++++++++++++++++++ test/parser_symbol_table.cpp | 82 ++++++++++++++++++++++++++++++++++++ 5 files changed, 237 insertions(+) create mode 100644 example/roman_numerals.cpp diff --git a/doc/parser.qbk b/doc/parser.qbk index f7fcbe04..b6c50691 100644 --- a/doc/parser.qbk +++ b/doc/parser.qbk @@ -29,6 +29,7 @@ [import ../example/trivial.cpp] [import ../example/trivial_skipper.cpp] [import ../example/semantic_actions.cpp] +[import ../example/roman_numerals.cpp] [import ../include/boost/parser/error_handling_fwd.hpp] @@ -46,9 +47,11 @@ [def _v_ [classref boost::parser::view `view`]] [def _n_ [classref boost::parser::none `none`]] +[def _symbols_ [classref boost::parser::symbols `symbols`]] [def _p_ [funcref boost::parser::parse `parse()`]] [def _cbp_ [funcref boost::parser::callback_parse `callback_parse()`]] + [def _attr_ [funcref boost::parser::_attr `_attr()`]] [def _val_ [funcref boost::parser::_val `_val()`]] [def _pass_ [funcref boost::parser::_pass `_pass()`]] @@ -62,12 +65,16 @@ [def _report_error_ [funcref boost::parser::_report_error `_report_error()`]] [def _report_warning_ [funcref boost::parser::_report_warning `_report_warning()`]] +[def _lit_ [funcref boost::parser::lit `lit()`]] + [def _ch_ [globalref boost::parser::char_ `char_`]] [def _i_ [globalref boost::parser::int_ `int_`]] [def _d_ [globalref boost::parser::double_ `double_`]] [def _kl_ [@https://en.wikipedia.org/wiki/Kleene_star Kleene star]] [def _comb_ [@https://en.wikipedia.org/wiki/Parser_combinator parser combinator]] +[def _udl_ [@https://en.cppreference.com/w/cpp/language/user_literal UDL]] +[def _udls_ [@https://en.cppreference.com/w/cpp/language/user_literal UDLs]] [def _Spirit_ [@https://www.boost.org/doc/libs/release/libs/spirit Boost.Spirit]] diff --git a/doc/tutorial.qbk b/doc/tutorial.qbk index 150b8bd1..b4027615 100644 --- a/doc/tutorial.qbk +++ b/doc/tutorial.qbk @@ -388,4 +388,78 @@ explanation.] [endsect] +[section Symbol Tables] + +When writing a parser, it often comes up that there is a set of strings that, +when parsed, are associated with a set of values 1-to-1. It is tedious to +write parsers that recognize all the possible input strings when you have to +associate each one with an attribute via a semantic action. Instead, we can +use a symbol table. + +Say we want to parse Roman numerals, one of the most common work-related +parsing problems. We want to recognize numbers that start with any number of +"M"s, representing thousands, followed by the hundreds, the tens, and the +ones. Any of these may be absent from the input, but not all. Here are three +symbol _Parser_ tables that we can use to recognize ones, tens, and hundreds +values, respectively: + +[roman_numeral_symbol_tables] + +A _symbols_ maps strings of `char` to their associated attributes. The type +of the attribute must be specified as a template parameter to _symbols_ +_emdash_ `int` in this case. + +Any "M"s we encounter should add 1000 to the result, and all other values come +from the symbol tables. Here are the semantic actions we'll need to do that: + +[roman_numeral_actions] + +`add_1000` just adds `1000` to `result`. `add` adds whatever attribute is +produced by its parser to `result`. + +Now we just need to put the pieces together to make a parser: + +[roman_numeral_parser] + +We've got a few new bits in play here, so let's break it down. `'M'_l` is a +/literal parser/. That is, it is a parser that parses a literal `char`, code +point, or string. In this case, a `char` "M" is being parsed. The `_l` bit +at the end is a _udl_ suffix that you can put after any `char`, `char32_t`, or +`char const *` to form a literal parser. You can also make a literal parser +by writing _lit_ for some `x` of one of the previously mentioned types. + +Why do we need any of this, considering that we just used a literal `','` in +our previous example? The reason is that `'M'` is not used in an expression +with another _Parser_ parser. It is used within `*'M'_l[add_1000]`. If we'd +written `*'M'[add_1000]`, clearly that would be ill-formed; `char` has no +`operator*()`, nor an `operator[]()`, associated with it. + +[tip Any time you want to use a `char`, `char32_t`, or string literal in a +_Parser_ parser, write it as-is if it is combined with a preexisting _Parser_ +subparser `p`, as in `'x' >> p`. Otherwise, you need to wrap it in a call to +_lit_, or use the `_l` _udl_ suffix.] + +On to the next bit: `-hundreds[add]`. By now, the use of the index operator +should be pretty familiar; it associates the semantic action `add` with the +parser `hundreds`. The `operator-()` at the beginning is new. It means that +the parser it is applied to is optional. You can read it as "zero or one". +So, if `hundreds` is not successfully parsed after `*'M'[add_1000]`, nothing +happens, because `hundreds` is allowed to be missing _emdash_ it's optional. +If `hundreds` is parsed successfully, say by matching `"CC"`, the resulting +attribute, `200`, is added to `result` inside `add`. + +Here is the full listing of the program. Notice that it would have been +inappropriate to use a whitespace skipper here, since the entire parse is a +single number, so it was removed. + +[roman_numeral_example] + +[endsect] + +[section Unicode Support] + +TODO + +[endsect] + [endsect] diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index f87dd3f6..8a0b4069 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -25,3 +25,4 @@ add_sample(hello) add_sample(trivial) add_sample(trivial_skipper) add_sample(semantic_actions) +add_sample(roman_numerals) diff --git a/example/roman_numerals.cpp b/example/roman_numerals.cpp new file mode 100644 index 00000000..00150d56 --- /dev/null +++ b/example/roman_numerals.cpp @@ -0,0 +1,73 @@ +// Copyright (C) 2020 T. Zachary Laine +// +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +//[ roman_numeral_example +#include + +#include +#include + + +namespace bp = boost::parser; + +int main() +{ + std::cout << "Enter a number using Roman numerals. "; + std::string input; + std::getline(std::cin, input); + + //[ roman_numeral_symbol_tables + bp::symbols const ones = { + {"I", 1}, + {"II", 2}, + {"III", 3}, + {"IV", 4}, + {"V", 5}, + {"VI", 6}, + {"VII", 7}, + {"VIII", 8}, + {"IX", 9}}; + + bp::symbols const tens = { + {"X", 10}, + {"XX", 20}, + {"XXX", 30}, + {"XL", 40}, + {"L", 50}, + {"LX", 60}, + {"LXX", 70}, + {"LXXX", 80}, + {"XC", 90}}; + + bp::symbols const hundreds = { + {"C", 100}, + {"CC", 200}, + {"CCC", 300}, + {"CD", 400}, + {"D", 500}, + {"DC", 600}, + {"DCC", 700}, + {"DCCC", 800}, + {"CM", 900}}; + //] + + //[ roman_numeral_actions + int result = 0; + auto const add_1000 = [&result](auto & ctx) { result += 1000; }; + auto const add = [&result](auto & ctx) { result += _attr(ctx); }; + //] + + //[ roman_numeral_parser + using namespace bp::literals; + auto const parser = + *'M'_l[add_1000] >> -hundreds[add] >> -tens[add] >> -ones[add]; + //] + + if (bp::parse(input, parser) && result != 0) + std::cout << "That's " << result << " in Arabic numerals.\n"; + else + std::cout << "That's not a Roman number.\n"; +} +//] diff --git a/test/parser_symbol_table.cpp b/test/parser_symbol_table.cpp index d222e771..c6bfb10b 100644 --- a/test/parser_symbol_table.cpp +++ b/test/parser_symbol_table.cpp @@ -53,6 +53,88 @@ TEST(parser, symbols_simple) } } +TEST(parser, symbols_max_munch) +{ + symbols const roman_numerals = { + {"I", 1}, + {"II", 2}, + {"III", 3}, + {"IV", 4}, + {"V", 5}, + {"VI", 6}, + {"VII", 7}, + {"VIII", 8}, + {"IX", 9}, + + {"X", 10}, + {"XX", 20}, + {"XXX", 30}, + {"XL", 40}, + {"L", 50}, + {"LX", 60}, + {"LXX", 70}, + {"LXXX", 80}, + {"XC", 90}, + + {"C", 100}, + {"CC", 200}, + {"CCC", 300}, + {"CD", 400}, + {"D", 500}, + {"DC", 600}, + {"DCC", 700}, + {"DCCC", 800}, + {"CM", 900}, + + {"M", 1000}}; + + { + auto const result = parse("I", roman_numerals); + EXPECT_TRUE(result); + EXPECT_EQ(*result, 1); + } + { + auto const result = parse("II", roman_numerals); + EXPECT_TRUE(result); + EXPECT_EQ(*result, 2); + } + { + auto const result = parse("III", roman_numerals); + EXPECT_TRUE(result); + EXPECT_EQ(*result, 3); + } + { + auto const result = parse("IV", roman_numerals); + EXPECT_TRUE(result); + EXPECT_EQ(*result, 4); + } + { + auto const result = parse("V", roman_numerals); + EXPECT_TRUE(result); + EXPECT_EQ(*result, 5); + } + { + auto const result = parse("VI", roman_numerals); + EXPECT_TRUE(result); + EXPECT_EQ(*result, 6); + } + { + auto const result = parse("VII", roman_numerals); + EXPECT_TRUE(result); + EXPECT_EQ(*result, 7); + } + { + auto const result = parse("VIII", roman_numerals); + EXPECT_TRUE(result); + EXPECT_EQ(*result, 8); + } + { + auto const result = parse("IX", roman_numerals); + EXPECT_TRUE(result); + EXPECT_EQ(*result, 9); + } +} + TEST(parser, symbols_mutating) { symbols roman_numerals;