// Copyright (c) 2001-2008 Hartmut Kaiser // Copyright (c) 2001-2007 Joel de Guzman // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // This example shows how to create a simple lexer recognizing a couple of // different tokens aimed at a simple language and how to use this lexer with // a grammar. It shows how to associate values to tokens and how to access the // token values from inside the grammar. // // Additionally, this example demonstrates, how to define a token set usable // as the skip parser during parsing, allowing to define several tokens to be // ignored. // // The example demonstrates how to use the add(...)(...) syntax to associate // token definitions with the lexer and how token ids can be used in the // parser to refer to a token, without having to directly reference its // definition. // // This example recognizes a very simple programming language having // assignment statements and if and while control structures. Look at the file // example6.input for an example. // // This example is essentially identical to example4.cpp. The only difference // is that we use the self.add() syntax to define tokens and to associate them // with the lexer. #include #include #include #include #include #include #include #include "example.hpp" using namespace boost::spirit; using namespace boost::spirit::qi; using namespace boost::spirit::lex; using namespace boost::spirit::arg_names; using boost::phoenix::val; /////////////////////////////////////////////////////////////////////////////// // Token id definitions /////////////////////////////////////////////////////////////////////////////// enum token_ids { ID_CONSTANT = 1000, ID_IF, ID_ELSE, ID_WHILE, ID_IDENTIFIER }; /////////////////////////////////////////////////////////////////////////////// // Token definitions /////////////////////////////////////////////////////////////////////////////// template struct example6_tokens : lexer_def { typedef typename Lexer::token_set token_set; template void def (Self& self) { // define the tokens to match identifier = "[a-zA-Z_][a-zA-Z0-9_]*"; constant = "[0-9]+"; // define the whitespace to ignore (spaces, tabs, newlines and C-style // comments) white_space = token_def<>("[ \\t\\n]+") | "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/" ; // associate the tokens and the token set with the lexer self = token_def<>('(') | ')' | '{' | '}' | '=' | ';'; // Token definitions can be added by using some special syntactic // construct as shown below. // Note, that the token definitions added this way expose the iterator // pair pointing to the matched input stream as their attribute. self.add (constant, ID_CONSTANT) ("if", ID_IF) ("else", ID_ELSE) ("while", ID_WHILE) (identifier, ID_IDENTIFIER) ; // add whitespace tokens to another lexer state (here: "WS") self("WS") = white_space; } // The following two tokens have an associated value type, identifier // carries a string (the identifier name) and constant carries the matched // integer value. // // Note: any explicitly token value type specified during a token_def<> // declaration needs to be listed during token type definition as // well (see the typedef for the token_type below). // // The conversion of the matched input to an instance of this type occurs // once (on first access), which makes token values as efficient as // possible. Moreover, token instances are constructed once by the lexer // library. From this point on tokens are passed by reference only, // avoiding tokens being copied around. token_def identifier; token_def constant; // token set to be used as the skip parser token_set white_space; }; /////////////////////////////////////////////////////////////////////////////// // Grammar definition /////////////////////////////////////////////////////////////////////////////// template struct example6_grammar : grammar > { template example6_grammar(TokenDef const& tok) : example6_grammar::base_type(program) { program = +block ; block = '{' >> *statement >> '}' ; statement = assignment | if_stmt | while_stmt ; assignment = (tok.identifier >> '=' >> expression >> ';') [ std::cout << val("assignment statement to: ") << _1 << "\n" ] ; if_stmt = ( token(ID_IF) >> '(' >> expression >> ')' >> block >> -(token(ID_ELSE) >> block) ) [ std::cout << val("if expression: ") << _2 << "\n" ] ; while_stmt = (token(ID_WHILE) >> '(' >> expression >> ')' >> block) [ std::cout << val("while expression: ") << _2 << "\n" ] ; // since expression has a variant return type accommodating for // std::string and unsigned integer, both possible values may be // returned to the calling rule expression = tok.identifier [ _val = _1 ] | tok.constant [ _val = _1 ] ; } typedef typename Lexer::token_set token_set; typedef boost::variant expression_type; rule > program, block, statement; rule > assignment, if_stmt; rule > while_stmt; // the expression is the only rule having a return value rule > expression; }; /////////////////////////////////////////////////////////////////////////////// int main() { // iterator type used to expose the underlying input stream typedef std::string::iterator base_iterator_type; // This is the lexer token type to use. The second template parameter lists // all attribute types used for token_def's during token definition (see // calculator_tokens<> above). Here we use the predefined lexertl token // type, but any compatible token type may be used instead. // // If you don't list any token value types in the following declaration // (or just use the default token type: lexertl_token) // it will compile and work just fine, just a bit less efficient. This is // because the token value will be generated from the matched input // sequence every time it is requested. But as soon as you specify at // least one token value type you'll have to list all value types used // for token_def<> declarations in the token definition class above, // otherwise compilation errors will occur. typedef lexertl_token< base_iterator_type, boost::mpl::vector > token_type; // Here we use the lexertl based lexer engine. typedef lexertl_lexer lexer_type; // This is the token definition type (derived from the given lexer type). typedef example6_tokens example6_tokens; // this is the iterator type exposed by the lexer typedef lexer::iterator_type iterator_type; // this is the type of the grammar to parse typedef example6_grammar example6_grammar; // now we use the types defined above to create the lexer and grammar // object instances needed to invoke the parsing process example6_tokens tokens; // Our token definition example6_grammar calc(tokens); // Our grammar definition lexer lex(tokens); // Our lexer std::string str (read_from_file("example6.input")); // At this point we generate the iterator pair used to expose the // tokenized input stream. std::string::iterator it = str.begin(); iterator_type iter = lex.begin(it, str.end()); iterator_type end = lex.end(); // Parsing is done based on the the token stream, not the character // stream read from the input. // Note, how we use the token_def defined above as the skip parser. It must // be explicitly wrapped inside a state directive, switching the lexer // state for the duration of skipping whitespace. std::string ws("WS"); bool r = phrase_parse(iter, end, calc, in_state(ws)[tokens.white_space]); if (r && iter == end) { std::cout << "-------------------------\n"; std::cout << "Parsing succeeded\n"; std::cout << "-------------------------\n"; } else { std::cout << "-------------------------\n"; std::cout << "Parsing failed\n"; std::cout << "-------------------------\n"; } std::cout << "Bye... :-) \n\n"; return 0; }