mirror of
https://github.com/boostorg/parser.git
synced 2026-01-19 04:22:13 +00:00
using the template parameters directly, since this also requires the user to type parser_interface. Fixes #220.
376 lines
12 KiB
C++
376 lines
12 KiB
C++
// Copyright (C) 2020 T. Zachary Laine
|
|
//
|
|
// Distributed under the Boost Software License, Version 1.0. (See
|
|
// accompanying file LICENSE_1_0.txt or copy at
|
|
// http://www.boost.org/LICENSE_1_0.txt)
|
|
//[ extended_callback_parsing_json_example
|
|
#include <boost/parser/parser.hpp>
|
|
#include <boost/parser/transcode_view.hpp>
|
|
|
|
#include <fstream>
|
|
#include <vector>
|
|
#include <climits>
|
|
|
|
|
|
namespace json {
|
|
|
|
namespace bp = ::boost::parser;
|
|
using namespace bp::literals;
|
|
|
|
template<typename Iter>
|
|
struct excessive_nesting : std::runtime_error
|
|
{
|
|
excessive_nesting(Iter it) :
|
|
runtime_error("excessive_nesting"), iter(it)
|
|
{}
|
|
Iter iter;
|
|
};
|
|
|
|
|
|
struct global_state
|
|
{
|
|
int recursive_open_count = 0;
|
|
int max_recursive_open_count = 0;
|
|
};
|
|
|
|
struct double_escape_locals
|
|
{
|
|
int first_surrogate = 0;
|
|
};
|
|
|
|
|
|
bp::rule<class ws> const ws = "whitespace";
|
|
|
|
bp::rule<class string_char, uint32_t> const string_char =
|
|
"code point (code points <= U+001F must be escaped)";
|
|
bp::rule<class four_hex_digits, uint32_t> const hex_4 =
|
|
"four hexadecimal digits";
|
|
bp::rule<class escape_seq, uint32_t> const escape_seq =
|
|
"\\uXXXX hexadecimal escape sequence";
|
|
bp::rule<class escape_double_seq, uint32_t, double_escape_locals> const
|
|
escape_double_seq = "\\uXXXX hexadecimal escape sequence";
|
|
bp::rule<class single_escaped_char, uint32_t> const single_escaped_char =
|
|
"'\"', '\\', '/', 'b', 'f', 'n', 'r', or 't'";
|
|
|
|
bp::callback_rule<class null_tag> const null = "null";
|
|
|
|
// Since we don't create polymorphic values in this parse, we need to be
|
|
// able to report that we parsed a bool, so we need a callback rule for
|
|
// this.
|
|
bp::callback_rule<class bool_tag, bool> const bool_p = "boolean";
|
|
|
|
bp::callback_rule<class string_tag, std::string> const string = "string";
|
|
bp::callback_rule<class number_tag, double> const number = "number";
|
|
|
|
// object_element is broken up into the key (object_element_key) and the
|
|
// whole thing (object_element). This was done because the value after
|
|
// the ':' may have many parts. It may be an array, for example. This
|
|
// implies that we need to report that we have the string part of the
|
|
// object-element, and that the rest -- the value -- is coming.
|
|
bp::callback_rule<class object_element_key_tag, std::string> const
|
|
object_element_key = "string";
|
|
bp::rule<class object_element_tag> const object_element = "object-element";
|
|
|
|
// object gets broken up too, to enable the reporting of the beginning and
|
|
// end of the object when '{' or '}' is parsed, respectively. The same
|
|
// thing is done for array, below.
|
|
bp::callback_rule<class object_open_tag> const object_open = "'{'";
|
|
bp::callback_rule<class object_close_tag> const object_close = "'}'";
|
|
bp::rule<class object_tag> const object = "object";
|
|
|
|
bp::callback_rule<class array_open_tag> const array_open = "'['";
|
|
bp::callback_rule<class array_close_tag> const array_close = "']'";
|
|
bp::rule<class array_tag> const array = "array";
|
|
|
|
// value no longer produces an attribute, and it has no callback either.
|
|
// Each individual possible kind of value (string, array, etc.) gets
|
|
// reported separately.
|
|
bp::rule<class value_tag> const value = "value";
|
|
|
|
|
|
// Since we use these tag types as function parameters in the callbacks,
|
|
// they need to be complete types.
|
|
class null_tag {};
|
|
class bool_tag {};
|
|
class string_tag {};
|
|
class number_tag {};
|
|
class object_element_key_tag {};
|
|
class object_open_tag {};
|
|
class object_close_tag {};
|
|
class array_open_tag {};
|
|
class array_close_tag {};
|
|
|
|
|
|
auto const ws_def = '\x09'_l | '\x0a' | '\x0d' | '\x20';
|
|
|
|
auto first_hex_escape = [](auto & ctx) {
|
|
auto & locals = _locals(ctx);
|
|
uint32_t const cu = _attr(ctx);
|
|
if (!boost::parser::detail::text::high_surrogate(cu))
|
|
_pass(ctx) = false;
|
|
else
|
|
locals.first_surrogate = cu;
|
|
};
|
|
auto second_hex_escape = [](auto & ctx) {
|
|
auto & locals = _locals(ctx);
|
|
uint32_t const cu = _attr(ctx);
|
|
if (!boost::parser::detail::text::low_surrogate(cu)) {
|
|
_pass(ctx) = false;
|
|
} else {
|
|
uint32_t const high_surrogate_min = 0xd800;
|
|
uint32_t const low_surrogate_min = 0xdc00;
|
|
uint32_t const surrogate_offset =
|
|
0x10000 - (high_surrogate_min << 10) - low_surrogate_min;
|
|
uint32_t const first_cu = locals.first_surrogate;
|
|
_val(ctx) = (first_cu << 10) + cu + surrogate_offset;
|
|
}
|
|
};
|
|
|
|
auto const hex_4_def = boost::parser::uint_.base<16>().digits<4>();
|
|
|
|
auto const escape_seq_def = "\\u" > hex_4;
|
|
|
|
auto const escape_double_seq_def =
|
|
escape_seq[first_hex_escape] >> escape_seq[second_hex_escape];
|
|
|
|
bp::symbols<uint32_t> const single_escaped_char_def = {
|
|
{"\"", 0x0022u},
|
|
{"\\", 0x005cu},
|
|
{"/", 0x002fu},
|
|
{"b", 0x0008u},
|
|
{"f", 0x000cu},
|
|
{"n", 0x000au},
|
|
{"r", 0x000du},
|
|
{"t", 0x0009u}};
|
|
|
|
auto const string_char_def = escape_double_seq | escape_seq |
|
|
('\\'_l > single_escaped_char) |
|
|
(bp::cp - bp::char_(0x0000u, 0x001fu));
|
|
|
|
auto const null_def = "null"_l;
|
|
|
|
auto const bool_p_def = bp::bool_;
|
|
|
|
auto const string_def = bp::lexeme['"' >> *(string_char - '"') > '"'];
|
|
|
|
auto parse_double = [](auto & ctx) {
|
|
auto const cp_range = _attr(ctx);
|
|
auto cp_first = cp_range.begin();
|
|
auto const cp_last = cp_range.end();
|
|
|
|
auto const result = bp::prefix_parse(cp_first, cp_last, bp::double_);
|
|
if (result) {
|
|
_val(ctx) = *result;
|
|
} else {
|
|
// This would be more efficient if we used
|
|
// boost::container::small_vector, or std::inplace_vector from
|
|
// C++26.
|
|
std::vector<char> chars(cp_first, cp_last);
|
|
auto const chars_first = &*chars.begin();
|
|
auto chars_last = chars_first + chars.size();
|
|
_val(ctx) = std::strtod(chars_first, &chars_last);
|
|
}
|
|
};
|
|
|
|
auto const number_def =
|
|
bp::raw[bp::lexeme
|
|
[-bp::char_('-') >>
|
|
(bp::char_('1', '9') >> *bp::digit | bp::char_('0')) >>
|
|
-(bp::char_('.') >> +bp::digit) >>
|
|
-(bp::char_("eE") >> -bp::char_("+-") >> +bp::digit)]]
|
|
[parse_double];
|
|
|
|
// The object_element_key parser is exactly the same as the string parser.
|
|
// Note that we did *not* use string here, though; we used string_def. If
|
|
// we had used string, its callback would have been called first, and
|
|
// worse still, since it moves its attribute, the callback for
|
|
// object_element_key would always report the empty string, because the
|
|
// string callback would have consumed it first.
|
|
auto const object_element_key_def = string_def;
|
|
|
|
auto const object_element_def = object_element_key > ':' > value;
|
|
|
|
// This is a very straightforward way to write object_def when we know we
|
|
// don't care about attribute-generating (non-callback) parsing. If we
|
|
// wanted to support both modes in one parser definition, we could have
|
|
// written:
|
|
// auto const object_open_def = eps;
|
|
// auto const object_close_def = eps;
|
|
// auto const object_def = '{' >> object_open >>
|
|
// -(object_element % ',') >
|
|
// '}' >> object_close;
|
|
auto const object_open_def = '{'_l;
|
|
auto const object_close_def = '}'_l;
|
|
auto const object_def = object_open >>
|
|
-(object_element % ',') > object_close;
|
|
|
|
auto const array_open_def = '['_l;
|
|
auto const array_close_def = ']'_l;
|
|
auto const array_def = array_open >> -(value % ',') > array_close;
|
|
|
|
auto const value_def = number | bool_p | null | string | array | object;
|
|
|
|
BOOST_PARSER_DEFINE_RULES(
|
|
ws,
|
|
hex_4,
|
|
escape_seq,
|
|
escape_double_seq,
|
|
single_escaped_char,
|
|
string_char,
|
|
null,
|
|
bool_p,
|
|
string,
|
|
number,
|
|
object_element_key,
|
|
object_element,
|
|
object_open,
|
|
object_close,
|
|
object,
|
|
array_open,
|
|
array_close,
|
|
array,
|
|
value);
|
|
|
|
// The parse function loses its attribute from the return type; now the
|
|
// return type is just bool.
|
|
template<typename Callbacks>
|
|
bool parse(
|
|
std::string_view str,
|
|
std::string_view filename,
|
|
Callbacks const & callbacks,
|
|
int max_recursion = 512)
|
|
{
|
|
auto const range = boost::parser::as_utf32(str);
|
|
using iter_t = decltype(range.begin());
|
|
|
|
if (max_recursion <= 0)
|
|
max_recursion = INT_MAX;
|
|
|
|
global_state globals{0, max_recursion};
|
|
// This is a different error handler from the json.cpp example, just
|
|
// to show different options.
|
|
bp::stream_error_handler error_handler(filename);
|
|
auto const parser = bp::with_error_handler(
|
|
bp::with_globals(value, globals), error_handler);
|
|
|
|
try {
|
|
// This is identical to the parse() call in json.cpp, except that
|
|
// it is callback_parse() instead, and it takes the callbacks
|
|
// parameter.
|
|
return bp::callback_parse(range, parser, ws, callbacks);
|
|
} catch (excessive_nesting<iter_t> const & e) {
|
|
std::string const message = "error: Exceeded maximum number (" +
|
|
std::to_string(max_recursion) +
|
|
") of open arrays and/or objects";
|
|
bp::write_formatted_message(
|
|
std::cout,
|
|
filename,
|
|
range.begin(),
|
|
e.iter,
|
|
range.end(),
|
|
message);
|
|
}
|
|
|
|
return {};
|
|
}
|
|
|
|
}
|
|
|
|
std::string file_slurp(std::ifstream & ifs)
|
|
{
|
|
std::string retval;
|
|
while (ifs) {
|
|
char const c = ifs.get();
|
|
retval += c;
|
|
}
|
|
if (!retval.empty() && retval.back() == -1)
|
|
retval.pop_back();
|
|
return retval;
|
|
}
|
|
|
|
// This is our callbacks-struct. It has a callback for each of the kinds of
|
|
// callback rules in our parser. If one were missing, you'd get a pretty
|
|
// nasty template instantiation error. Note that these are all const members;
|
|
// callback_parse() takes the callbacks object by constant reference.
|
|
struct json_callbacks
|
|
{
|
|
void operator()(json::null_tag) const { std::cout << "JSON null value\n"; }
|
|
void operator()(json::bool_tag, bool b) const
|
|
{
|
|
indent();
|
|
std::cout << "JSON bool " << (b ? "true" : "false") << "\n";
|
|
}
|
|
void operator()(json::string_tag, std::string s) const
|
|
{
|
|
indent();
|
|
std::cout << "JSON string \"" << s << "\"\n";
|
|
}
|
|
void operator()(json::number_tag, double d) const
|
|
{
|
|
indent();
|
|
std::cout << "JSON number " << d << "\n";
|
|
}
|
|
void operator()(json::object_element_key_tag, std::string key) const
|
|
{
|
|
indent();
|
|
std::cout << "JSON object element with key \"" << key
|
|
<< "\" and value...\n";
|
|
}
|
|
void operator()(json::object_open_tag) const
|
|
{
|
|
indent(1);
|
|
std::cout << "Beginning of JSON object.\n";
|
|
}
|
|
void operator()(json::object_close_tag) const
|
|
{
|
|
indent(-1);
|
|
std::cout << "End of JSON object.\n";
|
|
}
|
|
void operator()(json::array_open_tag) const
|
|
{
|
|
indent(1);
|
|
std::cout << "Beginning of JSON array.\n";
|
|
}
|
|
void operator()(json::array_close_tag) const
|
|
{
|
|
indent(-1);
|
|
std::cout << "End of JSON array.\n";
|
|
}
|
|
|
|
void indent(int level_bump = 0) const
|
|
{
|
|
if (level_bump < 0)
|
|
indent_.resize(indent_.size() - 2);
|
|
std::cout << indent_;
|
|
if (0 < level_bump)
|
|
indent_ += " ";
|
|
}
|
|
mutable std::string indent_;
|
|
};
|
|
|
|
int main(int argc, char * argv[])
|
|
{
|
|
if (argc < 2) {
|
|
std::cerr << "A filename to parse is required.\n";
|
|
exit(1);
|
|
}
|
|
|
|
std::ifstream ifs(argv[1]);
|
|
if (!ifs) {
|
|
std::cerr << "Unable to read file '" << argv[1] << "'.\n";
|
|
exit(1);
|
|
}
|
|
|
|
std::string const file_contents = file_slurp(ifs);
|
|
bool success = json::parse(file_contents, argv[1], json_callbacks{});
|
|
if (success) {
|
|
std::cout << "Parse successful!\n";
|
|
} else {
|
|
std::cerr << "Parse failure.\n";
|
|
exit(1);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
//]
|