From d2fc6305bbf51c046dae0919c47979288992cf0f Mon Sep 17 00:00:00 2001 From: Zach Laine Date: Sun, 7 May 2017 15:28:50 -0500 Subject: [PATCH] Another large refactor, this time to support UTF-8 streams. --- test/basic_structures.cpp | 2 +- test/block_style.cpp | 2 +- test/characters.cpp | 2 +- test/flow_style.cpp | 2 +- test/stream.cpp | 2 +- yaml/parser/basic_structures.hpp | 13 ++-- yaml/parser/basic_structures_def.hpp | 33 ++++++++-- yaml/parser/block_styles.hpp | 2 +- yaml/parser/block_styles_def.hpp | 7 +- yaml/parser/characters.hpp | 66 +++++++++++++++---- yaml/parser/characters_def.hpp | 96 ++++++++++------------------ yaml/parser/error_handler.hpp | 23 ++++--- yaml/parser/flow_styles.hpp | 12 ++-- yaml/parser/flow_styles_def.hpp | 44 +++++++------ yaml/parser/stream.hpp | 15 +++-- yaml/parser/stream_def.hpp | 60 +++++++++++------ 16 files changed, 227 insertions(+), 154 deletions(-) diff --git a/test/basic_structures.cpp b/test/basic_structures.cpp index f9e60116..5a8aee40 100644 --- a/test/basic_structures.cpp +++ b/test/basic_structures.cpp @@ -10,5 +10,5 @@ #include #include -typedef std::string::const_iterator char_iterator_t; +using char_iterator_t = yaml::parser::ustring_t::const_iterator; template struct yaml::parser::basic_structures_t; diff --git a/test/block_style.cpp b/test/block_style.cpp index 65d9e16a..cf80834c 100644 --- a/test/block_style.cpp +++ b/test/block_style.cpp @@ -10,5 +10,5 @@ #include #include -typedef std::string::const_iterator char_iterator_t; +using char_iterator_t = yaml::parser::ustring_t::const_iterator; template struct yaml::parser::block_styles_t; diff --git a/test/characters.cpp b/test/characters.cpp index 5c7d46cf..fc0a235d 100644 --- a/test/characters.cpp +++ b/test/characters.cpp @@ -10,5 +10,5 @@ #include #include -typedef std::string::const_iterator char_iterator_t; +using char_iterator_t = yaml::parser::ustring_t::const_iterator; template struct yaml::parser::characters_t; diff --git a/test/flow_style.cpp b/test/flow_style.cpp index 9c3de910..c06873d9 100644 --- a/test/flow_style.cpp +++ b/test/flow_style.cpp @@ -10,5 +10,5 @@ #include #include -typedef std::string::const_iterator char_iterator_t; +using char_iterator_t = yaml::parser::ustring_t::const_iterator; template struct yaml::parser::flow_styles_t; diff --git a/test/stream.cpp b/test/stream.cpp index 7def0acc..344afc2e 100644 --- a/test/stream.cpp +++ b/test/stream.cpp @@ -10,5 +10,5 @@ #include #include -typedef std::string::const_iterator char_iterator_t; +using char_iterator_t = yaml::parser::ustring_t::const_iterator; template struct yaml::parser::stream_t; diff --git a/yaml/parser/basic_structures.hpp b/yaml/parser/basic_structures.hpp index b9133715..22047270 100644 --- a/yaml/parser/basic_structures.hpp +++ b/yaml/parser/basic_structures.hpp @@ -44,6 +44,7 @@ namespace yaml { namespace parser { struct basic_structures_t { using iterator_t = pos_iterator; + using iter_range_t = uchar_range; explicit basic_structures_t (boost::phoenix::function const & error_handler); @@ -52,12 +53,12 @@ namespace yaml { namespace parser { qi::rule indent; // indent exactly n spaces qi::rule indent_lt; // indent <= n spaces qi::rule indent_le; // indent < n spaces - qi::rule separate_in_line; + qi::rule separate_in_line; qi::rule line_prefix; qi::rule l_empty; qi::rule b_l_folded; qi::rule flow_folded; - qi::rule comment_text; + qi::rule comment_text; qi::rule s_b_comment; qi::rule l_comment; qi::rule s_l_comments; @@ -74,12 +75,12 @@ namespace yaml { namespace parser { qi::rule< iterator_t, ast::properties_t(int, context_t), - qi::locals + qi::locals > properties; - qi::rule tag_property; - qi::rule anchor_property; - qi::rule anchor_name; + qi::rule tag_property; + qi::rule anchor_property; + qi::rule anchor_name; qi::rule one_time_eoi; diff --git a/yaml/parser/basic_structures_def.hpp b/yaml/parser/basic_structures_def.hpp index acf06760..d040f103 100644 --- a/yaml/parser/basic_structures_def.hpp +++ b/yaml/parser/basic_structures_def.hpp @@ -84,6 +84,21 @@ namespace yaml { namespace parser { } }; + struct to_str + { + template + struct result { using type = std::string; }; + + template + std::string operator() (boost::iterator_range> range) const + { + using iterator_t = typename boost::iterator_range>::iterator; + boost::u32_to_u8_iterator first(range.begin()); + boost::u32_to_u8_iterator last(range.end()); + return std::string(first, last); + } + }; + } template @@ -92,7 +107,7 @@ namespace yaml { namespace parser { { qi::attr_type attr; qi::uint_type uint_; - qi::char_type char_; + qi::unicode::char_type char_; qi::_val_type _val; qi::_1_type _1; qi::_r1_type _r1; @@ -102,7 +117,7 @@ namespace yaml { namespace parser { qi::_b_type _b; qi::lit_type lit; qi::blank_type blank; - qi::alnum_type alnum; + qi::unicode::alnum_type alnum; qi::eol_type eol; qi::eoi_type eoi; qi::eps_type eps; @@ -123,6 +138,7 @@ namespace yaml { namespace parser { function check_yaml_version; function check_start_of_line; function first_time_eoi; + function to_str; // 6.1. Indentation Spaces @@ -171,7 +187,7 @@ namespace yaml { namespace parser { eol >> (eps(!_r3) | !(lit("...") | "---")) >> *l_empty(_r1, _r2) // b-l-trimmed [71] - >> attr('\n') + >> attr(' ') ; // [74] @@ -269,18 +285,21 @@ namespace yaml { namespace parser { // 6.9 Node Properties // [96] + // TODO: Defer construction of an ast::properties_t until we know + // we'll keep it. properties = ( tag_property[_a = _1] >> -(separate(_r1, _r2) >> anchor_property[_b = *_1]) | anchor_property[_b = _1] >> -(separate(_r1, _r2) >> tag_property[_a = *_1]) ) - [_val = construct(_a, _b)] + [_val = construct(to_str(_a), to_str(_b))] ; // [97] - tag_property %= + tag_property = raw[ lit('!') >> "<" > +uri_char > ">" // verbatim_tag [98] | tag_handle >> +tag_char // shorthand_tag [99] | '!' // non_specific_tag [100] + ] ; // [22] @@ -298,7 +317,7 @@ namespace yaml { namespace parser { //foo: // *a: anchor_property = - '&' >> +(ns_char - indicator) + '&' >> raw[+(ns_char - indicator)] ; // [102] @@ -306,7 +325,7 @@ namespace yaml { namespace parser { // [103] anchor_name = - +(ns_char - char_(",[]{}")) + raw[+(ns_char - char_(",[]{}"))] ; one_time_eoi = diff --git a/yaml/parser/block_styles.hpp b/yaml/parser/block_styles.hpp index 7ea34686..16e1a07e 100644 --- a/yaml/parser/block_styles.hpp +++ b/yaml/parser/block_styles.hpp @@ -67,7 +67,7 @@ namespace yaml { namespace parser { qi::rule chomped_empty; qi::rule strip_empty; qi::rule keep_empty; - qi::rule> trail_comments; + qi::rule> trail_comments; qi::rule< iterator_t, diff --git a/yaml/parser/block_styles_def.hpp b/yaml/parser/block_styles_def.hpp index 6673fa89..14114281 100644 --- a/yaml/parser/block_styles_def.hpp +++ b/yaml/parser/block_styles_def.hpp @@ -77,6 +77,7 @@ namespace yaml { namespace parser { phx::function handle_properties; phx::function chomping; phx::function indentation; + phx::function push_utf8; phx::function seq_spaces; // [201] auto ins = phx::insert(_val, _1); auto pb = phx::push_back(_val, _1); @@ -166,7 +167,7 @@ namespace yaml { namespace parser { // [171] literal_text = - *l_empty(_r1, context_t::block_in) >> indent(_r1) >> +nb_char + *l_empty(_r1, context_t::block_in) >> indent(_r1) >> +nb_char[push_utf8(_val, _1)] ; // [172] @@ -191,7 +192,7 @@ namespace yaml { namespace parser { // [175] folded_text = - indent(_r1) >> ns_char >> *nb_char + indent(_r1) >> ns_char[push_utf8(_val, _1)] >> *nb_char[push_utf8(_val, _1)] ; // [176] @@ -201,7 +202,7 @@ namespace yaml { namespace parser { // [177] spaced_text = - indent(_r1) >> blank >> *nb_char + indent(_r1) >> blank[_val = _1] >> *nb_char[push_utf8(_val, _1)] ; // [178] diff --git a/yaml/parser/characters.hpp b/yaml/parser/characters.hpp index 09ffb787..0de4e061 100644 --- a/yaml/parser/characters.hpp +++ b/yaml/parser/characters.hpp @@ -10,13 +10,13 @@ #define BOOST_SPIRIT_NO_PREDEFINED_TERMINALS #define BOOST_SPIRIT_ACTIONS_ALLOW_ATTR_COMPAT +#define BOOST_SPIRIT_UNICODE #include #include #include #include -#include #include @@ -90,9 +90,55 @@ namespace yaml { namespace parser { } using uchar_t = boost::uint32_t; // Unicode code point + using ustring_t = std::basic_string; // UCS-4 Unicode string + + // TODO: Guard the use of this with a macro, and only use it in debug + // builds; after committing it like that, remove it entirely. + struct parsed_uchar_t + { + parsed_uchar_t (uchar_t c = 0) : value_ (c) {} + uchar_t value_; + + friend std::ostream & operator<< (std::ostream & os, parsed_uchar_t c) + { + std::string utf8; + using insert_iterator_t = std::back_insert_iterator; + insert_iterator_t out_it(utf8); + boost::utf8_output_iterator utf8_it(out_it); + *utf8_it++ = c.value_; + return os << utf8; + } + }; template - using pos_iterator = boost::spirit::classic::position_iterator; + using uchar_range = boost::iterator_range>; + + namespace detail { + + struct push_utf8 + { + template + struct result { using type = void; }; + + void operator() (std::string & utf8, parsed_uchar_t code_point) const + { + using insert_iterator_t = std::back_insert_iterator; + insert_iterator_t out_it(utf8); + boost::utf8_output_iterator utf8_it(out_it); + *utf8_it++ = code_point.value_; + } + + void operator() (std::string & utf8, parsed_uchar_t code_point_1, parsed_uchar_t code_point_2) const + { + using insert_iterator_t = std::back_insert_iterator; + insert_iterator_t out_it(utf8); + boost::utf8_output_iterator utf8_it(out_it); + *utf8_it++ = code_point_1.value_; + *utf8_it++ = code_point_2.value_; + } + }; + + } template struct characters_t @@ -101,16 +147,14 @@ namespace yaml { namespace parser { characters_t (); - qi::rule full_bom; - - qi::rule printable; - qi::rule nb_json; + qi::rule printable; + qi::rule nb_json; qi::rule bom; - qi::rule nb_char; - qi::rule ns_char; - qi::rule uri_char; - qi::rule tag_char; - qi::rule esc_char; + qi::rule nb_char; + qi::rule ns_char; + qi::rule uri_char; + qi::rule tag_char; + qi::rule esc_char; }; } } diff --git a/yaml/parser/characters_def.hpp b/yaml/parser/characters_def.hpp index 2d260033..08dd690c 100644 --- a/yaml/parser/characters_def.hpp +++ b/yaml/parser/characters_def.hpp @@ -15,54 +15,40 @@ #include #include #include +#include namespace yaml { namespace parser { namespace detail { - struct push_utf8 - { - template - struct result { using type = void; }; - - void operator() (std::string & utf8, uchar_t code_point) const - { - using insert_iter = std::back_insert_iterator; - insert_iter out_iter(utf8); - boost::utf8_output_iterator utf8_iter(out_iter); - *utf8_iter++ = code_point; - } - }; - struct push_esc { - template + template struct result { using type = void; }; - void operator() (std::string & utf8, uchar_t c) const + void operator() (parsed_uchar_t & to, uchar_t from) const { - switch (c) + switch (from) { - case ' ': utf8 += ' '; break; - case '\t': utf8 += '\t'; break; - case '0': utf8 += '\0'; break; - case 'a': utf8 += 0x7; break; - case 'b': utf8 += 0x8; break; - case 't': utf8 += '\t'; break; - case 'n': utf8 += 0xa; break; - case 'v': utf8 += '\v'; break; - case 'f': utf8 += '\f'; break; - case 'r': utf8 += '\r'; break; - case 'e': utf8 += 0x1b; break; - case '"': utf8 += '"'; break; - case '/': utf8 += '/'; break; - case '\\': utf8 += '\\'; break; - - case '_': push_utf8()(utf8, 0xa0); break; - case 'N': push_utf8()(utf8, 0x85); break; - case 'L': push_utf8()(utf8, 0x2028); break; - case 'P': push_utf8()(utf8, 0x2029); break; + case ' ': to.value_ = ' '; break; + case '\t': to.value_ = '\t'; break; + case '0': to.value_ = '\0'; break; + case 'a': to.value_ = 0x7; break; + case 'b': to.value_ = 0x8; break; + case 't': to.value_ = '\t'; break; + case 'n': to.value_ = 0xa; break; + case 'v': to.value_ = '\v'; break; + case 'f': to.value_ = '\f'; break; + case 'r': to.value_ = '\r'; break; + case 'e': to.value_ = 0x1b; break; + case '"': to.value_ = '"'; break; + case '/': to.value_ = '/'; break; + case '\\': to.value_ = '\\'; break; + case '_': to.value_ = 0xa0; break; + case 'N': to.value_ = 0x85; break; + case 'L': to.value_ = 0x2028; break; + case 'P': to.value_ = 0x2029; break; } } }; @@ -72,55 +58,43 @@ namespace yaml { namespace parser { template characters_t::characters_t () { - qi::byte_type byte_; - qi::char_type char_; + qi::unicode::char_type char_; qi::_val_type _val; qi::_1_type _1; - qi::lit_type lit; qi::blank_type blank; qi::eol_type eol; - qi::alnum_type alnum; + qi::unicode::alnum_type alnum; qi::hex_type hex; qi::eps_type eps; namespace phx = boost::phoenix; using qi::copy; using qi::uint_parser; + using phx::construct; using phx::function; using phx::ref; uint_parser hex4; uint_parser hex8; - function push_utf8; function push_esc; // 5.2. Character Encodings - full_bom = - byte_('\x00') >> byte_('\x00') >> byte_('\xfe') >> byte_('\xff') [_val = encoding_t::utf32_be] - | byte_('\x00') >> byte_('\x00') >> byte_('\x00') >> byte_ [_val = encoding_t::utf32_be] - | byte_('\xff') >> byte_('\xfe') >> byte_('\x00') >> byte_('\x00') [_val = encoding_t::utf32_le] - | byte_ >> byte_('\x00') >> byte_('\x00') >> byte_('\x00') [_val = encoding_t::utf32_le] - | byte_('\xfe') >> byte_('\xff') [_val = encoding_t::utf16_be] - | byte_('\x00') >> byte_ [_val = encoding_t::utf16_be] - | byte_('\xff') >> byte_('\xfe') [_val = encoding_t::utf16_le] - | byte_ >> byte_('\x00') [_val = encoding_t::utf16_le] - | byte_('\xef') >> byte_('\xbb') >> byte_('\xbf') [_val = encoding_t::utf8] - ; - // [1] printable = - char_("\t\n\f\x20-\x7e") // TODO: This should properly have other chars in it once we can parse UTf8. + char_("\t\n\f") | char_(U'\x20', U'\x7e') // 8 bit + | char_(U'\x85') | char_(U'\xa0', U'\ud7ff') | char_(U'\ue000', U'\ufffd') // 16 bit + | char_(U'\U00010000', U'\U0010ffff') // 32 bit ; // [2] nb_json = - char_("\t\x20-\x7e") // TODO: This should properly be \t | anything >= \x20 + char_(U'\t') | char_(U'\x20', U'\U0010ffff') ; // [3] bom = - byte_('\xfe') >> byte_('\xff') + char_(0xfeff) ; // 5.4. Line Break Characters @@ -140,11 +114,11 @@ namespace yaml { namespace parser { // 5.6. Miscellaneous Characters // [38] - auto word_char = copy(alnum | char_("-")); + auto word_char = copy(alnum | char_('-')); // [39] uri_char = - char_("%") >> hex[push_utf8(_val, _1)] + char_('%') >> hex | word_char | char_("#;/?:@&=+$,_.!~*'()[]") ; @@ -160,9 +134,9 @@ namespace yaml { namespace parser { esc_char = '\\' >> ( - ('x' > hex) [push_utf8(_val, _1)] - | ('u' > hex4) [push_utf8(_val, _1)] - | ('U' > hex8) [push_utf8(_val, _1)] + ('x' > hex) [_val = construct(_1)] + | ('u' > hex4) [_val = construct(_1)] + | ('U' > hex8) [_val = construct(_1)] | char_("0abtnvfre\"/\\N_LP\t ") [push_esc(_val, _1)] ) ; diff --git a/yaml/parser/error_handler.hpp b/yaml/parser/error_handler.hpp index cfd4d34c..f999fd16 100644 --- a/yaml/parser/error_handler.hpp +++ b/yaml/parser/error_handler.hpp @@ -13,6 +13,8 @@ #include #include #include +#include + #include #include @@ -34,6 +36,9 @@ namespace yaml { namespace parser { std::string msg_; }; + template + using pos_iterator = boost::spirit::classic::position_iterator; + struct error_handler_t { template @@ -67,27 +72,29 @@ namespace yaml { namespace parser { warning_fn_(msg); } - template + template void operator() ( - Iter first, - Iter last, - Iter err_pos, + pos_iterator first, + pos_iterator last, + pos_iterator err_pos, boost::spirit::info const & what ) const { - Iter line_start = boost::spirit::get_line_start(first, err_pos); + using iterator_t = pos_iterator; + + iterator_t line_start = boost::spirit::get_line_start(first, err_pos); std::string error_line; if (line_start != last && *line_start == '\r') ++line_start; if (line_start != last && *line_start == '\n') ++line_start; - for (Iter it = line_start; it != last; ++it) { - typename Iter::value_type c = *it; + for (iterator_t it = line_start; it != last; ++it) { + typename iterator_t::value_type c = *it; if (c == '\r' || c == '\n') break; error_line += c; } - typename Iter::position_t const pos = err_pos.get_position(); + typename iterator_t::position_t const pos = err_pos.get_position(); int const line = pos.line; int const column = pos.column; diff --git a/yaml/parser/flow_styles.hpp b/yaml/parser/flow_styles.hpp index 903ceb4f..f02e32a6 100644 --- a/yaml/parser/flow_styles.hpp +++ b/yaml/parser/flow_styles.hpp @@ -56,8 +56,8 @@ namespace yaml { namespace parser { qi::rule alias_node; - qi::rule nb_double_char; - qi::rule ns_double_char; + qi::rule nb_double_char; + qi::rule ns_double_char; qi::rule double_quoted; qi::rule double_text; qi::rule double_escaped; @@ -66,16 +66,16 @@ namespace yaml { namespace parser { qi::rule double_next_line; qi::rule double_multi_line; - qi::rule nb_single_char; - qi::rule ns_single_char; + qi::rule nb_single_char; + qi::rule ns_single_char; qi::rule single_quoted; qi::rule single_text; qi::rule single_in_line; qi::rule single_next_line; qi::rule single_multi_line; - qi::rule plain_first; - qi::rule plain_safe; + qi::rule plain_first; + qi::rule plain_safe; qi::rule plain_char; qi::rule plain; qi::rule plain_in_line; diff --git a/yaml/parser/flow_styles_def.hpp b/yaml/parser/flow_styles_def.hpp index fc2e1f3a..2f307be2 100644 --- a/yaml/parser/flow_styles_def.hpp +++ b/yaml/parser/flow_styles_def.hpp @@ -51,9 +51,10 @@ namespace yaml { namespace parser { qi::attr_type attr; qi::omit_type omit; qi::hold_type hold; - qi::char_type char_; + qi::unicode::char_type char_; qi::_val_type _val; qi::_1_type _1; + qi::_2_type _2; qi::_a_type _a; qi::_r1_type _r1; qi::_r2_type _r2; @@ -68,6 +69,7 @@ namespace yaml { namespace parser { using phx::construct; phx::function handle_properties; + phx::function push_utf8; auto ins = phx::insert(_val, _1); #ifdef BOOST_SPIRIT_DEBUG @@ -118,16 +120,16 @@ namespace yaml { namespace parser { // [110] double_text = eps(_r2 == context_t::flow_out || _r2 == context_t::flow_in) - >> double_multi_line(_r1) + >> double_multi_line(_r1)[_val = _1] | eps(_r2 == context_t::block_key || _r2 == context_t::flow_key) - >> *nb_double_char // double-one-line [111] + >> *nb_double_char[push_utf8(_val, _1)] // double-one-line [111] ; // [112] double_escaped = hold[ *blank >> '\\' - >> eol + >> omit[eol] >> *l_empty(_r1, context_t::flow_in) >> line_prefix(_r1, context_t::flow_in) ] @@ -140,16 +142,16 @@ namespace yaml { namespace parser { // [114] double_in_line = - *hold[*blank >> ns_double_char] + *hold[*blank[_val += _1] >> ns_double_char[push_utf8(_val, _1)]] ; // [115] double_next_line = hold[ - double_break(_r1) + double_break(_r1)[_val == _1] >> -hold[ - ns_double_char - >> double_in_line - >> (double_next_line(_r1) | *blank) + ns_double_char[push_utf8(_val, _1)] + >> double_in_line[_val += _1] + >> (double_next_line(_r1)[_val += _1] | *blank[_val += _1]) ] ] ; @@ -179,23 +181,23 @@ namespace yaml { namespace parser { // [121] single_text = eps(_r2 == context_t::flow_out || _r2 == context_t::flow_in) - >> single_multi_line(_r1) + >> single_multi_line(_r1)[_val = _1] | eps(_r2 == context_t::block_key || _r2 == context_t::flow_key) - >> *nb_single_char + >> *nb_single_char[push_utf8(_val, _1)] ; // [123] single_in_line = - *hold[*blank >> ns_single_char] + *hold[*blank[_val += _1] >> ns_single_char[push_utf8(_val, _1)]] ; // [124] single_next_line = hold[ - flow_folded(_r1, false) + flow_folded(_r1, false)[_val = _1] >> -hold[ - ns_single_char - >> single_in_line - >> (single_next_line(_r1) | *blank) + ns_single_char[push_utf8(_val, _1)] + >> single_in_line[_val += _1] + >> (single_next_line(_r1)[_val += _1] | *blank[_val += _1]) ] ] ; @@ -212,8 +214,8 @@ namespace yaml { namespace parser { // [126] plain_first = - (ns_char - indicator) - | hold[char_("?:-") >> plain_safe(_r1)] + (ns_char - indicator)[push_utf8(_val, _1)] + | hold[char_("?:-") >> plain_safe(_r1)[push_utf8(_val, _1)]] ; // [127] @@ -226,9 +228,9 @@ namespace yaml { namespace parser { // [130] plain_char = - hold[ns_char >> char_('#')] - | hold[char_(':') >> plain_safe(_r1)] - | plain_safe(_r1) - char_(":#") + (ns_char >> char_('#'))[push_utf8(_val, _1, _2)] + | (char_(':') >> plain_safe(_r1))[push_utf8(_val, _1, _2)] + | (plain_safe(_r1) - char_(":#"))[push_utf8(_val, _1)] ; // [131] diff --git a/yaml/parser/stream.hpp b/yaml/parser/stream.hpp index 8b7114d6..36282ba1 100644 --- a/yaml/parser/stream.hpp +++ b/yaml/parser/stream.hpp @@ -8,15 +8,13 @@ #ifndef YAML_PARSER_STREAM_HPP #define YAML_PARSER_STREAM_HPP -#define BOOST_SPIRIT_NO_PREDEFINED_TERMINALS - -#include - - +// TODO: Create a macro for YAML_HEADER_ONLY_INLINE #ifndef YAML_HEADER_ONLY #define YAML_HEADER_ONLY 1 #endif +#include + namespace yaml { namespace parser { @@ -53,7 +51,12 @@ namespace yaml { namespace parser { encoding_t read_bom (std::istream & is); template - encoding_t read_bom (pos_iterator & first, pos_iterator last); + encoding_t read_bom (pos_iterator & first, pos_iterator last, + typename std::enable_if::type* = 0); + + template + encoding_t read_bom (pos_iterator & first, pos_iterator last, + typename std::enable_if::type* = 0); #if YAML_HEADER_ONLY inline diff --git a/yaml/parser/stream_def.hpp b/yaml/parser/stream_def.hpp index 94f53d4c..be8a26bb 100644 --- a/yaml/parser/stream_def.hpp +++ b/yaml/parser/stream_def.hpp @@ -14,6 +14,7 @@ #include #include #include +#include namespace yaml { namespace parser { @@ -69,7 +70,7 @@ namespace yaml { namespace parser { auto pb = phx::push_back(_val, _1); - auto & full_bom = block_styles_.flow_styles_.basic_structures_.characters_.full_bom; + auto & bom = block_styles_.flow_styles_.basic_structures_.characters_.bom; auto & directive = block_styles_.flow_styles_.basic_structures_.directive; auto & l_comment = block_styles_.flow_styles_.basic_structures_.l_comment; @@ -80,7 +81,7 @@ namespace yaml { namespace parser { // [202] document_prefix = - !full_bom // BOM is read prior to each document. + !bom // BOM is read prior to each document. >> eps[_a = eoi_state_t::not_at_end] >> +l_comment(_a) >> eps(_a == eoi_state_t::not_at_end) ; @@ -92,26 +93,26 @@ namespace yaml { namespace parser { // [206] forbidden = raw[eps][check_start_of_line(_1, _pass)] - >> (-full_bom >> "---" | "...") + >> (-bom >> "---" | "...") >> (eol | blank | eoi) ; // [207] bare_document = - !full_bom // BOM is read prior to each document. + !bom // BOM is read prior to each document. >> block_node(-1, context_t::block_in) - forbidden ; // [208] explicit_document = - !full_bom // BOM is read prior to each document. + !bom // BOM is read prior to each document. >> "---" >> (bare_document | attr(ast::value_t()) >> s_l_comments(_a = eoi_state_t::not_at_end)) ; // [209] directive_document = - !full_bom // BOM is read prior to each document. + !bom // BOM is read prior to each document. >> +directive >> explicit_document ; @@ -127,10 +128,10 @@ namespace yaml { namespace parser { *document_prefix >> -any_document[pb] >> *( - +(document_suffix >> !full_bom) >> *document_prefix >> any_document[pb] + +(document_suffix >> !bom) >> *document_prefix >> any_document[pb] | *document_prefix >> explicit_document[pb] ) - >> *(document_suffix >> !full_bom) >> *document_prefix + >> *(document_suffix >> !bom) >> *document_prefix ; // Allow empty and comment lines at end of input. @@ -156,7 +157,7 @@ namespace yaml { namespace parser { namespace detail { - inline encoding_t read_bom_impl (char const * buf, int & size) + inline encoding_t read_bom_8 (char const * buf, int & size) { auto retval = encoding_t::utf8; @@ -213,7 +214,7 @@ namespace yaml { namespace parser { is.putback(buf[i]); } - auto const retval = detail::read_bom_impl(buf, size); + auto const retval = detail::read_bom_8(buf, size); for (int i = 0; i < size; ++i) { is.get(); @@ -223,7 +224,8 @@ namespace yaml { namespace parser { } template - encoding_t read_bom (pos_iterator & first, pos_iterator last) + encoding_t read_bom (pos_iterator & first, pos_iterator last, + typename std::enable_if::type*) { pos_iterator it = first; int size = 0; @@ -235,13 +237,22 @@ namespace yaml { namespace parser { ++size; } - auto const retval = detail::read_bom_impl(buf, size); + auto const retval = detail::read_bom_8(buf, size); std::advance(first, size); return retval; } + template + encoding_t read_bom (pos_iterator & first, pos_iterator last, + typename std::enable_if::type*) + { + if (first != last && *first == 0xfeff) + ++first; + return encoding_t::utf8; + } + #if YAML_HEADER_ONLY inline #endif @@ -253,8 +264,9 @@ namespace yaml { namespace parser { ) { boost::optional> retval; - using char_iterator_t = std::string::const_iterator; - using iterator_t = stream_t::iterator_t; + using raw_char_iterator_t = boost::u8_to_u32_iterator; + using ustring_t = std::basic_string; + using char_iterator_t = ustring_t::const_iterator; stream_t p( source_file, @@ -266,13 +278,23 @@ namespace yaml { namespace parser { if (!detail::check_encoding(first_encoding, p.error_handler_.f)) return retval; - std::string contents; - std::getline(is, contents, '\0'); + std::string raw_contents; + char buf[4096]; + while (is) { + is.read(buf, sizeof(buf)); + raw_contents.append(buf, buf + is.gcount()); + } + ustring_t contents( + raw_char_iterator_t(raw_contents.begin()), + raw_char_iterator_t(raw_contents.end()) + ); - char_iterator_t sfirst(contents.begin()); - char_iterator_t slast(contents.end()); + char_iterator_t contents_first(contents.begin()); + char_iterator_t contents_last(contents.end()); - iterator_t first(sfirst, slast); + using iterator_t = stream_t::iterator_t; + + iterator_t first(contents_first, contents_last); iterator_t last; first.set_tabchars(1);