diff --git a/include/boost/json/basic_parser.hpp b/include/boost/json/basic_parser.hpp index 85135e0a..a0a6f30d 100644 --- a/include/boost/json/basic_parser.hpp +++ b/include/boost/json/basic_parser.hpp @@ -202,7 +202,7 @@ reserve() sizeof(std::size_t)) * depth() + // array and object state + size sizeof(state) + // value parsing state sizeof(std::size_t) + // string size - sizeof(state)); // comment/utf8 state + sizeof(state)); // comment state } //---------------------------------------------------------- @@ -501,287 +501,6 @@ do_com5: AllowTrailing, AllowBadUTF8>(cs.begin()); } -template -template -const char* -basic_parser:: -validate_utf8(const char* p, const char* end) -{ - // 0 = invalid - // 1 = 2 bytes, second byte [80, BF] - // 2 = 3 bytes, second byte [A0, BF] - // 3 = 3 bytes, second byte [80, BF] - // 4 = 3 bytes, second byte [80, 9F] - // 5 = 4 bytes, second byte [90, BF] - // 6 = 4 bytes, second byte [80, BF] - // 7 = 4 bytes, second byte [80, 8F] - static constexpr char first[128] - { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, - 5, 6, 6, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }; - detail::const_stream_wrapper cs(p, end); - unsigned char c; - if(StackEmpty || st_.empty()) - { - // fast path - if(BOOST_JSON_LIKELY( - cs.remain() >= 4)) - { - BOOST_ASSERT(static_cast< - unsigned char>(*cs) > 0x7F); - uint32_t v; - std::memcpy(&v, cs.begin(), 4); - v = detail::little_endian(v); - switch(first[v & 0x0000007F]) - { - // 2 bytes, second byte [80, BF] - case 1: - if(BOOST_JSON_LIKELY( - (v & 0x0000C000) == 0x00008000)) - { - cs += 2; - return cs.begin(); - } - break; - // 3 bytes, second byte [A0, BF] - case 2: - if(BOOST_JSON_LIKELY( - (v & 0x00C0E000) == 0x0080A000)) - { - cs += 3; - return cs.begin(); - } - break; - // 3 bytes, second byte [80, BF] - case 3: - if(BOOST_JSON_LIKELY( - (v & 0x00C0C000) == 0x00808000)) - { - cs += 3; - return cs.begin(); - } - break; - // 3 bytes, second byte [80, 9F] - case 4: - if(BOOST_JSON_LIKELY( - (v & 0x00C0E000) == 0x00808000)) - { - cs += 3; - return cs.begin(); - } - break; - // 4 bytes, second byte [90, BF] - case 5: - if(BOOST_JSON_LIKELY( - (v & 0xC0C0FF00) + - 0x7F7F7000 <= 0x00002F00)) - { - cs += 4; - return cs.begin(); - } - break; - // 4 bytes, second byte [80, BF] - case 6: - if(BOOST_JSON_LIKELY( - (v & 0xC0C0C000) == 0x80808000)) - { - cs += 4; - return cs.begin(); - } - break; - // 4 bytes, second byte [80, 8F] - case 7: - if(BOOST_JSON_LIKELY( - (v & 0xC0C0F000) == 0x80808000)) - { - cs += 4; - return cs.begin(); - } - break; - } - return fail(cs.begin(), error::syntax); - } - } - else - { - state st; - st_.pop(st); - switch(st) - { - default:; - case state::utf1: goto do_utf1; - case state::utf2: goto do_utf2; - case state::utf3: goto do_utf3; - case state::utf4: goto do_utf4; - case state::utf5: goto do_utf5; - case state::utf6: goto do_utf6; - case state::utf7: goto do_utf7; - case state::utf8: goto do_utf8; - case state::utf9: goto do_utf9; - case state::utf10: goto do_utf10; - case state::utf11: goto do_utf11; - case state::utf12: goto do_utf12; - case state::utf13: goto do_utf13; - case state::utf14: goto do_utf14; - case state::utf15: goto do_utf15; - case state::utf16: goto do_utf16; - } - } - c = static_cast(*cs); - BOOST_ASSERT(c > 0x7F); - ++cs; - switch(first[c & 0x7F]) - { - // 2 bytes - case 1: -do_utf1: - if(BOOST_JSON_UNLIKELY(! cs)) - return maybe_suspend(cs.begin(), state::utf1); - if(BOOST_JSON_UNLIKELY( - (*cs & 0xC0) != 0x80)) - break; - ++cs; - return cs.begin(); - - // 3 bytes, second byte [A0, BF] - case 2: -do_utf2: - if(BOOST_JSON_UNLIKELY(! cs)) - return maybe_suspend(cs.begin(), state::utf2); - if(BOOST_JSON_UNLIKELY( - (*cs & 0xE0) != 0xA0)) - break; - ++cs; -do_utf3: - if(BOOST_JSON_UNLIKELY(! cs)) - return maybe_suspend(cs.begin(), state::utf3); - if(BOOST_JSON_UNLIKELY( - (*cs & 0xC0) != 0x80)) - break; - ++cs; - return cs.begin(); - - // 3 bytes, second byte [80, BF] - case 3: -do_utf4: - if(BOOST_JSON_UNLIKELY(! cs)) - return maybe_suspend(cs.begin(), state::utf4); - if(BOOST_JSON_UNLIKELY( - (*cs & 0xC0) != 0x80)) - break; - ++cs; -do_utf5: - if(BOOST_JSON_UNLIKELY(! cs)) - return maybe_suspend(cs.begin(), state::utf5); - if(BOOST_JSON_UNLIKELY( - (*cs & 0xC0) != 0x80)) - break; - ++cs; - return cs.begin(); - - // 3 bytes, second byte [80, 9F] - case 4: -do_utf6: - if(BOOST_JSON_UNLIKELY(! cs)) - return maybe_suspend(cs.begin(), state::utf6); - if(BOOST_JSON_UNLIKELY( - (*cs & 0xE0) != 0x80)) - break; - ++cs; -do_utf7: - if(BOOST_JSON_UNLIKELY(! cs)) - return maybe_suspend(cs.begin(), state::utf7); - if(BOOST_JSON_UNLIKELY( - (*cs & 0xC0) != 0x80)) - break; - ++cs; - return cs.begin(); - - // 4 bytes, second byte [90, BF] - case 5: -do_utf8: - if(BOOST_JSON_UNLIKELY(! cs)) - return maybe_suspend(cs.begin(), state::utf8); - if(BOOST_JSON_UNLIKELY( - (*cs + 0x70) > 0x2F)) - break; - ++cs; -do_utf9: - if(BOOST_JSON_UNLIKELY(! cs)) - return maybe_suspend(cs.begin(), state::utf9); - if(BOOST_JSON_UNLIKELY( - (*cs & 0xC0) != 0x80)) - break; - ++cs; -do_utf10: - if(BOOST_JSON_UNLIKELY(! cs)) - return maybe_suspend(cs.begin(), state::utf10); - if(BOOST_JSON_UNLIKELY( - (*cs & 0xC0) != 0x80)) - break; - ++cs; - return cs.begin(); - - // 4 bytes, second byte [80, BF] - case 6: -do_utf11: - if(BOOST_JSON_UNLIKELY(! cs)) - return maybe_suspend(cs.begin(), state::utf11); - if(BOOST_JSON_UNLIKELY( - (*cs & 0xC0) != 0x80)) - break; - ++cs; -do_utf12: - if(BOOST_JSON_UNLIKELY(! cs)) - return maybe_suspend(cs.begin(), state::utf12); - if(BOOST_JSON_UNLIKELY( - (*cs & 0xC0) != 0x80)) - break; - ++cs; -do_utf13: - if(BOOST_JSON_UNLIKELY(! cs)) - return maybe_suspend(cs.begin(), state::utf13); - if(BOOST_JSON_UNLIKELY( - (*cs & 0xC0) != 0x80)) - break; - ++cs; - return cs.begin(); - - // 4 bytes, second byte [80, 8F] - case 7: -do_utf14: - if(BOOST_JSON_UNLIKELY(! cs)) - return maybe_suspend(cs.begin(), state::utf14); - if(BOOST_JSON_UNLIKELY( - (*cs & 0xF0) != 0x80)) - break; - ++cs; -do_utf15: - if(BOOST_JSON_UNLIKELY(! cs)) - return maybe_suspend(cs.begin(), state::utf15); - if(BOOST_JSON_UNLIKELY( - (*cs & 0xC0) != 0x80)) - break; - ++cs; -do_utf16: - if(BOOST_JSON_UNLIKELY(! cs)) - return maybe_suspend(cs.begin(), state::utf16); - if(BOOST_JSON_UNLIKELY( - (*cs & 0xC0) != 0x80)) - break; - ++cs; - return cs.begin(); - } - return fail(cs.begin(), error::syntax); -} - template template const char* @@ -968,17 +687,17 @@ resume_value(const char* p) case state::fal3: case state::fal4: return parse_false(p); - case state::str1: case state::utf17: + case state::str1: return parse_unescaped(p); case state::str2: case state::str3: case state::str4: case state::str5: case state::str6: case state::str7: + case state::str8: case state::sur1: case state::sur2: case state::sur3: case state::sur4: case state::sur5: case state::sur6: - case state::utf18: return parse_escaped(p); @@ -1212,17 +931,17 @@ parse_string(const char* p) switch(st) { default: - case state::str1: case state::utf17: - return parse_unescaped(p); - + case state::str1: + return parse_unescaped(p); + case state::str2: case state::str3: case state::str4: case state::str5: case state::str6: case state::str7: + case state::str8: case state::sur1: case state::sur2: case state::sur3: case state::sur4: case state::sur5: case state::sur6: - case state::utf18: return parse_escaped(p); } @@ -1245,30 +964,23 @@ parse_unescaped(const char* p) constexpr auto on_part = IsKey ? &Handler::on_key_part : &Handler::on_string_part; detail::const_stream_wrapper cs(p, end_); - char const* start; std::size_t total; - std::size_t size; - if(! StackEmpty && ! st_.empty()) + if(StackEmpty || st_.empty()) + { + BOOST_ASSERT(*cs == '\x22'); // '"' + ++cs; + total = 0; + } + else { - start = cs.begin(); state st; st_.pop(st); st_.pop(total); - switch(st) - { - default: - case state::str1: goto do_str1; - case state::utf17: goto do_utf17; - } } - BOOST_ASSERT(*cs == '\x22'); // '"' - ++cs; - start = cs.begin(); - total = 0; -do_str1: + char const* start = cs.begin(); cs = detail::count_valid( cs.begin(), cs.end()); - size = cs.used(start); + std::size_t size = cs.used(start); if(BOOST_JSON_UNLIKELY(size > BOOST_JSON_MAX_STRING_SIZE - total)) return fail(cs.begin(), IsKey ? @@ -1277,7 +989,7 @@ do_str1: if(BOOST_JSON_UNLIKELY(! cs)) { // call handler if the string isn't empty - if(BOOST_JSON_LIKELY(cs.begin() > start)) + if(BOOST_JSON_LIKELY(size)) { if(BOOST_JSON_UNLIKELY(! (h_.*on_part)( {start, size}, ec_))) @@ -1287,27 +999,25 @@ do_str1: } if(BOOST_JSON_UNLIKELY(*cs != '\x22')) // '"' { + // sequence is invalid or incomplete if(! AllowBadUTF8 && (*cs & 0x80)) { -do_utf17: - // KRYSTIAN TODO: fix utf-8 validation - cs = validate_utf8(cs.begin(), cs.end()); - if(BOOST_JSON_UNLIKELY(incomplete(cs))) - return suspend_or_fail(state::utf17, total); - goto do_str1; + seq_.save(cs.begin(), cs.remain()); + if(BOOST_JSON_UNLIKELY(seq_.complete())) + return fail(cs.begin(), error::syntax); + return maybe_suspend(cs.end(), state::str8, total); } else if(BOOST_JSON_LIKELY(*cs == '\\')) { // flush unescaped run from input - if(BOOST_JSON_LIKELY(cs.begin() > start)) + if(BOOST_JSON_LIKELY(size)) { if(BOOST_JSON_UNLIKELY(! (h_.*on_part)( {start, size}, ec_))) return fail(cs.begin()); } - str_size_ = total; return parse_escaped(cs.begin()); + AllowBadUTF8>(cs.begin(), total); } // illegal control return fail(cs.begin(), error::syntax); @@ -1326,7 +1036,9 @@ template< bool AllowBadUTF8> const char* basic_parser:: -parse_escaped(const char* p) +parse_escaped( + const char* p, + std::size_t total) { //--------------------------------------------------------------- // @@ -1350,7 +1062,6 @@ parse_escaped(const char* p) detail::clipped_const_stream cs(p, end_); detail::buffer temp; int32_t digit; - std::size_t total; char c; cs.clip(temp.max_size()); if(! StackEmpty && ! st_.empty()) @@ -1367,13 +1078,13 @@ parse_escaped(const char* p) case state::str5: goto do_str5; case state::str6: goto do_str6; case state::str7: goto do_str7; + case state::str8: goto do_str8; case state::sur1: goto do_sur1; case state::sur2: goto do_sur2; case state::sur3: goto do_sur3; case state::sur4: goto do_sur4; case state::sur5: goto do_sur5; case state::sur6: goto do_sur6; - case state::utf18: goto do_utf18; } } // Unescaped JSON is never larger than its escaped version. @@ -1382,7 +1093,6 @@ parse_escaped(const char* p) // of the temporary buffer. // handle escaped character BOOST_ASSERT(*cs == '\\'); - total = str_size_; ++cs; do_str3: if(BOOST_JSON_UNLIKELY(! cs)) @@ -1696,12 +1406,13 @@ do_str2: } else if(! AllowBadUTF8 && (c & 0x80)) { -do_utf18: - char const* start = cs.begin(); - cs = validate_utf8(cs.begin(), cs.end()); - if(BOOST_JSON_UNLIKELY(incomplete(cs))) - return suspend_or_fail(state::utf18, total); - temp.append(start, cs.used(start)); + seq_.save(cs.begin(), cs.remain()); + if(BOOST_JSON_UNLIKELY(! seq_.complete())) + return maybe_suspend(cs.end(), state::str8, total); + if(BOOST_JSON_UNLIKELY(! seq_.valid())) + return fail(cs.begin(), error::syntax); + temp.append(seq_.sequence(), seq_.length()); + cs += seq_.length(); continue; } else if(BOOST_JSON_LIKELY(c == '\\')) @@ -1715,6 +1426,16 @@ do_utf18: temp.push_back(c); ++cs; } +do_str8: + uint8_t needed = seq_.needed(); + if(BOOST_JSON_UNLIKELY( + ! seq_.append(cs.begin(), cs.remain()))) + return maybe_suspend(cs.end(), state::str8, total); + if(BOOST_JSON_UNLIKELY(! seq_.valid())) + return fail(cs.begin(), error::syntax); + temp.append(seq_.sequence(), seq_.length()); + cs += needed; + goto do_str2; } //---------------------------------------------------------- diff --git a/include/boost/json/detail/basic_parser.hpp b/include/boost/json/detail/basic_parser.hpp index 59827255..9e603fb6 100644 --- a/include/boost/json/detail/basic_parser.hpp +++ b/include/boost/json/detail/basic_parser.hpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -161,15 +162,9 @@ class basic_parser tru1, tru2, tru3, fal1, fal2, fal3, fal4, str1, str2, str3, str4, - str5, str6, str7, + str5, str6, str7, str8, sur1, sur2, sur3, sur4, sur5, sur6, - utf1, utf2, utf3, - utf4, utf5, utf6, - utf7, utf8, utf9, - utf10, utf11, utf12, - utf13, utf14, utf15, - utf16, utf17, utf18, obj1, obj2, obj3, obj4, obj5, obj6, obj7, obj8, obj9, obj10, obj11, @@ -198,7 +193,7 @@ class basic_parser std::size_t max_depth_ = 32; // how many levels deeper the parser can go std::size_t depth_ = max_depth_; - std::size_t str_size_; + detail::utf8_sequence seq_; unsigned u1_; unsigned u2_; bool more_; // false for final buffer @@ -286,9 +281,6 @@ class basic_parser bool Terminal, bool AllowTrailing, bool AllowBadUTF8> const char* parse_comment(const char* p); - - template - const char* validate_utf8(const char* p, const char* end); template const char* parse_document(const char* p); @@ -322,16 +314,18 @@ class basic_parser bool AllowBadUTF8> const char* parse_string(const char* p); + template + const char* parse_number(const char* p); + template const char* parse_unescaped(const char* p); template - const char* parse_escaped(const char* p); - - template - const char* parse_number(const char* p); + const char* parse_escaped( + const char* p, + std::size_t total = 0); public: /** Destructor. diff --git a/include/boost/json/detail/sse2.hpp b/include/boost/json/detail/sse2.hpp index 4950c8ef..7867fc21 100644 --- a/include/boost/json/detail/sse2.hpp +++ b/include/boost/json/detail/sse2.hpp @@ -13,6 +13,7 @@ #define BOOST_JSON_DETAIL_SSE2_HPP #include +#include #include #include #ifdef BOOST_JSON_USE_SSE2 @@ -113,7 +114,8 @@ count_valid( _BitScanForward( &index, w ); m = index; #endif - return p + m; + p += m; + break; } p += 16; @@ -121,10 +123,22 @@ count_valid( while(p != end) { - const char c = *p; + const unsigned char c = *p; if(c == '\x22' || c == '\\' || c < 0x20) break; - ++p; + if(c < 0x80) + { + ++p; + continue; + } + // validate utf-8 + uint16_t first = classify_utf8(c & 0x7F); + uint8_t len = first & 0xFF; + if(BOOST_JSON_UNLIKELY(end - p < len)) + break; + if(BOOST_JSON_UNLIKELY(! is_valid_utf8(p, first))) + break; + p += len; } return p; @@ -158,10 +172,22 @@ count_valid( { while(p != end) { - const char c = *p; + const unsigned char c = *p; if(c == '\x22' || c == '\\' || c < 0x20) break; - ++p; + if(c < 0x80) + { + ++p; + continue; + } + // validate utf-8 + uint16_t first = classify_utf8(c & 0x7F); + uint8_t len = first & 0xFF; + if(BOOST_JSON_UNLIKELY(end - p < len)) + break; + if(BOOST_JSON_UNLIKELY(! is_valid_utf8(p, first))) + break; + p += len; } return p; diff --git a/include/boost/json/detail/utf8.hpp b/include/boost/json/detail/utf8.hpp new file mode 100644 index 00000000..c382ddb2 --- /dev/null +++ b/include/boost/json/detail/utf8.hpp @@ -0,0 +1,171 @@ +// +// Copyright (c) 2020 Krystian Stasiowski (sdkrystian@gmail.com) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// Official repository: https://github.com/cppalliance/json +// + +#ifndef BOOST_JSON_DETAIL_UTF8_HPP +#define BOOST_JSON_DETAIL_UTF8_HPP + +#include +#include +#include + +namespace boost { +namespace json { +namespace detail { + +inline +uint16_t +classify_utf8(char c) +{ + // 0x000 = invalid + // 0x102 = 2 bytes, second byte [80, BF] + // 0x203 = 3 bytes, second byte [A0, BF] + // 0x303 = 3 bytes, second byte [80, BF] + // 0x403 = 3 bytes, second byte [80, 9F] + // 0x504 = 4 bytes, second byte [90, BF] + // 0x604 = 4 bytes, second byte [80, BF] + // 0x704 = 4 bytes, second byte [80, 8F] + static constexpr uint16_t first[128] + { + 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, + 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, + 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, + 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, + 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, + 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, + 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, + 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, + + 0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, + 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, + 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, + 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, + 0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, + 0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303, + 0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000, + 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, + }; + return first[static_cast(c)]; +} + +inline +bool +is_valid_utf8(const char* p, uint16_t first) +{ + // KRYSTIAN TODO: account for big endian + // also, make a convenience function for + // N-byte loads + uint32_t v; + switch(first >> 8) + { + default: + return false; + // 2 bytes, second byte [80, BF] + case 1: + std::memcpy(&v, p, 2); + return (v & 0xC000) == 0x8000; + // 3 bytes, second byte [A0, BF] + case 2: + std::memcpy(&v, p, 3); + return (v & 0xC0E000) == 0x80A000; + // 3 bytes, second byte [80, BF] + case 3: + std::memcpy(&v, p, 3); + return (v & 0xC0C000) == 0x808000; + // 3 bytes, second byte [80, 9F] + case 4: + std::memcpy(&v, p, 3); + return (v & 0xC0E000) == 0x808000; + // 4 bytes, second byte [90, BF] + case 5: + std::memcpy(&v, p, 4); + return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00; + // 4 bytes, second byte [80, BF] + case 6: + std::memcpy(&v, p, 4); + return (v & 0xC0C0C000) == 0x80808000; + // 4 bytes, second byte [80, 8F] + case 7: + std::memcpy(&v, p, 4); + return (v & 0xC0C0F000) == 0x80808000; + } +} + +struct utf8_sequence +{ +private: + char seq_[4]; + uint16_t first_; + uint8_t size_; +public: + void + save( + const char* p, + std::size_t remain) noexcept + { + first_ = classify_utf8(*p & 0x7F); + size_ = remain >= length() ? + length() : static_cast(remain); + std::memcpy(seq_, p, size_); + } + + uint8_t + length() const noexcept + { + return first_ & 0xFF; + } + + bool complete() const noexcept + { + return size_ >= length(); + } + + // return true if complete + bool + append( + const char* p, + std::size_t remain) noexcept + { + if(! needed()) + return true; + if(remain >= needed()) + { + std::memcpy(seq_ + size_, p, needed()); + size_ = length(); + return true; + } + std::memcpy(seq_ + size_, p, remain); + size_ += static_cast(remain); + return false; + } + + const char* + sequence() const noexcept + { + return seq_; + } + + uint8_t + needed() const noexcept + { + return length() - size_; + } + + bool + valid() const noexcept + { + BOOST_ASSERT(size_ >= length()); + return is_valid_utf8(seq_, first_); + } +}; + +} // boost +} // json +} // detail + +#endif diff --git a/test/basic_parser.cpp b/test/basic_parser.cpp index 502ca529..3c73519d 100644 --- a/test/basic_parser.cpp +++ b/test/basic_parser.cpp @@ -1229,6 +1229,85 @@ public: bad("\"\\t\\t\xf4\x70\x80\x80----------\""); bad("\"\\n\xf4\x80\x70\x80----------\""); bad("\"\\n\xf4\x80\xbf\x70-\\n\xf4\x80\xbf\x70\""); + + class utf8_parser + { + struct handler + { + std::string captured = ""; + bool on_document_begin( error_code& ) { return true; } + bool on_document_end( error_code& ) { return true; } + bool on_object_begin( error_code& ) { return true; } + bool on_object_end( error_code& ) { return true; } + bool on_array_begin( error_code& ) { return true; } + bool on_array_end( error_code& ) { return true; } + bool on_key_part( string_view, error_code& ) { return true; } + bool on_key( string_view, error_code& ) { return true; } + bool on_string_part( string_view sv, error_code& ) + { + captured.append(sv.data(), sv.size()); + return true; + } + bool on_string( string_view sv, error_code& ) + { + captured.append(sv.data(), sv.size()); + return true; + } + bool on_number_part( string_view, error_code&) { return true; } + bool on_int64( std::int64_t, string_view, error_code& ) { return true; } + bool on_uint64( std::uint64_t, string_view, error_code& ) { return true; } + bool on_double( double, string_view, error_code& ) { return true; } + bool on_bool( bool, error_code& ) { return true; } + bool on_null( error_code& ) { return true; } + bool on_comment_part( string_view, error_code& ) { return true; } + bool on_comment( string_view, error_code& ) { return true; } + }; + + basic_parser p_; + + public: + std::size_t + write( + bool more, + char const* data, + std::size_t size, + error_code& ec) + { + auto const n = p_.write( + more, data, size, ec); + if(! ec && n < size) + ec = error::extra_data; + return n; + } + + string_view + captured() const noexcept + { + return p_.handler().captured; + } + }; + + const auto check = + [this](string_view expected) + { + good(expected); + utf8_parser p; + for(std::size_t i = 0; i < expected.size(); ++i) + { + error_code ec; + auto more = (i != expected.size() - 1); + auto written = p.write(more, + expected.data() + i, 1, ec); + BOOST_TEST(written == 1); + BOOST_TEST( !ec); + } + BOOST_TEST(p.captured() == + expected.substr(1, expected.size() - 2)); + }; + + check("\"\xd1\x82\""); + check("\"\xd1\x82\xd0\xb5\xd1\x81\xd1\x82\""); + check("\"\xc3\x0b1""and\xc3\xba\""); } void