123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195 |
- //
- // Copyright (c) 2020 Krystian Stasiowski ([email protected])
- //
- // Distributed under the Boost Software License, Version 1.0. (See accompanying
- // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
- //
- // Official repository: https://github.com/boostorg/json
- //
- #ifndef BOOST_JSON_DETAIL_UTF8_HPP
- #define BOOST_JSON_DETAIL_UTF8_HPP
- #include <boost/endian/conversion.hpp>
- #include <boost/json/detail/config.hpp>
- #include <cstddef>
- #include <cstring>
- #include <cstdint>
- namespace boost {
- namespace json {
- namespace detail {
- template<int N>
- std::uint32_t
- load_little_endian(void const* p)
- {
- std::uint32_t v = 0;
- std::memcpy(&v, p, N);
- endian::little_to_native_inplace(v);
- return v;
- }
- inline
- uint16_t
- classify_utf8(char c)
- {
- // 0x000 = invalid
- // 0x102 = 2 bytes, second byte [80, BF]
- // 0x203 = 3 bytes, second byte [A0, BF]
- // 0x303 = 3 bytes, second byte [80, BF]
- // 0x403 = 3 bytes, second byte [80, 9F]
- // 0x504 = 4 bytes, second byte [90, BF]
- // 0x604 = 4 bytes, second byte [80, BF]
- // 0x704 = 4 bytes, second byte [80, 8F]
- static constexpr uint16_t first[128]
- {
- 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
- 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
- 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
- 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
- 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
- 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
- 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
- 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
- 0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
- 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
- 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
- 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
- 0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303,
- 0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303,
- 0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000,
- 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
- };
- return first[static_cast<unsigned char>(c & 0x7F)];
- }
- inline
- bool
- is_valid_utf8(const char* p, uint16_t first)
- {
- uint32_t v;
- switch(first >> 8)
- {
- default:
- return false;
- // 2 bytes, second byte [80, BF]
- case 1:
- v = load_little_endian<2>(p);
- return (v & 0xC000) == 0x8000;
- // 3 bytes, second byte [A0, BF]
- case 2:
- v = load_little_endian<3>(p);
- return (v & 0xC0E000) == 0x80A000;
- // 3 bytes, second byte [80, BF]
- case 3:
- v = load_little_endian<3>(p);
- return (v & 0xC0C000) == 0x808000;
- // 3 bytes, second byte [80, 9F]
- case 4:
- v = load_little_endian<3>(p);
- return (v & 0xC0E000) == 0x808000;
- // 4 bytes, second byte [90, BF]
- case 5:
- v = load_little_endian<4>(p);
- return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00;
- // 4 bytes, second byte [80, BF]
- case 6:
- v = load_little_endian<4>(p);
- return (v & 0xC0C0C000) == 0x80808000;
- // 4 bytes, second byte [80, 8F]
- case 7:
- v = load_little_endian<4>(p);
- return (v & 0xC0C0F000) == 0x80808000;
- }
- }
- class utf8_sequence
- {
- char seq_[4];
- uint16_t first_;
- uint8_t size_;
- public:
- void
- save(
- const char* p,
- std::size_t remain) noexcept
- {
- first_ = classify_utf8(*p );
- if(remain >= length())
- size_ = length();
- else
- size_ = static_cast<uint8_t>(remain);
- std::memcpy(seq_, p, size_);
- }
- uint8_t
- length() const noexcept
- {
- return first_ & 0xFF;
- }
- bool
- complete() const noexcept
- {
- return size_ >= length();
- }
- // returns true if complete
- bool
- append(
- const char* p,
- std::size_t remain) noexcept
- {
- if(BOOST_JSON_UNLIKELY(needed() == 0))
- return true;
- if(BOOST_JSON_LIKELY(remain >= needed()))
- {
- std::memcpy(
- seq_ + size_, p, needed());
- size_ = length();
- return true;
- }
- if(BOOST_JSON_LIKELY(remain > 0))
- {
- std::memcpy(seq_ + size_, p, remain);
- size_ += static_cast<uint8_t>(remain);
- }
- return false;
- }
- const char*
- data() const noexcept
- {
- return seq_;
- }
- uint8_t
- needed() const noexcept
- {
- return length() - size_;
- }
- bool
- valid() const noexcept
- {
- BOOST_ASSERT(size_ >= length());
- return is_valid_utf8(seq_, first_);
- }
- };
- } // detail
- } // namespace json
- } // namespace boost
- #endif
|