utf8.hpp 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. //
  2. // Copyright (c) 2020 Krystian Stasiowski ([email protected])
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  5. // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  6. //
  7. // Official repository: https://github.com/boostorg/json
  8. //
  9. #ifndef BOOST_JSON_DETAIL_UTF8_HPP
  10. #define BOOST_JSON_DETAIL_UTF8_HPP
  11. #include <boost/endian/conversion.hpp>
  12. #include <boost/json/detail/config.hpp>
  13. #include <cstddef>
  14. #include <cstring>
  15. #include <cstdint>
  16. namespace boost {
  17. namespace json {
  18. namespace detail {
  19. template<int N>
  20. std::uint32_t
  21. load_little_endian(void const* p)
  22. {
  23. std::uint32_t v = 0;
  24. std::memcpy(&v, p, N);
  25. endian::little_to_native_inplace(v);
  26. return v;
  27. }
  28. inline
  29. uint16_t
  30. classify_utf8(char c)
  31. {
  32. // 0x000 = invalid
  33. // 0x102 = 2 bytes, second byte [80, BF]
  34. // 0x203 = 3 bytes, second byte [A0, BF]
  35. // 0x303 = 3 bytes, second byte [80, BF]
  36. // 0x403 = 3 bytes, second byte [80, 9F]
  37. // 0x504 = 4 bytes, second byte [90, BF]
  38. // 0x604 = 4 bytes, second byte [80, BF]
  39. // 0x704 = 4 bytes, second byte [80, 8F]
  40. static constexpr uint16_t first[128]
  41. {
  42. 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
  43. 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
  44. 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
  45. 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
  46. 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
  47. 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
  48. 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
  49. 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
  50. 0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
  51. 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
  52. 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
  53. 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
  54. 0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303,
  55. 0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303,
  56. 0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000,
  57. 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
  58. };
  59. return first[static_cast<unsigned char>(c & 0x7F)];
  60. }
  61. inline
  62. bool
  63. is_valid_utf8(const char* p, uint16_t first)
  64. {
  65. uint32_t v;
  66. switch(first >> 8)
  67. {
  68. default:
  69. return false;
  70. // 2 bytes, second byte [80, BF]
  71. case 1:
  72. v = load_little_endian<2>(p);
  73. return (v & 0xC000) == 0x8000;
  74. // 3 bytes, second byte [A0, BF]
  75. case 2:
  76. v = load_little_endian<3>(p);
  77. return (v & 0xC0E000) == 0x80A000;
  78. // 3 bytes, second byte [80, BF]
  79. case 3:
  80. v = load_little_endian<3>(p);
  81. return (v & 0xC0C000) == 0x808000;
  82. // 3 bytes, second byte [80, 9F]
  83. case 4:
  84. v = load_little_endian<3>(p);
  85. return (v & 0xC0E000) == 0x808000;
  86. // 4 bytes, second byte [90, BF]
  87. case 5:
  88. v = load_little_endian<4>(p);
  89. return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00;
  90. // 4 bytes, second byte [80, BF]
  91. case 6:
  92. v = load_little_endian<4>(p);
  93. return (v & 0xC0C0C000) == 0x80808000;
  94. // 4 bytes, second byte [80, 8F]
  95. case 7:
  96. v = load_little_endian<4>(p);
  97. return (v & 0xC0C0F000) == 0x80808000;
  98. }
  99. }
  100. class utf8_sequence
  101. {
  102. char seq_[4];
  103. uint16_t first_;
  104. uint8_t size_;
  105. public:
  106. void
  107. save(
  108. const char* p,
  109. std::size_t remain) noexcept
  110. {
  111. first_ = classify_utf8(*p );
  112. if(remain >= length())
  113. size_ = length();
  114. else
  115. size_ = static_cast<uint8_t>(remain);
  116. std::memcpy(seq_, p, size_);
  117. }
  118. uint8_t
  119. length() const noexcept
  120. {
  121. return first_ & 0xFF;
  122. }
  123. bool
  124. complete() const noexcept
  125. {
  126. return size_ >= length();
  127. }
  128. // returns true if complete
  129. bool
  130. append(
  131. const char* p,
  132. std::size_t remain) noexcept
  133. {
  134. if(BOOST_JSON_UNLIKELY(needed() == 0))
  135. return true;
  136. if(BOOST_JSON_LIKELY(remain >= needed()))
  137. {
  138. std::memcpy(
  139. seq_ + size_, p, needed());
  140. size_ = length();
  141. return true;
  142. }
  143. if(BOOST_JSON_LIKELY(remain > 0))
  144. {
  145. std::memcpy(seq_ + size_, p, remain);
  146. size_ += static_cast<uint8_t>(remain);
  147. }
  148. return false;
  149. }
  150. const char*
  151. data() const noexcept
  152. {
  153. return seq_;
  154. }
  155. uint8_t
  156. needed() const noexcept
  157. {
  158. return length() - size_;
  159. }
  160. bool
  161. valid() const noexcept
  162. {
  163. BOOST_ASSERT(size_ >= length());
  164. return is_valid_utf8(seq_, first_);
  165. }
  166. };
  167. } // detail
  168. } // namespace json
  169. } // namespace boost
  170. #endif