utf.hpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. //
  2. // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0.
  5. // https://www.boost.org/LICENSE_1_0.txt
  6. #ifndef BOOST_LOCALE_UTF_HPP_INCLUDED
  7. #define BOOST_LOCALE_UTF_HPP_INCLUDED
  8. #include <boost/locale/config.hpp>
  9. #include <cstdint>
  10. namespace boost { namespace locale {
  11. /// \brief Namespace that holds basic operations on UTF encoded sequences
  12. ///
  13. /// All functions defined in this namespace do not require linking with Boost.Locale library
  14. namespace utf {
  15. /// \brief The integral type that can hold a Unicode code point
  16. using code_point = uint32_t;
  17. /// \brief Special constant that defines illegal code point
  18. constexpr code_point illegal = 0xFFFFFFFFu;
  19. /// \brief Special constant that defines incomplete code point
  20. constexpr code_point incomplete = 0xFFFFFFFEu;
  21. /// Either a length/size or an error (illegal/incomplete)
  22. using len_or_error = code_point;
  23. /// \brief the function checks if \a v is a valid code point
  24. inline bool is_valid_codepoint(code_point v)
  25. {
  26. if(v > 0x10FFFF)
  27. return false;
  28. if(0xD800 <= v && v <= 0xDFFF) // surrogates
  29. return false;
  30. return true;
  31. }
  32. #ifdef BOOST_LOCALE_DOXYGEN
  33. /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
  34. template<typename CharType, int size = sizeof(CharType)>
  35. struct utf_traits {
  36. /// The type of the character
  37. typedef CharType char_type;
  38. /// Read one code point from the range [p,e) and return it.
  39. ///
  40. /// - If the sequence that was read is incomplete sequence returns \ref incomplete,
  41. /// - If illegal sequence detected returns \ref illegal
  42. ///
  43. /// Requirements
  44. ///
  45. /// - Iterator is valid input iterator
  46. ///
  47. /// Postconditions
  48. ///
  49. /// - p points to the last consumed character
  50. template<typename Iterator>
  51. static code_point decode(Iterator& p, Iterator e);
  52. /// Maximal width of valid sequence in the code units:
  53. ///
  54. /// - UTF-8 - 4
  55. /// - UTF-16 - 2
  56. /// - UTF-32 - 1
  57. static constexpr int max_width;
  58. /// The width of specific code point in the code units.
  59. ///
  60. /// Requirement: value is a valid Unicode code point
  61. /// Returns value in range [1..max_width]
  62. static int width(code_point value);
  63. /// Get the size of the trail part of variable length encoded sequence.
  64. ///
  65. /// Returns -1 if C is not valid lead character
  66. static int trail_length(char_type c);
  67. /// Returns true if c is trail code unit, always false for UTF-32
  68. static bool is_trail(char_type c);
  69. /// Returns true if c is lead code unit, always true of UTF-32
  70. static bool is_lead(char_type c);
  71. /// Convert valid Unicode code point \a value to the UTF sequence.
  72. ///
  73. /// Requirements:
  74. ///
  75. /// - \a value is valid code point
  76. /// - \a out is an output iterator should be able to accept at least width(value) units
  77. ///
  78. /// Returns the iterator past the last written code unit.
  79. template<typename Iterator>
  80. static Iterator encode(code_point value, Iterator out);
  81. /// Decodes valid UTF sequence that is pointed by p into code point.
  82. ///
  83. /// If the sequence is invalid or points to end the behavior is undefined
  84. template<typename Iterator>
  85. static code_point decode_valid(Iterator& p);
  86. };
  87. #else
  88. template<typename CharType, int size = sizeof(CharType)>
  89. struct utf_traits;
  90. template<typename CharType>
  91. struct utf_traits<CharType, 1> {
  92. typedef CharType char_type;
  93. static int trail_length(char_type ci)
  94. {
  95. unsigned char c = ci;
  96. if(c < 128)
  97. return 0;
  98. if(BOOST_UNLIKELY(c < 194))
  99. return -1;
  100. if(c < 224)
  101. return 1;
  102. if(c < 240)
  103. return 2;
  104. if(BOOST_LIKELY(c <= 244))
  105. return 3;
  106. return -1;
  107. }
  108. static constexpr int max_width = 4;
  109. static int width(code_point value)
  110. {
  111. if(value <= 0x7F)
  112. return 1;
  113. else if(value <= 0x7FF)
  114. return 2;
  115. else if(BOOST_LIKELY(value <= 0xFFFF))
  116. return 3;
  117. else
  118. return 4;
  119. }
  120. static bool is_trail(char_type ci)
  121. {
  122. unsigned char c = ci;
  123. return (c & 0xC0) == 0x80;
  124. }
  125. static bool is_lead(char_type ci) { return !is_trail(ci); }
  126. template<typename Iterator>
  127. static code_point decode(Iterator& p, Iterator e)
  128. {
  129. if(BOOST_UNLIKELY(p == e))
  130. return incomplete;
  131. unsigned char lead = *p++;
  132. // First byte is fully validated here
  133. int trail_size = trail_length(lead);
  134. if(BOOST_UNLIKELY(trail_size < 0))
  135. return illegal;
  136. // Ok as only ASCII may be of size = 0
  137. // also optimize for ASCII text
  138. if(trail_size == 0)
  139. return lead;
  140. code_point c = lead & ((1 << (6 - trail_size)) - 1);
  141. // Read the rest
  142. unsigned char tmp;
  143. switch(trail_size) {
  144. case 3:
  145. if(BOOST_UNLIKELY(p == e))
  146. return incomplete;
  147. tmp = *p++;
  148. if(!is_trail(tmp))
  149. return illegal;
  150. c = (c << 6) | (tmp & 0x3F);
  151. BOOST_FALLTHROUGH;
  152. case 2:
  153. if(BOOST_UNLIKELY(p == e))
  154. return incomplete;
  155. tmp = *p++;
  156. if(!is_trail(tmp))
  157. return illegal;
  158. c = (c << 6) | (tmp & 0x3F);
  159. BOOST_FALLTHROUGH;
  160. case 1:
  161. if(BOOST_UNLIKELY(p == e))
  162. return incomplete;
  163. tmp = *p++;
  164. if(!is_trail(tmp))
  165. return illegal;
  166. c = (c << 6) | (tmp & 0x3F);
  167. }
  168. // Check code point validity: no surrogates and
  169. // valid range
  170. if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
  171. return illegal;
  172. // make sure it is the most compact representation
  173. if(BOOST_UNLIKELY(width(c) != trail_size + 1))
  174. return illegal;
  175. return c;
  176. }
  177. template<typename Iterator>
  178. static code_point decode_valid(Iterator& p)
  179. {
  180. unsigned char lead = *p++;
  181. if(lead < 192)
  182. return lead;
  183. int trail_size;
  184. if(lead < 224)
  185. trail_size = 1;
  186. else if(BOOST_LIKELY(lead < 240)) // non-BMP rare
  187. trail_size = 2;
  188. else
  189. trail_size = 3;
  190. code_point c = lead & ((1 << (6 - trail_size)) - 1);
  191. switch(trail_size) {
  192. case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_FALLTHROUGH;
  193. case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_FALLTHROUGH;
  194. case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
  195. }
  196. return c;
  197. }
  198. template<typename Iterator>
  199. static Iterator encode(code_point value, Iterator out)
  200. {
  201. if(value <= 0x7F)
  202. *out++ = static_cast<char_type>(value);
  203. else if(value <= 0x7FF) {
  204. *out++ = static_cast<char_type>((value >> 6) | 0xC0);
  205. *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
  206. } else if(BOOST_LIKELY(value <= 0xFFFF)) {
  207. *out++ = static_cast<char_type>((value >> 12) | 0xE0);
  208. *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
  209. *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
  210. } else {
  211. *out++ = static_cast<char_type>((value >> 18) | 0xF0);
  212. *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
  213. *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
  214. *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
  215. }
  216. return out;
  217. }
  218. }; // utf8
  219. template<typename CharType>
  220. struct utf_traits<CharType, 2> {
  221. typedef CharType char_type;
  222. // See RFC 2781
  223. static bool is_first_surrogate(uint16_t x) { return 0xD800 <= x && x <= 0xDBFF; }
  224. static bool is_second_surrogate(uint16_t x) { return 0xDC00 <= x && x <= 0xDFFF; }
  225. static code_point combine_surrogate(uint16_t w1, uint16_t w2)
  226. {
  227. return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
  228. }
  229. static int trail_length(char_type c)
  230. {
  231. if(is_first_surrogate(c))
  232. return 1;
  233. if(is_second_surrogate(c))
  234. return -1;
  235. return 0;
  236. }
  237. /// Returns true if c is trail code unit, always false for UTF-32
  238. static bool is_trail(char_type c) { return is_second_surrogate(c); }
  239. /// Returns true if c is lead code unit, always true of UTF-32
  240. static bool is_lead(char_type c) { return !is_second_surrogate(c); }
  241. template<typename It>
  242. static code_point decode(It& current, It last)
  243. {
  244. if(BOOST_UNLIKELY(current == last))
  245. return incomplete;
  246. uint16_t w1 = *current++;
  247. if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))
  248. return w1;
  249. if(w1 > 0xDBFF)
  250. return illegal;
  251. if(current == last)
  252. return incomplete;
  253. uint16_t w2 = *current++;
  254. if(w2 < 0xDC00 || 0xDFFF < w2)
  255. return illegal;
  256. return combine_surrogate(w1, w2);
  257. }
  258. template<typename It>
  259. static code_point decode_valid(It& current)
  260. {
  261. uint16_t w1 = *current++;
  262. if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))
  263. return w1;
  264. uint16_t w2 = *current++;
  265. return combine_surrogate(w1, w2);
  266. }
  267. static constexpr int max_width = 2;
  268. static int width(code_point u) { return u >= 0x10000 ? 2 : 1; }
  269. template<typename It>
  270. static It encode(code_point u, It out)
  271. {
  272. if(BOOST_LIKELY(u <= 0xFFFF))
  273. *out++ = static_cast<char_type>(u);
  274. else {
  275. u -= 0x10000;
  276. *out++ = static_cast<char_type>(0xD800 | (u >> 10));
  277. *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
  278. }
  279. return out;
  280. }
  281. }; // utf16;
  282. template<typename CharType>
  283. struct utf_traits<CharType, 4> {
  284. typedef CharType char_type;
  285. static int trail_length(char_type c)
  286. {
  287. if(is_valid_codepoint(c))
  288. return 0;
  289. return -1;
  290. }
  291. static bool is_trail(char_type /*c*/) { return false; }
  292. static bool is_lead(char_type /*c*/) { return true; }
  293. template<typename It>
  294. static code_point decode_valid(It& current)
  295. {
  296. return *current++;
  297. }
  298. template<typename It>
  299. static code_point decode(It& current, It last)
  300. {
  301. if(BOOST_UNLIKELY(current == last))
  302. return boost::locale::utf::incomplete;
  303. code_point c = *current++;
  304. if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
  305. return boost::locale::utf::illegal;
  306. return c;
  307. }
  308. static constexpr int max_width = 1;
  309. static int width(code_point /*u*/) { return 1; }
  310. template<typename It>
  311. static It encode(code_point u, It out)
  312. {
  313. *out++ = static_cast<char_type>(u);
  314. return out;
  315. }
  316. }; // utf32
  317. #endif
  318. } // namespace utf
  319. }} // namespace boost::locale
  320. #endif