utf.hpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454
  1. //
  2. // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
  3. // Copyright (c) 2020 Alexander Grund
  4. //
  5. // Distributed under the Boost Software License, Version 1.0.
  6. // https://www.boost.org/LICENSE_1_0.txt
  7. #ifndef BOOST_NOWIDE_UTF_HPP_INCLUDED
  8. #define BOOST_NOWIDE_UTF_HPP_INCLUDED
  9. #include <boost/nowide/config.hpp>
  10. #include <cstdint>
  11. namespace boost {
  12. namespace nowide {
  13. ///
  14. /// \brief Namespace that holds basic operations on UTF encoded sequences
  15. ///
  16. /// All functions defined in this namespace do not require linking with Boost.Nowide library.
  17. /// Extracted from Boost.Locale
  18. ///
  19. namespace utf {
  20. ///
  21. /// \brief The integral type that can hold a Unicode code point
  22. ///
  23. using code_point = uint32_t;
  24. ///
  25. /// \brief Special constant that defines illegal code point
  26. ///
  27. static const code_point illegal = 0xFFFFFFFFu;
  28. ///
  29. /// \brief Special constant that defines incomplete code point
  30. ///
  31. static const code_point incomplete = 0xFFFFFFFEu;
  32. ///
  33. /// \brief the function checks if \a v is a valid code point
  34. ///
  35. inline bool is_valid_codepoint(code_point v)
  36. {
  37. if(v > 0x10FFFF)
  38. return false;
  39. if(0xD800 <= v && v <= 0xDFFF) // surrogates
  40. return false;
  41. return true;
  42. }
  43. #ifdef BOOST_NOWIDE_DOXYGEN
  44. ///
  45. /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
  46. ///
  47. template<typename CharType, int size = sizeof(CharType)>
  48. struct utf_traits
  49. {
  50. ///
  51. /// The type of the character
  52. ///
  53. using char_type = CharType;
  54. ///
  55. /// Read one code point from the range [p,e) and return it.
  56. ///
  57. /// - If the sequence that was read is incomplete sequence returns \ref incomplete,
  58. /// - If illegal sequence detected returns \ref illegal
  59. ///
  60. /// Requirements
  61. ///
  62. /// - Iterator is valid input iterator
  63. ///
  64. /// Postconditions
  65. ///
  66. /// - p points to the last consumed character
  67. ///
  68. template<typename Iterator>
  69. static code_point decode(Iterator& p, Iterator e);
  70. ///
  71. /// Maximal width of valid sequence in the code units:
  72. ///
  73. /// - UTF-8 - 4
  74. /// - UTF-16 - 2
  75. /// - UTF-32 - 1
  76. ///
  77. static const int max_width;
  78. ///
  79. /// The width of specific code point in the code units.
  80. ///
  81. /// Requirement: value is a valid Unicode code point
  82. /// Returns value in range [1..max_width]
  83. ///
  84. static int width(code_point value);
  85. ///
  86. /// Get the size of the trail part of variable length encoded sequence.
  87. ///
  88. /// Returns -1 if C is not valid lead character
  89. ///
  90. static int trail_length(char_type c);
  91. ///
  92. /// Returns true if c is trail code unit, always false for UTF-32
  93. ///
  94. static bool is_trail(char_type c);
  95. ///
  96. /// Returns true if c is lead code unit, always true of UTF-32
  97. ///
  98. static bool is_lead(char_type c);
  99. ///
  100. /// Convert valid Unicode code point \a value to the UTF sequence.
  101. ///
  102. /// Requirements:
  103. ///
  104. /// - \a value is valid code point
  105. /// - \a out is an output iterator should be able to accept at least width(value) units
  106. ///
  107. /// Returns the iterator past the last written code unit.
  108. ///
  109. template<typename Iterator>
  110. static Iterator encode(code_point value, Iterator out);
  111. ///
  112. /// Decodes valid UTF sequence that is pointed by p into code point.
  113. ///
  114. /// If the sequence is invalid or points to end the behavior is undefined
  115. ///
  116. template<typename Iterator>
  117. static code_point decode_valid(Iterator& p);
  118. };
  119. #else
  120. template<typename CharType, int size = sizeof(CharType)>
  121. struct utf_traits;
  122. template<typename CharType>
  123. struct utf_traits<CharType, 1>
  124. {
  125. using char_type = CharType;
  126. static int trail_length(char_type ci)
  127. {
  128. unsigned char c = ci;
  129. if(c < 128)
  130. return 0;
  131. if(BOOST_UNLIKELY(c < 194))
  132. return -1;
  133. if(c < 224)
  134. return 1;
  135. if(c < 240)
  136. return 2;
  137. if(BOOST_LIKELY(c <= 244))
  138. return 3;
  139. return -1;
  140. }
  141. static const int max_width = 4;
  142. static int width(code_point value)
  143. {
  144. if(value <= 0x7F)
  145. {
  146. return 1;
  147. } else if(value <= 0x7FF)
  148. {
  149. return 2;
  150. } else if(BOOST_LIKELY(value <= 0xFFFF))
  151. {
  152. return 3;
  153. } else
  154. {
  155. return 4;
  156. }
  157. }
  158. static bool is_trail(char_type ci)
  159. {
  160. unsigned char c = ci;
  161. return (c & 0xC0) == 0x80;
  162. }
  163. static bool is_lead(char_type ci)
  164. {
  165. return !is_trail(ci);
  166. }
  167. template<typename Iterator>
  168. static code_point decode(Iterator& p, Iterator e)
  169. {
  170. if(BOOST_UNLIKELY(p == e))
  171. return incomplete;
  172. unsigned char lead = *p++;
  173. // First byte is fully validated here
  174. int trail_size = trail_length(lead);
  175. if(BOOST_UNLIKELY(trail_size < 0))
  176. return illegal;
  177. // OK as only ASCII may be of size = 0
  178. // also optimize for ASCII text
  179. if(trail_size == 0)
  180. return lead;
  181. code_point c = lead & ((1 << (6 - trail_size)) - 1);
  182. // Read the rest
  183. unsigned char tmp;
  184. switch(trail_size)
  185. {
  186. case 3:
  187. if(BOOST_UNLIKELY(p == e))
  188. return incomplete;
  189. tmp = *p++;
  190. if(!is_trail(tmp))
  191. return illegal;
  192. c = (c << 6) | (tmp & 0x3F);
  193. BOOST_NOWIDE_FALLTHROUGH;
  194. case 2:
  195. if(BOOST_UNLIKELY(p == e))
  196. return incomplete;
  197. tmp = *p++;
  198. if(!is_trail(tmp))
  199. return illegal;
  200. c = (c << 6) | (tmp & 0x3F);
  201. BOOST_NOWIDE_FALLTHROUGH;
  202. case 1:
  203. if(BOOST_UNLIKELY(p == e))
  204. return incomplete;
  205. tmp = *p++;
  206. if(!is_trail(tmp))
  207. return illegal;
  208. c = (c << 6) | (tmp & 0x3F);
  209. }
  210. // Check code point validity:
  211. // - no surrogates and valid range
  212. // - most compact representation
  213. if(BOOST_UNLIKELY(!is_valid_codepoint(c)) || BOOST_UNLIKELY(width(c) != trail_size + 1))
  214. {
  215. p -= trail_size;
  216. return illegal;
  217. }
  218. return c;
  219. }
  220. template<typename Iterator>
  221. static code_point decode_valid(Iterator& p)
  222. {
  223. unsigned char lead = *p++;
  224. if(lead < 192)
  225. return lead;
  226. int trail_size;
  227. if(lead < 224)
  228. trail_size = 1;
  229. else if(BOOST_LIKELY(lead < 240)) // non-BMP rare
  230. trail_size = 2;
  231. else
  232. trail_size = 3;
  233. code_point c = lead & ((1 << (6 - trail_size)) - 1);
  234. switch(trail_size)
  235. {
  236. case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
  237. case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
  238. case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
  239. }
  240. return c;
  241. }
  242. template<typename Iterator>
  243. static Iterator encode(code_point value, Iterator out)
  244. {
  245. if(value <= 0x7F)
  246. {
  247. *out++ = static_cast<char_type>(value);
  248. } else if(value <= 0x7FF)
  249. {
  250. *out++ = static_cast<char_type>((value >> 6) | 0xC0);
  251. *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
  252. } else if(BOOST_LIKELY(value <= 0xFFFF))
  253. {
  254. *out++ = static_cast<char_type>((value >> 12) | 0xE0);
  255. *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
  256. *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
  257. } else
  258. {
  259. *out++ = static_cast<char_type>((value >> 18) | 0xF0);
  260. *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
  261. *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
  262. *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
  263. }
  264. return out;
  265. }
  266. }; // utf8
  267. template<typename CharType>
  268. struct utf_traits<CharType, 2>
  269. {
  270. using char_type = CharType;
  271. // See RFC 2781
  272. static bool is_single_codepoint(uint16_t x)
  273. {
  274. // Ranges [U+0000, 0+D7FF], [U+E000, U+FFFF] are numerically equal in UTF-16
  275. return x <= 0xD7FF || x >= 0xE000;
  276. }
  277. static bool is_first_surrogate(uint16_t x)
  278. {
  279. // Range [U+D800, 0+DBFF]: High surrogate
  280. return 0xD800 <= x && x <= 0xDBFF;
  281. }
  282. static bool is_second_surrogate(uint16_t x)
  283. {
  284. // Range [U+DC00, 0+DFFF]: Low surrogate
  285. return 0xDC00 <= x && x <= 0xDFFF;
  286. }
  287. static code_point combine_surrogate(uint16_t w1, uint16_t w2)
  288. {
  289. return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
  290. }
  291. static int trail_length(char_type c)
  292. {
  293. if(is_first_surrogate(c))
  294. return 1;
  295. if(is_second_surrogate(c))
  296. return -1;
  297. return 0;
  298. }
  299. /// Return true if c is trail code unit, always false for UTF-32
  300. static bool is_trail(char_type c)
  301. {
  302. return is_second_surrogate(c);
  303. }
  304. /// Return true if c is lead code unit, always true of UTF-32
  305. static bool is_lead(char_type c)
  306. {
  307. return !is_second_surrogate(c);
  308. }
  309. template<typename It>
  310. static code_point decode(It& current, It last)
  311. {
  312. if(BOOST_UNLIKELY(current == last))
  313. return incomplete;
  314. uint16_t w1 = *current++;
  315. if(BOOST_LIKELY(is_single_codepoint(w1)))
  316. {
  317. return w1;
  318. }
  319. // Now it's either a high or a low surrogate, the latter is invalid
  320. if(w1 >= 0xDC00)
  321. return illegal;
  322. if(current == last)
  323. return incomplete;
  324. uint16_t w2 = *current++;
  325. if(!is_second_surrogate(w2))
  326. return illegal;
  327. return combine_surrogate(w1, w2);
  328. }
  329. template<typename It>
  330. static code_point decode_valid(It& current)
  331. {
  332. uint16_t w1 = *current++;
  333. if(BOOST_LIKELY(is_single_codepoint(w1)))
  334. {
  335. return w1;
  336. }
  337. uint16_t w2 = *current++;
  338. return combine_surrogate(w1, w2);
  339. }
  340. static const int max_width = 2;
  341. static int width(code_point u) // LCOV_EXCL_LINE
  342. {
  343. return u >= 0x10000 ? 2 : 1;
  344. }
  345. template<typename It>
  346. static It encode(code_point u, It out)
  347. {
  348. if(BOOST_LIKELY(u <= 0xFFFF))
  349. {
  350. *out++ = static_cast<char_type>(u);
  351. } else
  352. {
  353. u -= 0x10000;
  354. *out++ = static_cast<char_type>(0xD800 | (u >> 10));
  355. *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
  356. }
  357. return out;
  358. }
  359. }; // utf16;
  360. template<typename CharType>
  361. struct utf_traits<CharType, 4>
  362. {
  363. using char_type = CharType;
  364. static int trail_length(char_type c)
  365. {
  366. if(is_valid_codepoint(c))
  367. return 0;
  368. return -1;
  369. }
  370. static bool is_trail(char_type /*c*/)
  371. {
  372. return false;
  373. }
  374. static bool is_lead(char_type /*c*/)
  375. {
  376. return true;
  377. }
  378. template<typename It>
  379. static code_point decode_valid(It& current)
  380. {
  381. return *current++;
  382. }
  383. template<typename It>
  384. static code_point decode(It& current, It last)
  385. {
  386. if(BOOST_UNLIKELY(current == last))
  387. return incomplete;
  388. code_point c = *current++;
  389. if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
  390. return illegal;
  391. return c;
  392. }
  393. static const int max_width = 1;
  394. static int width(code_point /*u*/)
  395. {
  396. return 1;
  397. }
  398. template<typename It>
  399. static It encode(code_point u, It out)
  400. {
  401. *out++ = static_cast<char_type>(u);
  402. return out;
  403. }
  404. }; // utf32
  405. #endif
  406. } // namespace utf
  407. } // namespace nowide
  408. } // namespace boost
  409. #endif