utf8_codecvt_facet.ipp 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
  2. // utf8_codecvt_facet.ipp
  3. // Copyright (c) 2001 Ronald Garcia, Indiana University ([email protected])
  4. // Andrew Lumsdaine, Indiana University ([email protected]).
  5. // Use, modification and distribution is subject to the Boost Software
  6. // License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
  7. // http://www.boost.org/LICENSE_1_0.txt)
  8. // Please see the comments in <boost/detail/utf8_codecvt_facet.hpp> to
  9. // learn how this file should be used.
  10. #include <boost/detail/utf8_codecvt_facet.hpp>
  11. #include <cstdlib> // for multi-byte converson routines
  12. #include <cassert>
  13. #include <boost/limits.hpp>
  14. #include <boost/config.hpp>
  15. // If we don't have wstring, then Unicode support
  16. // is not available anyway, so we don't need to even
  17. // compiler this file. This also fixes the problem
  18. // with mingw, which can compile this file, but will
  19. // generate link error when building DLL.
  20. #ifndef BOOST_NO_STD_WSTRING
  21. BOOST_UTF8_BEGIN_NAMESPACE
  22. /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
  23. // implementation for wchar_t
  24. namespace detail {
  25. inline const wchar_t * get_octet1_modifier_table() BOOST_NOEXCEPT
  26. {
  27. static const wchar_t octet1_modifier_table[] = {
  28. 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
  29. };
  30. return octet1_modifier_table;
  31. }
  32. } // namespace detail
  33. BOOST_UTF8_DECL utf8_codecvt_facet::utf8_codecvt_facet(
  34. std::size_t no_locale_manage
  35. ) :
  36. std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage)
  37. {}
  38. BOOST_UTF8_DECL utf8_codecvt_facet::~utf8_codecvt_facet()
  39. {}
  40. // Translate incoming UTF-8 into UCS-4
  41. BOOST_UTF8_DECL std::codecvt_base::result utf8_codecvt_facet::do_in(
  42. std::mbstate_t& /*state*/,
  43. const char * from,
  44. const char * from_end,
  45. const char * & from_next,
  46. wchar_t * to,
  47. wchar_t * to_end,
  48. wchar_t * & to_next
  49. ) const {
  50. // Basic algorithm: The first octet determines how many
  51. // octets total make up the UCS-4 character. The remaining
  52. // "continuing octets" all begin with "10". To convert, subtract
  53. // the amount that specifies the number of octets from the first
  54. // octet. Subtract 0x80 (1000 0000) from each continuing octet,
  55. // then mash the whole lot together. Note that each continuing
  56. // octet only uses 6 bits as unique values, so only shift by
  57. // multiples of 6 to combine.
  58. const wchar_t * const octet1_modifier_table = detail::get_octet1_modifier_table();
  59. while (from != from_end && to != to_end) {
  60. // Error checking on the first octet
  61. if (invalid_leading_octet(*from)) {
  62. from_next = from;
  63. to_next = to;
  64. return std::codecvt_base::error;
  65. }
  66. // The first octet is adjusted by a value dependent upon
  67. // the number of "continuing octets" encoding the character
  68. const int cont_octet_count = get_cont_octet_count(*from);
  69. // The unsigned char conversion is necessary in case char is
  70. // signed (I learned this the hard way)
  71. wchar_t ucs_result =
  72. (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];
  73. // Invariants:
  74. // 1) At the start of the loop, 'i' continuing characters have been
  75. // processed
  76. // 2) *from points to the next continuing character to be processed.
  77. int i = 0;
  78. while (i != cont_octet_count && from != from_end) {
  79. // Error checking on continuing characters
  80. if (invalid_continuing_octet(*from)) {
  81. from_next = from;
  82. to_next = to;
  83. return std::codecvt_base::error;
  84. }
  85. ucs_result *= (1 << 6);
  86. // each continuing character has an extra (10xxxxxx)b attached to
  87. // it that must be removed.
  88. ucs_result += (unsigned char)(*from++) - 0x80;
  89. ++i;
  90. }
  91. // If the buffer ends with an incomplete unicode character...
  92. if (from == from_end && i != cont_octet_count) {
  93. // rewind "from" to before the current character translation
  94. from_next = from - (i + 1);
  95. to_next = to;
  96. return std::codecvt_base::partial;
  97. }
  98. *to++ = ucs_result;
  99. }
  100. from_next = from;
  101. to_next = to;
  102. // Were we done converting or did we run out of destination space?
  103. if (from == from_end)
  104. return std::codecvt_base::ok;
  105. else
  106. return std::codecvt_base::partial;
  107. }
  108. BOOST_UTF8_DECL std::codecvt_base::result utf8_codecvt_facet::do_out(
  109. std::mbstate_t& /*state*/,
  110. const wchar_t * from,
  111. const wchar_t * from_end,
  112. const wchar_t * & from_next,
  113. char * to,
  114. char * to_end,
  115. char * & to_next
  116. ) const
  117. {
  118. const wchar_t * const octet1_modifier_table = detail::get_octet1_modifier_table();
  119. wchar_t max_wchar = (std::numeric_limits<wchar_t>::max)();
  120. while (from != from_end && to != to_end) {
  121. // Check for invalid UCS-4 character
  122. if (*from > max_wchar) {
  123. from_next = from;
  124. to_next = to;
  125. return std::codecvt_base::error;
  126. }
  127. int cont_octet_count = get_cont_octet_out_count(*from);
  128. // RG - comment this formula better
  129. int shift_exponent = cont_octet_count * 6;
  130. // Process the first character
  131. *to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] +
  132. (unsigned char)(*from / (1 << shift_exponent)));
  133. // Process the continuation characters
  134. // Invariants: At the start of the loop:
  135. // 1) 'i' continuing octets have been generated
  136. // 2) '*to' points to the next location to place an octet
  137. // 3) shift_exponent is 6 more than needed for the next octet
  138. int i = 0;
  139. while (i != cont_octet_count && to != to_end) {
  140. shift_exponent -= 6;
  141. *to++ = static_cast<char>(0x80 + ((*from / (1 << shift_exponent)) % (1 << 6)));
  142. ++i;
  143. }
  144. // If we filled up the out buffer before encoding the character
  145. if (to == to_end && i != cont_octet_count) {
  146. from_next = from;
  147. to_next = to - (i + 1);
  148. return std::codecvt_base::partial;
  149. }
  150. ++from;
  151. }
  152. from_next = from;
  153. to_next = to;
  154. // Were we done or did we run out of destination space
  155. if (from == from_end)
  156. return std::codecvt_base::ok;
  157. else
  158. return std::codecvt_base::partial;
  159. }
  160. // How many char objects can I process to get <= max_limit
  161. // wchar_t objects?
  162. BOOST_UTF8_DECL int utf8_codecvt_facet::do_length(
  163. std::mbstate_t &,
  164. const char * from,
  165. const char * from_end,
  166. std::size_t max_limit
  167. ) const
  168. #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600))
  169. throw()
  170. #endif
  171. {
  172. const char * from_next = from;
  173. for (std::size_t char_count = 0u; char_count < max_limit && from_next < from_end; ++char_count) {
  174. unsigned int octet_count = get_octet_count(*from_next);
  175. // The buffer may represent incomplete characters, so terminate early if one is found
  176. if (octet_count > static_cast<std::size_t>(from_end - from_next))
  177. break;
  178. from_next += octet_count;
  179. }
  180. return static_cast<int>(from_next - from);
  181. }
  182. BOOST_UTF8_DECL unsigned int utf8_codecvt_facet::get_octet_count(
  183. unsigned char lead_octet
  184. ) {
  185. // if the 0-bit (MSB) is 0, then 1 character
  186. if (lead_octet <= 0x7f) return 1;
  187. // Otherwise the count number of consecutive 1 bits starting at MSB
  188. // assert(0xc0 <= lead_octet && lead_octet <= 0xfd);
  189. if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
  190. else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
  191. else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
  192. else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
  193. else return 6;
  194. }
  195. namespace detail {
  196. template<std::size_t s>
  197. inline int get_cont_octet_out_count_impl(wchar_t word) {
  198. if (word < 0x80) {
  199. return 0;
  200. }
  201. if (word < 0x800) {
  202. return 1;
  203. }
  204. return 2;
  205. }
  206. template<>
  207. inline int get_cont_octet_out_count_impl<4>(wchar_t word) {
  208. if (word < 0x80) {
  209. return 0;
  210. }
  211. if (word < 0x800) {
  212. return 1;
  213. }
  214. // Note that the following code will generate warnings on some platforms
  215. // where wchar_t is defined as UCS2. The warnings are superfluous as the
  216. // specialization is never instantitiated with such compilers, but this
  217. // can cause problems if warnings are being treated as errors, so we guard
  218. // against that. Including <boost/detail/utf8_codecvt_facet.hpp> as we do
  219. // should be enough to get WCHAR_MAX defined.
  220. #if !defined(WCHAR_MAX)
  221. # error WCHAR_MAX not defined!
  222. #endif
  223. // cope with VC++ 7.1 or earlier having invalid WCHAR_MAX
  224. #if defined(_MSC_VER) && _MSC_VER <= 1310 // 7.1 or earlier
  225. return 2;
  226. #elif WCHAR_MAX > 0x10000
  227. if (word < 0x10000) {
  228. return 2;
  229. }
  230. if (word < 0x200000) {
  231. return 3;
  232. }
  233. if (word < 0x4000000) {
  234. return 4;
  235. }
  236. return 5;
  237. #else
  238. return 2;
  239. #endif
  240. }
  241. } // namespace detail
  242. // How many "continuing octets" will be needed for this word
  243. // == total octets - 1.
  244. BOOST_UTF8_DECL int utf8_codecvt_facet::get_cont_octet_out_count(
  245. wchar_t word
  246. ) {
  247. return detail::get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
  248. }
  249. BOOST_UTF8_END_NAMESPACE
  250. #endif