unicode_iterator.hpp 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871
  1. /*
  2. *
  3. * Copyright (c) 2004
  4. * John Maddock
  5. *
  6. * Use, modification and distribution are subject to the
  7. * Boost Software License, Version 1.0. (See accompanying file
  8. * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  9. *
  10. */
  11. /*
  12. * LOCATION: see http://www.boost.org for most recent version.
  13. * FILE unicode_iterator.hpp
  14. * VERSION see <boost/version.hpp>
  15. * DESCRIPTION: Iterator adapters for converting between different Unicode encodings.
  16. */
  17. /****************************************************************************
  18. Contents:
  19. ~~~~~~~~~
  20. 1) Read Only, Input Adapters:
  21. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  22. template <class BaseIterator, class U8Type = ::boost::uint8_t>
  23. class u32_to_u8_iterator;
  24. Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-8.
  25. template <class BaseIterator, class U32Type = ::boost::uint32_t>
  26. class u8_to_u32_iterator;
  27. Adapts sequence of UTF-8 code points to "look like" a sequence of UTF-32.
  28. template <class BaseIterator, class U16Type = ::boost::uint16_t>
  29. class u32_to_u16_iterator;
  30. Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-16.
  31. template <class BaseIterator, class U32Type = ::boost::uint32_t>
  32. class u16_to_u32_iterator;
  33. Adapts sequence of UTF-16 code points to "look like" a sequence of UTF-32.
  34. 2) Single pass output iterator adapters:
  35. template <class BaseIterator>
  36. class utf8_output_iterator;
  37. Accepts UTF-32 code points and forwards them on as UTF-8 code points.
  38. template <class BaseIterator>
  39. class utf16_output_iterator;
  40. Accepts UTF-32 code points and forwards them on as UTF-16 code points.
  41. ****************************************************************************/
  42. #ifndef BOOST_REGEX_V4_UNICODE_ITERATOR_HPP
  43. #define BOOST_REGEX_V4_UNICODE_ITERATOR_HPP
  44. #include <boost/cstdint.hpp>
  45. #include <boost/regex/config.hpp>
  46. #include <boost/static_assert.hpp>
  47. #include <boost/throw_exception.hpp>
  48. #include <stdexcept>
  49. #ifndef BOOST_NO_STD_LOCALE
  50. #include <sstream>
  51. #include <ios>
  52. #endif
  53. #include <limits.h> // CHAR_BIT
  54. #ifdef BOOST_REGEX_CXX03
  55. #else
  56. #endif
  57. namespace boost{
  58. namespace detail{
  59. static const ::boost::uint16_t high_surrogate_base = 0xD7C0u;
  60. static const ::boost::uint16_t low_surrogate_base = 0xDC00u;
  61. static const ::boost::uint32_t ten_bit_mask = 0x3FFu;
  62. inline bool is_high_surrogate(::boost::uint16_t v)
  63. {
  64. return (v & 0xFFFFFC00u) == 0xd800u;
  65. }
  66. inline bool is_low_surrogate(::boost::uint16_t v)
  67. {
  68. return (v & 0xFFFFFC00u) == 0xdc00u;
  69. }
  70. template <class T>
  71. inline bool is_surrogate(T v)
  72. {
  73. return (v & 0xFFFFF800u) == 0xd800;
  74. }
  75. inline unsigned utf8_byte_count(boost::uint8_t c)
  76. {
  77. // if the most significant bit with a zero in it is in position
  78. // 8-N then there are N bytes in this UTF-8 sequence:
  79. boost::uint8_t mask = 0x80u;
  80. unsigned result = 0;
  81. while(c & mask)
  82. {
  83. ++result;
  84. mask >>= 1;
  85. }
  86. return (result == 0) ? 1 : ((result > 4) ? 4 : result);
  87. }
  88. inline unsigned utf8_trailing_byte_count(boost::uint8_t c)
  89. {
  90. return utf8_byte_count(c) - 1;
  91. }
  92. #ifdef BOOST_MSVC
  93. #pragma warning(push)
  94. #pragma warning(disable:4100)
  95. #endif
  96. #ifndef BOOST_NO_EXCEPTIONS
  97. BOOST_NORETURN
  98. #endif
  99. inline void invalid_utf32_code_point(::boost::uint32_t val)
  100. {
  101. #ifndef BOOST_NO_STD_LOCALE
  102. std::stringstream ss;
  103. ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-16 sequence";
  104. std::out_of_range e(ss.str());
  105. #else
  106. std::out_of_range e("Invalid UTF-32 code point encountered while trying to encode UTF-16 sequence");
  107. #endif
  108. boost::throw_exception(e);
  109. }
  110. #ifdef BOOST_MSVC
  111. #pragma warning(pop)
  112. #endif
  113. } // namespace detail
  114. template <class BaseIterator, class U16Type = ::boost::uint16_t>
  115. class u32_to_u16_iterator
  116. {
  117. #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
  118. typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
  119. BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
  120. BOOST_STATIC_ASSERT(sizeof(U16Type)*CHAR_BIT == 16);
  121. #endif
  122. public:
  123. typedef std::ptrdiff_t difference_type;
  124. typedef U16Type value_type;
  125. typedef value_type const* pointer;
  126. typedef value_type const reference;
  127. typedef std::bidirectional_iterator_tag iterator_category;
  128. reference operator*()const
  129. {
  130. if(m_current == 2)
  131. extract_current();
  132. return m_values[m_current];
  133. }
  134. bool operator==(const u32_to_u16_iterator& that)const
  135. {
  136. if(m_position == that.m_position)
  137. {
  138. // Both m_currents must be equal, or both even
  139. // this is the same as saying their sum must be even:
  140. return (m_current + that.m_current) & 1u ? false : true;
  141. }
  142. return false;
  143. }
  144. bool operator!=(const u32_to_u16_iterator& that)const
  145. {
  146. return !(*this == that);
  147. }
  148. u32_to_u16_iterator& operator++()
  149. {
  150. // if we have a pending read then read now, so that we know whether
  151. // to skip a position, or move to a low-surrogate:
  152. if(m_current == 2)
  153. {
  154. // pending read:
  155. extract_current();
  156. }
  157. // move to the next surrogate position:
  158. ++m_current;
  159. // if we've reached the end skip a position:
  160. if(m_values[m_current] == 0)
  161. {
  162. m_current = 2;
  163. ++m_position;
  164. }
  165. return *this;
  166. }
  167. u32_to_u16_iterator operator++(int)
  168. {
  169. u32_to_u16_iterator r(*this);
  170. ++(*this);
  171. return r;
  172. }
  173. u32_to_u16_iterator& operator--()
  174. {
  175. if(m_current != 1)
  176. {
  177. // decrementing an iterator always leads to a valid position:
  178. --m_position;
  179. extract_current();
  180. m_current = m_values[1] ? 1 : 0;
  181. }
  182. else
  183. {
  184. m_current = 0;
  185. }
  186. return *this;
  187. }
  188. u32_to_u16_iterator operator--(int)
  189. {
  190. u32_to_u16_iterator r(*this);
  191. --(*this);
  192. return r;
  193. }
  194. BaseIterator base()const
  195. {
  196. return m_position;
  197. }
  198. // construct:
  199. u32_to_u16_iterator() : m_position(), m_current(0)
  200. {
  201. m_values[0] = 0;
  202. m_values[1] = 0;
  203. m_values[2] = 0;
  204. }
  205. u32_to_u16_iterator(BaseIterator b) : m_position(b), m_current(2)
  206. {
  207. m_values[0] = 0;
  208. m_values[1] = 0;
  209. m_values[2] = 0;
  210. }
  211. private:
  212. void extract_current()const
  213. {
  214. // begin by checking for a code point out of range:
  215. ::boost::uint32_t v = *m_position;
  216. if(v >= 0x10000u)
  217. {
  218. if(v > 0x10FFFFu)
  219. detail::invalid_utf32_code_point(*m_position);
  220. // split into two surrogates:
  221. m_values[0] = static_cast<U16Type>(v >> 10) + detail::high_surrogate_base;
  222. m_values[1] = static_cast<U16Type>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
  223. m_current = 0;
  224. BOOST_REGEX_ASSERT(detail::is_high_surrogate(m_values[0]));
  225. BOOST_REGEX_ASSERT(detail::is_low_surrogate(m_values[1]));
  226. }
  227. else
  228. {
  229. // 16-bit code point:
  230. m_values[0] = static_cast<U16Type>(*m_position);
  231. m_values[1] = 0;
  232. m_current = 0;
  233. // value must not be a surrogate:
  234. if(detail::is_surrogate(m_values[0]))
  235. detail::invalid_utf32_code_point(*m_position);
  236. }
  237. }
  238. BaseIterator m_position;
  239. mutable U16Type m_values[3];
  240. mutable unsigned m_current;
  241. };
  242. template <class BaseIterator, class U32Type = ::boost::uint32_t>
  243. class u16_to_u32_iterator
  244. {
  245. // special values for pending iterator reads:
  246. BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu);
  247. #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
  248. typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
  249. BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 16);
  250. BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
  251. #endif
  252. public:
  253. typedef std::ptrdiff_t difference_type;
  254. typedef U32Type value_type;
  255. typedef value_type const* pointer;
  256. typedef value_type const reference;
  257. typedef std::bidirectional_iterator_tag iterator_category;
  258. reference operator*()const
  259. {
  260. if(m_value == pending_read)
  261. extract_current();
  262. return m_value;
  263. }
  264. bool operator==(const u16_to_u32_iterator& that)const
  265. {
  266. return m_position == that.m_position;
  267. }
  268. bool operator!=(const u16_to_u32_iterator& that)const
  269. {
  270. return !(*this == that);
  271. }
  272. u16_to_u32_iterator& operator++()
  273. {
  274. // skip high surrogate first if there is one:
  275. if(detail::is_high_surrogate(*m_position)) ++m_position;
  276. ++m_position;
  277. m_value = pending_read;
  278. return *this;
  279. }
  280. u16_to_u32_iterator operator++(int)
  281. {
  282. u16_to_u32_iterator r(*this);
  283. ++(*this);
  284. return r;
  285. }
  286. u16_to_u32_iterator& operator--()
  287. {
  288. --m_position;
  289. // if we have a low surrogate then go back one more:
  290. if(detail::is_low_surrogate(*m_position))
  291. --m_position;
  292. m_value = pending_read;
  293. return *this;
  294. }
  295. u16_to_u32_iterator operator--(int)
  296. {
  297. u16_to_u32_iterator r(*this);
  298. --(*this);
  299. return r;
  300. }
  301. BaseIterator base()const
  302. {
  303. return m_position;
  304. }
  305. // construct:
  306. u16_to_u32_iterator() : m_position()
  307. {
  308. m_value = pending_read;
  309. }
  310. u16_to_u32_iterator(BaseIterator b) : m_position(b)
  311. {
  312. m_value = pending_read;
  313. }
  314. //
  315. // Range checked version:
  316. //
  317. u16_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
  318. {
  319. m_value = pending_read;
  320. //
  321. // The range must not start with a low surrogate, or end in a high surrogate,
  322. // otherwise we run the risk of running outside the underlying input range.
  323. // Likewise b must not be located at a low surrogate.
  324. //
  325. boost::uint16_t val;
  326. if(start != end)
  327. {
  328. if((b != start) && (b != end))
  329. {
  330. val = *b;
  331. if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
  332. invalid_code_point(val);
  333. }
  334. val = *start;
  335. if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
  336. invalid_code_point(val);
  337. val = *--end;
  338. if(detail::is_high_surrogate(val))
  339. invalid_code_point(val);
  340. }
  341. }
  342. private:
  343. static void invalid_code_point(::boost::uint16_t val)
  344. {
  345. #ifndef BOOST_NO_STD_LOCALE
  346. std::stringstream ss;
  347. ss << "Misplaced UTF-16 surrogate U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-32 sequence";
  348. std::out_of_range e(ss.str());
  349. #else
  350. std::out_of_range e("Misplaced UTF-16 surrogate encountered while trying to encode UTF-32 sequence");
  351. #endif
  352. boost::throw_exception(e);
  353. }
  354. void extract_current()const
  355. {
  356. m_value = static_cast<U32Type>(static_cast< ::boost::uint16_t>(*m_position));
  357. // if the last value is a high surrogate then adjust m_position and m_value as needed:
  358. if(detail::is_high_surrogate(*m_position))
  359. {
  360. // precondition; next value must have be a low-surrogate:
  361. BaseIterator next(m_position);
  362. ::boost::uint16_t t = *++next;
  363. if((t & 0xFC00u) != 0xDC00u)
  364. invalid_code_point(t);
  365. m_value = (m_value - detail::high_surrogate_base) << 10;
  366. m_value |= (static_cast<U32Type>(static_cast< ::boost::uint16_t>(t)) & detail::ten_bit_mask);
  367. }
  368. // postcondition; result must not be a surrogate:
  369. if(detail::is_surrogate(m_value))
  370. invalid_code_point(static_cast< ::boost::uint16_t>(m_value));
  371. }
  372. BaseIterator m_position;
  373. mutable U32Type m_value;
  374. };
  375. template <class BaseIterator, class U8Type = ::boost::uint8_t>
  376. class u32_to_u8_iterator
  377. {
  378. #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
  379. typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
  380. BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
  381. BOOST_STATIC_ASSERT(sizeof(U8Type)*CHAR_BIT == 8);
  382. #endif
  383. public:
  384. typedef std::ptrdiff_t difference_type;
  385. typedef U8Type value_type;
  386. typedef value_type const* pointer;
  387. typedef value_type const reference;
  388. typedef std::bidirectional_iterator_tag iterator_category;
  389. reference operator*()const
  390. {
  391. if(m_current == 4)
  392. extract_current();
  393. return m_values[m_current];
  394. }
  395. bool operator==(const u32_to_u8_iterator& that)const
  396. {
  397. if(m_position == that.m_position)
  398. {
  399. // either the m_current's must be equal, or one must be 0 and
  400. // the other 4: which means neither must have bits 1 or 2 set:
  401. return (m_current == that.m_current)
  402. || (((m_current | that.m_current) & 3) == 0);
  403. }
  404. return false;
  405. }
  406. bool operator!=(const u32_to_u8_iterator& that)const
  407. {
  408. return !(*this == that);
  409. }
  410. u32_to_u8_iterator& operator++()
  411. {
  412. // if we have a pending read then read now, so that we know whether
  413. // to skip a position, or move to a low-surrogate:
  414. if(m_current == 4)
  415. {
  416. // pending read:
  417. extract_current();
  418. }
  419. // move to the next surrogate position:
  420. ++m_current;
  421. // if we've reached the end skip a position:
  422. if(m_values[m_current] == 0)
  423. {
  424. m_current = 4;
  425. ++m_position;
  426. }
  427. return *this;
  428. }
  429. u32_to_u8_iterator operator++(int)
  430. {
  431. u32_to_u8_iterator r(*this);
  432. ++(*this);
  433. return r;
  434. }
  435. u32_to_u8_iterator& operator--()
  436. {
  437. if((m_current & 3) == 0)
  438. {
  439. --m_position;
  440. extract_current();
  441. m_current = 3;
  442. while(m_current && (m_values[m_current] == 0))
  443. --m_current;
  444. }
  445. else
  446. --m_current;
  447. return *this;
  448. }
  449. u32_to_u8_iterator operator--(int)
  450. {
  451. u32_to_u8_iterator r(*this);
  452. --(*this);
  453. return r;
  454. }
  455. BaseIterator base()const
  456. {
  457. return m_position;
  458. }
  459. // construct:
  460. u32_to_u8_iterator() : m_position(), m_current(0)
  461. {
  462. m_values[0] = 0;
  463. m_values[1] = 0;
  464. m_values[2] = 0;
  465. m_values[3] = 0;
  466. m_values[4] = 0;
  467. }
  468. u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4)
  469. {
  470. m_values[0] = 0;
  471. m_values[1] = 0;
  472. m_values[2] = 0;
  473. m_values[3] = 0;
  474. m_values[4] = 0;
  475. }
  476. private:
  477. void extract_current()const
  478. {
  479. boost::uint32_t c = *m_position;
  480. if(c > 0x10FFFFu)
  481. detail::invalid_utf32_code_point(c);
  482. if(c < 0x80u)
  483. {
  484. m_values[0] = static_cast<unsigned char>(c);
  485. m_values[1] = static_cast<unsigned char>(0u);
  486. m_values[2] = static_cast<unsigned char>(0u);
  487. m_values[3] = static_cast<unsigned char>(0u);
  488. }
  489. else if(c < 0x800u)
  490. {
  491. m_values[0] = static_cast<unsigned char>(0xC0u + (c >> 6));
  492. m_values[1] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  493. m_values[2] = static_cast<unsigned char>(0u);
  494. m_values[3] = static_cast<unsigned char>(0u);
  495. }
  496. else if(c < 0x10000u)
  497. {
  498. m_values[0] = static_cast<unsigned char>(0xE0u + (c >> 12));
  499. m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
  500. m_values[2] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  501. m_values[3] = static_cast<unsigned char>(0u);
  502. }
  503. else
  504. {
  505. m_values[0] = static_cast<unsigned char>(0xF0u + (c >> 18));
  506. m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
  507. m_values[2] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
  508. m_values[3] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  509. }
  510. m_current= 0;
  511. }
  512. BaseIterator m_position;
  513. mutable U8Type m_values[5];
  514. mutable unsigned m_current;
  515. };
  516. template <class BaseIterator, class U32Type = ::boost::uint32_t>
  517. class u8_to_u32_iterator
  518. {
  519. // special values for pending iterator reads:
  520. BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu);
  521. #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
  522. typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
  523. BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 8);
  524. BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
  525. #endif
  526. public:
  527. typedef std::ptrdiff_t difference_type;
  528. typedef U32Type value_type;
  529. typedef value_type const* pointer;
  530. typedef value_type const reference;
  531. typedef std::bidirectional_iterator_tag iterator_category;
  532. reference operator*()const
  533. {
  534. if(m_value == pending_read)
  535. extract_current();
  536. return m_value;
  537. }
  538. bool operator==(const u8_to_u32_iterator& that)const
  539. {
  540. return m_position == that.m_position;
  541. }
  542. bool operator!=(const u8_to_u32_iterator& that)const
  543. {
  544. return !(*this == that);
  545. }
  546. u8_to_u32_iterator& operator++()
  547. {
  548. // We must not start with a continuation character:
  549. if((static_cast<boost::uint8_t>(*m_position) & 0xC0) == 0x80)
  550. invalid_sequence();
  551. // skip high surrogate first if there is one:
  552. unsigned c = detail::utf8_byte_count(*m_position);
  553. if(m_value == pending_read)
  554. {
  555. // Since we haven't read in a value, we need to validate the code points:
  556. for(unsigned i = 0; i < c; ++i)
  557. {
  558. ++m_position;
  559. // We must have a continuation byte:
  560. if((i != c - 1) && ((static_cast<boost::uint8_t>(*m_position) & 0xC0) != 0x80))
  561. invalid_sequence();
  562. }
  563. }
  564. else
  565. {
  566. std::advance(m_position, c);
  567. }
  568. m_value = pending_read;
  569. return *this;
  570. }
  571. u8_to_u32_iterator operator++(int)
  572. {
  573. u8_to_u32_iterator r(*this);
  574. ++(*this);
  575. return r;
  576. }
  577. u8_to_u32_iterator& operator--()
  578. {
  579. // Keep backtracking until we don't have a trailing character:
  580. unsigned count = 0;
  581. while((*--m_position & 0xC0u) == 0x80u) ++count;
  582. // now check that the sequence was valid:
  583. if(count != detail::utf8_trailing_byte_count(*m_position))
  584. invalid_sequence();
  585. m_value = pending_read;
  586. return *this;
  587. }
  588. u8_to_u32_iterator operator--(int)
  589. {
  590. u8_to_u32_iterator r(*this);
  591. --(*this);
  592. return r;
  593. }
  594. BaseIterator base()const
  595. {
  596. return m_position;
  597. }
  598. // construct:
  599. u8_to_u32_iterator() : m_position()
  600. {
  601. m_value = pending_read;
  602. }
  603. u8_to_u32_iterator(BaseIterator b) : m_position(b)
  604. {
  605. m_value = pending_read;
  606. }
  607. //
  608. // Checked constructor:
  609. //
  610. u8_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
  611. {
  612. m_value = pending_read;
  613. //
  614. // We must not start with a continuation character, or end with a
  615. // truncated UTF-8 sequence otherwise we run the risk of going past
  616. // the start/end of the underlying sequence:
  617. //
  618. if(start != end)
  619. {
  620. unsigned char v = *start;
  621. if((v & 0xC0u) == 0x80u)
  622. invalid_sequence();
  623. if((b != start) && (b != end) && ((*b & 0xC0u) == 0x80u))
  624. invalid_sequence();
  625. BaseIterator pos = end;
  626. do
  627. {
  628. v = *--pos;
  629. }
  630. while((start != pos) && ((v & 0xC0u) == 0x80u));
  631. std::ptrdiff_t extra = detail::utf8_byte_count(v);
  632. if(std::distance(pos, end) < extra)
  633. invalid_sequence();
  634. }
  635. }
  636. private:
  637. static void invalid_sequence()
  638. {
  639. std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character");
  640. boost::throw_exception(e);
  641. }
  642. void extract_current()const
  643. {
  644. m_value = static_cast<U32Type>(static_cast< ::boost::uint8_t>(*m_position));
  645. // we must not have a continuation character:
  646. if((m_value & 0xC0u) == 0x80u)
  647. invalid_sequence();
  648. // see how many extra bytes we have:
  649. unsigned extra = detail::utf8_trailing_byte_count(*m_position);
  650. // extract the extra bits, 6 from each extra byte:
  651. BaseIterator next(m_position);
  652. for(unsigned c = 0; c < extra; ++c)
  653. {
  654. ++next;
  655. m_value <<= 6;
  656. // We must have a continuation byte:
  657. if((static_cast<boost::uint8_t>(*next) & 0xC0) != 0x80)
  658. invalid_sequence();
  659. m_value += static_cast<boost::uint8_t>(*next) & 0x3Fu;
  660. }
  661. // we now need to remove a few of the leftmost bits, but how many depends
  662. // upon how many extra bytes we've extracted:
  663. static const boost::uint32_t masks[4] =
  664. {
  665. 0x7Fu,
  666. 0x7FFu,
  667. 0xFFFFu,
  668. 0x1FFFFFu,
  669. };
  670. m_value &= masks[extra];
  671. // check the result is in range:
  672. if(m_value > static_cast<U32Type>(0x10FFFFu))
  673. invalid_sequence();
  674. // The result must not be a surrogate:
  675. if((m_value >= static_cast<U32Type>(0xD800)) && (m_value <= static_cast<U32Type>(0xDFFF)))
  676. invalid_sequence();
  677. // We should not have had an invalidly encoded UTF8 sequence:
  678. if((extra > 0) && (m_value <= static_cast<U32Type>(masks[extra - 1])))
  679. invalid_sequence();
  680. }
  681. BaseIterator m_position;
  682. mutable U32Type m_value;
  683. };
  684. template <class BaseIterator>
  685. class utf16_output_iterator
  686. {
  687. public:
  688. typedef void difference_type;
  689. typedef void value_type;
  690. typedef boost::uint32_t* pointer;
  691. typedef boost::uint32_t& reference;
  692. typedef std::output_iterator_tag iterator_category;
  693. utf16_output_iterator(const BaseIterator& b)
  694. : m_position(b){}
  695. utf16_output_iterator(const utf16_output_iterator& that)
  696. : m_position(that.m_position){}
  697. utf16_output_iterator& operator=(const utf16_output_iterator& that)
  698. {
  699. m_position = that.m_position;
  700. return *this;
  701. }
  702. const utf16_output_iterator& operator*()const
  703. {
  704. return *this;
  705. }
  706. void operator=(boost::uint32_t val)const
  707. {
  708. push(val);
  709. }
  710. utf16_output_iterator& operator++()
  711. {
  712. return *this;
  713. }
  714. utf16_output_iterator& operator++(int)
  715. {
  716. return *this;
  717. }
  718. BaseIterator base()const
  719. {
  720. return m_position;
  721. }
  722. private:
  723. void push(boost::uint32_t v)const
  724. {
  725. if(v >= 0x10000u)
  726. {
  727. // begin by checking for a code point out of range:
  728. if(v > 0x10FFFFu)
  729. detail::invalid_utf32_code_point(v);
  730. // split into two surrogates:
  731. *m_position++ = static_cast<boost::uint16_t>(v >> 10) + detail::high_surrogate_base;
  732. *m_position++ = static_cast<boost::uint16_t>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
  733. }
  734. else
  735. {
  736. // 16-bit code point:
  737. // value must not be a surrogate:
  738. if(detail::is_surrogate(v))
  739. detail::invalid_utf32_code_point(v);
  740. *m_position++ = static_cast<boost::uint16_t>(v);
  741. }
  742. }
  743. mutable BaseIterator m_position;
  744. };
  745. template <class BaseIterator>
  746. class utf8_output_iterator
  747. {
  748. public:
  749. typedef void difference_type;
  750. typedef void value_type;
  751. typedef boost::uint32_t* pointer;
  752. typedef boost::uint32_t& reference;
  753. typedef std::output_iterator_tag iterator_category;
  754. utf8_output_iterator(const BaseIterator& b)
  755. : m_position(b){}
  756. utf8_output_iterator(const utf8_output_iterator& that)
  757. : m_position(that.m_position){}
  758. utf8_output_iterator& operator=(const utf8_output_iterator& that)
  759. {
  760. m_position = that.m_position;
  761. return *this;
  762. }
  763. const utf8_output_iterator& operator*()const
  764. {
  765. return *this;
  766. }
  767. void operator=(boost::uint32_t val)const
  768. {
  769. push(val);
  770. }
  771. utf8_output_iterator& operator++()
  772. {
  773. return *this;
  774. }
  775. utf8_output_iterator& operator++(int)
  776. {
  777. return *this;
  778. }
  779. BaseIterator base()const
  780. {
  781. return m_position;
  782. }
  783. private:
  784. void push(boost::uint32_t c)const
  785. {
  786. if(c > 0x10FFFFu)
  787. detail::invalid_utf32_code_point(c);
  788. if(c < 0x80u)
  789. {
  790. *m_position++ = static_cast<unsigned char>(c);
  791. }
  792. else if(c < 0x800u)
  793. {
  794. *m_position++ = static_cast<unsigned char>(0xC0u + (c >> 6));
  795. *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  796. }
  797. else if(c < 0x10000u)
  798. {
  799. *m_position++ = static_cast<unsigned char>(0xE0u + (c >> 12));
  800. *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
  801. *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  802. }
  803. else
  804. {
  805. *m_position++ = static_cast<unsigned char>(0xF0u + (c >> 18));
  806. *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
  807. *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
  808. *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  809. }
  810. }
  811. mutable BaseIterator m_position;
  812. };
  813. } // namespace boost
  814. #endif // BOOST_REGEX_UNICODE_ITERATOR_HPP