index.hpp 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930
  1. //
  2. // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0.
  5. // https://www.boost.org/LICENSE_1_0.txt
  6. #ifndef BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED
  7. #define BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED
  8. #include <boost/locale/boundary/boundary_point.hpp>
  9. #include <boost/locale/boundary/facets.hpp>
  10. #include <boost/locale/boundary/segment.hpp>
  11. #include <boost/locale/boundary/types.hpp>
  12. #include <boost/iterator/iterator_facade.hpp>
  13. #include <algorithm>
  14. #include <cstdint>
  15. #include <iterator>
  16. #include <locale>
  17. #include <memory>
  18. #include <stdexcept>
  19. #include <string>
  20. #include <type_traits>
  21. #include <vector>
  22. #ifdef BOOST_MSVC
  23. # pragma warning(push)
  24. # pragma warning(disable : 4275 4251 4231 4660)
  25. #endif
  26. namespace boost { namespace locale { namespace boundary {
  27. ///
  28. /// \defgroup boundary Boundary Analysis
  29. ///
  30. /// This module contains all operations required for %boundary analysis of text: character, word, line and sentence
  31. /// boundaries
  32. ///
  33. /// @{
  34. ///
  35. /// \cond INTERNAL
  36. namespace detail {
  37. template<typename Char>
  38. const boundary_indexing<Char>& get_boundary_indexing(const std::locale& l)
  39. {
  40. using facet_type = boundary_indexing<Char>;
  41. if(!std::has_facet<facet_type>(l))
  42. throw std::runtime_error("Locale was generated without segmentation support!");
  43. return std::use_facet<facet_type>(l);
  44. }
  45. template<typename IteratorType,
  46. typename CategoryType = typename std::iterator_traits<IteratorType>::iterator_category>
  47. struct mapping_traits {
  48. typedef typename std::iterator_traits<IteratorType>::value_type char_type;
  49. static index_type map(boundary_type t, IteratorType b, IteratorType e, const std::locale& l)
  50. {
  51. std::basic_string<char_type> str(b, e);
  52. return get_boundary_indexing<char_type>(l).map(t, str.c_str(), str.c_str() + str.size());
  53. }
  54. };
  55. template<typename CharType, typename SomeIteratorType>
  56. struct linear_iterator_traits {
  57. static constexpr bool is_linear =
  58. std::is_same<SomeIteratorType, CharType*>::value || std::is_same<SomeIteratorType, const CharType*>::value
  59. || std::is_same<SomeIteratorType, typename std::basic_string<CharType>::iterator>::value
  60. || std::is_same<SomeIteratorType, typename std::basic_string<CharType>::const_iterator>::value
  61. || std::is_same<SomeIteratorType, typename std::vector<CharType>::iterator>::value
  62. || std::is_same<SomeIteratorType, typename std::vector<CharType>::const_iterator>::value;
  63. };
  64. template<typename IteratorType>
  65. struct mapping_traits<IteratorType, std::random_access_iterator_tag> {
  66. typedef typename std::iterator_traits<IteratorType>::value_type char_type;
  67. static index_type map(boundary_type t, IteratorType b, IteratorType e, const std::locale& l)
  68. {
  69. index_type result;
  70. // Optimize for most common cases
  71. //
  72. // C++11 requires that string is continuous in memory and all known
  73. // string implementations do this because of c_str() support.
  74. if(linear_iterator_traits<char_type, IteratorType>::is_linear && b != e) {
  75. const char_type* begin = &*b;
  76. const char_type* end = begin + (e - b);
  77. index_type tmp = get_boundary_indexing<char_type>(l).map(t, begin, end);
  78. result.swap(tmp);
  79. } else {
  80. std::basic_string<char_type> str(b, e);
  81. index_type tmp = get_boundary_indexing<char_type>(l).map(t, str.c_str(), str.c_str() + str.size());
  82. result.swap(tmp);
  83. }
  84. return result;
  85. }
  86. };
  87. template<typename BaseIterator>
  88. class mapping {
  89. public:
  90. typedef BaseIterator base_iterator;
  91. typedef typename std::iterator_traits<base_iterator>::value_type char_type;
  92. mapping(boundary_type type, base_iterator begin, base_iterator end, const std::locale& loc) :
  93. index_(new index_type()), begin_(begin), end_(end)
  94. {
  95. index_type idx = detail::mapping_traits<base_iterator>::map(type, begin, end, loc);
  96. index_->swap(idx);
  97. }
  98. mapping() {}
  99. const index_type& index() const { return *index_; }
  100. base_iterator begin() const { return begin_; }
  101. base_iterator end() const { return end_; }
  102. private:
  103. std::shared_ptr<index_type> index_;
  104. base_iterator begin_, end_;
  105. };
  106. template<typename BaseIterator>
  107. class segment_index_iterator : public boost::iterator_facade<segment_index_iterator<BaseIterator>,
  108. segment<BaseIterator>,
  109. boost::bidirectional_traversal_tag,
  110. const segment<BaseIterator>&> {
  111. public:
  112. typedef BaseIterator base_iterator;
  113. typedef mapping<base_iterator> mapping_type;
  114. typedef segment<base_iterator> segment_type;
  115. segment_index_iterator() : current_(0, 0), map_(nullptr), mask_(0), full_select_(false) {}
  116. segment_index_iterator(base_iterator p, const mapping_type* map, rule_type mask, bool full_select) :
  117. map_(map), mask_(mask), full_select_(full_select)
  118. {
  119. set(p);
  120. }
  121. segment_index_iterator(bool is_begin, const mapping_type* map, rule_type mask, bool full_select) :
  122. map_(map), mask_(mask), full_select_(full_select)
  123. {
  124. if(is_begin)
  125. set_begin();
  126. else
  127. set_end();
  128. }
  129. const segment_type& dereference() const { return value_; }
  130. bool equal(const segment_index_iterator& other) const
  131. {
  132. return map_ == other.map_ && current_.second == other.current_.second;
  133. }
  134. void increment()
  135. {
  136. std::pair<size_t, size_t> next = current_;
  137. if(full_select_) {
  138. next.first = next.second;
  139. while(next.second < size()) {
  140. next.second++;
  141. if(valid_offset(next.second))
  142. break;
  143. }
  144. if(next.second == size())
  145. next.first = next.second - 1;
  146. } else {
  147. while(next.second < size()) {
  148. next.first = next.second;
  149. next.second++;
  150. if(valid_offset(next.second))
  151. break;
  152. }
  153. }
  154. update_current(next);
  155. }
  156. void decrement()
  157. {
  158. std::pair<size_t, size_t> next = current_;
  159. if(full_select_) {
  160. while(next.second > 1) {
  161. next.second--;
  162. if(valid_offset(next.second))
  163. break;
  164. }
  165. next.first = next.second;
  166. while(next.first > 0) {
  167. next.first--;
  168. if(valid_offset(next.first))
  169. break;
  170. }
  171. } else {
  172. while(next.second > 1) {
  173. next.second--;
  174. if(valid_offset(next.second))
  175. break;
  176. }
  177. next.first = next.second - 1;
  178. }
  179. update_current(next);
  180. }
  181. private:
  182. void set_end()
  183. {
  184. current_.first = size() - 1;
  185. current_.second = size();
  186. value_ = segment_type(map_->end(), map_->end(), 0);
  187. }
  188. void set_begin()
  189. {
  190. current_.first = current_.second = 0;
  191. value_ = segment_type(map_->begin(), map_->begin(), 0);
  192. increment();
  193. }
  194. void set(base_iterator p)
  195. {
  196. const auto b = map_->index().begin(), e = map_->index().end();
  197. auto boundary_point = std::upper_bound(b, e, break_info(std::distance(map_->begin(), p)));
  198. while(boundary_point != e && (boundary_point->rule & mask_) == 0)
  199. ++boundary_point;
  200. current_.first = current_.second = boundary_point - b;
  201. if(full_select_) {
  202. while(current_.first > 0) {
  203. current_.first--;
  204. if(valid_offset(current_.first))
  205. break;
  206. }
  207. } else {
  208. if(current_.first > 0)
  209. current_.first--;
  210. }
  211. value_.first = map_->begin();
  212. std::advance(value_.first, get_offset(current_.first));
  213. value_.second = value_.first;
  214. std::advance(value_.second, get_offset(current_.second) - get_offset(current_.first));
  215. update_rule();
  216. }
  217. void update_current(std::pair<size_t, size_t> pos)
  218. {
  219. std::ptrdiff_t first_diff = get_offset(pos.first) - get_offset(current_.first);
  220. std::ptrdiff_t second_diff = get_offset(pos.second) - get_offset(current_.second);
  221. std::advance(value_.first, first_diff);
  222. std::advance(value_.second, second_diff);
  223. current_ = pos;
  224. update_rule();
  225. }
  226. void update_rule()
  227. {
  228. if(current_.second != size())
  229. value_.rule(index()[current_.second].rule);
  230. }
  231. size_t get_offset(size_t ind) const
  232. {
  233. if(ind == size())
  234. return index().back().offset;
  235. return index()[ind].offset;
  236. }
  237. bool valid_offset(size_t offset) const
  238. {
  239. return offset == 0 || offset == size() // make sure we not acess index[size]
  240. || (index()[offset].rule & mask_) != 0;
  241. }
  242. size_t size() const { return index().size(); }
  243. const index_type& index() const { return map_->index(); }
  244. segment_type value_;
  245. std::pair<size_t, size_t> current_;
  246. const mapping_type* map_;
  247. rule_type mask_;
  248. bool full_select_;
  249. };
  250. template<typename BaseIterator>
  251. class boundary_point_index_iterator : public boost::iterator_facade<boundary_point_index_iterator<BaseIterator>,
  252. boundary_point<BaseIterator>,
  253. boost::bidirectional_traversal_tag,
  254. const boundary_point<BaseIterator>&> {
  255. public:
  256. typedef BaseIterator base_iterator;
  257. typedef mapping<base_iterator> mapping_type;
  258. typedef boundary_point<base_iterator> boundary_point_type;
  259. boundary_point_index_iterator() : current_(0), map_(nullptr), mask_(0) {}
  260. boundary_point_index_iterator(bool is_begin, const mapping_type* map, rule_type mask) :
  261. map_(map), mask_(mask)
  262. {
  263. if(is_begin)
  264. set_begin();
  265. else
  266. set_end();
  267. }
  268. boundary_point_index_iterator(base_iterator p, const mapping_type* map, rule_type mask) :
  269. map_(map), mask_(mask)
  270. {
  271. set(p);
  272. }
  273. const boundary_point_type& dereference() const { return value_; }
  274. bool equal(const boundary_point_index_iterator& other) const
  275. {
  276. return map_ == other.map_ && current_ == other.current_;
  277. }
  278. void increment()
  279. {
  280. size_t next = current_;
  281. while(next < size()) {
  282. next++;
  283. if(valid_offset(next))
  284. break;
  285. }
  286. update_current(next);
  287. }
  288. void decrement()
  289. {
  290. size_t next = current_;
  291. while(next > 0) {
  292. next--;
  293. if(valid_offset(next))
  294. break;
  295. }
  296. update_current(next);
  297. }
  298. private:
  299. void set_end()
  300. {
  301. current_ = size();
  302. value_ = boundary_point_type(map_->end(), 0);
  303. }
  304. void set_begin()
  305. {
  306. current_ = 0;
  307. value_ = boundary_point_type(map_->begin(), 0);
  308. }
  309. void set(base_iterator p)
  310. {
  311. size_t dist = std::distance(map_->begin(), p);
  312. const auto b = index().begin(), e = index().end();
  313. const auto ptr = std::lower_bound(b, e, break_info(dist));
  314. if(ptr == e)
  315. current_ = size() - 1;
  316. else
  317. current_ = ptr - b;
  318. while(!valid_offset(current_))
  319. current_++;
  320. std::ptrdiff_t diff = get_offset(current_) - dist;
  321. std::advance(p, diff);
  322. value_.iterator(p);
  323. update_rule();
  324. }
  325. void update_current(size_t pos)
  326. {
  327. std::ptrdiff_t diff = get_offset(pos) - get_offset(current_);
  328. base_iterator i = value_.iterator();
  329. std::advance(i, diff);
  330. current_ = pos;
  331. value_.iterator(i);
  332. update_rule();
  333. }
  334. void update_rule()
  335. {
  336. if(current_ != size())
  337. value_.rule(index()[current_].rule);
  338. }
  339. size_t get_offset(size_t ind) const
  340. {
  341. if(ind == size())
  342. return index().back().offset;
  343. return index()[ind].offset;
  344. }
  345. bool valid_offset(size_t offset) const
  346. {
  347. return offset == 0 || offset + 1 >= size() // last and first are always valid regardless of mark
  348. || (index()[offset].rule & mask_) != 0;
  349. }
  350. size_t size() const { return index().size(); }
  351. const index_type& index() const { return map_->index(); }
  352. boundary_point_type value_;
  353. size_t current_;
  354. const mapping_type* map_;
  355. rule_type mask_;
  356. };
  357. } // namespace detail
  358. /// \endcond
  359. template<typename BaseIterator>
  360. class segment_index;
  361. template<typename BaseIterator>
  362. class boundary_point_index;
  363. /// \brief This class holds an index of segments in the text range and allows to iterate over them
  364. ///
  365. /// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators
  366. /// to the \ref segment objects.
  367. ///
  368. /// It provides two options on way of selecting segments:
  369. ///
  370. /// - \ref rule(rule_type mask) - a mask that allows to select only specific types of segments according to
  371. /// various masks %as \ref word_any.
  372. /// \n
  373. /// The default is to select any types of boundaries.
  374. /// \n
  375. /// For example: using word %boundary analysis, when the provided mask is \ref word_kana then the iterators
  376. /// would iterate only over the words containing Kana letters and \ref word_any would select all types of
  377. /// words excluding ranges that consist of white space and punctuation marks. So iterating over the text
  378. /// "to be or not to be?" with \ref word_any rule would return segments "to", "be", "or", "not", "to", "be",
  379. /// instead of default "to", " ", "be", " ", "or", " ", "not", " ", "to", " ", "be", "?".
  380. /// - \ref full_select(bool how) - a flag that defines the way a range is selected if the rule of the previous
  381. /// %boundary point does not fit the selected rule.
  382. /// \n
  383. /// For example: We want to fetch all sentences from the following text: "Hello! How\nare you?".
  384. /// \n
  385. /// This text contains three %boundary points separating it to sentences by different rules:
  386. /// - The exclamation mark "!" ends the sentence "Hello!"
  387. /// - The line feed that splits the sentence "How\nare you?" into two parts.
  388. /// - The question mark that ends the second sentence.
  389. /// \n
  390. /// If you would only change the \ref rule() to \ref sentence_term then the segment_index would
  391. /// provide two sentences "Hello!" and "are you?" %as only them actually terminated with required
  392. /// terminator "!" or "?". But changing \ref full_select() to true, the selected segment would include
  393. /// all the text up to previous valid %boundary point and would return two expected sentences:
  394. /// "Hello!" and "How\nare you?".
  395. ///
  396. /// This class allows to find a segment according to the given iterator in range using \ref find() member
  397. /// function.
  398. ///
  399. /// \note
  400. ///
  401. /// - Changing any of the options - \ref rule() or \ref full_select() and of course re-indexing the text
  402. /// invalidates existing iterators and they can't be used any more.
  403. /// - segment_index can be created from boundary_point_index or other segment_index that was created with
  404. /// same \ref boundary_type. This is very fast operation %as they shared same index
  405. /// and it does not require its regeneration.
  406. ///
  407. /// \see
  408. ///
  409. /// - \ref boundary_point_index
  410. /// - \ref segment
  411. /// - \ref boundary_point
  412. template<typename BaseIterator>
  413. class segment_index {
  414. public:
  415. /// The type of the iterator used to iterate over the original text
  416. typedef BaseIterator base_iterator;
  417. #ifdef BOOST_LOCALE_DOXYGEN
  418. /// The bidirectional iterator that iterates over \ref value_type objects.
  419. ///
  420. /// - The iterators may be invalidated by use of any non-const member function
  421. /// including but not limited to \ref rule(rule_type) and \ref full_select(bool).
  422. /// - The returned value_type object is valid %as long %as iterator points to it.
  423. /// So this following code is wrong %as t used after p was updated:
  424. /// \code
  425. /// segment_index<some_iterator>::iterator p=index.begin();
  426. /// segment<some_iterator> &t = *p;
  427. /// ++p;
  428. /// std::cout << t.str() << std::endl;
  429. /// \endcode
  430. typedef unspecified_iterator_type iterator;
  431. /// \copydoc iterator
  432. typedef unspecified_iterator_type const_iterator;
  433. #else
  434. typedef detail::segment_index_iterator<base_iterator> iterator;
  435. typedef detail::segment_index_iterator<base_iterator> const_iterator;
  436. #endif
  437. /// The type dereferenced by the \ref iterator and \ref const_iterator. It is
  438. /// an object that represents selected segment.
  439. typedef segment<base_iterator> value_type;
  440. /// Default constructor.
  441. ///
  442. /// \note
  443. ///
  444. /// When this object is constructed by default it does not include a valid index, thus
  445. /// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined
  446. /// behavior
  447. segment_index() : mask_(0xFFFFFFFFu), full_select_(false) {}
  448. /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text
  449. /// in range [begin,end) using a rule \a mask for locale \a loc.
  450. segment_index(boundary_type type,
  451. base_iterator begin,
  452. base_iterator end,
  453. rule_type mask,
  454. const std::locale& loc = std::locale()) :
  455. map_(type, begin, end, loc),
  456. mask_(mask), full_select_(false)
  457. {}
  458. /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text
  459. /// in range [begin,end) selecting all possible segments (full mask) for locale \a loc.
  460. segment_index(boundary_type type,
  461. base_iterator begin,
  462. base_iterator end,
  463. const std::locale& loc = std::locale()) :
  464. map_(type, begin, end, loc),
  465. mask_(0xFFFFFFFFu), full_select_(false)
  466. {}
  467. /// Create a segment_index from a \ref boundary_point_index. It copies all indexing information
  468. /// and used default rule (all possible segments)
  469. ///
  470. /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text
  471. /// range it is much better to create one from another rather then indexing the same
  472. /// range twice.
  473. ///
  474. /// \note \ref rule() flags are not copied
  475. segment_index(const boundary_point_index<base_iterator>&);
  476. /// Copy an index from a \ref boundary_point_index. It copies all indexing information
  477. /// and uses the default rule (all possible segments)
  478. ///
  479. /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text
  480. /// range it is much better to create one from another rather then indexing the same
  481. /// range twice.
  482. ///
  483. /// \note \ref rule() flags are not copied
  484. segment_index& operator=(const boundary_point_index<base_iterator>&);
  485. /// Create a new index for %boundary analysis \ref boundary_type "type" of the text
  486. /// in range [begin,end) for locale \a loc.
  487. ///
  488. /// \note \ref rule() and \ref full_select() remain unchanged.
  489. void map(boundary_type type, base_iterator begin, base_iterator end, const std::locale& loc = std::locale())
  490. {
  491. map_ = mapping_type(type, begin, end, loc);
  492. }
  493. /// Get the \ref iterator on the beginning of the segments range.
  494. ///
  495. /// Preconditions: the segment_index should have a mapping
  496. ///
  497. /// \note
  498. ///
  499. /// The returned iterator is invalidated by access to any non-const member functions of this object
  500. iterator begin() const
  501. {
  502. return iterator(true, &map_, mask_, full_select_);
  503. }
  504. /// Get the \ref iterator on the ending of the segments range.
  505. ///
  506. /// Preconditions: the segment_index should have a mapping
  507. ///
  508. /// The returned iterator is invalidated by access to any non-const member functions of this object
  509. iterator end() const
  510. {
  511. return iterator(false, &map_, mask_, full_select_);
  512. }
  513. /// Find a first valid segment following a position \a p.
  514. ///
  515. /// If \a p is inside a valid segment this segment is selected:
  516. ///
  517. /// For example: For \ref word %boundary analysis with \ref word_any rule():
  518. ///
  519. /// - "to| be or ", would point to "be",
  520. /// - "t|o be or ", would point to "to",
  521. /// - "to be or| ", would point to end.
  522. ///
  523. ///
  524. /// Preconditions: the segment_index should have a mapping and \a p should be valid iterator
  525. /// to the text in the mapped range.
  526. ///
  527. /// The returned iterator is invalidated by access to any non-const member functions of this object
  528. iterator find(base_iterator p) const
  529. {
  530. return iterator(p, &map_, mask_, full_select_);
  531. }
  532. /// Get the mask of rules that are used
  533. rule_type rule() const
  534. {
  535. return mask_;
  536. }
  537. /// Set the mask of rules that are used
  538. void rule(rule_type v)
  539. {
  540. mask_ = v;
  541. }
  542. /// Get the full_select property value - should segment include in the range
  543. /// values that not belong to specific \ref rule() or not.
  544. ///
  545. /// The default value is false.
  546. ///
  547. /// For example for \ref sentence %boundary with rule \ref sentence_term the segments
  548. /// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false
  549. /// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select()
  550. /// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the
  551. /// following part "are you?"
  552. bool full_select() const
  553. {
  554. return full_select_;
  555. }
  556. /// Set the full_select property value - should segment include in the range
  557. /// values that not belong to specific \ref rule() or not.
  558. ///
  559. /// The default value is false.
  560. ///
  561. /// For example for \ref sentence %boundary with rule \ref sentence_term the segments
  562. /// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false
  563. /// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select()
  564. /// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the
  565. /// following part "are you?"
  566. void full_select(bool v)
  567. {
  568. full_select_ = v;
  569. }
  570. private:
  571. friend class boundary_point_index<base_iterator>;
  572. typedef detail::mapping<base_iterator> mapping_type;
  573. mapping_type map_;
  574. rule_type mask_;
  575. bool full_select_;
  576. };
  577. /// \brief This class holds an index of \ref boundary_point "boundary points" and allows iterating
  578. /// over them.
  579. ///
  580. /// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators
  581. /// to the \ref boundary_point objects.
  582. ///
  583. /// It provides an option that affects selecting %boundary points according to different rules:
  584. /// using \ref rule(rule_type mask) member function. It allows to set a mask that select only specific
  585. /// types of %boundary points like \ref sentence_term.
  586. ///
  587. /// For example for a sentence %boundary analysis of a text "Hello! How\nare you?" when the default
  588. /// rule is used the %boundary points would be:
  589. ///
  590. /// - "|Hello! How\nare you?"
  591. /// - "Hello! |How\nare you?"
  592. /// - "Hello! How\n|are you?"
  593. /// - "Hello! How\nare you?|"
  594. ///
  595. /// However if \ref rule() is set to \ref sentence_term then the selected %boundary points would be:
  596. ///
  597. /// - "|Hello! How\nare you?"
  598. /// - "Hello! |How\nare you?"
  599. /// - "Hello! How\nare you?|"
  600. ///
  601. /// Such that a %boundary point defined by a line feed character would be ignored.
  602. ///
  603. /// This class allows to find a boundary_point according to the given iterator in range using \ref find() member
  604. /// function.
  605. ///
  606. /// \note
  607. /// - Even an empty text range [x,x) considered to have a one %boundary point x.
  608. /// - \a a and \a b points of the range [a,b) are always considered %boundary points
  609. /// regardless the rules used.
  610. /// - Changing any of the option \ref rule() or course re-indexing the text
  611. /// invalidates existing iterators and they can't be used any more.
  612. /// - boundary_point_index can be created from segment_index or other boundary_point_index that was created with
  613. /// same \ref boundary_type. This is very fast operation %as they shared same index
  614. /// and it does not require its regeneration.
  615. ///
  616. /// \see
  617. ///
  618. /// - \ref segment_index
  619. /// - \ref boundary_point
  620. /// - \ref segment
  621. template<typename BaseIterator>
  622. class boundary_point_index {
  623. public:
  624. /// The type of the iterator used to iterate over the original text
  625. typedef BaseIterator base_iterator;
  626. #ifdef BOOST_LOCALE_DOXYGEN
  627. /// The bidirectional iterator that iterates over \ref value_type objects.
  628. ///
  629. /// - The iterators may be invalidated by use of any non-const member function
  630. /// including but not limited to \ref rule(rule_type) member function.
  631. /// - The returned value_type object is valid %as long %as iterator points to it.
  632. /// So this following code is wrong %as t used after p was updated:
  633. /// \code
  634. /// boundary_point_index<some_iterator>::iterator p=index.begin();
  635. /// boundary_point<some_iterator> &t = *p;
  636. /// ++p;
  637. /// rule_type r = t->rule();
  638. /// \endcode
  639. ///
  640. typedef unspecified_iterator_type iterator;
  641. /// \copydoc iterator
  642. typedef unspecified_iterator_type const_iterator;
  643. #else
  644. typedef detail::boundary_point_index_iterator<base_iterator> iterator;
  645. typedef detail::boundary_point_index_iterator<base_iterator> const_iterator;
  646. #endif
  647. /// The type dereferenced by the \ref iterator and \ref const_iterator. It is
  648. /// an object that represents the selected \ref boundary_point "boundary point".
  649. typedef boundary_point<base_iterator> value_type;
  650. /// Default constructor.
  651. ///
  652. /// \note
  653. ///
  654. /// When this object is constructed by default it does not include a valid index, thus
  655. /// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined
  656. /// behavior
  657. boundary_point_index() : mask_(0xFFFFFFFFu) {}
  658. /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text
  659. /// in range [begin,end) using a rule \a mask for locale \a loc.
  660. boundary_point_index(boundary_type type,
  661. base_iterator begin,
  662. base_iterator end,
  663. rule_type mask,
  664. const std::locale& loc = std::locale()) :
  665. map_(type, begin, end, loc),
  666. mask_(mask)
  667. {}
  668. /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text
  669. /// in range [begin,end) selecting all possible %boundary points (full mask) for locale \a loc.
  670. boundary_point_index(boundary_type type,
  671. base_iterator begin,
  672. base_iterator end,
  673. const std::locale& loc = std::locale()) :
  674. map_(type, begin, end, loc),
  675. mask_(0xFFFFFFFFu)
  676. {}
  677. /// Create a boundary_point_index from a \ref segment_index. It copies all indexing information
  678. /// and uses the default rule (all possible %boundary points)
  679. ///
  680. /// This operation is very cheap, so if you use boundary_point_index and segment_index on the same text
  681. /// range it is much better to create one from another rather then indexing the same
  682. /// range twice.
  683. ///
  684. /// \note \ref rule() flags are not copied
  685. boundary_point_index(const segment_index<base_iterator>& other);
  686. /// Copy a boundary_point_index from a \ref segment_index. It copies all indexing information
  687. /// and keeps the current \ref rule() unchanged
  688. ///
  689. /// This operation is very cheap, so if you use boundary_point_index and segment_index on the same text
  690. /// range it is much better to create one from another rather then indexing the same
  691. /// range twice.
  692. ///
  693. /// \note \ref rule() flags are not copied
  694. boundary_point_index& operator=(const segment_index<base_iterator>& other);
  695. /// Create a new index for %boundary analysis \ref boundary_type "type" of the text
  696. /// in range [begin,end) for locale \a loc.
  697. ///
  698. /// \note \ref rule() remains unchanged.
  699. void map(boundary_type type, base_iterator begin, base_iterator end, const std::locale& loc = std::locale())
  700. {
  701. map_ = mapping_type(type, begin, end, loc);
  702. }
  703. /// Get the \ref iterator on the beginning of the %boundary points range.
  704. ///
  705. /// Preconditions: this boundary_point_index should have a mapping
  706. ///
  707. /// \note
  708. ///
  709. /// The returned iterator is invalidated by access to any non-const member functions of this object
  710. iterator begin() const
  711. {
  712. return iterator(true, &map_, mask_);
  713. }
  714. /// Get the \ref iterator on the ending of the %boundary points range.
  715. ///
  716. /// Preconditions: this boundary_point_index should have a mapping
  717. ///
  718. /// \note
  719. ///
  720. /// The returned iterator is invalidated by access to any non-const member functions of this object
  721. iterator end() const
  722. {
  723. return iterator(false, &map_, mask_);
  724. }
  725. /// Find a first valid %boundary point on a position \a p or following it.
  726. ///
  727. /// For example: For \ref word %boundary analysis of the text "to be or"
  728. ///
  729. /// - "|to be", would return %boundary point at "|to be",
  730. /// - "t|o be", would point to "to| be"
  731. ///
  732. /// Preconditions: the boundary_point_index should have a mapping and \a p should be valid iterator
  733. /// to the text in the mapped range.
  734. ///
  735. /// The returned iterator is invalidated by access to any non-const member functions of this object
  736. iterator find(base_iterator p) const
  737. {
  738. return iterator(p, &map_, mask_);
  739. }
  740. /// Get the mask of rules that are used
  741. rule_type rule() const
  742. {
  743. return mask_;
  744. }
  745. /// Set the mask of rules that are used
  746. void rule(rule_type v)
  747. {
  748. mask_ = v;
  749. }
  750. private:
  751. friend class segment_index<base_iterator>;
  752. typedef detail::mapping<base_iterator> mapping_type;
  753. mapping_type map_;
  754. rule_type mask_;
  755. };
  756. /// \cond INTERNAL
  757. template<typename BaseIterator>
  758. segment_index<BaseIterator>::segment_index(const boundary_point_index<BaseIterator>& other) :
  759. map_(other.map_), mask_(0xFFFFFFFFu), full_select_(false)
  760. {}
  761. template<typename BaseIterator>
  762. boundary_point_index<BaseIterator>::boundary_point_index(const segment_index<BaseIterator>& other) :
  763. map_(other.map_), mask_(0xFFFFFFFFu)
  764. {}
  765. template<typename BaseIterator>
  766. segment_index<BaseIterator>& segment_index<BaseIterator>::operator=(const boundary_point_index<BaseIterator>& other)
  767. {
  768. map_ = other.map_;
  769. return *this;
  770. }
  771. template<typename BaseIterator>
  772. boundary_point_index<BaseIterator>&
  773. boundary_point_index<BaseIterator>::operator=(const segment_index<BaseIterator>& other)
  774. {
  775. map_ = other.map_;
  776. return *this;
  777. }
  778. /// \endcond
  779. typedef segment_index<std::string::const_iterator> ssegment_index; ///< convenience typedef
  780. typedef segment_index<std::wstring::const_iterator> wssegment_index; ///< convenience typedef
  781. #ifndef BOOST_LOCALE_NO_CXX20_STRING8
  782. typedef segment_index<std::u8string::const_iterator> u8ssegment_index; ///< convenience typedef
  783. #endif
  784. #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
  785. typedef segment_index<std::u16string::const_iterator> u16ssegment_index; ///< convenience typedef
  786. #endif
  787. #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
  788. typedef segment_index<std::u32string::const_iterator> u32ssegment_index; ///< convenience typedef
  789. #endif
  790. typedef segment_index<const char*> csegment_index; ///< convenience typedef
  791. typedef segment_index<const wchar_t*> wcsegment_index; ///< convenience typedef
  792. #ifdef __cpp_char8_t
  793. typedef segment_index<const char8_t*> u8csegment_index; ///< convenience typedef
  794. #endif
  795. #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
  796. typedef segment_index<const char16_t*> u16csegment_index; ///< convenience typedef
  797. #endif
  798. #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
  799. typedef segment_index<const char32_t*> u32csegment_index; ///< convenience typedef
  800. #endif
  801. typedef boundary_point_index<std::string::const_iterator> sboundary_point_index; ///< convenience typedef
  802. typedef boundary_point_index<std::wstring::const_iterator> wsboundary_point_index; ///< convenience typedef
  803. #ifndef BOOST_LOCALE_NO_CXX20_STRING8
  804. typedef boundary_point_index<std::u8string::const_iterator> u8sboundary_point_index; ///< convenience typedef
  805. #endif
  806. #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
  807. typedef boundary_point_index<std::u16string::const_iterator> u16sboundary_point_index; ///< convenience typedef
  808. #endif
  809. #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
  810. typedef boundary_point_index<std::u32string::const_iterator> u32sboundary_point_index; ///< convenience typedef
  811. #endif
  812. typedef boundary_point_index<const char*> cboundary_point_index; ///< convenience typedef
  813. typedef boundary_point_index<const wchar_t*> wcboundary_point_index; ///< convenience typedef
  814. #ifdef __cpp_char8_t
  815. typedef boundary_point_index<const char8_t*> u8cboundary_point_index; ///< convenience typedef
  816. #endif
  817. #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
  818. typedef boundary_point_index<const char16_t*> u16cboundary_point_index; ///< convenience typedef
  819. #endif
  820. #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
  821. typedef boundary_point_index<const char32_t*> u32cboundary_point_index; ///< convenience typedef
  822. #endif
  823. }}} // namespace boost::locale::boundary
  824. ///
  825. /// \example boundary.cpp
  826. /// Example of using segment_index
  827. /// \example wboundary.cpp
  828. /// Example of using segment_index over wide strings
  829. ///
  830. #ifdef BOOST_MSVC
  831. # pragma warning(pop)
  832. #endif
  833. #endif