token_functions.hpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653
  1. // Boost token_functions.hpp ------------------------------------------------//
  2. // Copyright John R. Bandela 2001.
  3. // Distributed under the Boost Software License, Version 1.0. (See
  4. // accompanying file LICENSE_1_0.txt or copy at
  5. // http://www.boost.org/LICENSE_1_0.txt)
  6. // See http://www.boost.org/libs/tokenizer/ for documentation.
  7. // Revision History:
  8. // 01 Oct 2004 Joaquin M Lopez Munoz
  9. // Workaround for a problem with string::assign in msvc-stlport
  10. // 06 Apr 2004 John Bandela
  11. // Fixed a bug involving using char_delimiter with a true input iterator
  12. // 28 Nov 2003 Robert Zeh and John Bandela
  13. // Converted into "fast" functions that avoid using += when
  14. // the supplied iterator isn't an input_iterator; based on
  15. // some work done at Archelon and a version that was checked into
  16. // the boost CVS for a short period of time.
  17. // 20 Feb 2002 John Maddock
  18. // Removed using namespace std declarations and added
  19. // workaround for BOOST_NO_STDC_NAMESPACE (the library
  20. // can be safely mixed with regex).
  21. // 06 Feb 2002 Jeremy Siek
  22. // Added char_separator.
  23. // 02 Feb 2002 Jeremy Siek
  24. // Removed tabs and a little cleanup.
  25. #ifndef BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_
  26. #define BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_
  27. #include <vector>
  28. #include <stdexcept>
  29. #include <string>
  30. #include <cctype>
  31. #include <algorithm> // for find_if
  32. #include <boost/config.hpp>
  33. #include <boost/assert.hpp>
  34. #include <boost/type_traits/conditional.hpp>
  35. #include <boost/type_traits/is_pointer.hpp>
  36. #include <boost/detail/workaround.hpp>
  37. #include <boost/throw_exception.hpp>
  38. #if !defined(BOOST_NO_CWCTYPE)
  39. #include <cwctype>
  40. #endif
  41. //
  42. // the following must not be macros if we are to prefix them
  43. // with std:: (they shouldn't be macros anyway...)
  44. //
  45. #ifdef ispunct
  46. # undef ispunct
  47. #endif
  48. #ifdef iswpunct
  49. # undef iswpunct
  50. #endif
  51. #ifdef isspace
  52. # undef isspace
  53. #endif
  54. #ifdef iswspace
  55. # undef iswspace
  56. #endif
  57. //
  58. // fix namespace problems:
  59. //
  60. #ifdef BOOST_NO_STDC_NAMESPACE
  61. namespace std{
  62. using ::ispunct;
  63. using ::isspace;
  64. #if !defined(BOOST_NO_CWCTYPE)
  65. using ::iswpunct;
  66. using ::iswspace;
  67. #endif
  68. }
  69. #endif
  70. namespace boost{
  71. //===========================================================================
  72. // The escaped_list_separator class. Which is a model of TokenizerFunction
  73. // An escaped list is a super-set of what is commonly known as a comma
  74. // separated value (csv) list.It is separated into fields by a comma or
  75. // other character. If the delimiting character is inside quotes, then it is
  76. // counted as a regular character.To allow for embedded quotes in a field,
  77. // there can be escape sequences using the \ much like C.
  78. // The role of the comma, the quotation mark, and the escape
  79. // character (backslash \), can be assigned to other characters.
  80. struct escaped_list_error : public std::runtime_error{
  81. escaped_list_error(const std::string& what_arg):std::runtime_error(what_arg) { }
  82. };
  83. // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
  84. // MSVC does not like the following typename
  85. template <class Char,
  86. class Traits = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type >
  87. class escaped_list_separator {
  88. private:
  89. typedef std::basic_string<Char,Traits> string_type;
  90. struct char_eq {
  91. Char e_;
  92. char_eq(Char e):e_(e) { }
  93. bool operator()(Char c) {
  94. return Traits::eq(e_,c);
  95. }
  96. };
  97. string_type escape_;
  98. string_type c_;
  99. string_type quote_;
  100. bool last_;
  101. bool is_escape(Char e) {
  102. char_eq f(e);
  103. return std::find_if(escape_.begin(),escape_.end(),f)!=escape_.end();
  104. }
  105. bool is_c(Char e) {
  106. char_eq f(e);
  107. return std::find_if(c_.begin(),c_.end(),f)!=c_.end();
  108. }
  109. bool is_quote(Char e) {
  110. char_eq f(e);
  111. return std::find_if(quote_.begin(),quote_.end(),f)!=quote_.end();
  112. }
  113. template <typename iterator, typename Token>
  114. void do_escape(iterator& next,iterator end,Token& tok) {
  115. if (++next == end)
  116. BOOST_THROW_EXCEPTION(escaped_list_error(std::string("cannot end with escape")));
  117. if (Traits::eq(*next,'n')) {
  118. tok+='\n';
  119. return;
  120. }
  121. else if (is_quote(*next)) {
  122. tok+=*next;
  123. return;
  124. }
  125. else if (is_c(*next)) {
  126. tok+=*next;
  127. return;
  128. }
  129. else if (is_escape(*next)) {
  130. tok+=*next;
  131. return;
  132. }
  133. else
  134. BOOST_THROW_EXCEPTION(escaped_list_error(std::string("unknown escape sequence")));
  135. }
  136. public:
  137. explicit escaped_list_separator(Char e = '\\',
  138. Char c = ',',Char q = '\"')
  139. : escape_(1,e), c_(1,c), quote_(1,q), last_(false) { }
  140. escaped_list_separator(string_type e, string_type c, string_type q)
  141. : escape_(e), c_(c), quote_(q), last_(false) { }
  142. void reset() {last_=false;}
  143. template <typename InputIterator, typename Token>
  144. bool operator()(InputIterator& next,InputIterator end,Token& tok) {
  145. bool bInQuote = false;
  146. tok = Token();
  147. if (next == end) {
  148. if (last_) {
  149. last_ = false;
  150. return true;
  151. }
  152. else
  153. return false;
  154. }
  155. last_ = false;
  156. for (;next != end;++next) {
  157. if (is_escape(*next)) {
  158. do_escape(next,end,tok);
  159. }
  160. else if (is_c(*next)) {
  161. if (!bInQuote) {
  162. // If we are not in quote, then we are done
  163. ++next;
  164. // The last character was a c, that means there is
  165. // 1 more blank field
  166. last_ = true;
  167. return true;
  168. }
  169. else tok+=*next;
  170. }
  171. else if (is_quote(*next)) {
  172. bInQuote=!bInQuote;
  173. }
  174. else {
  175. tok += *next;
  176. }
  177. }
  178. return true;
  179. }
  180. };
  181. //===========================================================================
  182. // The classes here are used by offset_separator and char_separator to implement
  183. // faster assigning of tokens using assign instead of +=
  184. namespace tokenizer_detail {
  185. //===========================================================================
  186. // Tokenizer was broken for wide character separators, at least on Windows, since
  187. // CRT functions isspace etc only expect values in [0, 0xFF]. Debug build asserts
  188. // if higher values are passed in. The traits extension class should take care of this.
  189. // Assuming that the conditional will always get optimized out in the function
  190. // implementations, argument types are not a problem since both forms of character classifiers
  191. // expect an int.
  192. #if !defined(BOOST_NO_CWCTYPE)
  193. template<typename traits, int N>
  194. struct traits_extension_details : public traits {
  195. typedef typename traits::char_type char_type;
  196. static bool isspace(char_type c)
  197. {
  198. return std::iswspace(c) != 0;
  199. }
  200. static bool ispunct(char_type c)
  201. {
  202. return std::iswpunct(c) != 0;
  203. }
  204. };
  205. template<typename traits>
  206. struct traits_extension_details<traits, 1> : public traits {
  207. typedef typename traits::char_type char_type;
  208. static bool isspace(char_type c)
  209. {
  210. return std::isspace(c) != 0;
  211. }
  212. static bool ispunct(char_type c)
  213. {
  214. return std::ispunct(c) != 0;
  215. }
  216. };
  217. #endif
  218. // In case there is no cwctype header, we implement the checks manually.
  219. // We make use of the fact that the tested categories should fit in ASCII.
  220. template<typename traits>
  221. struct traits_extension : public traits {
  222. typedef typename traits::char_type char_type;
  223. static bool isspace(char_type c)
  224. {
  225. #if !defined(BOOST_NO_CWCTYPE)
  226. return traits_extension_details<traits, sizeof(char_type)>::isspace(c);
  227. #else
  228. return static_cast< unsigned >(c) <= 255 && std::isspace(c) != 0;
  229. #endif
  230. }
  231. static bool ispunct(char_type c)
  232. {
  233. #if !defined(BOOST_NO_CWCTYPE)
  234. return traits_extension_details<traits, sizeof(char_type)>::ispunct(c);
  235. #else
  236. return static_cast< unsigned >(c) <= 255 && std::ispunct(c) != 0;
  237. #endif
  238. }
  239. };
  240. // The assign_or_plus_equal struct contains functions that implement
  241. // assign, +=, and clearing based on the iterator type. The
  242. // generic case does nothing for plus_equal and clearing, while
  243. // passing through the call for assign.
  244. //
  245. // When an input iterator is being used, the situation is reversed.
  246. // The assign method does nothing, plus_equal invokes operator +=,
  247. // and the clearing method sets the supplied token to the default
  248. // token constructor's result.
  249. //
  250. template<class IteratorTag>
  251. struct assign_or_plus_equal {
  252. template<class Iterator, class Token>
  253. static void assign(Iterator b, Iterator e, Token &t) {
  254. t.assign(b, e);
  255. }
  256. template<class Token, class Value>
  257. static void plus_equal(Token &, const Value &) { }
  258. // If we are doing an assign, there is no need for the
  259. // the clear.
  260. //
  261. template<class Token>
  262. static void clear(Token &) { }
  263. };
  264. template <>
  265. struct assign_or_plus_equal<std::input_iterator_tag> {
  266. template<class Iterator, class Token>
  267. static void assign(Iterator , Iterator , Token &) { }
  268. template<class Token, class Value>
  269. static void plus_equal(Token &t, const Value &v) {
  270. t += v;
  271. }
  272. template<class Token>
  273. static void clear(Token &t) {
  274. t = Token();
  275. }
  276. };
  277. template<class Iterator>
  278. struct pointer_iterator_category{
  279. typedef std::random_access_iterator_tag type;
  280. };
  281. template<class Iterator>
  282. struct class_iterator_category{
  283. typedef typename Iterator::iterator_category type;
  284. };
  285. // This portably gets the iterator_tag without partial template specialization
  286. template<class Iterator>
  287. struct get_iterator_category{
  288. typedef typename conditional<is_pointer<Iterator>::value,
  289. pointer_iterator_category<Iterator>,
  290. class_iterator_category<Iterator>
  291. >::type cat;
  292. typedef typename cat::type iterator_category;
  293. };
  294. } // namespace tokenizer_detail
  295. //===========================================================================
  296. // The offset_separator class, which is a model of TokenizerFunction.
  297. // Offset breaks a string into tokens based on a range of offsets
  298. class offset_separator {
  299. private:
  300. std::vector<int> offsets_;
  301. unsigned int current_offset_;
  302. bool wrap_offsets_;
  303. bool return_partial_last_;
  304. public:
  305. template <typename Iter>
  306. offset_separator(Iter begin, Iter end, bool wrap_offsets = true,
  307. bool return_partial_last = true)
  308. : offsets_(begin,end), current_offset_(0),
  309. wrap_offsets_(wrap_offsets),
  310. return_partial_last_(return_partial_last) { }
  311. offset_separator()
  312. : offsets_(1,1), current_offset_(),
  313. wrap_offsets_(true), return_partial_last_(true) { }
  314. void reset() {
  315. current_offset_ = 0;
  316. }
  317. template <typename InputIterator, typename Token>
  318. bool operator()(InputIterator& next, InputIterator end, Token& tok)
  319. {
  320. typedef tokenizer_detail::assign_or_plus_equal<
  321. BOOST_DEDUCED_TYPENAME tokenizer_detail::get_iterator_category<
  322. InputIterator
  323. >::iterator_category
  324. > assigner;
  325. BOOST_ASSERT(!offsets_.empty());
  326. assigner::clear(tok);
  327. InputIterator start(next);
  328. if (next == end)
  329. return false;
  330. if (current_offset_ == offsets_.size())
  331. {
  332. if (wrap_offsets_)
  333. current_offset_=0;
  334. else
  335. return false;
  336. }
  337. int c = offsets_[current_offset_];
  338. int i = 0;
  339. for (; i < c; ++i) {
  340. if (next == end)break;
  341. assigner::plus_equal(tok,*next++);
  342. }
  343. assigner::assign(start,next,tok);
  344. if (!return_partial_last_)
  345. if (i < (c-1) )
  346. return false;
  347. ++current_offset_;
  348. return true;
  349. }
  350. };
  351. //===========================================================================
  352. // The char_separator class breaks a sequence of characters into
  353. // tokens based on the character delimiters (very much like bad old
  354. // strtok). A delimiter character can either be kept or dropped. A
  355. // kept delimiter shows up as an output token, whereas a dropped
  356. // delimiter does not.
  357. // This class replaces the char_delimiters_separator class. The
  358. // constructor for the char_delimiters_separator class was too
  359. // confusing and needed to be deprecated. However, because of the
  360. // default arguments to the constructor, adding the new constructor
  361. // would cause ambiguity, so instead I deprecated the whole class.
  362. // The implementation of the class was also simplified considerably.
  363. enum empty_token_policy { drop_empty_tokens, keep_empty_tokens };
  364. // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
  365. template <typename Char,
  366. typename Tr = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type >
  367. class char_separator
  368. {
  369. typedef tokenizer_detail::traits_extension<Tr> Traits;
  370. typedef std::basic_string<Char,Tr> string_type;
  371. public:
  372. explicit
  373. char_separator(const Char* dropped_delims,
  374. const Char* kept_delims = 0,
  375. empty_token_policy empty_tokens = drop_empty_tokens)
  376. : m_dropped_delims(dropped_delims),
  377. m_use_ispunct(false),
  378. m_use_isspace(false),
  379. m_empty_tokens(empty_tokens),
  380. m_output_done(false)
  381. {
  382. // Borland workaround
  383. if (kept_delims)
  384. m_kept_delims = kept_delims;
  385. }
  386. // use ispunct() for kept delimiters and isspace for dropped.
  387. explicit
  388. char_separator()
  389. : m_use_ispunct(true),
  390. m_use_isspace(true),
  391. m_empty_tokens(drop_empty_tokens),
  392. m_output_done(false) { }
  393. void reset() { }
  394. template <typename InputIterator, typename Token>
  395. bool operator()(InputIterator& next, InputIterator end, Token& tok)
  396. {
  397. typedef tokenizer_detail::assign_or_plus_equal<
  398. BOOST_DEDUCED_TYPENAME tokenizer_detail::get_iterator_category<
  399. InputIterator
  400. >::iterator_category
  401. > assigner;
  402. assigner::clear(tok);
  403. // skip past all dropped_delims
  404. if (m_empty_tokens == drop_empty_tokens)
  405. for (; next != end && is_dropped(*next); ++next)
  406. { }
  407. InputIterator start(next);
  408. if (m_empty_tokens == drop_empty_tokens) {
  409. if (next == end)
  410. return false;
  411. // if we are on a kept_delims move past it and stop
  412. if (is_kept(*next)) {
  413. assigner::plus_equal(tok,*next);
  414. ++next;
  415. } else
  416. // append all the non delim characters
  417. for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next)
  418. assigner::plus_equal(tok,*next);
  419. }
  420. else { // m_empty_tokens == keep_empty_tokens
  421. // Handle empty token at the end
  422. if (next == end)
  423. {
  424. if (m_output_done == false)
  425. {
  426. m_output_done = true;
  427. assigner::assign(start,next,tok);
  428. return true;
  429. }
  430. else
  431. return false;
  432. }
  433. if (is_kept(*next)) {
  434. if (m_output_done == false)
  435. m_output_done = true;
  436. else {
  437. assigner::plus_equal(tok,*next);
  438. ++next;
  439. m_output_done = false;
  440. }
  441. }
  442. else if (m_output_done == false && is_dropped(*next)) {
  443. m_output_done = true;
  444. }
  445. else {
  446. if (is_dropped(*next))
  447. start=++next;
  448. for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next)
  449. assigner::plus_equal(tok,*next);
  450. m_output_done = true;
  451. }
  452. }
  453. assigner::assign(start,next,tok);
  454. return true;
  455. }
  456. private:
  457. string_type m_kept_delims;
  458. string_type m_dropped_delims;
  459. bool m_use_ispunct;
  460. bool m_use_isspace;
  461. empty_token_policy m_empty_tokens;
  462. bool m_output_done;
  463. bool is_kept(Char E) const
  464. {
  465. if (m_kept_delims.length())
  466. return m_kept_delims.find(E) != string_type::npos;
  467. else if (m_use_ispunct) {
  468. return Traits::ispunct(E) != 0;
  469. } else
  470. return false;
  471. }
  472. bool is_dropped(Char E) const
  473. {
  474. if (m_dropped_delims.length())
  475. return m_dropped_delims.find(E) != string_type::npos;
  476. else if (m_use_isspace) {
  477. return Traits::isspace(E) != 0;
  478. } else
  479. return false;
  480. }
  481. };
  482. //===========================================================================
  483. // The following class is DEPRECATED, use class char_separators instead.
  484. //
  485. // The char_delimiters_separator class, which is a model of
  486. // TokenizerFunction. char_delimiters_separator breaks a string
  487. // into tokens based on character delimiters. There are 2 types of
  488. // delimiters. returnable delimiters can be returned as
  489. // tokens. These are often punctuation. nonreturnable delimiters
  490. // cannot be returned as tokens. These are often whitespace
  491. // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
  492. template <class Char,
  493. class Tr = BOOST_DEDUCED_TYPENAME std::basic_string<Char>::traits_type >
  494. class char_delimiters_separator {
  495. private:
  496. typedef tokenizer_detail::traits_extension<Tr> Traits;
  497. typedef std::basic_string<Char,Tr> string_type;
  498. string_type returnable_;
  499. string_type nonreturnable_;
  500. bool return_delims_;
  501. bool no_ispunct_;
  502. bool no_isspace_;
  503. bool is_ret(Char E)const
  504. {
  505. if (returnable_.length())
  506. return returnable_.find(E) != string_type::npos;
  507. else{
  508. if (no_ispunct_) {return false;}
  509. else{
  510. int r = Traits::ispunct(E);
  511. return r != 0;
  512. }
  513. }
  514. }
  515. bool is_nonret(Char E)const
  516. {
  517. if (nonreturnable_.length())
  518. return nonreturnable_.find(E) != string_type::npos;
  519. else{
  520. if (no_isspace_) {return false;}
  521. else{
  522. int r = Traits::isspace(E);
  523. return r != 0;
  524. }
  525. }
  526. }
  527. public:
  528. explicit char_delimiters_separator(bool return_delims = false,
  529. const Char* returnable = 0,
  530. const Char* nonreturnable = 0)
  531. : returnable_(returnable ? returnable : string_type().c_str()),
  532. nonreturnable_(nonreturnable ? nonreturnable:string_type().c_str()),
  533. return_delims_(return_delims), no_ispunct_(returnable!=0),
  534. no_isspace_(nonreturnable!=0) { }
  535. void reset() { }
  536. public:
  537. template <typename InputIterator, typename Token>
  538. bool operator()(InputIterator& next, InputIterator end,Token& tok) {
  539. tok = Token();
  540. // skip past all nonreturnable delims
  541. // skip past the returnable only if we are not returning delims
  542. for (;next!=end && ( is_nonret(*next) || (is_ret(*next)
  543. && !return_delims_ ) );++next) { }
  544. if (next == end) {
  545. return false;
  546. }
  547. // if we are to return delims and we are one a returnable one
  548. // move past it and stop
  549. if (is_ret(*next) && return_delims_) {
  550. tok+=*next;
  551. ++next;
  552. }
  553. else
  554. // append all the non delim characters
  555. for (;next!=end && !is_nonret(*next) && !is_ret(*next);++next)
  556. tok+=*next;
  557. return true;
  558. }
  559. };
  560. } //namespace boost
  561. #endif