perl_matcher.hpp 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645
  1. /*
  2. *
  3. * Copyright (c) 2002
  4. * John Maddock
  5. *
  6. * Use, modification and distribution are subject to the
  7. * Boost Software License, Version 1.0. (See accompanying file
  8. * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  9. *
  10. */
  11. #ifndef BOOST_REGEX_MATCHER_HPP
  12. #define BOOST_REGEX_MATCHER_HPP
  13. #include <boost/regex/v4/iterator_category.hpp>
  14. #ifdef BOOST_MSVC
  15. #pragma warning(push)
  16. #pragma warning(disable: 4103)
  17. #endif
  18. #ifdef BOOST_HAS_ABI_HEADERS
  19. # include BOOST_ABI_PREFIX
  20. #endif
  21. #ifdef BOOST_MSVC
  22. #pragma warning(pop)
  23. #endif
  24. #ifdef BOOST_MSVC
  25. # pragma warning(push)
  26. #pragma warning(disable : 4251)
  27. #if BOOST_MSVC < 1700
  28. # pragma warning(disable : 4231)
  29. #endif
  30. # if BOOST_MSVC < 1600
  31. # pragma warning(disable : 4660)
  32. # endif
  33. #if BOOST_MSVC < 1910
  34. #pragma warning(disable:4800)
  35. #endif
  36. #endif
  37. namespace boost{
  38. namespace BOOST_REGEX_DETAIL_NS{
  39. //
  40. // error checking API:
  41. //
  42. inline void BOOST_REGEX_CALL verify_options(boost::regex_constants::syntax_option_type, match_flag_type mf)
  43. {
  44. //
  45. // can't mix match_extra with POSIX matching rules:
  46. //
  47. if ((mf & match_extra) && (mf & match_posix))
  48. {
  49. std::logic_error msg("Usage Error: Can't mix regular expression captures with POSIX matching rules");
  50. throw_exception(msg);
  51. }
  52. }
  53. //
  54. // function can_start:
  55. //
  56. template <class charT>
  57. inline bool can_start(charT c, const unsigned char* map, unsigned char mask)
  58. {
  59. return ((c < static_cast<charT>(0)) ? true : ((c >= static_cast<charT>(1 << CHAR_BIT)) ? true : map[c] & mask));
  60. }
  61. inline bool can_start(char c, const unsigned char* map, unsigned char mask)
  62. {
  63. return map[(unsigned char)c] & mask;
  64. }
  65. inline bool can_start(signed char c, const unsigned char* map, unsigned char mask)
  66. {
  67. return map[(unsigned char)c] & mask;
  68. }
  69. inline bool can_start(unsigned char c, const unsigned char* map, unsigned char mask)
  70. {
  71. return map[c] & mask;
  72. }
  73. inline bool can_start(unsigned short c, const unsigned char* map, unsigned char mask)
  74. {
  75. return ((c >= (1 << CHAR_BIT)) ? true : map[c] & mask);
  76. }
  77. #if !defined(__hpux) && !defined(__WINSCW__)// WCHAR_MIN not usable in pp-directives.
  78. #if defined(WCHAR_MIN) && (WCHAR_MIN == 0) && !defined(BOOST_NO_INTRINSIC_WCHAR_T)
  79. inline bool can_start(wchar_t c, const unsigned char* map, unsigned char mask)
  80. {
  81. return ((c >= static_cast<wchar_t>(1u << CHAR_BIT)) ? true : map[c] & mask);
  82. }
  83. #endif
  84. #endif
  85. #if !defined(BOOST_NO_INTRINSIC_WCHAR_T)
  86. inline bool can_start(unsigned int c, const unsigned char* map, unsigned char mask)
  87. {
  88. return (((c >= static_cast<unsigned int>(1u << CHAR_BIT)) ? true : map[c] & mask));
  89. }
  90. #endif
  91. //
  92. // Unfortunately Rogue Waves standard library appears to have a bug
  93. // in std::basic_string::compare that results in erroneous answers
  94. // in some cases (tested with Borland C++ 5.1, Rogue Wave lib version
  95. // 0x020101) the test case was:
  96. // {39135,0} < {0xff,0}
  97. // which succeeds when it should not.
  98. //
  99. #ifndef _RWSTD_VER
  100. template <class C, class T, class A>
  101. inline int string_compare(const std::basic_string<C,T,A>& s, const C* p)
  102. {
  103. if(0 == *p)
  104. {
  105. if(s.empty() || ((s.size() == 1) && (s[0] == 0)))
  106. return 0;
  107. }
  108. return s.compare(p);
  109. }
  110. #else
  111. template <class C, class T, class A>
  112. inline int string_compare(const std::basic_string<C,T,A>& s, const C* p)
  113. {
  114. if(0 == *p)
  115. {
  116. if(s.empty() || ((s.size() == 1) && (s[0] == 0)))
  117. return 0;
  118. }
  119. return s.compare(p);
  120. }
  121. inline int string_compare(const std::string& s, const char* p)
  122. { return std::strcmp(s.c_str(), p); }
  123. # ifndef BOOST_NO_WREGEX
  124. inline int string_compare(const std::wstring& s, const wchar_t* p)
  125. { return std::wcscmp(s.c_str(), p); }
  126. #endif
  127. #endif
  128. template <class Seq, class C>
  129. inline int string_compare(const Seq& s, const C* p)
  130. {
  131. std::size_t i = 0;
  132. while((i < s.size()) && (p[i] == s[i]))
  133. {
  134. ++i;
  135. }
  136. return (i == s.size()) ? -(int)p[i] : (int)s[i] - (int)p[i];
  137. }
  138. # define STR_COMP(s,p) string_compare(s,p)
  139. template<class charT>
  140. inline const charT* re_skip_past_null(const charT* p)
  141. {
  142. while (*p != static_cast<charT>(0)) ++p;
  143. return ++p;
  144. }
  145. template <class iterator, class charT, class traits_type, class char_classT>
  146. iterator BOOST_REGEX_CALL re_is_set_member(iterator next,
  147. iterator last,
  148. const re_set_long<char_classT>* set_,
  149. const regex_data<charT, traits_type>& e, bool icase)
  150. {
  151. const charT* p = reinterpret_cast<const charT*>(set_+1);
  152. iterator ptr;
  153. unsigned int i;
  154. //bool icase = e.m_flags & regex_constants::icase;
  155. if(next == last) return next;
  156. typedef typename traits_type::string_type traits_string_type;
  157. const ::boost::regex_traits_wrapper<traits_type>& traits_inst = *(e.m_ptraits);
  158. // dwa 9/13/00 suppress incorrect MSVC warning - it claims this is never
  159. // referenced
  160. (void)traits_inst;
  161. // try and match a single character, could be a multi-character
  162. // collating element...
  163. for(i = 0; i < set_->csingles; ++i)
  164. {
  165. ptr = next;
  166. if(*p == static_cast<charT>(0))
  167. {
  168. // treat null string as special case:
  169. if(traits_inst.translate(*ptr, icase))
  170. {
  171. ++p;
  172. continue;
  173. }
  174. return set_->isnot ? next : (ptr == next) ? ++next : ptr;
  175. }
  176. else
  177. {
  178. while(*p && (ptr != last))
  179. {
  180. if(traits_inst.translate(*ptr, icase) != *p)
  181. break;
  182. ++p;
  183. ++ptr;
  184. }
  185. if(*p == static_cast<charT>(0)) // if null we've matched
  186. return set_->isnot ? next : (ptr == next) ? ++next : ptr;
  187. p = re_skip_past_null(p); // skip null
  188. }
  189. }
  190. charT col = traits_inst.translate(*next, icase);
  191. if(set_->cranges || set_->cequivalents)
  192. {
  193. traits_string_type s1;
  194. //
  195. // try and match a range, NB only a single character can match
  196. if(set_->cranges)
  197. {
  198. if((e.m_flags & regex_constants::collate) == 0)
  199. s1.assign(1, col);
  200. else
  201. {
  202. charT a[2] = { col, charT(0), };
  203. s1 = traits_inst.transform(a, a + 1);
  204. }
  205. for(i = 0; i < set_->cranges; ++i)
  206. {
  207. if(STR_COMP(s1, p) >= 0)
  208. {
  209. do{ ++p; }while(*p);
  210. ++p;
  211. if(STR_COMP(s1, p) <= 0)
  212. return set_->isnot ? next : ++next;
  213. }
  214. else
  215. {
  216. // skip first string
  217. do{ ++p; }while(*p);
  218. ++p;
  219. }
  220. // skip second string
  221. do{ ++p; }while(*p);
  222. ++p;
  223. }
  224. }
  225. //
  226. // try and match an equivalence class, NB only a single character can match
  227. if(set_->cequivalents)
  228. {
  229. charT a[2] = { col, charT(0), };
  230. s1 = traits_inst.transform_primary(a, a +1);
  231. for(i = 0; i < set_->cequivalents; ++i)
  232. {
  233. if(STR_COMP(s1, p) == 0)
  234. return set_->isnot ? next : ++next;
  235. // skip string
  236. do{ ++p; }while(*p);
  237. ++p;
  238. }
  239. }
  240. }
  241. if(traits_inst.isctype(col, set_->cclasses) == true)
  242. return set_->isnot ? next : ++next;
  243. if((set_->cnclasses != 0) && (traits_inst.isctype(col, set_->cnclasses) == false))
  244. return set_->isnot ? next : ++next;
  245. return set_->isnot ? ++next : next;
  246. }
  247. template <class BidiIterator>
  248. class repeater_count
  249. {
  250. repeater_count** stack;
  251. repeater_count* next;
  252. int state_id;
  253. std::size_t count; // the number of iterations so far
  254. BidiIterator start_pos; // where the last repeat started
  255. repeater_count* unwind_until(int n, repeater_count* p, int current_recursion_id)
  256. {
  257. while(p && (p->state_id != n))
  258. {
  259. if(-2 - current_recursion_id == p->state_id)
  260. return 0;
  261. p = p->next;
  262. if(p && (p->state_id < 0))
  263. {
  264. p = unwind_until(p->state_id, p, current_recursion_id);
  265. if(!p)
  266. return p;
  267. p = p->next;
  268. }
  269. }
  270. return p;
  271. }
  272. public:
  273. repeater_count(repeater_count** s) : stack(s), next(0), state_id(-1), count(0), start_pos() {}
  274. repeater_count(int i, repeater_count** s, BidiIterator start, int current_recursion_id)
  275. : start_pos(start)
  276. {
  277. state_id = i;
  278. stack = s;
  279. next = *stack;
  280. *stack = this;
  281. if((state_id > next->state_id) && (next->state_id >= 0))
  282. count = 0;
  283. else
  284. {
  285. repeater_count* p = next;
  286. p = unwind_until(state_id, p, current_recursion_id);
  287. if(p)
  288. {
  289. count = p->count;
  290. start_pos = p->start_pos;
  291. }
  292. else
  293. count = 0;
  294. }
  295. }
  296. ~repeater_count()
  297. {
  298. if(next)
  299. *stack = next;
  300. }
  301. std::size_t get_count() { return count; }
  302. int get_id() { return state_id; }
  303. std::size_t operator++() { return ++count; }
  304. bool check_null_repeat(const BidiIterator& pos, std::size_t max)
  305. {
  306. // this is called when we are about to start a new repeat,
  307. // if the last one was NULL move our count to max,
  308. // otherwise save the current position.
  309. bool result = (count == 0) ? false : (pos == start_pos);
  310. if(result)
  311. count = max;
  312. else
  313. start_pos = pos;
  314. return result;
  315. }
  316. };
  317. struct saved_state;
  318. enum saved_state_type
  319. {
  320. saved_type_end = 0,
  321. saved_type_paren = 1,
  322. saved_type_recurse = 2,
  323. saved_type_assertion = 3,
  324. saved_state_alt = 4,
  325. saved_state_repeater_count = 5,
  326. saved_state_extra_block = 6,
  327. saved_state_greedy_single_repeat = 7,
  328. saved_state_rep_slow_dot = 8,
  329. saved_state_rep_fast_dot = 9,
  330. saved_state_rep_char = 10,
  331. saved_state_rep_short_set = 11,
  332. saved_state_rep_long_set = 12,
  333. saved_state_non_greedy_long_repeat = 13,
  334. saved_state_count = 14
  335. };
  336. #ifdef BOOST_MSVC
  337. # pragma warning(push)
  338. #if BOOST_MSVC >= 1800
  339. #pragma warning(disable:26495)
  340. #endif
  341. #endif
  342. template <class Results>
  343. struct recursion_info
  344. {
  345. typedef typename Results::value_type value_type;
  346. typedef typename value_type::iterator iterator;
  347. int idx;
  348. const re_syntax_base* preturn_address;
  349. Results results;
  350. repeater_count<iterator>* repeater_stack;
  351. iterator location_of_start;
  352. };
  353. #ifdef BOOST_MSVC
  354. # pragma warning(pop)
  355. #endif
  356. template <class BidiIterator, class Allocator, class traits>
  357. class perl_matcher
  358. {
  359. public:
  360. typedef typename traits::char_type char_type;
  361. typedef perl_matcher<BidiIterator, Allocator, traits> self_type;
  362. typedef bool (self_type::*matcher_proc_type)();
  363. typedef std::size_t traits_size_type;
  364. typedef typename is_byte<char_type>::width_type width_type;
  365. typedef typename regex_iterator_traits<BidiIterator>::difference_type difference_type;
  366. typedef match_results<BidiIterator, Allocator> results_type;
  367. perl_matcher(BidiIterator first, BidiIterator end,
  368. match_results<BidiIterator, Allocator>& what,
  369. const basic_regex<char_type, traits>& e,
  370. match_flag_type f,
  371. BidiIterator l_base)
  372. : m_result(what), base(first), last(end),
  373. position(first), backstop(l_base), re(e), traits_inst(e.get_traits()),
  374. m_independent(false), next_count(&rep_obj), rep_obj(&next_count)
  375. #ifdef BOOST_REGEX_NON_RECURSIVE
  376. , m_recursions(0)
  377. #endif
  378. {
  379. construct_init(e, f);
  380. }
  381. bool match();
  382. bool find();
  383. void setf(match_flag_type f)
  384. { m_match_flags |= f; }
  385. void unsetf(match_flag_type f)
  386. { m_match_flags &= ~f; }
  387. private:
  388. void construct_init(const basic_regex<char_type, traits>& e, match_flag_type f);
  389. bool find_imp();
  390. bool match_imp();
  391. #ifdef BOOST_REGEX_HAS_MS_STACK_GUARD
  392. typedef bool (perl_matcher::*protected_proc_type)();
  393. bool protected_call(protected_proc_type);
  394. #endif
  395. void estimate_max_state_count(std::random_access_iterator_tag*);
  396. void estimate_max_state_count(void*);
  397. bool match_prefix();
  398. bool match_all_states();
  399. // match procs, stored in s_match_vtable:
  400. bool match_startmark();
  401. bool match_endmark();
  402. bool match_literal();
  403. bool match_start_line();
  404. bool match_end_line();
  405. bool match_wild();
  406. bool match_match();
  407. bool match_word_boundary();
  408. bool match_within_word();
  409. bool match_word_start();
  410. bool match_word_end();
  411. bool match_buffer_start();
  412. bool match_buffer_end();
  413. bool match_backref();
  414. bool match_long_set();
  415. bool match_set();
  416. bool match_jump();
  417. bool match_alt();
  418. bool match_rep();
  419. bool match_combining();
  420. bool match_soft_buffer_end();
  421. bool match_restart_continue();
  422. bool match_long_set_repeat();
  423. bool match_set_repeat();
  424. bool match_char_repeat();
  425. bool match_dot_repeat_fast();
  426. bool match_dot_repeat_slow();
  427. bool match_dot_repeat_dispatch()
  428. {
  429. return ::boost::is_random_access_iterator<BidiIterator>::value ? match_dot_repeat_fast() : match_dot_repeat_slow();
  430. }
  431. bool match_backstep();
  432. bool match_assert_backref();
  433. bool match_toggle_case();
  434. #ifdef BOOST_REGEX_RECURSIVE
  435. bool backtrack_till_match(std::size_t count);
  436. #endif
  437. bool match_recursion();
  438. bool match_fail();
  439. bool match_accept();
  440. bool match_commit();
  441. bool match_then();
  442. bool skip_until_paren(int index, bool match = true);
  443. // find procs stored in s_find_vtable:
  444. bool find_restart_any();
  445. bool find_restart_word();
  446. bool find_restart_line();
  447. bool find_restart_buf();
  448. bool find_restart_lit();
  449. private:
  450. // final result structure to be filled in:
  451. match_results<BidiIterator, Allocator>& m_result;
  452. // temporary result for POSIX matches:
  453. scoped_ptr<match_results<BidiIterator, Allocator> > m_temp_match;
  454. // pointer to actual result structure to fill in:
  455. match_results<BidiIterator, Allocator>* m_presult;
  456. // start of sequence being searched:
  457. BidiIterator base;
  458. // end of sequence being searched:
  459. BidiIterator last;
  460. // current character being examined:
  461. BidiIterator position;
  462. // where to restart next search after failed match attempt:
  463. BidiIterator restart;
  464. // where the current search started from, acts as base for $` during grep:
  465. BidiIterator search_base;
  466. // how far we can go back when matching lookbehind:
  467. BidiIterator backstop;
  468. // the expression being examined:
  469. const basic_regex<char_type, traits>& re;
  470. // the expression's traits class:
  471. const ::boost::regex_traits_wrapper<traits>& traits_inst;
  472. // the next state in the machine being matched:
  473. const re_syntax_base* pstate;
  474. // matching flags in use:
  475. match_flag_type m_match_flags;
  476. // how many states we have examined so far:
  477. std::ptrdiff_t state_count;
  478. // max number of states to examine before giving up:
  479. std::ptrdiff_t max_state_count;
  480. // whether we should ignore case or not:
  481. bool icase;
  482. // set to true when (position == last), indicates that we may have a partial match:
  483. bool m_has_partial_match;
  484. // set to true whenever we get a match:
  485. bool m_has_found_match;
  486. // set to true whenever we're inside an independent sub-expression:
  487. bool m_independent;
  488. // the current repeat being examined:
  489. repeater_count<BidiIterator>* next_count;
  490. // the first repeat being examined (top of linked list):
  491. repeater_count<BidiIterator> rep_obj;
  492. // the mask to pass when matching word boundaries:
  493. typename traits::char_class_type m_word_mask;
  494. // the bitmask to use when determining whether a match_any matches a newline or not:
  495. unsigned char match_any_mask;
  496. // recursion information:
  497. std::vector<recursion_info<results_type> > recursion_stack;
  498. #ifdef BOOST_REGEX_RECURSIVE
  499. // Set to false by a (*COMMIT):
  500. bool m_can_backtrack;
  501. bool m_have_accept;
  502. bool m_have_then;
  503. #endif
  504. #ifdef BOOST_REGEX_NON_RECURSIVE
  505. //
  506. // additional members for non-recursive version:
  507. //
  508. typedef bool (self_type::*unwind_proc_type)(bool);
  509. void extend_stack();
  510. bool unwind(bool);
  511. bool unwind_end(bool);
  512. bool unwind_paren(bool);
  513. bool unwind_recursion_stopper(bool);
  514. bool unwind_assertion(bool);
  515. bool unwind_alt(bool);
  516. bool unwind_repeater_counter(bool);
  517. bool unwind_extra_block(bool);
  518. bool unwind_greedy_single_repeat(bool);
  519. bool unwind_slow_dot_repeat(bool);
  520. bool unwind_fast_dot_repeat(bool);
  521. bool unwind_char_repeat(bool);
  522. bool unwind_short_set_repeat(bool);
  523. bool unwind_long_set_repeat(bool);
  524. bool unwind_non_greedy_repeat(bool);
  525. bool unwind_recursion(bool);
  526. bool unwind_recursion_pop(bool);
  527. bool unwind_commit(bool);
  528. bool unwind_then(bool);
  529. bool unwind_case(bool);
  530. void destroy_single_repeat();
  531. void push_matched_paren(int index, const sub_match<BidiIterator>& sub);
  532. void push_recursion_stopper();
  533. void push_assertion(const re_syntax_base* ps, bool positive);
  534. void push_alt(const re_syntax_base* ps);
  535. void push_repeater_count(int i, repeater_count<BidiIterator>** s);
  536. void push_single_repeat(std::size_t c, const re_repeat* r, BidiIterator last_position, int state_id);
  537. void push_non_greedy_repeat(const re_syntax_base* ps);
  538. void push_recursion(int idx, const re_syntax_base* p, results_type* presults, results_type* presults2);
  539. void push_recursion_pop();
  540. void push_case_change(bool);
  541. // pointer to base of stack:
  542. saved_state* m_stack_base;
  543. // pointer to current stack position:
  544. saved_state* m_backup_state;
  545. // how many memory blocks have we used up?:
  546. unsigned used_block_count;
  547. // determines what value to return when unwinding from recursion,
  548. // allows for mixed recursive/non-recursive algorithm:
  549. bool m_recursive_result;
  550. // We have unwound to a lookahead/lookbehind, used by COMMIT/PRUNE/SKIP:
  551. bool m_unwound_lookahead;
  552. // We have unwound to an alternative, used by THEN:
  553. bool m_unwound_alt;
  554. // We are unwinding a commit - used by independent subs to determine whether to stop there or carry on unwinding:
  555. //bool m_unwind_commit;
  556. // Recursion limit:
  557. unsigned m_recursions;
  558. #endif
  559. #ifdef BOOST_MSVC
  560. # pragma warning(push)
  561. #if BOOST_MSVC >= 1800
  562. #pragma warning(disable:26495)
  563. #endif
  564. #endif
  565. // these operations aren't allowed, so are declared private,
  566. // bodies are provided to keep explicit-instantiation requests happy:
  567. perl_matcher& operator=(const perl_matcher&)
  568. {
  569. return *this;
  570. }
  571. perl_matcher(const perl_matcher& that)
  572. : m_result(that.m_result), re(that.re), traits_inst(that.traits_inst), rep_obj(0) {}
  573. #ifdef BOOST_MSVC
  574. # pragma warning(pop)
  575. #endif
  576. };
  577. } // namespace BOOST_REGEX_DETAIL_NS
  578. #ifdef BOOST_MSVC
  579. # pragma warning(pop)
  580. #endif
  581. #ifdef BOOST_MSVC
  582. #pragma warning(push)
  583. #pragma warning(disable: 4103)
  584. #endif
  585. #ifdef BOOST_HAS_ABI_HEADERS
  586. # include BOOST_ABI_SUFFIX
  587. #endif
  588. #ifdef BOOST_MSVC
  589. #pragma warning(pop)
  590. #endif
  591. } // namespace boost
  592. //
  593. // include the implementation of perl_matcher:
  594. //
  595. #ifdef BOOST_REGEX_RECURSIVE
  596. #include <boost/regex/v4/perl_matcher_recursive.hpp>
  597. #else
  598. #include <boost/regex/v4/perl_matcher_non_recursive.hpp>
  599. #endif
  600. // this one has to be last:
  601. #include <boost/regex/v4/perl_matcher_common.hpp>
  602. #endif