basic_regex_creator.hpp 53 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576
  1. /*
  2. *
  3. * Copyright (c) 2004
  4. * John Maddock
  5. *
  6. * Use, modification and distribution are subject to the
  7. * Boost Software License, Version 1.0. (See accompanying file
  8. * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  9. *
  10. */
  11. /*
  12. * LOCATION: see http://www.boost.org for most recent version.
  13. * FILE basic_regex_creator.cpp
  14. * VERSION see <boost/version.hpp>
  15. * DESCRIPTION: Declares template class basic_regex_creator which fills in
  16. * the data members of a regex_data object.
  17. */
  18. #ifndef BOOST_REGEX_V5_BASIC_REGEX_CREATOR_HPP
  19. #define BOOST_REGEX_V5_BASIC_REGEX_CREATOR_HPP
  20. #ifdef BOOST_REGEX_MSVC
  21. # pragma warning(push)
  22. #pragma warning(disable:4459)
  23. #if BOOST_REGEX_MSVC < 1910
  24. #pragma warning(disable:4800)
  25. #endif
  26. #endif
  27. #include <set>
  28. namespace boost{
  29. namespace BOOST_REGEX_DETAIL_NS{
  30. template <class charT>
  31. struct digraph : public std::pair<charT, charT>
  32. {
  33. digraph() : std::pair<charT, charT>(charT(0), charT(0)){}
  34. digraph(charT c1) : std::pair<charT, charT>(c1, charT(0)){}
  35. digraph(charT c1, charT c2) : std::pair<charT, charT>(c1, c2)
  36. {}
  37. digraph(const digraph<charT>& d) : std::pair<charT, charT>(d.first, d.second){}
  38. digraph<charT>& operator=(const digraph<charT>&) = default;
  39. template <class Seq>
  40. digraph(const Seq& s) : std::pair<charT, charT>()
  41. {
  42. BOOST_REGEX_ASSERT(s.size() <= 2);
  43. BOOST_REGEX_ASSERT(s.size());
  44. this->first = s[0];
  45. this->second = (s.size() > 1) ? s[1] : 0;
  46. }
  47. };
  48. template <class charT, class traits>
  49. class basic_char_set
  50. {
  51. public:
  52. typedef digraph<charT> digraph_type;
  53. typedef typename traits::string_type string_type;
  54. typedef typename traits::char_class_type m_type;
  55. basic_char_set()
  56. {
  57. m_negate = false;
  58. m_has_digraphs = false;
  59. m_classes = 0;
  60. m_negated_classes = 0;
  61. m_empty = true;
  62. }
  63. void add_single(const digraph_type& s)
  64. {
  65. m_singles.insert(s);
  66. if(s.second)
  67. m_has_digraphs = true;
  68. m_empty = false;
  69. }
  70. void add_range(const digraph_type& first, const digraph_type& end)
  71. {
  72. m_ranges.push_back(first);
  73. m_ranges.push_back(end);
  74. if(first.second)
  75. {
  76. m_has_digraphs = true;
  77. add_single(first);
  78. }
  79. if(end.second)
  80. {
  81. m_has_digraphs = true;
  82. add_single(end);
  83. }
  84. m_empty = false;
  85. }
  86. void add_class(m_type m)
  87. {
  88. m_classes |= m;
  89. m_empty = false;
  90. }
  91. void add_negated_class(m_type m)
  92. {
  93. m_negated_classes |= m;
  94. m_empty = false;
  95. }
  96. void add_equivalent(const digraph_type& s)
  97. {
  98. m_equivalents.insert(s);
  99. if(s.second)
  100. {
  101. m_has_digraphs = true;
  102. add_single(s);
  103. }
  104. m_empty = false;
  105. }
  106. void negate()
  107. {
  108. m_negate = true;
  109. //m_empty = false;
  110. }
  111. //
  112. // accessor functions:
  113. //
  114. bool has_digraphs()const
  115. {
  116. return m_has_digraphs;
  117. }
  118. bool is_negated()const
  119. {
  120. return m_negate;
  121. }
  122. typedef typename std::vector<digraph_type>::const_iterator list_iterator;
  123. typedef typename std::set<digraph_type>::const_iterator set_iterator;
  124. set_iterator singles_begin()const
  125. {
  126. return m_singles.begin();
  127. }
  128. set_iterator singles_end()const
  129. {
  130. return m_singles.end();
  131. }
  132. list_iterator ranges_begin()const
  133. {
  134. return m_ranges.begin();
  135. }
  136. list_iterator ranges_end()const
  137. {
  138. return m_ranges.end();
  139. }
  140. set_iterator equivalents_begin()const
  141. {
  142. return m_equivalents.begin();
  143. }
  144. set_iterator equivalents_end()const
  145. {
  146. return m_equivalents.end();
  147. }
  148. m_type classes()const
  149. {
  150. return m_classes;
  151. }
  152. m_type negated_classes()const
  153. {
  154. return m_negated_classes;
  155. }
  156. bool empty()const
  157. {
  158. return m_empty;
  159. }
  160. private:
  161. std::set<digraph_type> m_singles; // a list of single characters to match
  162. std::vector<digraph_type> m_ranges; // a list of end points of our ranges
  163. bool m_negate; // true if the set is to be negated
  164. bool m_has_digraphs; // true if we have digraphs present
  165. m_type m_classes; // character classes to match
  166. m_type m_negated_classes; // negated character classes to match
  167. bool m_empty; // whether we've added anything yet
  168. std::set<digraph_type> m_equivalents; // a list of equivalence classes
  169. };
  170. template <class charT, class traits>
  171. class basic_regex_creator
  172. {
  173. public:
  174. basic_regex_creator(regex_data<charT, traits>* data);
  175. std::ptrdiff_t getoffset(void* addr)
  176. {
  177. return getoffset(addr, m_pdata->m_data.data());
  178. }
  179. std::ptrdiff_t getoffset(const void* addr, const void* base)
  180. {
  181. return static_cast<const char*>(addr) - static_cast<const char*>(base);
  182. }
  183. re_syntax_base* getaddress(std::ptrdiff_t off)
  184. {
  185. return getaddress(off, m_pdata->m_data.data());
  186. }
  187. re_syntax_base* getaddress(std::ptrdiff_t off, void* base)
  188. {
  189. return static_cast<re_syntax_base*>(static_cast<void*>(static_cast<char*>(base) + off));
  190. }
  191. void init(unsigned l_flags)
  192. {
  193. m_pdata->m_flags = l_flags;
  194. m_icase = l_flags & regex_constants::icase;
  195. }
  196. regbase::flag_type flags()
  197. {
  198. return m_pdata->m_flags;
  199. }
  200. void flags(regbase::flag_type f)
  201. {
  202. m_pdata->m_flags = f;
  203. if(m_icase != static_cast<bool>(f & regbase::icase))
  204. {
  205. m_icase = static_cast<bool>(f & regbase::icase);
  206. }
  207. }
  208. re_syntax_base* append_state(syntax_element_type t, std::size_t s = sizeof(re_syntax_base));
  209. re_syntax_base* insert_state(std::ptrdiff_t pos, syntax_element_type t, std::size_t s = sizeof(re_syntax_base));
  210. re_literal* append_literal(charT c);
  211. re_syntax_base* append_set(const basic_char_set<charT, traits>& char_set);
  212. re_syntax_base* append_set(const basic_char_set<charT, traits>& char_set, std::integral_constant<bool, false>*);
  213. re_syntax_base* append_set(const basic_char_set<charT, traits>& char_set, std::integral_constant<bool, true>*);
  214. void finalize(const charT* p1, const charT* p2);
  215. protected:
  216. regex_data<charT, traits>* m_pdata; // pointer to the basic_regex_data struct we are filling in
  217. const ::boost::regex_traits_wrapper<traits>&
  218. m_traits; // convenience reference to traits class
  219. re_syntax_base* m_last_state; // the last state we added
  220. bool m_icase; // true for case insensitive matches
  221. unsigned m_repeater_id; // the state_id of the next repeater
  222. bool m_has_backrefs; // true if there are actually any backrefs
  223. std::uintmax_t m_bad_repeats; // bitmask of repeats we can't deduce a startmap for;
  224. bool m_has_recursions; // set when we have recursive expressions to fixup
  225. std::vector<unsigned char> m_recursion_checks; // notes which recursions we've followed while analysing this expression
  226. typename traits::char_class_type m_word_mask; // mask used to determine if a character is a word character
  227. typename traits::char_class_type m_mask_space; // mask used to determine if a character is a word character
  228. typename traits::char_class_type m_lower_mask; // mask used to determine if a character is a lowercase character
  229. typename traits::char_class_type m_upper_mask; // mask used to determine if a character is an uppercase character
  230. typename traits::char_class_type m_alpha_mask; // mask used to determine if a character is an alphabetic character
  231. private:
  232. basic_regex_creator& operator=(const basic_regex_creator&);
  233. basic_regex_creator(const basic_regex_creator&);
  234. void fixup_pointers(re_syntax_base* state);
  235. void fixup_recursions(re_syntax_base* state);
  236. void create_startmaps(re_syntax_base* state);
  237. int calculate_backstep(re_syntax_base* state);
  238. void create_startmap(re_syntax_base* state, unsigned char* l_map, unsigned int* pnull, unsigned char mask);
  239. unsigned get_restart_type(re_syntax_base* state);
  240. void set_all_masks(unsigned char* bits, unsigned char);
  241. bool is_bad_repeat(re_syntax_base* pt);
  242. void set_bad_repeat(re_syntax_base* pt);
  243. syntax_element_type get_repeat_type(re_syntax_base* state);
  244. void probe_leading_repeat(re_syntax_base* state);
  245. };
  246. template <class charT, class traits>
  247. basic_regex_creator<charT, traits>::basic_regex_creator(regex_data<charT, traits>* data)
  248. : m_pdata(data), m_traits(*(data->m_ptraits)), m_last_state(0), m_icase(false), m_repeater_id(0),
  249. m_has_backrefs(false), m_bad_repeats(0), m_has_recursions(false), m_word_mask(0), m_mask_space(0), m_lower_mask(0), m_upper_mask(0), m_alpha_mask(0)
  250. {
  251. m_pdata->m_data.clear();
  252. m_pdata->m_status = ::boost::regex_constants::error_ok;
  253. static const charT w = 'w';
  254. static const charT s = 's';
  255. static const charT l[5] = { 'l', 'o', 'w', 'e', 'r', };
  256. static const charT u[5] = { 'u', 'p', 'p', 'e', 'r', };
  257. static const charT a[5] = { 'a', 'l', 'p', 'h', 'a', };
  258. m_word_mask = m_traits.lookup_classname(&w, &w +1);
  259. m_mask_space = m_traits.lookup_classname(&s, &s +1);
  260. m_lower_mask = m_traits.lookup_classname(l, l + 5);
  261. m_upper_mask = m_traits.lookup_classname(u, u + 5);
  262. m_alpha_mask = m_traits.lookup_classname(a, a + 5);
  263. m_pdata->m_word_mask = m_word_mask;
  264. BOOST_REGEX_ASSERT(m_word_mask != 0);
  265. BOOST_REGEX_ASSERT(m_mask_space != 0);
  266. BOOST_REGEX_ASSERT(m_lower_mask != 0);
  267. BOOST_REGEX_ASSERT(m_upper_mask != 0);
  268. BOOST_REGEX_ASSERT(m_alpha_mask != 0);
  269. }
  270. template <class charT, class traits>
  271. re_syntax_base* basic_regex_creator<charT, traits>::append_state(syntax_element_type t, std::size_t s)
  272. {
  273. // if the state is a backref then make a note of it:
  274. if(t == syntax_element_backref)
  275. this->m_has_backrefs = true;
  276. // append a new state, start by aligning our last one:
  277. m_pdata->m_data.align();
  278. // set the offset to the next state in our last one:
  279. if(m_last_state)
  280. m_last_state->next.i = m_pdata->m_data.size() - getoffset(m_last_state);
  281. // now actually extend our data:
  282. m_last_state = static_cast<re_syntax_base*>(m_pdata->m_data.extend(s));
  283. // fill in boilerplate options in the new state:
  284. m_last_state->next.i = 0;
  285. m_last_state->type = t;
  286. return m_last_state;
  287. }
  288. template <class charT, class traits>
  289. re_syntax_base* basic_regex_creator<charT, traits>::insert_state(std::ptrdiff_t pos, syntax_element_type t, std::size_t s)
  290. {
  291. // append a new state, start by aligning our last one:
  292. m_pdata->m_data.align();
  293. // set the offset to the next state in our last one:
  294. if(m_last_state)
  295. m_last_state->next.i = m_pdata->m_data.size() - getoffset(m_last_state);
  296. // remember the last state position:
  297. std::ptrdiff_t off = getoffset(m_last_state) + s;
  298. // now actually insert our data:
  299. re_syntax_base* new_state = static_cast<re_syntax_base*>(m_pdata->m_data.insert(pos, s));
  300. // fill in boilerplate options in the new state:
  301. new_state->next.i = s;
  302. new_state->type = t;
  303. m_last_state = getaddress(off);
  304. return new_state;
  305. }
  306. template <class charT, class traits>
  307. re_literal* basic_regex_creator<charT, traits>::append_literal(charT c)
  308. {
  309. re_literal* result;
  310. // start by seeing if we have an existing re_literal we can extend:
  311. if((0 == m_last_state) || (m_last_state->type != syntax_element_literal))
  312. {
  313. // no existing re_literal, create a new one:
  314. result = static_cast<re_literal*>(append_state(syntax_element_literal, sizeof(re_literal) + sizeof(charT)));
  315. result->length = 1;
  316. *static_cast<charT*>(static_cast<void*>(result+1)) = m_traits.translate(c, m_icase);
  317. }
  318. else
  319. {
  320. // we have an existing re_literal, extend it:
  321. std::ptrdiff_t off = getoffset(m_last_state);
  322. m_pdata->m_data.extend(sizeof(charT));
  323. m_last_state = result = static_cast<re_literal*>(getaddress(off));
  324. charT* characters = static_cast<charT*>(static_cast<void*>(result+1));
  325. characters[result->length] = m_traits.translate(c, m_icase);
  326. result->length += 1;
  327. }
  328. return result;
  329. }
  330. template <class charT, class traits>
  331. inline re_syntax_base* basic_regex_creator<charT, traits>::append_set(
  332. const basic_char_set<charT, traits>& char_set)
  333. {
  334. typedef std::integral_constant<bool, (sizeof(charT) == 1) > truth_type;
  335. return char_set.has_digraphs()
  336. ? append_set(char_set, static_cast<std::integral_constant<bool, false>*>(0))
  337. : append_set(char_set, static_cast<truth_type*>(0));
  338. }
  339. template <class charT, class traits>
  340. re_syntax_base* basic_regex_creator<charT, traits>::append_set(
  341. const basic_char_set<charT, traits>& char_set, std::integral_constant<bool, false>*)
  342. {
  343. typedef typename traits::string_type string_type;
  344. typedef typename basic_char_set<charT, traits>::list_iterator item_iterator;
  345. typedef typename basic_char_set<charT, traits>::set_iterator set_iterator;
  346. typedef typename traits::char_class_type m_type;
  347. re_set_long<m_type>* result = static_cast<re_set_long<m_type>*>(append_state(syntax_element_long_set, sizeof(re_set_long<m_type>)));
  348. //
  349. // fill in the basics:
  350. //
  351. result->csingles = static_cast<unsigned int>(std::distance(char_set.singles_begin(), char_set.singles_end()));
  352. result->cranges = static_cast<unsigned int>(std::distance(char_set.ranges_begin(), char_set.ranges_end())) / 2;
  353. result->cequivalents = static_cast<unsigned int>(std::distance(char_set.equivalents_begin(), char_set.equivalents_end()));
  354. result->cclasses = char_set.classes();
  355. result->cnclasses = char_set.negated_classes();
  356. if(flags() & regbase::icase)
  357. {
  358. // adjust classes as needed:
  359. if(((result->cclasses & m_lower_mask) == m_lower_mask) || ((result->cclasses & m_upper_mask) == m_upper_mask))
  360. result->cclasses |= m_alpha_mask;
  361. if(((result->cnclasses & m_lower_mask) == m_lower_mask) || ((result->cnclasses & m_upper_mask) == m_upper_mask))
  362. result->cnclasses |= m_alpha_mask;
  363. }
  364. result->isnot = char_set.is_negated();
  365. result->singleton = !char_set.has_digraphs();
  366. //
  367. // remember where the state is for later:
  368. //
  369. std::ptrdiff_t offset = getoffset(result);
  370. //
  371. // now extend with all the singles:
  372. //
  373. item_iterator first, last;
  374. set_iterator sfirst, slast;
  375. sfirst = char_set.singles_begin();
  376. slast = char_set.singles_end();
  377. while(sfirst != slast)
  378. {
  379. charT* p = static_cast<charT*>(this->m_pdata->m_data.extend(sizeof(charT) * (sfirst->first == static_cast<charT>(0) ? 1 : sfirst->second ? 3 : 2)));
  380. p[0] = m_traits.translate(sfirst->first, m_icase);
  381. if(sfirst->first == static_cast<charT>(0))
  382. {
  383. p[0] = 0;
  384. }
  385. else if(sfirst->second)
  386. {
  387. p[1] = m_traits.translate(sfirst->second, m_icase);
  388. p[2] = 0;
  389. }
  390. else
  391. p[1] = 0;
  392. ++sfirst;
  393. }
  394. //
  395. // now extend with all the ranges:
  396. //
  397. first = char_set.ranges_begin();
  398. last = char_set.ranges_end();
  399. while(first != last)
  400. {
  401. // first grab the endpoints of the range:
  402. digraph<charT> c1 = *first;
  403. c1.first = this->m_traits.translate(c1.first, this->m_icase);
  404. c1.second = this->m_traits.translate(c1.second, this->m_icase);
  405. ++first;
  406. digraph<charT> c2 = *first;
  407. c2.first = this->m_traits.translate(c2.first, this->m_icase);
  408. c2.second = this->m_traits.translate(c2.second, this->m_icase);
  409. ++first;
  410. string_type s1, s2;
  411. // different actions now depending upon whether collation is turned on:
  412. if(flags() & regex_constants::collate)
  413. {
  414. // we need to transform our range into sort keys:
  415. charT a1[3] = { c1.first, c1.second, charT(0), };
  416. charT a2[3] = { c2.first, c2.second, charT(0), };
  417. s1 = this->m_traits.transform(a1, (a1[1] ? a1+2 : a1+1));
  418. s2 = this->m_traits.transform(a2, (a2[1] ? a2+2 : a2+1));
  419. if(s1.empty())
  420. s1 = string_type(1, charT(0));
  421. if(s2.empty())
  422. s2 = string_type(1, charT(0));
  423. }
  424. else
  425. {
  426. if(c1.second)
  427. {
  428. s1.insert(s1.end(), c1.first);
  429. s1.insert(s1.end(), c1.second);
  430. }
  431. else
  432. s1 = string_type(1, c1.first);
  433. if(c2.second)
  434. {
  435. s2.insert(s2.end(), c2.first);
  436. s2.insert(s2.end(), c2.second);
  437. }
  438. else
  439. s2.insert(s2.end(), c2.first);
  440. }
  441. if(s1 > s2)
  442. {
  443. // Oops error:
  444. return 0;
  445. }
  446. charT* p = static_cast<charT*>(this->m_pdata->m_data.extend(sizeof(charT) * (s1.size() + s2.size() + 2) ) );
  447. BOOST_REGEX_DETAIL_NS::copy(s1.begin(), s1.end(), p);
  448. p[s1.size()] = charT(0);
  449. p += s1.size() + 1;
  450. BOOST_REGEX_DETAIL_NS::copy(s2.begin(), s2.end(), p);
  451. p[s2.size()] = charT(0);
  452. }
  453. //
  454. // now process the equivalence classes:
  455. //
  456. sfirst = char_set.equivalents_begin();
  457. slast = char_set.equivalents_end();
  458. while(sfirst != slast)
  459. {
  460. string_type s;
  461. if(sfirst->second)
  462. {
  463. charT cs[3] = { sfirst->first, sfirst->second, charT(0), };
  464. s = m_traits.transform_primary(cs, cs+2);
  465. }
  466. else
  467. s = m_traits.transform_primary(&sfirst->first, &sfirst->first+1);
  468. if(s.empty())
  469. return 0; // invalid or unsupported equivalence class
  470. charT* p = static_cast<charT*>(this->m_pdata->m_data.extend(sizeof(charT) * (s.size()+1) ) );
  471. BOOST_REGEX_DETAIL_NS::copy(s.begin(), s.end(), p);
  472. p[s.size()] = charT(0);
  473. ++sfirst;
  474. }
  475. //
  476. // finally reset the address of our last state:
  477. //
  478. m_last_state = result = static_cast<re_set_long<m_type>*>(getaddress(offset));
  479. return result;
  480. }
  481. template<class T>
  482. inline bool char_less(T t1, T t2)
  483. {
  484. return t1 < t2;
  485. }
  486. inline bool char_less(char t1, char t2)
  487. {
  488. return static_cast<unsigned char>(t1) < static_cast<unsigned char>(t2);
  489. }
  490. inline bool char_less(signed char t1, signed char t2)
  491. {
  492. return static_cast<unsigned char>(t1) < static_cast<unsigned char>(t2);
  493. }
  494. template <class charT, class traits>
  495. re_syntax_base* basic_regex_creator<charT, traits>::append_set(
  496. const basic_char_set<charT, traits>& char_set, std::integral_constant<bool, true>*)
  497. {
  498. typedef typename traits::string_type string_type;
  499. typedef typename basic_char_set<charT, traits>::list_iterator item_iterator;
  500. typedef typename basic_char_set<charT, traits>::set_iterator set_iterator;
  501. re_set* result = static_cast<re_set*>(append_state(syntax_element_set, sizeof(re_set)));
  502. bool negate = char_set.is_negated();
  503. std::memset(result->_map, 0, sizeof(result->_map));
  504. //
  505. // handle singles first:
  506. //
  507. item_iterator first, last;
  508. set_iterator sfirst, slast;
  509. sfirst = char_set.singles_begin();
  510. slast = char_set.singles_end();
  511. while(sfirst != slast)
  512. {
  513. for(unsigned int i = 0; i < (1 << CHAR_BIT); ++i)
  514. {
  515. if(this->m_traits.translate(static_cast<charT>(i), this->m_icase)
  516. == this->m_traits.translate(sfirst->first, this->m_icase))
  517. result->_map[i] = true;
  518. }
  519. ++sfirst;
  520. }
  521. //
  522. // OK now handle ranges:
  523. //
  524. first = char_set.ranges_begin();
  525. last = char_set.ranges_end();
  526. while(first != last)
  527. {
  528. // first grab the endpoints of the range:
  529. charT c1 = this->m_traits.translate(first->first, this->m_icase);
  530. ++first;
  531. charT c2 = this->m_traits.translate(first->first, this->m_icase);
  532. ++first;
  533. // different actions now depending upon whether collation is turned on:
  534. if(flags() & regex_constants::collate)
  535. {
  536. // we need to transform our range into sort keys:
  537. charT c3[2] = { c1, charT(0), };
  538. string_type s1 = this->m_traits.transform(c3, c3+1);
  539. c3[0] = c2;
  540. string_type s2 = this->m_traits.transform(c3, c3+1);
  541. if(s1 > s2)
  542. {
  543. // Oops error:
  544. return 0;
  545. }
  546. BOOST_REGEX_ASSERT(c3[1] == charT(0));
  547. for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
  548. {
  549. c3[0] = static_cast<charT>(i);
  550. string_type s3 = this->m_traits.transform(c3, c3 +1);
  551. if((s1 <= s3) && (s3 <= s2))
  552. result->_map[i] = true;
  553. }
  554. }
  555. else
  556. {
  557. if(char_less(c2, c1))
  558. {
  559. // Oops error:
  560. return 0;
  561. }
  562. // everything in range matches:
  563. std::memset(result->_map + static_cast<unsigned char>(c1), true, static_cast<unsigned char>(1u) + static_cast<unsigned char>(static_cast<unsigned char>(c2) - static_cast<unsigned char>(c1)));
  564. }
  565. }
  566. //
  567. // and now the classes:
  568. //
  569. typedef typename traits::char_class_type m_type;
  570. m_type m = char_set.classes();
  571. if(flags() & regbase::icase)
  572. {
  573. // adjust m as needed:
  574. if(((m & m_lower_mask) == m_lower_mask) || ((m & m_upper_mask) == m_upper_mask))
  575. m |= m_alpha_mask;
  576. }
  577. if(m != 0)
  578. {
  579. for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
  580. {
  581. if(this->m_traits.isctype(static_cast<charT>(i), m))
  582. result->_map[i] = true;
  583. }
  584. }
  585. //
  586. // and now the negated classes:
  587. //
  588. m = char_set.negated_classes();
  589. if(flags() & regbase::icase)
  590. {
  591. // adjust m as needed:
  592. if(((m & m_lower_mask) == m_lower_mask) || ((m & m_upper_mask) == m_upper_mask))
  593. m |= m_alpha_mask;
  594. }
  595. if(m != 0)
  596. {
  597. for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
  598. {
  599. if(0 == this->m_traits.isctype(static_cast<charT>(i), m))
  600. result->_map[i] = true;
  601. }
  602. }
  603. //
  604. // now process the equivalence classes:
  605. //
  606. sfirst = char_set.equivalents_begin();
  607. slast = char_set.equivalents_end();
  608. while(sfirst != slast)
  609. {
  610. string_type s;
  611. BOOST_REGEX_ASSERT(static_cast<charT>(0) == sfirst->second);
  612. s = m_traits.transform_primary(&sfirst->first, &sfirst->first+1);
  613. if(s.empty())
  614. return 0; // invalid or unsupported equivalence class
  615. for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
  616. {
  617. charT c[2] = { (static_cast<charT>(i)), charT(0), };
  618. string_type s2 = this->m_traits.transform_primary(c, c+1);
  619. if(s == s2)
  620. result->_map[i] = true;
  621. }
  622. ++sfirst;
  623. }
  624. if(negate)
  625. {
  626. for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
  627. {
  628. result->_map[i] = !(result->_map[i]);
  629. }
  630. }
  631. return result;
  632. }
  633. template <class charT, class traits>
  634. void basic_regex_creator<charT, traits>::finalize(const charT* p1, const charT* p2)
  635. {
  636. if(this->m_pdata->m_status)
  637. return;
  638. // we've added all the states we need, now finish things off.
  639. // start by adding a terminating state:
  640. append_state(syntax_element_match);
  641. // extend storage to store original expression:
  642. std::ptrdiff_t len = p2 - p1;
  643. m_pdata->m_expression_len = len;
  644. charT* ps = static_cast<charT*>(m_pdata->m_data.extend(sizeof(charT) * (1 + (p2 - p1))));
  645. m_pdata->m_expression = ps;
  646. BOOST_REGEX_DETAIL_NS::copy(p1, p2, ps);
  647. ps[p2 - p1] = 0;
  648. // fill in our other data...
  649. // successful parsing implies a zero status:
  650. m_pdata->m_status = 0;
  651. // get the first state of the machine:
  652. m_pdata->m_first_state = static_cast<re_syntax_base*>(m_pdata->m_data.data());
  653. // fixup pointers in the machine:
  654. fixup_pointers(m_pdata->m_first_state);
  655. if(m_has_recursions)
  656. {
  657. m_pdata->m_has_recursions = true;
  658. fixup_recursions(m_pdata->m_first_state);
  659. if(this->m_pdata->m_status)
  660. return;
  661. }
  662. else
  663. m_pdata->m_has_recursions = false;
  664. // create nested startmaps:
  665. create_startmaps(m_pdata->m_first_state);
  666. // create main startmap:
  667. std::memset(m_pdata->m_startmap, 0, sizeof(m_pdata->m_startmap));
  668. m_pdata->m_can_be_null = 0;
  669. m_bad_repeats = 0;
  670. if(m_has_recursions)
  671. m_recursion_checks.assign(1 + m_pdata->m_mark_count, 0u);
  672. create_startmap(m_pdata->m_first_state, m_pdata->m_startmap, &(m_pdata->m_can_be_null), mask_all);
  673. // get the restart type:
  674. m_pdata->m_restart_type = get_restart_type(m_pdata->m_first_state);
  675. // optimise a leading repeat if there is one:
  676. probe_leading_repeat(m_pdata->m_first_state);
  677. }
  678. template <class charT, class traits>
  679. void basic_regex_creator<charT, traits>::fixup_pointers(re_syntax_base* state)
  680. {
  681. while(state)
  682. {
  683. switch(state->type)
  684. {
  685. case syntax_element_recurse:
  686. m_has_recursions = true;
  687. if(state->next.i)
  688. state->next.p = getaddress(state->next.i, state);
  689. else
  690. state->next.p = 0;
  691. break;
  692. case syntax_element_rep:
  693. case syntax_element_dot_rep:
  694. case syntax_element_char_rep:
  695. case syntax_element_short_set_rep:
  696. case syntax_element_long_set_rep:
  697. // set the state_id of this repeat:
  698. static_cast<re_repeat*>(state)->state_id = m_repeater_id++;
  699. BOOST_REGEX_FALLTHROUGH;
  700. case syntax_element_alt:
  701. std::memset(static_cast<re_alt*>(state)->_map, 0, sizeof(static_cast<re_alt*>(state)->_map));
  702. static_cast<re_alt*>(state)->can_be_null = 0;
  703. BOOST_REGEX_FALLTHROUGH;
  704. case syntax_element_jump:
  705. static_cast<re_jump*>(state)->alt.p = getaddress(static_cast<re_jump*>(state)->alt.i, state);
  706. BOOST_REGEX_FALLTHROUGH;
  707. default:
  708. if(state->next.i)
  709. state->next.p = getaddress(state->next.i, state);
  710. else
  711. state->next.p = 0;
  712. }
  713. state = state->next.p;
  714. }
  715. }
  716. template <class charT, class traits>
  717. void basic_regex_creator<charT, traits>::fixup_recursions(re_syntax_base* state)
  718. {
  719. re_syntax_base* base = state;
  720. while(state)
  721. {
  722. switch(state->type)
  723. {
  724. case syntax_element_assert_backref:
  725. {
  726. // just check that the index is valid:
  727. int idx = static_cast<const re_brace*>(state)->index;
  728. if(idx < 0)
  729. {
  730. idx = -idx-1;
  731. if(idx >= hash_value_mask)
  732. {
  733. idx = m_pdata->get_id(idx);
  734. if(idx <= 0)
  735. {
  736. // check of sub-expression that doesn't exist:
  737. if(0 == this->m_pdata->m_status) // update the error code if not already set
  738. this->m_pdata->m_status = boost::regex_constants::error_bad_pattern;
  739. //
  740. // clear the expression, we should be empty:
  741. //
  742. this->m_pdata->m_expression = 0;
  743. this->m_pdata->m_expression_len = 0;
  744. //
  745. // and throw if required:
  746. //
  747. if(0 == (this->flags() & regex_constants::no_except))
  748. {
  749. std::string message = "Encountered a forward reference to a marked sub-expression that does not exist.";
  750. boost::regex_error e(message, boost::regex_constants::error_bad_pattern, 0);
  751. e.raise();
  752. }
  753. }
  754. }
  755. }
  756. }
  757. break;
  758. case syntax_element_recurse:
  759. {
  760. bool ok = false;
  761. re_syntax_base* p = base;
  762. std::ptrdiff_t idx = static_cast<re_jump*>(state)->alt.i;
  763. if(idx >= hash_value_mask)
  764. {
  765. //
  766. // There may be more than one capture group with this hash, just do what Perl
  767. // does and recurse to the leftmost:
  768. //
  769. idx = m_pdata->get_id(static_cast<int>(idx));
  770. }
  771. if(idx < 0)
  772. {
  773. ok = false;
  774. }
  775. else
  776. {
  777. while(p)
  778. {
  779. if((p->type == syntax_element_startmark) && (static_cast<re_brace*>(p)->index == idx))
  780. {
  781. //
  782. // We've found the target of the recursion, set the jump target:
  783. //
  784. static_cast<re_jump*>(state)->alt.p = p;
  785. ok = true;
  786. //
  787. // Now scan the target for nested repeats:
  788. //
  789. p = p->next.p;
  790. int next_rep_id = 0;
  791. while(p)
  792. {
  793. switch(p->type)
  794. {
  795. case syntax_element_rep:
  796. case syntax_element_dot_rep:
  797. case syntax_element_char_rep:
  798. case syntax_element_short_set_rep:
  799. case syntax_element_long_set_rep:
  800. next_rep_id = static_cast<re_repeat*>(p)->state_id;
  801. break;
  802. case syntax_element_endmark:
  803. if(static_cast<const re_brace*>(p)->index == idx)
  804. next_rep_id = -1;
  805. break;
  806. default:
  807. break;
  808. }
  809. if(next_rep_id)
  810. break;
  811. p = p->next.p;
  812. }
  813. if(next_rep_id > 0)
  814. {
  815. static_cast<re_recurse*>(state)->state_id = next_rep_id - 1;
  816. }
  817. break;
  818. }
  819. p = p->next.p;
  820. }
  821. }
  822. if(!ok)
  823. {
  824. // recursion to sub-expression that doesn't exist:
  825. if(0 == this->m_pdata->m_status) // update the error code if not already set
  826. this->m_pdata->m_status = boost::regex_constants::error_bad_pattern;
  827. //
  828. // clear the expression, we should be empty:
  829. //
  830. this->m_pdata->m_expression = 0;
  831. this->m_pdata->m_expression_len = 0;
  832. //
  833. // and throw if required:
  834. //
  835. if(0 == (this->flags() & regex_constants::no_except))
  836. {
  837. std::string message = "Encountered a forward reference to a recursive sub-expression that does not exist.";
  838. boost::regex_error e(message, boost::regex_constants::error_bad_pattern, 0);
  839. e.raise();
  840. }
  841. }
  842. }
  843. break;
  844. default:
  845. break;
  846. }
  847. state = state->next.p;
  848. }
  849. }
  850. template <class charT, class traits>
  851. void basic_regex_creator<charT, traits>::create_startmaps(re_syntax_base* state)
  852. {
  853. // non-recursive implementation:
  854. // create the last map in the machine first, so that earlier maps
  855. // can make use of the result...
  856. //
  857. // This was originally a recursive implementation, but that caused stack
  858. // overflows with complex expressions on small stacks (think COM+).
  859. // start by saving the case setting:
  860. bool l_icase = m_icase;
  861. std::vector<std::pair<bool, re_syntax_base*> > v;
  862. while(state)
  863. {
  864. switch(state->type)
  865. {
  866. case syntax_element_toggle_case:
  867. // we need to track case changes here:
  868. m_icase = static_cast<re_case*>(state)->icase;
  869. state = state->next.p;
  870. continue;
  871. case syntax_element_alt:
  872. case syntax_element_rep:
  873. case syntax_element_dot_rep:
  874. case syntax_element_char_rep:
  875. case syntax_element_short_set_rep:
  876. case syntax_element_long_set_rep:
  877. // just push the state onto our stack for now:
  878. v.push_back(std::pair<bool, re_syntax_base*>(m_icase, state));
  879. state = state->next.p;
  880. break;
  881. case syntax_element_backstep:
  882. // we need to calculate how big the backstep is:
  883. static_cast<re_brace*>(state)->index
  884. = this->calculate_backstep(state->next.p);
  885. if(static_cast<re_brace*>(state)->index < 0)
  886. {
  887. // Oops error:
  888. if(0 == this->m_pdata->m_status) // update the error code if not already set
  889. this->m_pdata->m_status = boost::regex_constants::error_bad_pattern;
  890. //
  891. // clear the expression, we should be empty:
  892. //
  893. this->m_pdata->m_expression = 0;
  894. this->m_pdata->m_expression_len = 0;
  895. //
  896. // and throw if required:
  897. //
  898. if(0 == (this->flags() & regex_constants::no_except))
  899. {
  900. std::string message = "Invalid lookbehind assertion encountered in the regular expression.";
  901. boost::regex_error e(message, boost::regex_constants::error_bad_pattern, 0);
  902. e.raise();
  903. }
  904. }
  905. BOOST_REGEX_FALLTHROUGH;
  906. default:
  907. state = state->next.p;
  908. }
  909. }
  910. // now work through our list, building all the maps as we go:
  911. while(!v.empty())
  912. {
  913. // Initialize m_recursion_checks if we need it:
  914. if(m_has_recursions)
  915. m_recursion_checks.assign(1 + m_pdata->m_mark_count, 0u);
  916. const std::pair<bool, re_syntax_base*>& p = v.back();
  917. m_icase = p.first;
  918. state = p.second;
  919. v.pop_back();
  920. // Build maps:
  921. m_bad_repeats = 0;
  922. create_startmap(state->next.p, static_cast<re_alt*>(state)->_map, &static_cast<re_alt*>(state)->can_be_null, mask_take);
  923. m_bad_repeats = 0;
  924. if(m_has_recursions)
  925. m_recursion_checks.assign(1 + m_pdata->m_mark_count, 0u);
  926. create_startmap(static_cast<re_alt*>(state)->alt.p, static_cast<re_alt*>(state)->_map, &static_cast<re_alt*>(state)->can_be_null, mask_skip);
  927. // adjust the type of the state to allow for faster matching:
  928. state->type = this->get_repeat_type(state);
  929. }
  930. // restore case sensitivity:
  931. m_icase = l_icase;
  932. }
  933. template <class charT, class traits>
  934. int basic_regex_creator<charT, traits>::calculate_backstep(re_syntax_base* state)
  935. {
  936. typedef typename traits::char_class_type m_type;
  937. int result = 0;
  938. while(state)
  939. {
  940. switch(state->type)
  941. {
  942. case syntax_element_startmark:
  943. if((static_cast<re_brace*>(state)->index == -1)
  944. || (static_cast<re_brace*>(state)->index == -2))
  945. {
  946. state = static_cast<re_jump*>(state->next.p)->alt.p->next.p;
  947. continue;
  948. }
  949. else if(static_cast<re_brace*>(state)->index == -3)
  950. {
  951. state = state->next.p->next.p;
  952. continue;
  953. }
  954. break;
  955. case syntax_element_endmark:
  956. if((static_cast<re_brace*>(state)->index == -1)
  957. || (static_cast<re_brace*>(state)->index == -2))
  958. return result;
  959. break;
  960. case syntax_element_literal:
  961. result += static_cast<re_literal*>(state)->length;
  962. break;
  963. case syntax_element_wild:
  964. case syntax_element_set:
  965. result += 1;
  966. break;
  967. case syntax_element_dot_rep:
  968. case syntax_element_char_rep:
  969. case syntax_element_short_set_rep:
  970. case syntax_element_backref:
  971. case syntax_element_rep:
  972. case syntax_element_combining:
  973. case syntax_element_long_set_rep:
  974. case syntax_element_backstep:
  975. {
  976. re_repeat* rep = static_cast<re_repeat *>(state);
  977. // adjust the type of the state to allow for faster matching:
  978. state->type = this->get_repeat_type(state);
  979. if((state->type == syntax_element_dot_rep)
  980. || (state->type == syntax_element_char_rep)
  981. || (state->type == syntax_element_short_set_rep))
  982. {
  983. if(rep->max != rep->min)
  984. return -1;
  985. if (static_cast<std::size_t>((std::numeric_limits<int>::max)() - result) < rep->min)
  986. return -1; // protection against overflow, we can't calculate a backstep in this case and the expression is probably ill-formed.
  987. result += static_cast<int>(rep->min);
  988. state = rep->alt.p;
  989. continue;
  990. }
  991. else if(state->type == syntax_element_long_set_rep)
  992. {
  993. BOOST_REGEX_ASSERT(rep->next.p->type == syntax_element_long_set);
  994. if(static_cast<re_set_long<m_type>*>(rep->next.p)->singleton == 0)
  995. return -1;
  996. if(rep->max != rep->min)
  997. return -1;
  998. result += static_cast<int>(rep->min);
  999. state = rep->alt.p;
  1000. continue;
  1001. }
  1002. }
  1003. return -1;
  1004. case syntax_element_long_set:
  1005. if(static_cast<re_set_long<m_type>*>(state)->singleton == 0)
  1006. return -1;
  1007. result += 1;
  1008. break;
  1009. case syntax_element_jump:
  1010. state = static_cast<re_jump*>(state)->alt.p;
  1011. continue;
  1012. case syntax_element_alt:
  1013. {
  1014. int r1 = calculate_backstep(state->next.p);
  1015. int r2 = calculate_backstep(static_cast<re_alt*>(state)->alt.p);
  1016. if((r1 < 0) || (r1 != r2))
  1017. return -1;
  1018. return result + r1;
  1019. }
  1020. default:
  1021. break;
  1022. }
  1023. state = state->next.p;
  1024. }
  1025. return -1;
  1026. }
  1027. struct recursion_saver
  1028. {
  1029. std::vector<unsigned char> saved_state;
  1030. std::vector<unsigned char>* state;
  1031. recursion_saver(std::vector<unsigned char>* p) : saved_state(*p), state(p) {}
  1032. ~recursion_saver()
  1033. {
  1034. state->swap(saved_state);
  1035. }
  1036. };
  1037. template <class charT, class traits>
  1038. void basic_regex_creator<charT, traits>::create_startmap(re_syntax_base* state, unsigned char* l_map, unsigned int* pnull, unsigned char mask)
  1039. {
  1040. recursion_saver saved_recursions(&m_recursion_checks);
  1041. int not_last_jump = 1;
  1042. re_syntax_base* recursion_start = 0;
  1043. int recursion_sub = 0;
  1044. re_syntax_base* recursion_restart = 0;
  1045. // track case sensitivity:
  1046. bool l_icase = m_icase;
  1047. while(state)
  1048. {
  1049. switch(state->type)
  1050. {
  1051. case syntax_element_toggle_case:
  1052. l_icase = static_cast<re_case*>(state)->icase;
  1053. state = state->next.p;
  1054. break;
  1055. case syntax_element_literal:
  1056. {
  1057. // don't set anything in *pnull, set each element in l_map
  1058. // that could match the first character in the literal:
  1059. if(l_map)
  1060. {
  1061. l_map[0] |= mask_init;
  1062. charT first_char = *static_cast<charT*>(static_cast<void*>(static_cast<re_literal*>(state) + 1));
  1063. for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
  1064. {
  1065. if(m_traits.translate(static_cast<charT>(i), l_icase) == first_char)
  1066. l_map[i] |= mask;
  1067. }
  1068. }
  1069. return;
  1070. }
  1071. case syntax_element_end_line:
  1072. {
  1073. // next character must be a line separator (if there is one):
  1074. if(l_map)
  1075. {
  1076. l_map[0] |= mask_init;
  1077. l_map[static_cast<unsigned>('\n')] |= mask;
  1078. l_map[static_cast<unsigned>('\r')] |= mask;
  1079. l_map[static_cast<unsigned>('\f')] |= mask;
  1080. l_map[0x85] |= mask;
  1081. }
  1082. // now figure out if we can match a NULL string at this point:
  1083. if(pnull)
  1084. create_startmap(state->next.p, 0, pnull, mask);
  1085. return;
  1086. }
  1087. case syntax_element_recurse:
  1088. {
  1089. BOOST_REGEX_ASSERT(static_cast<const re_jump*>(state)->alt.p->type == syntax_element_startmark);
  1090. recursion_sub = static_cast<re_brace*>(static_cast<const re_jump*>(state)->alt.p)->index;
  1091. if(m_recursion_checks[recursion_sub] & 1u)
  1092. {
  1093. // Infinite recursion!!
  1094. if(0 == this->m_pdata->m_status) // update the error code if not already set
  1095. this->m_pdata->m_status = boost::regex_constants::error_bad_pattern;
  1096. //
  1097. // clear the expression, we should be empty:
  1098. //
  1099. this->m_pdata->m_expression = 0;
  1100. this->m_pdata->m_expression_len = 0;
  1101. //
  1102. // and throw if required:
  1103. //
  1104. if(0 == (this->flags() & regex_constants::no_except))
  1105. {
  1106. std::string message = "Encountered an infinite recursion.";
  1107. boost::regex_error e(message, boost::regex_constants::error_bad_pattern, 0);
  1108. e.raise();
  1109. }
  1110. }
  1111. else if(recursion_start == 0)
  1112. {
  1113. recursion_start = state;
  1114. recursion_restart = state->next.p;
  1115. state = static_cast<re_jump*>(state)->alt.p;
  1116. m_recursion_checks[recursion_sub] |= 1u;
  1117. break;
  1118. }
  1119. m_recursion_checks[recursion_sub] |= 1u;
  1120. // can't handle nested recursion here...
  1121. BOOST_REGEX_FALLTHROUGH;
  1122. }
  1123. case syntax_element_backref:
  1124. // can be null, and any character can match:
  1125. if(pnull)
  1126. *pnull |= mask;
  1127. BOOST_REGEX_FALLTHROUGH;
  1128. case syntax_element_wild:
  1129. {
  1130. // can't be null, any character can match:
  1131. set_all_masks(l_map, mask);
  1132. return;
  1133. }
  1134. case syntax_element_accept:
  1135. case syntax_element_match:
  1136. {
  1137. // must be null, any character can match:
  1138. set_all_masks(l_map, mask);
  1139. if(pnull)
  1140. *pnull |= mask;
  1141. return;
  1142. }
  1143. case syntax_element_word_start:
  1144. {
  1145. // recurse, then AND with all the word characters:
  1146. create_startmap(state->next.p, l_map, pnull, mask);
  1147. if(l_map)
  1148. {
  1149. l_map[0] |= mask_init;
  1150. for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
  1151. {
  1152. if(!m_traits.isctype(static_cast<charT>(i), m_word_mask))
  1153. l_map[i] &= static_cast<unsigned char>(~mask);
  1154. }
  1155. }
  1156. return;
  1157. }
  1158. case syntax_element_word_end:
  1159. {
  1160. // recurse, then AND with all the word characters:
  1161. create_startmap(state->next.p, l_map, pnull, mask);
  1162. if(l_map)
  1163. {
  1164. l_map[0] |= mask_init;
  1165. for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
  1166. {
  1167. if(m_traits.isctype(static_cast<charT>(i), m_word_mask))
  1168. l_map[i] &= static_cast<unsigned char>(~mask);
  1169. }
  1170. }
  1171. return;
  1172. }
  1173. case syntax_element_buffer_end:
  1174. {
  1175. // we *must be null* :
  1176. if(pnull)
  1177. *pnull |= mask;
  1178. return;
  1179. }
  1180. case syntax_element_long_set:
  1181. if(l_map)
  1182. {
  1183. typedef typename traits::char_class_type m_type;
  1184. if(static_cast<re_set_long<m_type>*>(state)->singleton)
  1185. {
  1186. l_map[0] |= mask_init;
  1187. for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
  1188. {
  1189. charT c = static_cast<charT>(i);
  1190. if(&c != re_is_set_member(&c, &c + 1, static_cast<re_set_long<m_type>*>(state), *m_pdata, l_icase))
  1191. l_map[i] |= mask;
  1192. }
  1193. }
  1194. else
  1195. set_all_masks(l_map, mask);
  1196. }
  1197. return;
  1198. case syntax_element_set:
  1199. if(l_map)
  1200. {
  1201. l_map[0] |= mask_init;
  1202. for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
  1203. {
  1204. if(static_cast<re_set*>(state)->_map[
  1205. static_cast<unsigned char>(m_traits.translate(static_cast<charT>(i), l_icase))])
  1206. l_map[i] |= mask;
  1207. }
  1208. }
  1209. return;
  1210. case syntax_element_jump:
  1211. // take the jump:
  1212. state = static_cast<re_alt*>(state)->alt.p;
  1213. not_last_jump = -1;
  1214. break;
  1215. case syntax_element_alt:
  1216. case syntax_element_rep:
  1217. case syntax_element_dot_rep:
  1218. case syntax_element_char_rep:
  1219. case syntax_element_short_set_rep:
  1220. case syntax_element_long_set_rep:
  1221. {
  1222. re_alt* rep = static_cast<re_alt*>(state);
  1223. if(rep->_map[0] & mask_init)
  1224. {
  1225. if(l_map)
  1226. {
  1227. // copy previous results:
  1228. l_map[0] |= mask_init;
  1229. for(unsigned int i = 0; i <= UCHAR_MAX; ++i)
  1230. {
  1231. if(rep->_map[i] & mask_any)
  1232. l_map[i] |= mask;
  1233. }
  1234. }
  1235. if(pnull)
  1236. {
  1237. if(rep->can_be_null & mask_any)
  1238. *pnull |= mask;
  1239. }
  1240. }
  1241. else
  1242. {
  1243. // we haven't created a startmap for this alternative yet
  1244. // so take the union of the two options:
  1245. if(is_bad_repeat(state))
  1246. {
  1247. set_all_masks(l_map, mask);
  1248. if(pnull)
  1249. *pnull |= mask;
  1250. return;
  1251. }
  1252. set_bad_repeat(state);
  1253. create_startmap(state->next.p, l_map, pnull, mask);
  1254. if((state->type == syntax_element_alt)
  1255. || (static_cast<re_repeat*>(state)->min == 0)
  1256. || (not_last_jump == 0))
  1257. create_startmap(rep->alt.p, l_map, pnull, mask);
  1258. }
  1259. }
  1260. return;
  1261. case syntax_element_soft_buffer_end:
  1262. // match newline or null:
  1263. if(l_map)
  1264. {
  1265. l_map[0] |= mask_init;
  1266. l_map[static_cast<unsigned>('\n')] |= mask;
  1267. l_map[static_cast<unsigned>('\r')] |= mask;
  1268. }
  1269. if(pnull)
  1270. *pnull |= mask;
  1271. return;
  1272. case syntax_element_endmark:
  1273. // need to handle independent subs as a special case:
  1274. if(static_cast<re_brace*>(state)->index < 0)
  1275. {
  1276. // can be null, any character can match:
  1277. set_all_masks(l_map, mask);
  1278. if(pnull)
  1279. *pnull |= mask;
  1280. return;
  1281. }
  1282. else if(recursion_start && (recursion_sub != 0) && (recursion_sub == static_cast<re_brace*>(state)->index))
  1283. {
  1284. // recursion termination:
  1285. recursion_start = 0;
  1286. state = recursion_restart;
  1287. break;
  1288. }
  1289. //
  1290. // Normally we just go to the next state... but if this sub-expression is
  1291. // the target of a recursion, then we might be ending a recursion, in which
  1292. // case we should check whatever follows that recursion, as well as whatever
  1293. // follows this state:
  1294. //
  1295. if(m_pdata->m_has_recursions && static_cast<re_brace*>(state)->index)
  1296. {
  1297. bool ok = false;
  1298. re_syntax_base* p = m_pdata->m_first_state;
  1299. while(p)
  1300. {
  1301. if(p->type == syntax_element_recurse)
  1302. {
  1303. re_brace* p2 = static_cast<re_brace*>(static_cast<re_jump*>(p)->alt.p);
  1304. if((p2->type == syntax_element_startmark) && (p2->index == static_cast<re_brace*>(state)->index))
  1305. {
  1306. ok = true;
  1307. break;
  1308. }
  1309. }
  1310. p = p->next.p;
  1311. }
  1312. if(ok && ((m_recursion_checks[static_cast<re_brace*>(state)->index] & 2u) == 0))
  1313. {
  1314. m_recursion_checks[static_cast<re_brace*>(state)->index] |= 2u;
  1315. create_startmap(p->next.p, l_map, pnull, mask);
  1316. }
  1317. }
  1318. state = state->next.p;
  1319. break;
  1320. case syntax_element_commit:
  1321. set_all_masks(l_map, mask);
  1322. // Continue scanning so we can figure out whether we can be null:
  1323. state = state->next.p;
  1324. break;
  1325. case syntax_element_startmark:
  1326. // need to handle independent subs as a special case:
  1327. if(static_cast<re_brace*>(state)->index == -3)
  1328. {
  1329. state = state->next.p->next.p;
  1330. break;
  1331. }
  1332. BOOST_REGEX_FALLTHROUGH;
  1333. default:
  1334. state = state->next.p;
  1335. }
  1336. ++not_last_jump;
  1337. }
  1338. }
  1339. template <class charT, class traits>
  1340. unsigned basic_regex_creator<charT, traits>::get_restart_type(re_syntax_base* state)
  1341. {
  1342. //
  1343. // find out how the machine starts, so we can optimise the search:
  1344. //
  1345. while(state)
  1346. {
  1347. switch(state->type)
  1348. {
  1349. case syntax_element_startmark:
  1350. case syntax_element_endmark:
  1351. state = state->next.p;
  1352. continue;
  1353. case syntax_element_start_line:
  1354. return regbase::restart_line;
  1355. case syntax_element_word_start:
  1356. return regbase::restart_word;
  1357. case syntax_element_buffer_start:
  1358. return regbase::restart_buf;
  1359. case syntax_element_restart_continue:
  1360. return regbase::restart_continue;
  1361. default:
  1362. state = 0;
  1363. continue;
  1364. }
  1365. }
  1366. return regbase::restart_any;
  1367. }
  1368. template <class charT, class traits>
  1369. void basic_regex_creator<charT, traits>::set_all_masks(unsigned char* bits, unsigned char mask)
  1370. {
  1371. //
  1372. // set mask in all of bits elements,
  1373. // if bits[0] has mask_init not set then we can
  1374. // optimise this to a call to memset:
  1375. //
  1376. if(bits)
  1377. {
  1378. if(bits[0] == 0)
  1379. (std::memset)(bits, mask, 1u << CHAR_BIT);
  1380. else
  1381. {
  1382. for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
  1383. bits[i] |= mask;
  1384. }
  1385. bits[0] |= mask_init;
  1386. }
  1387. }
  1388. template <class charT, class traits>
  1389. bool basic_regex_creator<charT, traits>::is_bad_repeat(re_syntax_base* pt)
  1390. {
  1391. switch(pt->type)
  1392. {
  1393. case syntax_element_rep:
  1394. case syntax_element_dot_rep:
  1395. case syntax_element_char_rep:
  1396. case syntax_element_short_set_rep:
  1397. case syntax_element_long_set_rep:
  1398. {
  1399. unsigned state_id = static_cast<re_repeat*>(pt)->state_id;
  1400. if(state_id >= sizeof(m_bad_repeats) * CHAR_BIT)
  1401. return true; // run out of bits, assume we can't traverse this one.
  1402. static const std::uintmax_t one = 1uL;
  1403. return m_bad_repeats & (one << state_id);
  1404. }
  1405. default:
  1406. return false;
  1407. }
  1408. }
  1409. template <class charT, class traits>
  1410. void basic_regex_creator<charT, traits>::set_bad_repeat(re_syntax_base* pt)
  1411. {
  1412. switch(pt->type)
  1413. {
  1414. case syntax_element_rep:
  1415. case syntax_element_dot_rep:
  1416. case syntax_element_char_rep:
  1417. case syntax_element_short_set_rep:
  1418. case syntax_element_long_set_rep:
  1419. {
  1420. unsigned state_id = static_cast<re_repeat*>(pt)->state_id;
  1421. static const std::uintmax_t one = 1uL;
  1422. if(state_id <= sizeof(m_bad_repeats) * CHAR_BIT)
  1423. m_bad_repeats |= (one << state_id);
  1424. }
  1425. break;
  1426. default:
  1427. break;
  1428. }
  1429. }
  1430. template <class charT, class traits>
  1431. syntax_element_type basic_regex_creator<charT, traits>::get_repeat_type(re_syntax_base* state)
  1432. {
  1433. typedef typename traits::char_class_type m_type;
  1434. if(state->type == syntax_element_rep)
  1435. {
  1436. // check to see if we are repeating a single state:
  1437. if(state->next.p->next.p->next.p == static_cast<re_alt*>(state)->alt.p)
  1438. {
  1439. switch(state->next.p->type)
  1440. {
  1441. case BOOST_REGEX_DETAIL_NS::syntax_element_wild:
  1442. return BOOST_REGEX_DETAIL_NS::syntax_element_dot_rep;
  1443. case BOOST_REGEX_DETAIL_NS::syntax_element_literal:
  1444. return BOOST_REGEX_DETAIL_NS::syntax_element_char_rep;
  1445. case BOOST_REGEX_DETAIL_NS::syntax_element_set:
  1446. return BOOST_REGEX_DETAIL_NS::syntax_element_short_set_rep;
  1447. case BOOST_REGEX_DETAIL_NS::syntax_element_long_set:
  1448. if(static_cast<BOOST_REGEX_DETAIL_NS::re_set_long<m_type>*>(state->next.p)->singleton)
  1449. return BOOST_REGEX_DETAIL_NS::syntax_element_long_set_rep;
  1450. break;
  1451. default:
  1452. break;
  1453. }
  1454. }
  1455. }
  1456. return state->type;
  1457. }
  1458. template <class charT, class traits>
  1459. void basic_regex_creator<charT, traits>::probe_leading_repeat(re_syntax_base* state)
  1460. {
  1461. // enumerate our states, and see if we have a leading repeat
  1462. // for which failed search restarts can be optimized;
  1463. do
  1464. {
  1465. switch(state->type)
  1466. {
  1467. case syntax_element_startmark:
  1468. if(static_cast<re_brace*>(state)->index >= 0)
  1469. {
  1470. state = state->next.p;
  1471. continue;
  1472. }
  1473. #ifdef BOOST_REGEX_MSVC
  1474. # pragma warning(push)
  1475. #pragma warning(disable:6011)
  1476. #endif
  1477. if((static_cast<re_brace*>(state)->index == -1)
  1478. || (static_cast<re_brace*>(state)->index == -2))
  1479. {
  1480. // skip past the zero width assertion:
  1481. state = static_cast<const re_jump*>(state->next.p)->alt.p->next.p;
  1482. continue;
  1483. }
  1484. #ifdef BOOST_REGEX_MSVC
  1485. # pragma warning(pop)
  1486. #endif
  1487. if(static_cast<re_brace*>(state)->index == -3)
  1488. {
  1489. // Have to skip the leading jump state:
  1490. state = state->next.p->next.p;
  1491. continue;
  1492. }
  1493. return;
  1494. case syntax_element_endmark:
  1495. case syntax_element_start_line:
  1496. case syntax_element_end_line:
  1497. case syntax_element_word_boundary:
  1498. case syntax_element_within_word:
  1499. case syntax_element_word_start:
  1500. case syntax_element_word_end:
  1501. case syntax_element_buffer_start:
  1502. case syntax_element_buffer_end:
  1503. case syntax_element_restart_continue:
  1504. state = state->next.p;
  1505. break;
  1506. case syntax_element_dot_rep:
  1507. case syntax_element_char_rep:
  1508. case syntax_element_short_set_rep:
  1509. case syntax_element_long_set_rep:
  1510. if(this->m_has_backrefs == 0)
  1511. static_cast<re_repeat*>(state)->leading = true;
  1512. BOOST_REGEX_FALLTHROUGH;
  1513. default:
  1514. return;
  1515. }
  1516. }while(state);
  1517. }
  1518. } // namespace BOOST_REGEX_DETAIL_NS
  1519. } // namespace boost
  1520. #ifdef BOOST_REGEX_MSVC
  1521. # pragma warning(pop)
  1522. #endif
  1523. #endif