states.hpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. /*
  2. *
  3. * Copyright (c) 1998-2002
  4. * John Maddock
  5. *
  6. * Use, modification and distribution are subject to the
  7. * Boost Software License, Version 1.0. (See accompanying file
  8. * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  9. *
  10. */
  11. /*
  12. * LOCATION: see http://www.boost.org for most recent version.
  13. * FILE states.cpp
  14. * VERSION see <boost/version.hpp>
  15. * DESCRIPTION: Declares internal state machine structures.
  16. */
  17. #ifndef BOOST_REGEX_V4_STATES_HPP
  18. #define BOOST_REGEX_V4_STATES_HPP
  19. #ifdef BOOST_MSVC
  20. #pragma warning(push)
  21. #pragma warning(disable: 4103)
  22. #endif
  23. #ifdef BOOST_HAS_ABI_HEADERS
  24. # include BOOST_ABI_PREFIX
  25. #endif
  26. #ifdef BOOST_MSVC
  27. #pragma warning(pop)
  28. #endif
  29. namespace boost{
  30. namespace BOOST_REGEX_DETAIL_NS{
  31. /*** mask_type *******************************************************
  32. Whenever we have a choice of two alternatives, we use an array of bytes
  33. to indicate which of the two alternatives it is possible to take for any
  34. given input character. If mask_take is set, then we can take the next
  35. state, and if mask_skip is set then we can take the alternative.
  36. ***********************************************************************/
  37. enum mask_type
  38. {
  39. mask_take = 1,
  40. mask_skip = 2,
  41. mask_init = 4,
  42. mask_any = mask_skip | mask_take,
  43. mask_all = mask_any
  44. };
  45. /*** helpers **********************************************************
  46. These helpers let us use function overload resolution to detect whether
  47. we have narrow or wide character strings:
  48. ***********************************************************************/
  49. struct _narrow_type{};
  50. struct _wide_type{};
  51. template <class charT> struct is_byte;
  52. template<> struct is_byte<char> { typedef _narrow_type width_type; };
  53. template<> struct is_byte<unsigned char>{ typedef _narrow_type width_type; };
  54. template<> struct is_byte<signed char> { typedef _narrow_type width_type; };
  55. template <class charT> struct is_byte { typedef _wide_type width_type; };
  56. /*** enum syntax_element_type ******************************************
  57. Every record in the state machine falls into one of the following types:
  58. ***********************************************************************/
  59. enum syntax_element_type
  60. {
  61. // start of a marked sub-expression, or perl-style (?...) extension
  62. syntax_element_startmark = 0,
  63. // end of a marked sub-expression, or perl-style (?...) extension
  64. syntax_element_endmark = syntax_element_startmark + 1,
  65. // any sequence of literal characters
  66. syntax_element_literal = syntax_element_endmark + 1,
  67. // start of line assertion: ^
  68. syntax_element_start_line = syntax_element_literal + 1,
  69. // end of line assertion $
  70. syntax_element_end_line = syntax_element_start_line + 1,
  71. // match any character: .
  72. syntax_element_wild = syntax_element_end_line + 1,
  73. // end of expression: we have a match when we get here
  74. syntax_element_match = syntax_element_wild + 1,
  75. // perl style word boundary: \b
  76. syntax_element_word_boundary = syntax_element_match + 1,
  77. // perl style within word boundary: \B
  78. syntax_element_within_word = syntax_element_word_boundary + 1,
  79. // start of word assertion: \<
  80. syntax_element_word_start = syntax_element_within_word + 1,
  81. // end of word assertion: \>
  82. syntax_element_word_end = syntax_element_word_start + 1,
  83. // start of buffer assertion: \`
  84. syntax_element_buffer_start = syntax_element_word_end + 1,
  85. // end of buffer assertion: \'
  86. syntax_element_buffer_end = syntax_element_buffer_start + 1,
  87. // backreference to previously matched sub-expression
  88. syntax_element_backref = syntax_element_buffer_end + 1,
  89. // either a wide character set [..] or one with multicharacter collating elements:
  90. syntax_element_long_set = syntax_element_backref + 1,
  91. // narrow character set: [...]
  92. syntax_element_set = syntax_element_long_set + 1,
  93. // jump to a new state in the machine:
  94. syntax_element_jump = syntax_element_set + 1,
  95. // choose between two production states:
  96. syntax_element_alt = syntax_element_jump + 1,
  97. // a repeat
  98. syntax_element_rep = syntax_element_alt + 1,
  99. // match a combining character sequence
  100. syntax_element_combining = syntax_element_rep + 1,
  101. // perl style soft buffer end: \z
  102. syntax_element_soft_buffer_end = syntax_element_combining + 1,
  103. // perl style continuation: \G
  104. syntax_element_restart_continue = syntax_element_soft_buffer_end + 1,
  105. // single character repeats:
  106. syntax_element_dot_rep = syntax_element_restart_continue + 1,
  107. syntax_element_char_rep = syntax_element_dot_rep + 1,
  108. syntax_element_short_set_rep = syntax_element_char_rep + 1,
  109. syntax_element_long_set_rep = syntax_element_short_set_rep + 1,
  110. // a backstep for lookbehind repeats:
  111. syntax_element_backstep = syntax_element_long_set_rep + 1,
  112. // an assertion that a mark was matched:
  113. syntax_element_assert_backref = syntax_element_backstep + 1,
  114. syntax_element_toggle_case = syntax_element_assert_backref + 1,
  115. // a recursive expression:
  116. syntax_element_recurse = syntax_element_toggle_case + 1,
  117. // Verbs:
  118. syntax_element_fail = syntax_element_recurse + 1,
  119. syntax_element_accept = syntax_element_fail + 1,
  120. syntax_element_commit = syntax_element_accept + 1,
  121. syntax_element_then = syntax_element_commit + 1
  122. };
  123. #ifdef BOOST_REGEX_DEBUG
  124. // dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion
  125. std::ostream& operator<<(std::ostream&, syntax_element_type);
  126. #endif
  127. struct re_syntax_base;
  128. /*** union offset_type ************************************************
  129. Points to another state in the machine. During machine construction
  130. we use integral offsets, but these are converted to pointers before
  131. execution of the machine.
  132. ***********************************************************************/
  133. union offset_type
  134. {
  135. re_syntax_base* p;
  136. std::ptrdiff_t i;
  137. };
  138. /*** struct re_syntax_base ********************************************
  139. Base class for all states in the machine.
  140. ***********************************************************************/
  141. struct re_syntax_base
  142. {
  143. syntax_element_type type; // what kind of state this is
  144. offset_type next; // next state in the machine
  145. };
  146. /*** struct re_brace **************************************************
  147. A marked parenthesis.
  148. ***********************************************************************/
  149. struct re_brace : public re_syntax_base
  150. {
  151. // The index to match, can be zero (don't mark the sub-expression)
  152. // or negative (for perl style (?...) extensions):
  153. int index;
  154. bool icase;
  155. };
  156. /*** struct re_dot **************************************************
  157. Match anything.
  158. ***********************************************************************/
  159. enum
  160. {
  161. dont_care = 1,
  162. force_not_newline = 0,
  163. force_newline = 2,
  164. test_not_newline = 2,
  165. test_newline = 3
  166. };
  167. struct re_dot : public re_syntax_base
  168. {
  169. unsigned char mask;
  170. };
  171. /*** struct re_literal ************************************************
  172. A string of literals, following this structure will be an
  173. array of characters: charT[length]
  174. ***********************************************************************/
  175. struct re_literal : public re_syntax_base
  176. {
  177. unsigned int length;
  178. };
  179. /*** struct re_case ************************************************
  180. Indicates whether we are moving to a case insensive block or not
  181. ***********************************************************************/
  182. struct re_case : public re_syntax_base
  183. {
  184. bool icase;
  185. };
  186. /*** struct re_set_long ***********************************************
  187. A wide character set of characters, following this structure will be
  188. an array of type charT:
  189. First csingles null-terminated strings
  190. Then 2 * cranges NULL terminated strings
  191. Then cequivalents NULL terminated strings
  192. ***********************************************************************/
  193. template <class mask_type>
  194. struct re_set_long : public re_syntax_base
  195. {
  196. unsigned int csingles, cranges, cequivalents;
  197. mask_type cclasses;
  198. mask_type cnclasses;
  199. bool isnot;
  200. bool singleton;
  201. };
  202. /*** struct re_set ****************************************************
  203. A set of narrow-characters, matches any of _map which is none-zero
  204. ***********************************************************************/
  205. struct re_set : public re_syntax_base
  206. {
  207. unsigned char _map[1 << CHAR_BIT];
  208. };
  209. /*** struct re_jump ***************************************************
  210. Jump to a new location in the machine (not next).
  211. ***********************************************************************/
  212. struct re_jump : public re_syntax_base
  213. {
  214. offset_type alt; // location to jump to
  215. };
  216. /*** struct re_alt ***************************************************
  217. Jump to a new location in the machine (possibly next).
  218. ***********************************************************************/
  219. struct re_alt : public re_jump
  220. {
  221. unsigned char _map[1 << CHAR_BIT]; // which characters can take the jump
  222. unsigned int can_be_null; // true if we match a NULL string
  223. };
  224. /*** struct re_repeat *************************************************
  225. Repeat a section of the machine
  226. ***********************************************************************/
  227. struct re_repeat : public re_alt
  228. {
  229. std::size_t min, max; // min and max allowable repeats
  230. int state_id; // Unique identifier for this repeat
  231. bool leading; // True if this repeat is at the start of the machine (lets us optimize some searches)
  232. bool greedy; // True if this is a greedy repeat
  233. };
  234. /*** struct re_recurse ************************************************
  235. Recurse to a particular subexpression.
  236. **********************************************************************/
  237. struct re_recurse : public re_jump
  238. {
  239. int state_id; // identifier of first nested repeat within the recursion.
  240. };
  241. /*** struct re_commit *************************************************
  242. Used for the PRUNE, SKIP and COMMIT verbs which basically differ only in what happens
  243. if no match is found and we start searching forward.
  244. **********************************************************************/
  245. enum commit_type
  246. {
  247. commit_prune,
  248. commit_skip,
  249. commit_commit
  250. };
  251. struct re_commit : public re_syntax_base
  252. {
  253. commit_type action;
  254. };
  255. /*** enum re_jump_size_type *******************************************
  256. Provides compiled size of re_jump structure (allowing for trailing alignment).
  257. We provide this so we know how manybytes to insert when constructing the machine
  258. (The value of padding_mask is defined in regex_raw_buffer.hpp).
  259. ***********************************************************************/
  260. enum re_jump_size_type
  261. {
  262. re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask),
  263. re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask),
  264. re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask)
  265. };
  266. /*** proc re_is_set_member *********************************************
  267. Forward declaration: we'll need this one later...
  268. ***********************************************************************/
  269. template<class charT, class traits>
  270. struct regex_data;
  271. template <class iterator, class charT, class traits_type, class char_classT>
  272. iterator BOOST_REGEX_CALL re_is_set_member(iterator next,
  273. iterator last,
  274. const re_set_long<char_classT>* set_,
  275. const regex_data<charT, traits_type>& e, bool icase);
  276. } // namespace BOOST_REGEX_DETAIL_NS
  277. } // namespace boost
  278. #ifdef BOOST_MSVC
  279. #pragma warning(push)
  280. #pragma warning(disable: 4103)
  281. #endif
  282. #ifdef BOOST_HAS_ABI_HEADERS
  283. # include BOOST_ABI_SUFFIX
  284. #endif
  285. #ifdef BOOST_MSVC
  286. #pragma warning(pop)
  287. #endif
  288. #endif