123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372 |
- ///////////////////////////////////////////////////////////////////////////////
- /// \file regex_token_iterator.hpp
- /// Contains the definition of regex_token_iterator, and STL-compatible iterator
- /// for tokenizing a string using a regular expression.
- //
- // Copyright 2008 Eric Niebler. Distributed under the Boost
- // Software License, Version 1.0. (See accompanying file
- // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
- #ifndef BOOST_XPRESSIVE_REGEX_TOKEN_ITERATOR_HPP_EAN_10_04_2005
- #define BOOST_XPRESSIVE_REGEX_TOKEN_ITERATOR_HPP_EAN_10_04_2005
- // MS compatible compilers support #pragma once
- #if defined(_MSC_VER)
- # pragma once
- #endif
- #include <vector>
- #include <boost/assert.hpp>
- #include <boost/mpl/assert.hpp>
- #include <boost/type_traits/is_same.hpp>
- #include <boost/type_traits/is_convertible.hpp>
- #include <boost/xpressive/regex_iterator.hpp>
- namespace boost { namespace xpressive { namespace detail
- {
- //////////////////////////////////////////////////////////////////////////
- // regex_token_iterator_impl
- //
- template<typename BidiIter>
- struct regex_token_iterator_impl
- : counted_base<regex_token_iterator_impl<BidiIter> >
- {
- typedef sub_match<BidiIter> value_type;
- regex_token_iterator_impl
- (
- BidiIter begin
- , BidiIter cur
- , BidiIter end
- , BidiIter next_search
- , basic_regex<BidiIter> const &rex
- , regex_constants::match_flag_type flags = regex_constants::match_default
- , std::vector<int> subs = std::vector<int>(1, 0)
- , int n = -2
- , bool not_null = false
- )
- : iter_(begin, cur, end, next_search, rex, flags, not_null)
- , result_()
- , n_((-2 == n) ? (int)subs.size() - 1 : n)
- , subs_()
- {
- BOOST_ASSERT(0 != subs.size());
- this->subs_.swap(subs);
- }
- bool next()
- {
- if(-1 != this->n_)
- {
- BidiIter cur = this->iter_.state_.cur_;
- if(0 != (++this->n_ %= (int)this->subs_.size()) || this->iter_.next())
- {
- this->result_ = (-1 == this->subs_[ this->n_ ])
- ? this->iter_.what_.prefix()
- : this->iter_.what_[ this->subs_[ this->n_ ] ];
- return true;
- }
- else if(-1 == this->subs_[ this->n_-- ] && cur != this->iter_.state_.end_)
- {
- this->result_ = value_type(cur, this->iter_.state_.end_, true);
- return true;
- }
- }
- return false;
- }
- bool equal_to(regex_token_iterator_impl<BidiIter> const &that) const
- {
- return this->iter_.equal_to(that.iter_) && this->n_ == that.n_;
- }
- regex_iterator_impl<BidiIter> iter_;
- value_type result_;
- int n_;
- std::vector<int> subs_;
- };
- inline int get_mark_number(int i)
- {
- return i;
- }
- inline std::vector<int> to_vector(int subs)
- {
- return std::vector<int>(1, subs);
- }
- inline std::vector<int> const &to_vector(std::vector<int> const &subs)
- {
- return subs;
- }
- template<typename Int, std::size_t Size>
- inline std::vector<int> to_vector(Int const (&sub_matches)[ Size ])
- {
- // so that people can specify sub-match indices inline with
- // string literals, like "\1\2\3", leave off the trailing '\0'
- std::size_t const size = Size - is_same<Int, char>::value;
- std::vector<int> vect(size);
- for(std::size_t i = 0; i < size; ++i)
- {
- vect[i] = get_mark_number(sub_matches[i]);
- }
- return vect;
- }
- template<typename Int>
- inline std::vector<int> to_vector(std::vector<Int> const &sub_matches)
- {
- BOOST_MPL_ASSERT((is_convertible<Int, int>));
- return std::vector<int>(sub_matches.begin(), sub_matches.end());
- }
- } // namespace detail
- //////////////////////////////////////////////////////////////////////////
- // regex_token_iterator
- //
- template<typename BidiIter>
- struct regex_token_iterator
- {
- typedef basic_regex<BidiIter> regex_type;
- typedef typename iterator_value<BidiIter>::type char_type;
- typedef sub_match<BidiIter> value_type;
- typedef std::ptrdiff_t difference_type;
- typedef value_type const *pointer;
- typedef value_type const &reference;
- typedef std::forward_iterator_tag iterator_category;
- /// INTERNAL ONLY
- typedef detail::regex_token_iterator_impl<BidiIter> impl_type_;
- /// \post \c *this is the end of sequence iterator.
- regex_token_iterator()
- : impl_()
- {
- }
- /// \param begin The beginning of the character range to search.
- /// \param end The end of the character range to search.
- /// \param rex The regex pattern to search for.
- /// \pre \c [begin,end) is a valid range.
- regex_token_iterator
- (
- BidiIter begin
- , BidiIter end
- , basic_regex<BidiIter> const &rex
- )
- : impl_()
- {
- if(0 != rex.regex_id())
- {
- this->impl_ = new impl_type_(begin, begin, end, begin, rex);
- this->next_();
- }
- }
- /// \param begin The beginning of the character range to search.
- /// \param end The end of the character range to search.
- /// \param rex The regex pattern to search for.
- /// \param args A let() expression with argument bindings for semantic actions.
- /// \pre \c [begin,end) is a valid range.
- template<typename LetExpr>
- regex_token_iterator
- (
- BidiIter begin
- , BidiIter end
- , basic_regex<BidiIter> const &rex
- , detail::let_<LetExpr> const &args
- )
- : impl_()
- {
- if(0 != rex.regex_id())
- {
- this->impl_ = new impl_type_(begin, begin, end, begin, rex);
- detail::bind_args(args, this->impl_->iter_.what_);
- this->next_();
- }
- }
- /// \param begin The beginning of the character range to search.
- /// \param end The end of the character range to search.
- /// \param rex The regex pattern to search for.
- /// \param subs A range of integers designating sub-matches to be treated as tokens.
- /// \param flags Optional match flags, used to control how the expression is matched against the sequence. (See match_flag_type.)
- /// \pre \c [begin,end) is a valid range.
- /// \pre \c subs is either an integer greater or equal to -1,
- /// or else an array or non-empty \c std::vector\<\> of such integers.
- template<typename Subs>
- regex_token_iterator
- (
- BidiIter begin
- , BidiIter end
- , basic_regex<BidiIter> const &rex
- , Subs const &subs
- , regex_constants::match_flag_type flags = regex_constants::match_default
- )
- : impl_()
- {
- if(0 != rex.regex_id())
- {
- this->impl_ = new impl_type_(begin, begin, end, begin, rex, flags, detail::to_vector(subs));
- this->next_();
- }
- }
- /// \param begin The beginning of the character range to search.
- /// \param end The end of the character range to search.
- /// \param rex The regex pattern to search for.
- /// \param subs A range of integers designating sub-matches to be treated as tokens.
- /// \param args A let() expression with argument bindings for semantic actions.
- /// \param flags Optional match flags, used to control how the expression is matched against the sequence. (See match_flag_type.)
- /// \pre \c [begin,end) is a valid range.
- /// \pre \c subs is either an integer greater or equal to -1,
- /// or else an array or non-empty \c std::vector\<\> of such integers.
- template<typename Subs, typename LetExpr>
- regex_token_iterator
- (
- BidiIter begin
- , BidiIter end
- , basic_regex<BidiIter> const &rex
- , Subs const &subs
- , detail::let_<LetExpr> const &args
- , regex_constants::match_flag_type flags = regex_constants::match_default
- )
- : impl_()
- {
- if(0 != rex.regex_id())
- {
- this->impl_ = new impl_type_(begin, begin, end, begin, rex, flags, detail::to_vector(subs));
- detail::bind_args(args, this->impl_->iter_.what_);
- this->next_();
- }
- }
- /// \post <tt>*this == that</tt>
- regex_token_iterator(regex_token_iterator<BidiIter> const &that)
- : impl_(that.impl_) // COW
- {
- }
- /// \post <tt>*this == that</tt>
- regex_token_iterator<BidiIter> &operator =(regex_token_iterator<BidiIter> const &that)
- {
- this->impl_ = that.impl_; // COW
- return *this;
- }
- friend bool operator ==(regex_token_iterator<BidiIter> const &left, regex_token_iterator<BidiIter> const &right)
- {
- if(!left.impl_ || !right.impl_)
- {
- return !left.impl_ && !right.impl_;
- }
- return left.impl_->equal_to(*right.impl_);
- }
- friend bool operator !=(regex_token_iterator<BidiIter> const &left, regex_token_iterator<BidiIter> const &right)
- {
- return !(left == right);
- }
- value_type const &operator *() const
- {
- return this->impl_->result_;
- }
- value_type const *operator ->() const
- {
- return &this->impl_->result_;
- }
- /// If N == -1 then sets *this equal to the end of sequence iterator.
- /// Otherwise if N+1 \< subs.size(), then increments N and sets result equal to
- /// ((subs[N] == -1) ? value_type(what.prefix().str()) : value_type(what[subs[N]].str())).
- /// Otherwise if what.prefix().first != what[0].second and if the element match_prev_avail is
- /// not set in flags then sets it. Then locates the next match as if by calling
- /// regex_search(what[0].second, end, what, *pre, flags), with the following variation:
- /// in the event that the previous match found was of zero length (what[0].length() == 0)
- /// then attempts to find a non-zero length match starting at what[0].second, only if that
- /// fails and provided what[0].second != suffix().second does it look for a (possibly zero
- /// length) match starting from what[0].second + 1. If such a match is found then sets N
- /// equal to zero, and sets result equal to
- /// ((subs[N] == -1) ? value_type(what.prefix().str()) : value_type(what[subs[N]].str())).
- /// Otherwise if no further matches were found, then let last_end be the endpoint of the last
- /// match that was found. Then if last_end != end and subs[0] == -1 sets N equal to -1 and
- /// sets result equal to value_type(last_end, end). Otherwise sets *this equal to the end
- /// of sequence iterator.
- regex_token_iterator<BidiIter> &operator ++()
- {
- this->fork_(); // un-share the implementation
- this->next_();
- return *this;
- }
- regex_token_iterator<BidiIter> operator ++(int)
- {
- regex_token_iterator<BidiIter> tmp(*this);
- ++*this;
- return tmp;
- }
- private:
- /// INTERNAL ONLY
- void fork_()
- {
- if(1 != this->impl_->use_count())
- {
- intrusive_ptr<impl_type_> clone = new impl_type_
- (
- this->impl_->iter_.state_.begin_
- , this->impl_->iter_.state_.cur_
- , this->impl_->iter_.state_.end_
- , this->impl_->iter_.state_.next_search_
- , this->impl_->iter_.rex_
- , this->impl_->iter_.flags_
- , this->impl_->subs_
- , this->impl_->n_
- , this->impl_->iter_.not_null_
- );
- // only copy the match_results struct if we have to. Note: if the next call
- // to impl_->next() will return false or call regex_search, we don't need to
- // copy the match_results struct.
- if(-1 != this->impl_->n_ && this->impl_->n_ + 1 != static_cast<int>(this->impl_->subs_.size()))
- {
- // BUGBUG This is expensive -- it causes the sequence_stack to be cleared.
- // Find a better way
- clone->iter_.what_ = this->impl_->iter_.what_;
- }
- else
- {
- // At the very least, copy the action args
- detail::core_access<BidiIter>::get_action_args(clone->iter_.what_)
- = detail::core_access<BidiIter>::get_action_args(this->impl_->iter_.what_);
- }
- this->impl_.swap(clone);
- }
- }
- /// INTERNAL ONLY
- void next_()
- {
- BOOST_ASSERT(this->impl_ && 1 == this->impl_->use_count());
- if(!this->impl_->next())
- {
- this->impl_ = 0;
- }
- }
- intrusive_ptr<impl_type_> impl_;
- };
- }} // namespace boost::xpressive
- #endif
|