/////////////////////////////////////////////////////////////////////////////// /// \file regex_compiler.hpp /// Contains the definition of regex_compiler, a factory for building regex objects /// from strings. // // Copyright 2008 Eric Niebler. Distributed under the Boost // Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) #ifndef BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005 #define BOOST_XPRESSIVE_REGEX_COMPILER_HPP_EAN_10_04_2005 // MS compatible compilers support #pragma once #if defined(_MSC_VER) && (_MSC_VER >= 1020) # pragma once #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace boost { namespace xpressive { /////////////////////////////////////////////////////////////////////////////// // regex_compiler // /// \brief Class template regex_compiler is a factory for building basic_regex objects from a string. /// /// Class template regex_compiler is used to construct a basic_regex object from a string. The string /// should contain a valid regular expression. You can imbue a regex_compiler object with a locale, /// after which all basic_regex objects created with that regex_compiler object will use that locale. /// After creating a regex_compiler object, and optionally imbueing it with a locale, you can call the /// compile() method to construct a basic_regex object, passing it the string representing the regular /// expression. You can call compile() multiple times on the same regex_compiler object. Two basic_regex /// objects compiled from the same string will have different regex_id's. template struct regex_compiler { typedef BidiIter iterator_type; typedef typename iterator_value::type char_type; typedef regex_constants::syntax_option_type flag_type; typedef RegexTraits traits_type; typedef typename traits_type::string_type string_type; typedef typename traits_type::locale_type locale_type; typedef typename traits_type::char_class_type char_class_type; explicit regex_compiler(RegexTraits const &traits = RegexTraits()) : mark_count_(0) , hidden_mark_count_(0) , traits_(traits) , upper_(0) , self_() , rules_() { this->upper_ = lookup_classname(this->rxtraits(), "upper"); } /////////////////////////////////////////////////////////////////////////// // imbue /// Specify the locale to be used by a regex_compiler. /// /// \param loc The locale that this regex_compiler should use. /// \return The previous locale. locale_type imbue(locale_type loc) { locale_type oldloc = this->traits_.imbue(loc); this->upper_ = lookup_classname(this->rxtraits(), "upper"); return oldloc; } /////////////////////////////////////////////////////////////////////////// // getloc /// Get the locale used by a regex_compiler. /// /// \return The locale used by this regex_compiler. locale_type getloc() const { return this->traits_.getloc(); } /////////////////////////////////////////////////////////////////////////// // compile /// Builds a basic_regex object from a range of characters. /// /// \param begin The beginning of a range of characters representing the /// regular expression to compile. /// \param end The end of a range of characters representing the /// regular expression to compile. /// \param flags Optional bitmask that determines how the pat string is /// interpreted. (See syntax_option_type.) /// \return A basic_regex object corresponding to the regular expression /// represented by the character range. /// \pre InputIter is a model of the InputIterator concept. /// \pre [begin,end) is a valid range. /// \pre The range of characters specified by [begin,end) contains a /// valid string-based representation of a regular expression. /// \throw regex_error when the range of characters has invalid regular /// expression syntax. template basic_regex compile(InputIter begin, InputIter end, flag_type flags = regex_constants::ECMAScript) { typedef typename iterator_category::type category; return this->compile_(begin, end, flags, category()); } /// \overload /// template typename disable_if, basic_regex >::type compile(InputRange const &pat, flag_type flags = regex_constants::ECMAScript) { return this->compile(boost::begin(pat), boost::end(pat), flags); } /// \overload /// basic_regex compile(char_type const *begin, flag_type flags = regex_constants::ECMAScript) { BOOST_ASSERT(0 != begin); char_type const *end = begin + std::char_traits::length(begin); return this->compile(begin, end, flags); } /// \overload /// basic_regex compile(char_type const *begin, std::size_t size, flag_type flags) { BOOST_ASSERT(0 != begin); char_type const *end = begin + size; return this->compile(begin, end, flags); } /////////////////////////////////////////////////////////////////////////// // operator[] /// Return a reference to the named regular expression. If no such named /// regular expression exists, create a new regular expression and return /// a reference to it. /// /// \param name A std::string containing the name of the regular expression. /// \pre The string is not empty. /// \throw bad_alloc on allocation failure. basic_regex &operator [](string_type const &name) { BOOST_ASSERT(!name.empty()); return this->rules_[name]; } /// \overload /// basic_regex const &operator [](string_type const &name) const { BOOST_ASSERT(!name.empty()); return this->rules_[name]; } private: typedef detail::escape_value escape_value; typedef detail::alternate_matcher, RegexTraits> alternate_matcher; /////////////////////////////////////////////////////////////////////////// // compile_ /// INTERNAL ONLY template basic_regex compile_(FwdIter begin, FwdIter end, flag_type flags, std::forward_iterator_tag) { BOOST_MPL_ASSERT((is_same::type>)); using namespace regex_constants; this->reset(); this->traits_.flags(flags); basic_regex rextmp, *prex = &rextmp; FwdIter tmp = begin; // Check if this regex is a named rule: string_type name; if(token_group_begin == this->traits_.get_token(tmp, end) && BOOST_XPR_ENSURE_(tmp != end, error_paren, "mismatched parenthesis") && token_rule_assign == this->traits_.get_group_type(tmp, end, name)) { begin = tmp; BOOST_XPR_ENSURE_ ( begin != end && token_group_end == this->traits_.get_token(begin, end) , error_paren , "mismatched parenthesis" ); prex = &this->rules_[name]; } this->self_ = detail::core_access::get_regex_impl(*prex); // at the top level, a regex is a sequence of alternates detail::sequence seq = this->parse_alternates(begin, end); BOOST_XPR_ENSURE_(begin == end, error_paren, "mismatched parenthesis"); // terminate the sequence seq += detail::make_dynamic(detail::end_matcher()); // bundle the regex information into a regex_impl object detail::common_compile(seq.xpr().matchable(), *this->self_, this->rxtraits()); this->self_->traits_ = new detail::traits_holder(this->rxtraits()); this->self_->mark_count_ = this->mark_count_; this->self_->hidden_mark_count_ = this->hidden_mark_count_; // References changed, update dependencies. this->self_->tracking_update(); this->self_.reset(); return *prex; } /////////////////////////////////////////////////////////////////////////// // compile_ /// INTERNAL ONLY template basic_regex compile_(InputIter begin, InputIter end, flag_type flags, std::input_iterator_tag) { string_type pat(begin, end); return this->compile_(boost::begin(pat), boost::end(pat), flags, std::forward_iterator_tag()); } /////////////////////////////////////////////////////////////////////////// // reset /// INTERNAL ONLY void reset() { this->mark_count_ = 0; this->hidden_mark_count_ = 0; this->traits_.flags(regex_constants::ECMAScript); } /////////////////////////////////////////////////////////////////////////// // regex_traits /// INTERNAL ONLY traits_type &rxtraits() { return this->traits_.traits(); } /////////////////////////////////////////////////////////////////////////// // regex_traits /// INTERNAL ONLY traits_type const &rxtraits() const { return this->traits_.traits(); } /////////////////////////////////////////////////////////////////////////// // parse_alternates /// INTERNAL ONLY template detail::sequence parse_alternates(FwdIter &begin, FwdIter end) { using namespace regex_constants; int count = 0; FwdIter tmp = begin; detail::sequence seq; do switch(++count) { case 1: seq = this->parse_sequence(tmp, end); break; case 2: seq = detail::make_dynamic(alternate_matcher()) | seq; // fall-through default: seq |= this->parse_sequence(tmp, end); } while((begin = tmp) != end && token_alternate == this->traits_.get_token(tmp, end)); return seq; } /////////////////////////////////////////////////////////////////////////// // parse_group /// INTERNAL ONLY template detail::sequence parse_group(FwdIter &begin, FwdIter end) { using namespace regex_constants; int mark_nbr = 0; bool keeper = false; bool lookahead = false; bool lookbehind = false; bool negative = false; string_type name; detail::sequence seq, seq_end; FwdIter tmp = FwdIter(); syntax_option_type old_flags = this->traits_.flags(); switch(this->traits_.get_group_type(begin, end, name)) { case token_no_mark: // Don't process empty groups like (?:) or (?i) // BUGBUG this doesn't handle the degenerate (?:)+ correctly if(token_group_end == this->traits_.get_token(tmp = begin, end)) { return this->parse_atom(begin = tmp, end); } break; case token_negative_lookahead: negative = true; // fall-through case token_positive_lookahead: lookahead = true; break; case token_negative_lookbehind: negative = true; // fall-through case token_positive_lookbehind: lookbehind = true; break; case token_independent_sub_expression: keeper = true; break; case token_comment: while(BOOST_XPR_ENSURE_(begin != end, error_paren, "mismatched parenthesis")) { switch(this->traits_.get_token(begin, end)) { case token_group_end: return this->parse_atom(begin, end); case token_escape: BOOST_XPR_ENSURE_(begin != end, error_escape, "incomplete escape sequence"); case token_literal: ++begin; default:; } } break; case token_recurse: BOOST_XPR_ENSURE_ ( begin != end && token_group_end == this->traits_.get_token(begin, end) , error_paren , "mismatched parenthesis" ); return detail::make_dynamic(detail::regex_byref_matcher(this->self_)); case token_rule_assign: BOOST_THROW_EXCEPTION( regex_error(error_badrule, "rule assignments must be at the front of the regex") ); break; case token_rule_ref: { typedef detail::core_access access; BOOST_XPR_ENSURE_ ( begin != end && token_group_end == this->traits_.get_token(begin, end) , error_paren , "mismatched parenthesis" ); basic_regex &rex = this->rules_[name]; shared_ptr > impl = access::get_regex_impl(rex); this->self_->track_reference(*impl); return detail::make_dynamic(detail::regex_byref_matcher(impl)); } case token_named_mark: mark_nbr = static_cast(++this->mark_count_); for(std::size_t i = 0; i < this->self_->named_marks_.size(); ++i) { BOOST_XPR_ENSURE_(this->self_->named_marks_[i].name_ != name, error_badmark, "named mark already exists"); } this->self_->named_marks_.push_back(detail::named_mark(name, this->mark_count_)); seq = detail::make_dynamic(detail::mark_begin_matcher(mark_nbr)); seq_end = detail::make_dynamic(detail::mark_end_matcher(mark_nbr)); break; case token_named_mark_ref: BOOST_XPR_ENSURE_ ( begin != end && token_group_end == this->traits_.get_token(begin, end) , error_paren , "mismatched parenthesis" ); for(std::size_t i = 0; i < this->self_->named_marks_.size(); ++i) { if(this->self_->named_marks_[i].name_ == name) { mark_nbr = static_cast(this->self_->named_marks_[i].mark_nbr_); return detail::make_backref_xpression ( mark_nbr, this->traits_.flags(), this->rxtraits() ); } } BOOST_THROW_EXCEPTION(regex_error(error_badmark, "invalid named back-reference")); break; default: mark_nbr = static_cast(++this->mark_count_); seq = detail::make_dynamic(detail::mark_begin_matcher(mark_nbr)); seq_end = detail::make_dynamic(detail::mark_end_matcher(mark_nbr)); break; } // alternates seq += this->parse_alternates(begin, end); seq += seq_end; BOOST_XPR_ENSURE_ ( begin != end && token_group_end == this->traits_.get_token(begin, end) , error_paren , "mismatched parenthesis" ); typedef detail::shared_matchable xpr_type; if(lookahead) { seq += detail::make_independent_end_xpression(seq.pure()); detail::lookahead_matcher lookahead(seq.xpr(), negative, seq.pure()); seq = detail::make_dynamic(lookahead); } else if(lookbehind) { seq += detail::make_independent_end_xpression(seq.pure()); detail::lookbehind_matcher lookbehind(seq.xpr(), seq.width().value(), negative, seq.pure()); seq = detail::make_dynamic(lookbehind); } else if(keeper) // independent sub-expression { seq += detail::make_independent_end_xpression(seq.pure()); detail::keeper_matcher keeper(seq.xpr(), seq.pure()); seq = detail::make_dynamic(keeper); } // restore the modifiers this->traits_.flags(old_flags); return seq; } /////////////////////////////////////////////////////////////////////////// // parse_charset /// INTERNAL ONLY template detail::sequence parse_charset(FwdIter &begin, FwdIter end) { detail::compound_charset chset; // call out to a helper to actually parse the character set detail::parse_charset(begin, end, chset, this->traits_); return detail::make_charset_xpression ( chset , this->rxtraits() , this->traits_.flags() ); } /////////////////////////////////////////////////////////////////////////// // parse_atom /// INTERNAL ONLY template detail::sequence parse_atom(FwdIter &begin, FwdIter end) { using namespace regex_constants; escape_value esc = { 0, 0, 0, detail::escape_char }; FwdIter old_begin = begin; switch(this->traits_.get_token(begin, end)) { case token_literal: return detail::make_literal_xpression ( this->parse_literal(begin, end), this->traits_.flags(), this->rxtraits() ); case token_any: return detail::make_any_xpression(this->traits_.flags(), this->rxtraits()); case token_assert_begin_sequence: return detail::make_dynamic(detail::assert_bos_matcher()); case token_assert_end_sequence: return detail::make_dynamic(detail::assert_eos_matcher()); case token_assert_begin_line: return detail::make_assert_begin_line(this->traits_.flags(), this->rxtraits()); case token_assert_end_line: return detail::make_assert_end_line(this->traits_.flags(), this->rxtraits()); case token_assert_word_boundary: return detail::make_assert_word(detail::word_boundary(), this->rxtraits()); case token_assert_not_word_boundary: return detail::make_assert_word(detail::word_boundary(), this->rxtraits()); case token_assert_word_begin: return detail::make_assert_word(detail::word_begin(), this->rxtraits()); case token_assert_word_end: return detail::make_assert_word(detail::word_end(), this->rxtraits()); case token_escape: esc = this->parse_escape(begin, end); switch(esc.type_) { case detail::escape_mark: return detail::make_backref_xpression ( esc.mark_nbr_, this->traits_.flags(), this->rxtraits() ); case detail::escape_char: return detail::make_char_xpression ( esc.ch_, this->traits_.flags(), this->rxtraits() ); case detail::escape_class: return detail::make_posix_charset_xpression ( esc.class_ , this->is_upper_(*begin++) , this->traits_.flags() , this->rxtraits() ); } case token_group_begin: return this->parse_group(begin, end); case token_charset_begin: return this->parse_charset(begin, end); case token_invalid_quantifier: BOOST_THROW_EXCEPTION(regex_error(error_badrepeat, "quantifier not expected")); break; case token_quote_meta_begin: return detail::make_literal_xpression ( this->parse_quote_meta(begin, end), this->traits_.flags(), this->rxtraits() ); case token_quote_meta_end: BOOST_THROW_EXCEPTION( regex_error( error_escape , "found quote-meta end without corresponding quote-meta begin" ) ); break; case token_end_of_pattern: break; default: begin = old_begin; break; } return detail::sequence(); } /////////////////////////////////////////////////////////////////////////// // parse_quant /// INTERNAL ONLY template detail::sequence parse_quant(FwdIter &begin, FwdIter end) { BOOST_ASSERT(begin != end); detail::quant_spec spec = { 0, 0, false, &this->hidden_mark_count_ }; detail::sequence seq = this->parse_atom(begin, end); // BUGBUG this doesn't handle the degenerate (?:)+ correctly if(!seq.empty() && begin != end && detail::quant_none != seq.quant()) { if(this->traits_.get_quant_spec(begin, end, spec)) { BOOST_ASSERT(spec.min_ <= spec.max_); if(0 == spec.max_) // quant {0,0} is degenerate -- matches nothing. { seq = this->parse_quant(begin, end); } else { seq.repeat(spec); } } } return seq; } /////////////////////////////////////////////////////////////////////////// // parse_sequence /// INTERNAL ONLY template detail::sequence parse_sequence(FwdIter &begin, FwdIter end) { detail::sequence seq; while(begin != end) { detail::sequence seq_quant = this->parse_quant(begin, end); // did we find a quantified atom? if(seq_quant.empty()) break; // chain it to the end of the xpression sequence seq += seq_quant; } return seq; } /////////////////////////////////////////////////////////////////////////// // parse_literal // scan ahead looking for char literals to be globbed together into a string literal /// INTERNAL ONLY template string_type parse_literal(FwdIter &begin, FwdIter end) { using namespace regex_constants; BOOST_ASSERT(begin != end); BOOST_ASSERT(token_literal == this->traits_.get_token(begin, end)); escape_value esc = { 0, 0, 0, detail::escape_char }; string_type literal(1, *begin); for(FwdIter prev = begin, tmp = ++begin; begin != end; prev = begin, begin = tmp) { detail::quant_spec spec = { 0, 0, false, &this->hidden_mark_count_ }; if(this->traits_.get_quant_spec(tmp, end, spec)) { if(literal.size() != 1) { begin = prev; literal.erase(boost::prior(literal.end())); } return literal; } else switch(this->traits_.get_token(tmp, end)) { case token_escape: esc = this->parse_escape(tmp, end); if(detail::escape_char != esc.type_) return literal; literal.insert(literal.end(), esc.ch_); break; case token_literal: literal.insert(literal.end(), *tmp++); break; default: return literal; } } return literal; } /////////////////////////////////////////////////////////////////////////// // parse_quote_meta // scan ahead looking for char literals to be globbed together into a string literal /// INTERNAL ONLY template string_type parse_quote_meta(FwdIter &begin, FwdIter end) { using namespace regex_constants; FwdIter old_begin = begin, old_end; while(end != (old_end = begin)) { switch(this->traits_.get_token(begin, end)) { case token_quote_meta_end: return string_type(old_begin, old_end); case token_escape: BOOST_XPR_ENSURE_(begin != end, error_escape, "incomplete escape sequence"); case token_invalid_quantifier: case token_literal: ++begin; default:; } } return string_type(old_begin, begin); } /////////////////////////////////////////////////////////////////////////////// // parse_escape /// INTERNAL ONLY template escape_value parse_escape(FwdIter &begin, FwdIter end) { BOOST_XPR_ENSURE_(begin != end, regex_constants::error_escape, "incomplete escape sequence"); // first, check to see if this can be a backreference if(0 < this->rxtraits().value(*begin, 10)) { // Parse at most 3 decimal digits. FwdIter tmp = begin; int mark_nbr = detail::toi(tmp, end, this->rxtraits(), 10, 999); // If the resulting number could conceivably be a backref, then it is. if(10 > mark_nbr || mark_nbr <= static_cast(this->mark_count_)) { begin = tmp; escape_value esc = {0, mark_nbr, 0, detail::escape_mark}; return esc; } } // Not a backreference, defer to the parse_escape helper return detail::parse_escape(begin, end, this->traits_); } bool is_upper_(char_type ch) const { return 0 != this->upper_ && this->rxtraits().isctype(ch, this->upper_); } std::size_t mark_count_; std::size_t hidden_mark_count_; CompilerTraits traits_; typename RegexTraits::char_class_type upper_; shared_ptr > self_; std::map > rules_; }; }} // namespace boost::xpressive #endif