/////////////////////////////////////////////////////////////////////////////// // parse_charset.hpp // // Copyright 2008 Eric Niebler. Distributed under the Boost // Software License, Version 1.0. (See accompanying file // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) #ifndef BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005 #define BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005 // MS compatible compilers support #pragma once #if defined(_MSC_VER) && (_MSC_VER >= 1020) # pragma once #endif #include #include #include #include #include #include #include #include #include namespace boost { namespace xpressive { namespace detail { enum escape_type { escape_char , escape_mark , escape_class }; /////////////////////////////////////////////////////////////////////////////// // escape_value // template struct escape_value { Char ch_; int mark_nbr_; Class class_; escape_type type_; }; /////////////////////////////////////////////////////////////////////////////// // char_overflow_handler // struct char_overflow_handler { void operator ()(numeric::range_check_result result) const // throw(regex_error) { if(numeric::cInRange != result) { BOOST_THROW_EXCEPTION( regex_error( regex_constants::error_escape , "character escape too large to fit in target character type" ) ); } } }; /////////////////////////////////////////////////////////////////////////////// // parse_escape // template escape_value::type, typename CompilerTraits::regex_traits::char_class_type> parse_escape(FwdIter &begin, FwdIter end, CompilerTraits &tr) { using namespace regex_constants; typedef typename iterator_value::type char_type; typedef typename CompilerTraits::regex_traits regex_traits; typedef typename regex_traits::char_class_type char_class_type; // define an unsigned type the same size as char_type typedef typename boost::uint_t::least uchar_t; BOOST_MPL_ASSERT_RELATION(sizeof(uchar_t), ==, sizeof(char_type)); typedef numeric::conversion_traits converstion_traits; BOOST_XPR_ENSURE_(begin != end, error_escape, "unexpected end of pattern found"); numeric::converter converter; escape_value esc = { 0, 0, 0, escape_char }; bool const icase = (0 != (regex_constants::icase_ & tr.flags())); regex_traits const &rxtraits = tr.traits(); FwdIter tmp; esc.class_ = rxtraits.lookup_classname(begin, begin + 1, icase); if(0 != esc.class_) { esc.type_ = escape_class; return esc; } if(-1 != rxtraits.value(*begin, 8)) { esc.ch_ = converter(toi(begin, end, rxtraits, 8, 0777)); return esc; } switch(*begin) { // bell character case BOOST_XPR_CHAR_(char_type, 'a'): esc.ch_ = BOOST_XPR_CHAR_(char_type, '\a'); ++begin; break; // escape character case BOOST_XPR_CHAR_(char_type, 'e'): esc.ch_ = converter(27); ++begin; break; // control character case BOOST_XPR_CHAR_(char_type, 'c'): BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found"); BOOST_XPR_ENSURE_ ( rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'a'), BOOST_XPR_CHAR_(char_type, 'z'), *begin) || rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'A'), BOOST_XPR_CHAR_(char_type, 'Z'), *begin) , error_escape , "invalid escape control letter; must be one of a-z or A-Z" ); // Convert to character according to ECMA-262, section 15.10.2.10: esc.ch_ = converter(*begin % 32); ++begin; break; // formfeed character case BOOST_XPR_CHAR_(char_type, 'f'): esc.ch_ = BOOST_XPR_CHAR_(char_type, '\f'); ++begin; break; // newline case BOOST_XPR_CHAR_(char_type, 'n'): esc.ch_ = BOOST_XPR_CHAR_(char_type, '\n'); ++begin; break; // return case BOOST_XPR_CHAR_(char_type, 'r'): esc.ch_ = BOOST_XPR_CHAR_(char_type, '\r'); ++begin; break; // horizontal tab case BOOST_XPR_CHAR_(char_type, 't'): esc.ch_ = BOOST_XPR_CHAR_(char_type, '\t'); ++begin; break; // vertical tab case BOOST_XPR_CHAR_(char_type, 'v'): esc.ch_ = BOOST_XPR_CHAR_(char_type, '\v'); ++begin; break; // hex escape sequence case BOOST_XPR_CHAR_(char_type, 'x'): BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found"); tmp = begin; esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xff)); BOOST_XPR_ENSURE_(2 == std::distance(tmp, begin), error_escape, "invalid hex escape : " "must be \\x HexDigit HexDigit"); break; // Unicode escape sequence case BOOST_XPR_CHAR_(char_type, 'u'): BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found"); tmp = begin; esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xffff)); BOOST_XPR_ENSURE_(4 == std::distance(tmp, begin), error_escape, "invalid Unicode escape : " "must be \\u HexDigit HexDigit HexDigit HexDigit"); break; // backslash case BOOST_XPR_CHAR_(char_type, '\\'): //esc.ch_ = BOOST_XPR_CHAR_(char_type, '\\'); //++begin; //break; // all other escaped characters represent themselves default: esc.ch_ = *begin; ++begin; break; } return esc; } ////////////////////////////////////////////////////////////////////////// // parse_charset // template inline void parse_charset ( FwdIter &begin , FwdIter end , compound_charset &chset , CompilerTraits &tr ) { using namespace regex_constants; typedef typename RegexTraits::char_type char_type; typedef typename RegexTraits::char_class_type char_class_type; BOOST_ASSERT(begin != end); RegexTraits const &rxtraits = tr.traits(); bool const icase = (0 != (regex_constants::icase_ & tr.flags())); FwdIter iprev = FwdIter(); escape_value esc = {0, 0, 0, escape_char}; bool invert = false; // check to see if we have an inverse charset if(begin != end && token_charset_invert == tr.get_charset_token(iprev = begin, end)) { begin = iprev; invert = true; } // skip the end token if-and-only-if it is the first token in the charset if(begin != end && token_charset_end == tr.get_charset_token(iprev = begin, end)) { for(; begin != iprev; ++begin) { chset.set_char(*begin, rxtraits, icase); } } compiler_token_type tok; char_type ch_prev = char_type(), ch_next = char_type(); bool have_prev = false; BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found"); // remember the current position and grab the next token iprev = begin; tok = tr.get_charset_token(begin, end); do { BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found"); if(token_charset_hyphen == tok && have_prev) { // remember the current position FwdIter iprev2 = begin; have_prev = false; // ch_prev is lower bound of a range switch(tr.get_charset_token(begin, end)) { case token_charset_hyphen: case token_charset_invert: begin = iprev2; // un-get these tokens and fall through case token_literal: ch_next = *begin++; BOOST_XPR_ENSURE_(ch_prev <= ch_next, error_range, "invalid charset range"); chset.set_range(ch_prev, ch_next, rxtraits, icase); continue; case token_charset_backspace: ch_next = char_type(8); // backspace BOOST_XPR_ENSURE_(ch_prev <= ch_next, error_range, "invalid charset range"); chset.set_range(ch_prev, ch_next, rxtraits, icase); continue; case token_escape: esc = parse_escape(begin, end, tr); if(escape_char == esc.type_) { BOOST_XPR_ENSURE_(ch_prev <= esc.ch_, error_range, "invalid charset range"); chset.set_range(ch_prev, esc.ch_, rxtraits, icase); continue; } case token_charset_end: // fall through default: // not a range. begin = iprev; // backup to hyphen token chset.set_char(ch_prev, rxtraits, icase); chset.set_char(*begin++, rxtraits, icase); continue; } } if(have_prev) { chset.set_char(ch_prev, rxtraits, icase); have_prev = false; } switch(tok) { case token_charset_hyphen: case token_charset_invert: case token_charset_end: case token_posix_charset_end: begin = iprev; // un-get these tokens ch_prev = *begin++; have_prev = true; continue; case token_charset_backspace: ch_prev = char_type(8); // backspace have_prev = true; continue; case token_posix_charset_begin: { FwdIter tmp = begin, start = begin; bool invert = (token_charset_invert == tr.get_charset_token(tmp, end)); if(invert) { begin = start = tmp; } while(token_literal == (tok = tr.get_charset_token(begin, end))) { tmp = ++begin; BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found"); } if(token_posix_charset_end == tok) { char_class_type chclass = rxtraits.lookup_classname(start, tmp, icase); BOOST_XPR_ENSURE_(0 != chclass, error_ctype, "unknown class name"); chset.set_class(chclass, invert); continue; } begin = iprev; // un-get this token ch_prev = *begin++; have_prev = true; } continue; case token_escape: esc = parse_escape(begin, end, tr); if(escape_char == esc.type_) { ch_prev = esc.ch_; have_prev = true; } else if(escape_class == esc.type_) { char_class_type upper_ = lookup_classname(rxtraits, "upper"); BOOST_ASSERT(0 != upper_); chset.set_class(esc.class_, rxtraits.isctype(*begin++, upper_)); } else { BOOST_ASSERT(false); } continue; default: ch_prev = *begin++; have_prev = true; continue; } } while(BOOST_XPR_ENSURE_((iprev = begin) != end, error_brack, "unexpected end of pattern found"), token_charset_end != (tok = tr.get_charset_token(begin, end))); if(have_prev) { chset.set_char(ch_prev, rxtraits, icase); } if(invert) { chset.inverse(); } } }}} // namespace boost::xpressive::detail #endif