/* * * Copyright (c) 1998-2002 * John Maddock * * Use, modification and distribution are subject to the * Boost Software License, Version 1.0. (See accompanying file * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) * */ /* * LOCATION: see http://www.boost.org for most recent version. * FILE states.cpp * VERSION see * DESCRIPTION: Declares internal state machine structures. */ #ifndef BOOST_REGEX_V4_STATES_HPP #define BOOST_REGEX_V4_STATES_HPP #ifdef BOOST_MSVC #pragma warning(push) #pragma warning(disable: 4103) #endif #ifdef BOOST_HAS_ABI_HEADERS # include BOOST_ABI_PREFIX #endif #ifdef BOOST_MSVC #pragma warning(pop) #endif namespace boost{ namespace re_detail{ /*** mask_type ******************************************************* Whenever we have a choice of two alternatives, we use an array of bytes to indicate which of the two alternatives it is possible to take for any given input character. If mask_take is set, then we can take the next state, and if mask_skip is set then we can take the alternative. ***********************************************************************/ enum mask_type { mask_take = 1, mask_skip = 2, mask_init = 4, mask_any = mask_skip | mask_take, mask_all = mask_any }; /*** helpers ********************************************************** These helpers let us use function overload resolution to detect whether we have narrow or wide character strings: ***********************************************************************/ struct _narrow_type{}; struct _wide_type{}; template struct is_byte; template<> struct is_byte { typedef _narrow_type width_type; }; template<> struct is_byte{ typedef _narrow_type width_type; }; template<> struct is_byte { typedef _narrow_type width_type; }; template struct is_byte { typedef _wide_type width_type; }; /*** enum syntax_element_type ****************************************** Every record in the state machine falls into one of the following types: ***********************************************************************/ enum syntax_element_type { // start of a marked sub-expression, or perl-style (?...) extension syntax_element_startmark = 0, // end of a marked sub-expression, or perl-style (?...) extension syntax_element_endmark = syntax_element_startmark + 1, // any sequence of literal characters syntax_element_literal = syntax_element_endmark + 1, // start of line assertion: ^ syntax_element_start_line = syntax_element_literal + 1, // end of line assertion $ syntax_element_end_line = syntax_element_start_line + 1, // match any character: . syntax_element_wild = syntax_element_end_line + 1, // end of expression: we have a match when we get here syntax_element_match = syntax_element_wild + 1, // perl style word boundary: \b syntax_element_word_boundary = syntax_element_match + 1, // perl style within word boundary: \B syntax_element_within_word = syntax_element_word_boundary + 1, // start of word assertion: \< syntax_element_word_start = syntax_element_within_word + 1, // end of word assertion: \> syntax_element_word_end = syntax_element_word_start + 1, // start of buffer assertion: \` syntax_element_buffer_start = syntax_element_word_end + 1, // end of buffer assertion: \' syntax_element_buffer_end = syntax_element_buffer_start + 1, // backreference to previously matched sub-expression syntax_element_backref = syntax_element_buffer_end + 1, // either a wide character set [..] or one with multicharacter collating elements: syntax_element_long_set = syntax_element_backref + 1, // narrow character set: [...] syntax_element_set = syntax_element_long_set + 1, // jump to a new state in the machine: syntax_element_jump = syntax_element_set + 1, // choose between two production states: syntax_element_alt = syntax_element_jump + 1, // a repeat syntax_element_rep = syntax_element_alt + 1, // match a combining character sequence syntax_element_combining = syntax_element_rep + 1, // perl style soft buffer end: \z syntax_element_soft_buffer_end = syntax_element_combining + 1, // perl style continuation: \G syntax_element_restart_continue = syntax_element_soft_buffer_end + 1, // single character repeats: syntax_element_dot_rep = syntax_element_restart_continue + 1, syntax_element_char_rep = syntax_element_dot_rep + 1, syntax_element_short_set_rep = syntax_element_char_rep + 1, syntax_element_long_set_rep = syntax_element_short_set_rep + 1, // a backstep for lookbehind repeats: syntax_element_backstep = syntax_element_long_set_rep + 1, // an assertion that a mark was matched: syntax_element_assert_backref = syntax_element_backstep + 1, syntax_element_toggle_case = syntax_element_assert_backref + 1 }; #ifdef BOOST_REGEX_DEBUG // dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion std::ostream& operator<<(std::ostream&, syntax_element_type); #endif struct re_syntax_base; /*** union offset_type ************************************************ Points to another state in the machine. During machine construction we use integral offsets, but these are converted to pointers before execution of the machine. ***********************************************************************/ union offset_type { re_syntax_base* p; std::ptrdiff_t i; }; /*** struct re_syntax_base ******************************************** Base class for all states in the machine. ***********************************************************************/ struct re_syntax_base { syntax_element_type type; // what kind of state this is offset_type next; // next state in the machine }; /*** struct re_brace ************************************************** A marked parenthesis. ***********************************************************************/ struct re_brace : public re_syntax_base { // The index to match, can be zero (don't mark the sub-expression) // or negative (for perl style (?...) extentions): int index; }; /*** struct re_dot ************************************************** Match anything. ***********************************************************************/ enum { dont_care = 1, force_not_newline = 0, force_newline = 2, test_not_newline = 2, test_newline = 3 }; struct re_dot : public re_syntax_base { unsigned char mask; }; /*** struct re_literal ************************************************ A string of literals, following this structure will be an array of characters: charT[length] ***********************************************************************/ struct re_literal : public re_syntax_base { unsigned int length; }; /*** struct re_case ************************************************ Indicates whether we are moving to a case insensive block or not ***********************************************************************/ struct re_case : public re_syntax_base { bool icase; }; /*** struct re_set_long *********************************************** A wide character set of characters, following this structure will be an array of type charT: First csingles null-terminated strings Then 2 * cranges NULL terminated strings Then cequivalents NULL terminated strings ***********************************************************************/ template struct re_set_long : public re_syntax_base { unsigned int csingles, cranges, cequivalents; mask_type cclasses; mask_type cnclasses; bool isnot; bool singleton; }; /*** struct re_set **************************************************** A set of narrow-characters, matches any of _map which is none-zero ***********************************************************************/ struct re_set : public re_syntax_base { unsigned char _map[1 << CHAR_BIT]; }; /*** struct re_jump *************************************************** Jump to a new location in the machine (not next). ***********************************************************************/ struct re_jump : public re_syntax_base { offset_type alt; // location to jump to }; /*** struct re_alt *************************************************** Jump to a new location in the machine (possibly next). ***********************************************************************/ struct re_alt : public re_jump { unsigned char _map[1 << CHAR_BIT]; // which characters can take the jump unsigned int can_be_null; // true if we match a NULL string }; /*** struct re_repeat ************************************************* Repeat a section of the machine ***********************************************************************/ struct re_repeat : public re_alt { std::size_t min, max; // min and max allowable repeats int id; // Unique identifier for this repeat bool leading; // True if this repeat is at the start of the machine (lets us optimize some searches) bool greedy; // True if this is a greedy repeat }; /*** enum re_jump_size_type ******************************************* Provides compiled size of re_jump structure (allowing for trailing alignment). We provide this so we know how manybytes to insert when constructing the machine (The value of padding_mask is defined in regex_raw_buffer.hpp). ***********************************************************************/ enum re_jump_size_type { re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask), re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask), re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask) }; /*** proc re_is_set_member ********************************************* Forward declaration: we'll need this one later... ***********************************************************************/ template struct regex_data; template iterator BOOST_REGEX_CALL re_is_set_member(iterator next, iterator last, const re_set_long* set_, const regex_data& e, bool icase); } // namespace re_detail } // namespace boost #ifdef BOOST_MSVC #pragma warning(push) #pragma warning(disable: 4103) #endif #ifdef BOOST_HAS_ABI_HEADERS # include BOOST_ABI_SUFFIX #endif #ifdef BOOST_MSVC #pragma warning(pop) #endif #endif