| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718 |
- #
- # Secret Labs' Regular Expression Engine
- #
- # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
- #
- # This version of the SRE library can be redistributed under CNRI's
- # Python 1.6 license. For any other use, please contact Secret Labs
- # AB (info@pythonware.com).
- #
- # Portions of this engine have been developed in cooperation with
- # CNRI. Hewlett-Packard provided funding for 1.6 integration and
- # other compatibility work.
- #
- # 2010-01-16 mrab Python front-end re-written and extended
- r"""Support for regular expressions (RE).
- This module provides regular expression matching operations similar to those
- found in Perl. It supports both 8-bit and Unicode strings; both the pattern and
- the strings being processed can contain null bytes and characters outside the
- US ASCII range.
- Regular expressions can contain both special and ordinary characters. Most
- ordinary characters, like "A", "a", or "0", are the simplest regular
- expressions; they simply match themselves. You can concatenate ordinary
- characters, so last matches the string 'last'.
- There are a few differences between the old (legacy) behaviour and the new
- (enhanced) behaviour, which are indicated by VERSION0 or VERSION1.
- The special characters are:
- "." Matches any character except a newline.
- "^" Matches the start of the string.
- "$" Matches the end of the string or just before the
- newline at the end of the string.
- "*" Matches 0 or more (greedy) repetitions of the preceding
- RE. Greedy means that it will match as many repetitions
- as possible.
- "+" Matches 1 or more (greedy) repetitions of the preceding
- RE.
- "?" Matches 0 or 1 (greedy) of the preceding RE.
- *?,+?,?? Non-greedy versions of the previous three special
- characters.
- *+,++,?+ Possessive versions of the previous three special
- characters.
- {m,n} Matches from m to n repetitions of the preceding RE.
- {m,n}? Non-greedy version of the above.
- {m,n}+ Possessive version of the above.
- {...} Fuzzy matching constraints.
- "\\" Either escapes special characters or signals a special
- sequence.
- [...] Indicates a set of characters. A "^" as the first
- character indicates a complementing set.
- "|" A|B, creates an RE that will match either A or B.
- (...) Matches the RE inside the parentheses. The contents are
- captured and can be retrieved or matched later in the
- string.
- (?flags-flags) VERSION1: Sets/clears the flags for the remainder of
- the group or pattern; VERSION0: Sets the flags for the
- entire pattern.
- (?:...) Non-capturing version of regular parentheses.
- (?>...) Atomic non-capturing version of regular parentheses.
- (?flags-flags:...) Non-capturing version of regular parentheses with local
- flags.
- (?P<name>...) The substring matched by the group is accessible by
- name.
- (?<name>...) The substring matched by the group is accessible by
- name.
- (?P=name) Matches the text matched earlier by the group named
- name.
- (?#...) A comment; ignored.
- (?=...) Matches if ... matches next, but doesn't consume the
- string.
- (?!...) Matches if ... doesn't match next.
- (?<=...) Matches if preceded by ....
- (?<!...) Matches if not preceded by ....
- (?(id)yes|no) Matches yes pattern if group id matched, the (optional)
- no pattern otherwise.
- (?(DEFINE)...) If there's no group called "DEFINE", then ... will be
- ignored, but any group definitions will be available.
- (?|...|...) (?|A|B), creates an RE that will match either A or B,
- but reuses capture group numbers across the
- alternatives.
- (*FAIL) Forces matching to fail, which means immediate
- backtracking.
- (*F) Abbreviation for (*FAIL).
- (*PRUNE) Discards the current backtracking information. Its
- effect doesn't extend outside an atomic group or a
- lookaround.
- (*SKIP) Similar to (*PRUNE), except that it also sets where in
- the text the next attempt at matching the entire
- pattern will start. Its effect doesn't extend outside
- an atomic group or a lookaround.
- The fuzzy matching constraints are: "i" to permit insertions, "d" to permit
- deletions, "s" to permit substitutions, "e" to permit any of these. Limits are
- optional with "<=" and "<". If any type of error is provided then any type not
- provided is not permitted.
- A cost equation may be provided.
- Examples:
- (?:fuzzy){i<=2}
- (?:fuzzy){i<=1,s<=2,d<=1,1i+1s+1d<3}
- VERSION1: Set operators are supported, and a set can include nested sets. The
- set operators, in order of increasing precedence, are:
- || Set union ("x||y" means "x or y").
- ~~ (double tilde) Symmetric set difference ("x~~y" means "x or y, but not
- both").
- && Set intersection ("x&&y" means "x and y").
- -- (double dash) Set difference ("x--y" means "x but not y").
- Implicit union, ie, simple juxtaposition like in [ab], has the highest
- precedence.
- VERSION0 and VERSION1:
- The special sequences consist of "\\" and a character from the list below. If
- the ordinary character is not on the list, then the resulting RE will match the
- second character.
- \number Matches the contents of the group of the same number if
- number is no more than 2 digits, otherwise the character
- with the 3-digit octal code.
- \a Matches the bell character.
- \A Matches only at the start of the string.
- \b Matches the empty string, but only at the start or end of a
- word.
- \B Matches the empty string, but not at the start or end of a
- word.
- \d Matches any decimal digit; equivalent to the set [0-9] when
- matching a bytestring or a Unicode string with the ASCII
- flag, or the whole range of Unicode digits when matching a
- Unicode string.
- \D Matches any non-digit character; equivalent to [^\d].
- \f Matches the formfeed character.
- \g<name> Matches the text matched by the group named name.
- \G Matches the empty string, but only at the position where
- the search started.
- \h Matches horizontal whitespace.
- \K Keeps only what follows for the entire match.
- \L<name> Named list. The list is provided as a keyword argument.
- \m Matches the empty string, but only at the start of a word.
- \M Matches the empty string, but only at the end of a word.
- \n Matches the newline character.
- \N{name} Matches the named character.
- \p{name=value} Matches the character if its property has the specified
- value.
- \P{name=value} Matches the character if its property hasn't the specified
- value.
- \r Matches the carriage-return character.
- \s Matches any whitespace character; equivalent to
- [ \t\n\r\f\v].
- \S Matches any non-whitespace character; equivalent to [^\s].
- \t Matches the tab character.
- \uXXXX Matches the Unicode codepoint with 4-digit hex code XXXX.
- \UXXXXXXXX Matches the Unicode codepoint with 8-digit hex code
- XXXXXXXX.
- \v Matches the vertical tab character.
- \w Matches any alphanumeric character; equivalent to
- [a-zA-Z0-9_] when matching a bytestring or a Unicode string
- with the ASCII flag, or the whole range of Unicode
- alphanumeric characters (letters plus digits plus
- underscore) when matching a Unicode string. With LOCALE, it
- will match the set [0-9_] plus characters defined as
- letters for the current locale.
- \W Matches the complement of \w; equivalent to [^\w].
- \xXX Matches the character with 2-digit hex code XX.
- \X Matches a grapheme.
- \Z Matches only at the end of the string.
- \\ Matches a literal backslash.
- This module exports the following functions:
- match Match a regular expression pattern at the beginning of a string.
- fullmatch Match a regular expression pattern against all of a string.
- search Search a string for the presence of a pattern.
- sub Substitute occurrences of a pattern found in a string using a
- template string.
- subf Substitute occurrences of a pattern found in a string using a
- format string.
- subn Same as sub, but also return the number of substitutions made.
- subfn Same as subf, but also return the number of substitutions made.
- split Split a string by the occurrences of a pattern. VERSION1: will
- split at zero-width match; VERSION0: won't split at zero-width
- match.
- splititer Return an iterator yielding the parts of a split string.
- findall Find all occurrences of a pattern in a string.
- finditer Return an iterator yielding a match object for each match.
- compile Compile a pattern into a Pattern object.
- purge Clear the regular expression cache.
- escape Backslash all non-alphanumerics or special characters in a
- string.
- Most of the functions support a concurrent parameter: if True, the GIL will be
- released during matching, allowing other Python threads to run concurrently. If
- the string changes during matching, the behaviour is undefined. This parameter
- is not needed when working on the builtin (immutable) string classes.
- Some of the functions in this module take flags as optional parameters. Most of
- these flags can also be set within an RE:
- A a ASCII Make \w, \W, \b, \B, \d, and \D match the
- corresponding ASCII character categories. Default
- when matching a bytestring.
- B b BESTMATCH Find the best fuzzy match (default is first).
- D DEBUG Print the parsed pattern.
- E e ENHANCEMATCH Attempt to improve the fit after finding the first
- fuzzy match.
- F f FULLCASE Use full case-folding when performing
- case-insensitive matching in Unicode.
- I i IGNORECASE Perform case-insensitive matching.
- L L LOCALE Make \w, \W, \b, \B, \d, and \D dependent on the
- current locale. (One byte per character only.)
- M m MULTILINE "^" matches the beginning of lines (after a newline)
- as well as the string. "$" matches the end of lines
- (before a newline) as well as the end of the string.
- P p POSIX Perform POSIX-standard matching (leftmost longest).
- R r REVERSE Searches backwards.
- S s DOTALL "." matches any character at all, including the
- newline.
- U u UNICODE Make \w, \W, \b, \B, \d, and \D dependent on the
- Unicode locale. Default when matching a Unicode
- string.
- V0 V0 VERSION0 Turn on the old legacy behaviour.
- V1 V1 VERSION1 Turn on the new enhanced behaviour. This flag
- includes the FULLCASE flag.
- W w WORD Make \b and \B work with default Unicode word breaks
- and make ".", "^" and "$" work with Unicode line
- breaks.
- X x VERBOSE Ignore whitespace and comments for nicer looking REs.
- This module also defines an exception 'error'.
- """
- # Public symbols.
- __all__ = ["compile", "DEFAULT_VERSION", "escape", "findall", "finditer",
- "fullmatch", "match", "purge", "search", "split", "splititer", "sub", "subf",
- "subfn", "subn", "template", "Scanner", "A", "ASCII", "B", "BESTMATCH", "D",
- "DEBUG", "E", "ENHANCEMATCH", "S", "DOTALL", "F", "FULLCASE", "I",
- "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P", "POSIX", "R", "REVERSE",
- "T", "TEMPLATE", "U", "UNICODE", "V0", "VERSION0", "V1", "VERSION1", "X",
- "VERBOSE", "W", "WORD", "error", "Regex", "__version__", "__doc__"]
- __version__ = "2.5.83"
- # --------------------------------------------------------------------
- # Public interface.
- def match(pattern, string, flags=0, pos=None, endpos=None, partial=False,
- concurrent=None, timeout=None, ignore_unused=False, **kwargs):
- """Try to apply the pattern at the start of the string, returning a match
- object, or None if no match was found."""
- return _compile(pattern, flags, ignore_unused, kwargs).match(string, pos,
- endpos, concurrent, partial, timeout)
- def fullmatch(pattern, string, flags=0, pos=None, endpos=None, partial=False,
- concurrent=None, timeout=None, ignore_unused=False, **kwargs):
- """Try to apply the pattern against all of the string, returning a match
- object, or None if no match was found."""
- return _compile(pattern, flags, ignore_unused, kwargs).fullmatch(string,
- pos, endpos, concurrent, partial, timeout)
- def search(pattern, string, flags=0, pos=None, endpos=None, partial=False,
- concurrent=None, timeout=None, ignore_unused=False, **kwargs):
- """Search through string looking for a match to the pattern, returning a
- match object, or None if no match was found."""
- return _compile(pattern, flags, ignore_unused, kwargs).search(string, pos,
- endpos, concurrent, partial, timeout)
- def sub(pattern, repl, string, count=0, flags=0, pos=None, endpos=None,
- concurrent=None, timeout=None, ignore_unused=False, **kwargs):
- """Return the string obtained by replacing the leftmost (or rightmost with a
- reverse pattern) non-overlapping occurrences of the pattern in string by the
- replacement repl. repl can be either a string or a callable; if a string,
- backslash escapes in it are processed; if a callable, it's passed the match
- object and must return a replacement string to be used."""
- return _compile(pattern, flags, ignore_unused, kwargs).sub(repl, string,
- count, pos, endpos, concurrent, timeout)
- def subf(pattern, format, string, count=0, flags=0, pos=None, endpos=None,
- concurrent=None, timeout=None, ignore_unused=False, **kwargs):
- """Return the string obtained by replacing the leftmost (or rightmost with a
- reverse pattern) non-overlapping occurrences of the pattern in string by the
- replacement format. format can be either a string or a callable; if a string,
- it's treated as a format string; if a callable, it's passed the match object
- and must return a replacement string to be used."""
- return _compile(pattern, flags, ignore_unused, kwargs).subf(format, string,
- count, pos, endpos, concurrent, timeout)
- def subn(pattern, repl, string, count=0, flags=0, pos=None, endpos=None,
- concurrent=None, timeout=None, ignore_unused=False, **kwargs):
- """Return a 2-tuple containing (new_string, number). new_string is the string
- obtained by replacing the leftmost (or rightmost with a reverse pattern)
- non-overlapping occurrences of the pattern in the source string by the
- replacement repl. number is the number of substitutions that were made. repl
- can be either a string or a callable; if a string, backslash escapes in it
- are processed; if a callable, it's passed the match object and must return a
- replacement string to be used."""
- return _compile(pattern, flags, ignore_unused, kwargs).subn(repl, string,
- count, pos, endpos, concurrent, timeout)
- def subfn(pattern, format, string, count=0, flags=0, pos=None, endpos=None,
- concurrent=None, timeout=None, ignore_unused=False, **kwargs):
- """Return a 2-tuple containing (new_string, number). new_string is the string
- obtained by replacing the leftmost (or rightmost with a reverse pattern)
- non-overlapping occurrences of the pattern in the source string by the
- replacement format. number is the number of substitutions that were made. format
- can be either a string or a callable; if a string, it's treated as a format
- string; if a callable, it's passed the match object and must return a
- replacement string to be used."""
- return _compile(pattern, flags, ignore_unused, kwargs).subfn(format, string,
- count, pos, endpos, concurrent, timeout)
- def split(pattern, string, maxsplit=0, flags=0, concurrent=None, timeout=None,
- ignore_unused=False, **kwargs):
- """Split the source string by the occurrences of the pattern, returning a
- list containing the resulting substrings. If capturing parentheses are used
- in pattern, then the text of all groups in the pattern are also returned as
- part of the resulting list. If maxsplit is nonzero, at most maxsplit splits
- occur, and the remainder of the string is returned as the final element of
- the list."""
- return _compile(pattern, flags, ignore_unused, kwargs).split(string,
- maxsplit, concurrent, timeout)
- def splititer(pattern, string, maxsplit=0, flags=0, concurrent=None, timeout=None,
- ignore_unused=False, **kwargs):
- "Return an iterator yielding the parts of a split string."
- return _compile(pattern, flags, ignore_unused, kwargs).splititer(string,
- maxsplit, concurrent, timeout)
- def findall(pattern, string, flags=0, pos=None, endpos=None, overlapped=False,
- concurrent=None, timeout=None, ignore_unused=False, **kwargs):
- """Return a list of all matches in the string. The matches may be overlapped
- if overlapped is True. If one or more groups are present in the pattern,
- return a list of groups; this will be a list of tuples if the pattern has
- more than one group. Empty matches are included in the result."""
- return _compile(pattern, flags, ignore_unused, kwargs).findall(string, pos,
- endpos, overlapped, concurrent, timeout)
- def finditer(pattern, string, flags=0, pos=None, endpos=None, overlapped=False,
- partial=False, concurrent=None, timeout=None, ignore_unused=False, **kwargs):
- """Return an iterator over all matches in the string. The matches may be
- overlapped if overlapped is True. For each match, the iterator returns a
- match object. Empty matches are included in the result."""
- return _compile(pattern, flags, ignore_unused, kwargs).finditer(string, pos,
- endpos, overlapped, concurrent, partial, timeout)
- def compile(pattern, flags=0, ignore_unused=False, **kwargs):
- "Compile a regular expression pattern, returning a pattern object."
- return _compile(pattern, flags, ignore_unused, kwargs)
- def purge():
- "Clear the regular expression cache"
- _cache.clear()
- _locale_sensitive.clear()
- def template(pattern, flags=0):
- "Compile a template pattern, returning a pattern object."
- return _compile(pattern, flags | TEMPLATE, False, {})
- def escape(pattern, special_only=True, literal_spaces=False):
- """Escape a string for use as a literal in a pattern. If special_only is
- True, escape only special characters, else escape all non-alphanumeric
- characters. If literal_spaces is True, don't escape spaces."""
- # Convert it to Unicode.
- if isinstance(pattern, bytes):
- p = pattern.decode("latin-1")
- else:
- p = pattern
- s = []
- if special_only:
- for c in p:
- if c == " " and literal_spaces:
- s.append(c)
- elif c in _METACHARS or c.isspace():
- s.append("\\")
- s.append(c)
- elif c == "\x00":
- s.append("\\000")
- else:
- s.append(c)
- else:
- for c in p:
- if c == " " and literal_spaces:
- s.append(c)
- elif c in _ALNUM:
- s.append(c)
- elif c == "\x00":
- s.append("\\000")
- else:
- s.append("\\")
- s.append(c)
- r = "".join(s)
- # Convert it back to bytes if necessary.
- if isinstance(pattern, bytes):
- r = r.encode("latin-1")
- return r
- # --------------------------------------------------------------------
- # Internals.
- import regex._regex_core as _regex_core
- import regex._regex as _regex
- from threading import RLock as _RLock
- from locale import getpreferredencoding as _getpreferredencoding
- from regex._regex_core import *
- from regex._regex_core import (_ALL_VERSIONS, _ALL_ENCODINGS, _FirstSetError,
- _UnscopedFlagSet, _check_group_features, _compile_firstset,
- _compile_replacement, _flatten_code, _fold_case, _get_required_string,
- _parse_pattern, _shrink_cache)
- from regex._regex_core import (ALNUM as _ALNUM, Info as _Info, OP as _OP, Source
- as _Source, Fuzzy as _Fuzzy)
- # Version 0 is the old behaviour, compatible with the original 're' module.
- # Version 1 is the new behaviour, which differs slightly.
- DEFAULT_VERSION = VERSION0
- _METACHARS = frozenset("()[]{}?*+|^$\\.-#&~")
- _regex_core.DEFAULT_VERSION = DEFAULT_VERSION
- # Caches for the patterns and replacements.
- _cache = {}
- _cache_lock = _RLock()
- _named_args = {}
- _replacement_cache = {}
- _locale_sensitive = {}
- # Maximum size of the cache.
- _MAXCACHE = 500
- _MAXREPCACHE = 500
- def _compile(pattern, flags, ignore_unused, kwargs):
- "Compiles a regular expression to a PatternObject."
- global DEFAULT_VERSION
- try:
- from regex import DEFAULT_VERSION
- except ImportError:
- pass
- # We won't bother to cache the pattern if we're debugging.
- debugging = (flags & DEBUG) != 0
- # What locale is this pattern using?
- locale_key = (type(pattern), pattern)
- if _locale_sensitive.get(locale_key, True) or (flags & LOCALE) != 0:
- # This pattern is, or might be, locale-sensitive.
- pattern_locale = _getpreferredencoding()
- else:
- # This pattern is definitely not locale-sensitive.
- pattern_locale = None
- if not debugging:
- try:
- # Do we know what keyword arguments are needed?
- args_key = pattern, type(pattern), flags
- args_needed = _named_args[args_key]
- # Are we being provided with its required keyword arguments?
- args_supplied = set()
- if args_needed:
- for k, v in args_needed:
- try:
- args_supplied.add((k, frozenset(kwargs[k])))
- except KeyError:
- raise error("missing named list: {!r}".format(k))
- args_supplied = frozenset(args_supplied)
- # Have we already seen this regular expression and named list?
- pattern_key = (pattern, type(pattern), flags, args_supplied,
- DEFAULT_VERSION, pattern_locale)
- return _cache[pattern_key]
- except KeyError:
- # It's a new pattern, or new named list for a known pattern.
- pass
- # Guess the encoding from the class of the pattern string.
- if isinstance(pattern, str):
- guess_encoding = UNICODE
- elif isinstance(pattern, bytes):
- guess_encoding = ASCII
- elif isinstance(pattern, Pattern):
- if flags:
- raise ValueError("cannot process flags argument with a compiled pattern")
- return pattern
- else:
- raise TypeError("first argument must be a string or compiled pattern")
- # Set the default version in the core code in case it has been changed.
- _regex_core.DEFAULT_VERSION = DEFAULT_VERSION
- global_flags = flags
- while True:
- caught_exception = None
- try:
- source = _Source(pattern)
- info = _Info(global_flags, source.char_type, kwargs)
- info.guess_encoding = guess_encoding
- source.ignore_space = bool(info.flags & VERBOSE)
- parsed = _parse_pattern(source, info)
- break
- except _UnscopedFlagSet:
- # Remember the global flags for the next attempt.
- global_flags = info.global_flags
- except error as e:
- caught_exception = e
- if caught_exception:
- raise error(caught_exception.msg, caught_exception.pattern,
- caught_exception.pos)
- if not source.at_end():
- raise error("unbalanced parenthesis", pattern, source.pos)
- # Check the global flags for conflicts.
- version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
- if version not in (0, VERSION0, VERSION1):
- raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")
- if (info.flags & _ALL_ENCODINGS) not in (0, ASCII, LOCALE, UNICODE):
- raise ValueError("ASCII, LOCALE and UNICODE flags are mutually incompatible")
- if isinstance(pattern, bytes) and (info.flags & UNICODE):
- raise ValueError("cannot use UNICODE flag with a bytes pattern")
- if not (info.flags & _ALL_ENCODINGS):
- if isinstance(pattern, str):
- info.flags |= UNICODE
- else:
- info.flags |= ASCII
- reverse = bool(info.flags & REVERSE)
- fuzzy = isinstance(parsed, _Fuzzy)
- # Remember whether this pattern as an inline locale flag.
- _locale_sensitive[locale_key] = info.inline_locale
- # Fix the group references.
- caught_exception = None
- try:
- parsed.fix_groups(pattern, reverse, False)
- except error as e:
- caught_exception = e
- if caught_exception:
- raise error(caught_exception.msg, caught_exception.pattern,
- caught_exception.pos)
- # Should we print the parsed pattern?
- if flags & DEBUG:
- parsed.dump(indent=0, reverse=reverse)
- # Optimise the parsed pattern.
- parsed = parsed.optimise(info, reverse)
- parsed = parsed.pack_characters(info)
- # Get the required string.
- req_offset, req_chars, req_flags = _get_required_string(parsed, info.flags)
- # Build the named lists.
- named_lists = {}
- named_list_indexes = [None] * len(info.named_lists_used)
- args_needed = set()
- for key, index in info.named_lists_used.items():
- name, case_flags = key
- values = frozenset(kwargs[name])
- if case_flags:
- items = frozenset(_fold_case(info, v) for v in values)
- else:
- items = values
- named_lists[name] = values
- named_list_indexes[index] = items
- args_needed.add((name, values))
- # Any unused keyword arguments, possibly resulting from a typo?
- unused_kwargs = set(kwargs) - set(named_lists)
- if unused_kwargs and not ignore_unused:
- any_one = next(iter(unused_kwargs))
- raise ValueError('unused keyword argument {!a}'.format(any_one))
- # Check the features of the groups.
- _check_group_features(info, parsed)
- # Compile the parsed pattern. The result is a list of tuples.
- code = parsed.compile(reverse)
- # Is there a group call to the pattern as a whole?
- key = (0, reverse, fuzzy)
- ref = info.call_refs.get(key)
- if ref is not None:
- code = [(_OP.CALL_REF, ref)] + code + [(_OP.END, )]
- # Add the final 'success' opcode.
- code += [(_OP.SUCCESS, )]
- # Compile the additional copies of the groups that we need.
- for group, rev, fuz in info.additional_groups:
- code += group.compile(rev, fuz)
- # Flatten the code into a list of ints.
- code = _flatten_code(code)
- if not parsed.has_simple_start():
- # Get the first set, if possible.
- try:
- fs_code = _compile_firstset(info, parsed.get_firstset(reverse))
- fs_code = _flatten_code(fs_code)
- code = fs_code + code
- except _FirstSetError:
- pass
- # The named capture groups.
- index_group = dict((v, n) for n, v in info.group_index.items())
- # Create the PatternObject.
- #
- # Local flags like IGNORECASE affect the code generation, but aren't needed
- # by the PatternObject itself. Conversely, global flags like LOCALE _don't_
- # affect the code generation but _are_ needed by the PatternObject.
- compiled_pattern = _regex.compile(pattern, info.flags | version, code,
- info.group_index, index_group, named_lists, named_list_indexes,
- req_offset, req_chars, req_flags, info.group_count)
- # Do we need to reduce the size of the cache?
- if len(_cache) >= _MAXCACHE:
- with _cache_lock:
- _shrink_cache(_cache, _named_args, _locale_sensitive, _MAXCACHE)
- if not debugging:
- if (info.flags & LOCALE) == 0:
- pattern_locale = None
- args_needed = frozenset(args_needed)
- # Store this regular expression and named list.
- pattern_key = (pattern, type(pattern), flags, args_needed,
- DEFAULT_VERSION, pattern_locale)
- _cache[pattern_key] = compiled_pattern
- # Store what keyword arguments are needed.
- _named_args[args_key] = args_needed
- return compiled_pattern
- def _compile_replacement_helper(pattern, template):
- "Compiles a replacement template."
- # This function is called by the _regex module.
- # Have we seen this before?
- key = pattern.pattern, pattern.flags, template
- compiled = _replacement_cache.get(key)
- if compiled is not None:
- return compiled
- if len(_replacement_cache) >= _MAXREPCACHE:
- _replacement_cache.clear()
- is_unicode = isinstance(template, str)
- source = _Source(template)
- if is_unicode:
- def make_string(char_codes):
- return "".join(chr(c) for c in char_codes)
- else:
- def make_string(char_codes):
- return bytes(char_codes)
- compiled = []
- literal = []
- while True:
- ch = source.get()
- if not ch:
- break
- if ch == "\\":
- # '_compile_replacement' will return either an int group reference
- # or a string literal. It returns items (plural) in order to handle
- # a 2-character literal (an invalid escape sequence).
- is_group, items = _compile_replacement(source, pattern, is_unicode)
- if is_group:
- # It's a group, so first flush the literal.
- if literal:
- compiled.append(make_string(literal))
- literal = []
- compiled.extend(items)
- else:
- literal.extend(items)
- else:
- literal.append(ord(ch))
- # Flush the literal.
- if literal:
- compiled.append(make_string(literal))
- _replacement_cache[key] = compiled
- return compiled
- # We define Pattern here after all the support objects have been defined.
- Pattern = type(_compile('', 0, False, {}))
- Match = type(_compile('', 0, False, {}).match(''))
- # We'll define an alias for the 'compile' function so that the repr of a
- # pattern object is eval-able.
- Regex = compile
- # Register myself for pickling.
- import copyreg as _copy_reg
- def _pickle(pattern):
- return _regex.compile, pattern._pickled_data
- _copy_reg.pickle(Pattern, _pickle)
|