From 262941d9ded2322b033e2593418c5082916b39f7 Mon Sep 17 00:00:00 2001 From: Shen-Ta Hsieh Date: Wed, 19 Mar 2025 12:03:58 +0800 Subject: [PATCH] Update sre_parser from cpython 3.13.2 source --- gixy/core/regexp.py | 188 +++-- gixy/core/sre_parse/_constants.py | 230 ++++++ gixy/core/sre_parse/_parser.py | 1082 ++++++++++++++++++++++++++ gixy/core/sre_parse/sre_constants.py | 226 ------ gixy/core/sre_parse/sre_parse.py | 827 -------------------- tests/core/test_regexp.py | 1 + 6 files changed, 1423 insertions(+), 1131 deletions(-) create mode 100644 gixy/core/sre_parse/_constants.py create mode 100644 gixy/core/sre_parse/_parser.py delete mode 100644 gixy/core/sre_parse/sre_constants.py delete mode 100644 gixy/core/sre_parse/sre_parse.py diff --git a/gixy/core/regexp.py b/gixy/core/regexp.py index ab180ea..a2df6fe 100644 --- a/gixy/core/regexp.py +++ b/gixy/core/regexp.py @@ -4,7 +4,7 @@ import random import itertools from cached_property import cached_property -import gixy.core.sre_parse.sre_parse as sre_parse +from .sre_parse import _parser LOG = logging.getLogger(__name__) @@ -22,28 +22,28 @@ FIX_NAMED_GROUPS_RE = re.compile(r"(?|')") CATEGORIES = { # TODO(buglloc): unicode? - sre_parse.CATEGORY_SPACE: sre_parse.WHITESPACE, - sre_parse.CATEGORY_NOT_SPACE: _build_reverse_list(sre_parse.WHITESPACE), - sre_parse.CATEGORY_DIGIT: sre_parse.DIGITS, - sre_parse.CATEGORY_NOT_DIGIT: _build_reverse_list(sre_parse.DIGITS), - sre_parse.CATEGORY_WORD: frozenset('abcdefghijklmnopqrstuvwxyz' - 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' - '0123456789_'), - sre_parse.CATEGORY_NOT_WORD: _build_reverse_list(frozenset('abcdefghijklmnopqrstuvwxyz' - 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' - '0123456789_')), - sre_parse.CATEGORY_LINEBREAK: frozenset('\n'), - sre_parse.CATEGORY_NOT_LINEBREAK: _build_reverse_list(frozenset('\n')), + _parser.CATEGORY_SPACE: _parser.WHITESPACE, + _parser.CATEGORY_NOT_SPACE: _build_reverse_list(_parser.WHITESPACE), + _parser.CATEGORY_DIGIT: _parser.DIGITS, + _parser.CATEGORY_NOT_DIGIT: _build_reverse_list(_parser.DIGITS), + _parser.CATEGORY_WORD: frozenset('abcdefghijklmnopqrstuvwxyz' + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + '0123456789_'), + _parser.CATEGORY_NOT_WORD: _build_reverse_list(frozenset('abcdefghijklmnopqrstuvwxyz' + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + '0123456789_')), + _parser.CATEGORY_LINEBREAK: frozenset('\n'), + _parser.CATEGORY_NOT_LINEBREAK: _build_reverse_list(frozenset('\n')), 'ANY': [chr(x) for x in range(1, 127) if x != 10] } CATEGORIES_NAMES = { - sre_parse.CATEGORY_DIGIT: r'\d', - sre_parse.CATEGORY_NOT_DIGIT: r'\D', - sre_parse.CATEGORY_SPACE: r'\s', - sre_parse.CATEGORY_NOT_SPACE: r'\S', - sre_parse.CATEGORY_WORD: r'\w', - sre_parse.CATEGORY_NOT_WORD: r'\W', + _parser.CATEGORY_DIGIT: r'\d', + _parser.CATEGORY_NOT_DIGIT: r'\D', + _parser.CATEGORY_SPACE: r'\s', + _parser.CATEGORY_NOT_SPACE: r'\S', + _parser.CATEGORY_WORD: r'\w', + _parser.CATEGORY_NOT_WORD: r'\W', } @@ -55,22 +55,22 @@ def extract_groups(parsed, top=True): if not token: # Skip empty tokens pass - elif token[0] == sre_parse.SUBPATTERN: - if isinstance(token[1][0], int): - # Captured group index can't be a string. E.g. for pattern "(?:la)" group name is "None" - result[token[1][0]] = token[1][1] - result.update(extract_groups(token[1][1], False)) - elif token[0] == sre_parse.MIN_REPEAT: + elif token[0] == _parser.SUBPATTERN: + # (group, add_flags, del_flags, pattern) + if token[1][0] is not None: + result[token[1][0]] = token[1][3] + result.update(extract_groups(token[1][3], False)) + elif token[0] == _parser.MIN_REPEAT: result.update(extract_groups(token[1][2], False)) - elif token[0] == sre_parse.MAX_REPEAT: + elif token[0] == _parser.MAX_REPEAT: result.update(extract_groups(token[1][2], False)) - elif token[0] == sre_parse.BRANCH: + elif token[0] == _parser.BRANCH: result.update(extract_groups(token[1][1], False)) - elif token[0] == sre_parse.SUBPATTERN: - result.update(extract_groups(token[1][1], False)) - elif token[0] == sre_parse.IN: + elif token[0] == _parser.IN: result.update(extract_groups(token[1], False)) - elif isinstance(token, sre_parse.SubPattern): + elif token[0] == _parser.ATOMIC_GROUP: + result.update(extract_groups(token[1], False)) + elif isinstance(token, _parser.SubPattern): result.update(extract_groups(token, False)) return result @@ -155,7 +155,7 @@ class Token(object): class AnyToken(Token): - type = sre_parse.ANY + type = _parser.ANY def can_contain(self, char, skip_literal=True): return char in CATEGORIES['ANY'] @@ -174,7 +174,7 @@ class AnyToken(Token): class LiteralToken(Token): - type = sre_parse.LITERAL + type = _parser.LITERAL def _parse(self): self.char = chr(self.token[1]) @@ -195,7 +195,7 @@ class LiteralToken(Token): class NotLiteralToken(Token): - type = sre_parse.NOT_LITERAL + type = _parser.NOT_LITERAL def _parse(self): self.char = chr(self.token[1]) @@ -219,7 +219,7 @@ class NotLiteralToken(Token): class RangeToken(Token): - type = sre_parse.RANGE + type = _parser.RANGE def _parse(self): self.left_code = self.token[1][0] @@ -244,7 +244,7 @@ class RangeToken(Token): class CategoryToken(Token): - type = sre_parse.CATEGORY + type = _parser.CATEGORY def _parse(self): self.char_list = CATEGORIES.get(self.token[1], ['']) @@ -267,7 +267,7 @@ class CategoryToken(Token): class MinRepeatToken(Token): - type = sre_parse.MIN_REPEAT + type = _parser.MIN_REPEAT def _parse(self): self._parse_childs(self.token[1][2]) @@ -336,15 +336,15 @@ class MinRepeatToken(Token): return '{childs}{{{count}}}?'.format(childs=childs, count=self.min) if self.min == 0 and self.max == 1: return '{childs}?'.format(childs=childs) - if self.min == 0 and self.max == sre_parse.MAXREPEAT: + if self.min == 0 and self.max == _parser.MAXREPEAT: return '{childs}*?'.format(childs=childs) - if self.min == 1 and self.max == sre_parse.MAXREPEAT: + if self.min == 1 and self.max == _parser.MAXREPEAT: return '{childs}+?'.format(childs=childs) return '{childs}{{{min},{max}}}?'.format(childs=childs, min=self.min, max=self.max) class MaxRepeatToken(Token): - type = sre_parse.MAX_REPEAT + type = _parser.MAX_REPEAT def _parse(self): self._parse_childs(self.token[1][2]) @@ -413,22 +413,22 @@ class MaxRepeatToken(Token): return '{childs}{{{count}}}'.format(childs=childs, count=self.min) if self.min == 0 and self.max == 1: return '{childs}?'.format(childs=childs) - if self.min == 0 and self.max == sre_parse.MAXREPEAT: + if self.min == 0 and self.max == _parser.MAXREPEAT: return '{childs}*'.format(childs=childs) - if self.min == 1 and self.max == sre_parse.MAXREPEAT: + if self.min == 1 and self.max == _parser.MAXREPEAT: return '{childs}+'.format(childs=childs) return '{childs}{{{min},{max}}}'.format(childs=childs, min=self.min, max=self.max) class BranchToken(Token): - type = sre_parse.BRANCH + type = _parser.BRANCH def _parse(self): self.childs = [] for token in self.token[1][1]: if not token: self.childs.append(EmptyToken(token=token, parent=self.parent, regexp=self.regexp)) - elif isinstance(token, sre_parse.SubPattern): + elif isinstance(token, _parser.SubPattern): self.childs.append(InternalSubpatternToken(token=token, parent=self.parent, regexp=self.regexp)) else: raise RuntimeError('Unexpected token {0} in branch'.format(token)) @@ -464,13 +464,13 @@ class BranchToken(Token): class SubpatternToken(Token): - type = sre_parse.SUBPATTERN + type = _parser.SUBPATTERN def _parse(self): - self._parse_childs(self.token[1][1]) + # (group, add_flags, del_flags, pattern) + self._parse_childs(self.token[1][3]) self.group = self.token[1][0] - if isinstance(self.group, int): - # Captured group index can't be a string. E.g. for pattern "(?:la)" group name is "None" + if self.group is not None: self._reg_group(self.group) def can_contain(self, char, skip_literal=True): @@ -540,7 +540,7 @@ class SubpatternToken(Token): class InternalSubpatternToken(Token): - type = sre_parse.SUBPATTERN + type = _parser.SUBPATTERN def _parse(self): self._parse_childs(self.token) @@ -609,8 +609,41 @@ class InternalSubpatternToken(Token): return ''.join(str(x) for x in self.childs) +class AtomicGroupToken(Token): + type = _parser.ATOMIC_GROUP + + def _parse(self): + self._parse_childs(self.token[1]) + + def can_contain(self, char, skip_literal=True): + for child in self.childs: + if child.can_contain(char, skip_literal=skip_literal): + return True + return False + + def must_contain(self, char): + return False + + def can_startswith(self, char, strict=False): + return any(x.can_startswith(char, strict) for x in self.childs) + + def must_startswith(self, char, strict=False): + return False + + def generate(self, context): + res = [] + for child in self.childs: + res.append(child.generate(context)) + + return _gen_combinator(res) + [''] + + def __str__(self): + childs = ''.join(str(x) for x in self.childs) + return '(?>{childs})'.format(childs=childs) + + class InToken(Token): - type = sre_parse.IN + type = _parser.IN def _parse(self): self.childs = parse(self.token[1], self) @@ -682,11 +715,11 @@ class InToken(Token): class AtToken(Token): - type = sre_parse.AT + type = _parser.AT def _parse(self): - self.begin = self.token[1] == sre_parse.AT_BEGINNING - self.end = self.token[1] == sre_parse.AT_END + self.begin = self.token[1] == _parser.AT_BEGINNING + self.end = self.token[1] == _parser.AT_END def can_contain(self, char, skip_literal=True): return False @@ -711,7 +744,7 @@ class AtToken(Token): class NegateToken(Token): - type = sre_parse.NEGATE + type = _parser.NEGATE def can_contain(self, char, skip_literal=True): return False @@ -733,7 +766,7 @@ class NegateToken(Token): class GroupRefToken(Token): - type = sre_parse.GROUPREF + type = _parser.GROUPREF def _parse(self): self.id = self.token[1] @@ -759,7 +792,7 @@ class GroupRefToken(Token): class AssertToken(Token): - type = sre_parse.ASSERT + type = _parser.ASSERT def can_contain(self, char, skip_literal=True): # TODO(buglloc): Do it! @@ -777,7 +810,7 @@ class AssertToken(Token): class AssertNotToken(Token): - type = sre_parse.ASSERT_NOT + type = _parser.ASSERT_NOT def can_contain(self, char, skip_literal=True): # TODO(buglloc): Do it! @@ -822,35 +855,37 @@ def parse(sre_obj, parent=None, regexp=None): for token in sre_obj: if not token: result.append(EmptyToken(token=token, parent=parent, regexp=regexp)) - elif token[0] == sre_parse.ANY: + elif token[0] == _parser.ANY: result.append(AnyToken(token=token, parent=parent, regexp=regexp)) - elif token[0] == sre_parse.LITERAL: + elif token[0] == _parser.LITERAL: result.append(LiteralToken(token=token, parent=parent, regexp=regexp)) - elif token[0] == sre_parse.NOT_LITERAL: + elif token[0] == _parser.NOT_LITERAL: result.append(NotLiteralToken(token=token, parent=parent, regexp=regexp)) - elif token[0] == sre_parse.RANGE: + elif token[0] == _parser.RANGE: result.append(RangeToken(token=token, parent=parent, regexp=regexp)) - elif token[0] == sre_parse.CATEGORY: + elif token[0] == _parser.CATEGORY: result.append(CategoryToken(token=token, parent=parent, regexp=regexp)) - elif token[0] == sre_parse.MIN_REPEAT: + elif token[0] == _parser.MIN_REPEAT: result.append(MinRepeatToken(token=token, parent=parent, regexp=regexp)) - elif token[0] == sre_parse.MAX_REPEAT: + elif token[0] == _parser.MAX_REPEAT: result.append(MaxRepeatToken(token=token, parent=parent, regexp=regexp)) - elif token[0] == sre_parse.BRANCH: + elif token[0] == _parser.BRANCH: result.append(BranchToken(token=token, parent=parent, regexp=regexp)) - elif token[0] == sre_parse.SUBPATTERN: + elif token[0] == _parser.SUBPATTERN: result.append(SubpatternToken(token=token, parent=parent, regexp=regexp)) - elif token[0] == sre_parse.IN: + elif token[0] == _parser.ATOMIC_GROUP: + result.append(AtomicGroupToken(token=token, parent=parent, regexp=regexp)) + elif token[0] == _parser.IN: result.append(InToken(token=token, parent=parent, regexp=regexp)) - elif token[0] == sre_parse.NEGATE: + elif token[0] == _parser.NEGATE: result.append(NegateToken(token=token, parent=parent, regexp=regexp)) - elif token[0] == sre_parse.AT: + elif token[0] == _parser.AT: result.append(AtToken(token=token, parent=parent, regexp=regexp)) - elif token[0] == sre_parse.GROUPREF: + elif token[0] == _parser.GROUPREF: result.append(GroupRefToken(token=token, parent=parent, regexp=regexp)) - elif token[0] == sre_parse.ASSERT: + elif token[0] == _parser.ASSERT: pass # TODO(buglloc): Do it! - elif token[0] == sre_parse.ASSERT_NOT: + elif token[0] == _parser.ASSERT_NOT: pass # TODO(buglloc): Do it! else: LOG.info('Unexpected token "{0}"'.format(token[0])) @@ -996,13 +1031,10 @@ class Regexp(object): @cached_property def groups(self): - # self.root.parse() result = {} - # for name, token in self._groups.items(): - # result[name] = Regexp(str(self), root=token, strict=True, case_sensitive=self.case_sensitive) for name, parsed in extract_groups(self.parsed).items(): result[name] = Regexp('compiled', _parsed=parsed, strict=True, case_sensitive=self.case_sensitive) - for name, group in self.parsed.pattern.groupdict.items(): + for name, group in self.parsed.state.groupdict.items(): result[name] = result[group] return result @@ -1022,8 +1054,8 @@ class Regexp(object): return self._parsed try: - self._parsed = sre_parse.parse(FIX_NAMED_GROUPS_RE.sub('(?P<\\1>', self.source)) - except sre_parse.error as e: + self._parsed = _parser.parse(FIX_NAMED_GROUPS_RE.sub('(?P<\\1>', self.source)) + except _parser.error as e: LOG.fatal('Failed to parse regex: %s (%s)', self.source, str(e)) raise e diff --git a/gixy/core/sre_parse/_constants.py b/gixy/core/sre_parse/_constants.py new file mode 100644 index 0000000..c3b86bc --- /dev/null +++ b/gixy/core/sre_parse/_constants.py @@ -0,0 +1,230 @@ +# flake8: noqa +# Copied from cpython 3.13.2 + +# +# Secret Labs' Regular Expression Engine +# +# various symbols used by the regular expression engine. +# +# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. +# +# This version of the SRE library can be redistributed under CNRI's +# Python 1.6 license. For any other use, please contact Secret Labs +# AB (info@pythonware.com). +# +# Portions of this engine have been developed in cooperation with +# CNRI. Hewlett-Packard provided funding for 1.6 integration and +# other compatibility work. +# + +"""Internal support module for sre""" + +# update when constants are added or removed + +MAGIC = 20230612 + +from _sre import MAXREPEAT, MAXGROUPS + +# SRE standard exception (access as sre.error) +# should this really be here? + +class PatternError(Exception): + """Exception raised for invalid regular expressions. + + Attributes: + + msg: The unformatted error message + pattern: The regular expression pattern + pos: The index in the pattern where compilation failed (may be None) + lineno: The line corresponding to pos (may be None) + colno: The column corresponding to pos (may be None) + """ + + __module__ = 're' + + def __init__(self, msg, pattern=None, pos=None): + self.msg = msg + self.pattern = pattern + self.pos = pos + if pattern is not None and pos is not None: + msg = '%s at position %d' % (msg, pos) + if isinstance(pattern, str): + newline = '\n' + else: + newline = b'\n' + self.lineno = pattern.count(newline, 0, pos) + 1 + self.colno = pos - pattern.rfind(newline, 0, pos) + if newline in pattern: + msg = '%s (line %d, column %d)' % (msg, self.lineno, self.colno) + else: + self.lineno = self.colno = None + super().__init__(msg) + + +# Backward compatibility after renaming in 3.13 +error = PatternError + +class _NamedIntConstant(int): + def __new__(cls, value, name): + self = super(_NamedIntConstant, cls).__new__(cls, value) + self.name = name + return self + + def __repr__(self): + return self.name + + __reduce__ = None + +MAXREPEAT = _NamedIntConstant(MAXREPEAT, 'MAXREPEAT') + +def _makecodes(*names): + items = [_NamedIntConstant(i, name) for i, name in enumerate(names)] + globals().update({item.name: item for item in items}) + return items + +# operators +OPCODES = _makecodes( + # failure=0 success=1 (just because it looks better that way :-) + 'FAILURE', 'SUCCESS', + + 'ANY', 'ANY_ALL', + 'ASSERT', 'ASSERT_NOT', + 'AT', + 'BRANCH', + 'CATEGORY', + 'CHARSET', 'BIGCHARSET', + 'GROUPREF', 'GROUPREF_EXISTS', + 'IN', + 'INFO', + 'JUMP', + 'LITERAL', + 'MARK', + 'MAX_UNTIL', + 'MIN_UNTIL', + 'NOT_LITERAL', + 'NEGATE', + 'RANGE', + 'REPEAT', + 'REPEAT_ONE', + 'SUBPATTERN', + 'MIN_REPEAT_ONE', + 'ATOMIC_GROUP', + 'POSSESSIVE_REPEAT', + 'POSSESSIVE_REPEAT_ONE', + + 'GROUPREF_IGNORE', + 'IN_IGNORE', + 'LITERAL_IGNORE', + 'NOT_LITERAL_IGNORE', + + 'GROUPREF_LOC_IGNORE', + 'IN_LOC_IGNORE', + 'LITERAL_LOC_IGNORE', + 'NOT_LITERAL_LOC_IGNORE', + + 'GROUPREF_UNI_IGNORE', + 'IN_UNI_IGNORE', + 'LITERAL_UNI_IGNORE', + 'NOT_LITERAL_UNI_IGNORE', + 'RANGE_UNI_IGNORE', + + # The following opcodes are only occurred in the parser output, + # but not in the compiled code. + 'MIN_REPEAT', 'MAX_REPEAT', +) +del OPCODES[-2:] # remove MIN_REPEAT and MAX_REPEAT + +# positions +ATCODES = _makecodes( + 'AT_BEGINNING', 'AT_BEGINNING_LINE', 'AT_BEGINNING_STRING', + 'AT_BOUNDARY', 'AT_NON_BOUNDARY', + 'AT_END', 'AT_END_LINE', 'AT_END_STRING', + + 'AT_LOC_BOUNDARY', 'AT_LOC_NON_BOUNDARY', + + 'AT_UNI_BOUNDARY', 'AT_UNI_NON_BOUNDARY', +) + +# categories +CHCODES = _makecodes( + 'CATEGORY_DIGIT', 'CATEGORY_NOT_DIGIT', + 'CATEGORY_SPACE', 'CATEGORY_NOT_SPACE', + 'CATEGORY_WORD', 'CATEGORY_NOT_WORD', + 'CATEGORY_LINEBREAK', 'CATEGORY_NOT_LINEBREAK', + + 'CATEGORY_LOC_WORD', 'CATEGORY_LOC_NOT_WORD', + + 'CATEGORY_UNI_DIGIT', 'CATEGORY_UNI_NOT_DIGIT', + 'CATEGORY_UNI_SPACE', 'CATEGORY_UNI_NOT_SPACE', + 'CATEGORY_UNI_WORD', 'CATEGORY_UNI_NOT_WORD', + 'CATEGORY_UNI_LINEBREAK', 'CATEGORY_UNI_NOT_LINEBREAK', +) + + +# replacement operations for "ignore case" mode +OP_IGNORE = { + LITERAL: LITERAL_IGNORE, + NOT_LITERAL: NOT_LITERAL_IGNORE, +} + +OP_LOCALE_IGNORE = { + LITERAL: LITERAL_LOC_IGNORE, + NOT_LITERAL: NOT_LITERAL_LOC_IGNORE, +} + +OP_UNICODE_IGNORE = { + LITERAL: LITERAL_UNI_IGNORE, + NOT_LITERAL: NOT_LITERAL_UNI_IGNORE, +} + +AT_MULTILINE = { + AT_BEGINNING: AT_BEGINNING_LINE, + AT_END: AT_END_LINE +} + +AT_LOCALE = { + AT_BOUNDARY: AT_LOC_BOUNDARY, + AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY +} + +AT_UNICODE = { + AT_BOUNDARY: AT_UNI_BOUNDARY, + AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY +} + +CH_LOCALE = { + CATEGORY_DIGIT: CATEGORY_DIGIT, + CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT, + CATEGORY_SPACE: CATEGORY_SPACE, + CATEGORY_NOT_SPACE: CATEGORY_NOT_SPACE, + CATEGORY_WORD: CATEGORY_LOC_WORD, + CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD, + CATEGORY_LINEBREAK: CATEGORY_LINEBREAK, + CATEGORY_NOT_LINEBREAK: CATEGORY_NOT_LINEBREAK +} + +CH_UNICODE = { + CATEGORY_DIGIT: CATEGORY_UNI_DIGIT, + CATEGORY_NOT_DIGIT: CATEGORY_UNI_NOT_DIGIT, + CATEGORY_SPACE: CATEGORY_UNI_SPACE, + CATEGORY_NOT_SPACE: CATEGORY_UNI_NOT_SPACE, + CATEGORY_WORD: CATEGORY_UNI_WORD, + CATEGORY_NOT_WORD: CATEGORY_UNI_NOT_WORD, + CATEGORY_LINEBREAK: CATEGORY_UNI_LINEBREAK, + CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK +} + +# flags +SRE_FLAG_IGNORECASE = 2 # case insensitive +SRE_FLAG_LOCALE = 4 # honour system locale +SRE_FLAG_MULTILINE = 8 # treat target as multiline string +SRE_FLAG_DOTALL = 16 # treat target as a single string +SRE_FLAG_UNICODE = 32 # use unicode "locale" +SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments +SRE_FLAG_DEBUG = 128 # debugging +SRE_FLAG_ASCII = 256 # use ascii "locale" + +# flags for INFO primitive +SRE_INFO_PREFIX = 1 # has prefix +SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix) +SRE_INFO_CHARSET = 4 # pattern starts with character from given set diff --git a/gixy/core/sre_parse/_parser.py b/gixy/core/sre_parse/_parser.py new file mode 100644 index 0000000..7696e1b --- /dev/null +++ b/gixy/core/sre_parse/_parser.py @@ -0,0 +1,1082 @@ +# flake8: noqa +# Copied from cpython 3.13.2, and preserved non-capturing groups + +# +# Secret Labs' Regular Expression Engine +# +# convert re-style regular expression to sre pattern +# +# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. +# +# This version of the SRE library can be redistributed under CNRI's +# Python 1.6 license. For any other use, please contact Secret Labs +# AB (info@pythonware.com). +# +# Portions of this engine have been developed in cooperation with +# CNRI. Hewlett-Packard provided funding for 1.6 integration and +# other compatibility work. +# + +"""Internal support module for sre""" + +# XXX: show string offset and offending character for all errors + +from ._constants import * + +SPECIAL_CHARS = ".\\[{()*+?^$|" +REPEAT_CHARS = "*+?{" + +DIGITS = frozenset("0123456789") + +OCTDIGITS = frozenset("01234567") +HEXDIGITS = frozenset("0123456789abcdefABCDEF") +ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") + +WHITESPACE = frozenset(" \t\n\r\v\f") + +_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT}) +_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) + +ESCAPES = { + r"\a": (LITERAL, ord("\a")), + r"\b": (LITERAL, ord("\b")), + r"\f": (LITERAL, ord("\f")), + r"\n": (LITERAL, ord("\n")), + r"\r": (LITERAL, ord("\r")), + r"\t": (LITERAL, ord("\t")), + r"\v": (LITERAL, ord("\v")), + r"\\": (LITERAL, ord("\\")) +} + +CATEGORIES = { + r"\A": (AT, AT_BEGINNING_STRING), # start of string + r"\b": (AT, AT_BOUNDARY), + r"\B": (AT, AT_NON_BOUNDARY), + r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), + r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), + r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), + r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), + r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), + r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), + r"\Z": (AT, AT_END_STRING), # end of string +} + +FLAGS = { + # standard flags + "i": SRE_FLAG_IGNORECASE, + "L": SRE_FLAG_LOCALE, + "m": SRE_FLAG_MULTILINE, + "s": SRE_FLAG_DOTALL, + "x": SRE_FLAG_VERBOSE, + # extensions + "a": SRE_FLAG_ASCII, + "u": SRE_FLAG_UNICODE, +} + +TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE +GLOBAL_FLAGS = SRE_FLAG_DEBUG + +# Maximal value returned by SubPattern.getwidth(). +# Must be larger than MAXREPEAT, MAXCODE and sys.maxsize. +MAXWIDTH = 1 << 64 + +class State: + # keeps track of state for parsing + def __init__(self): + self.flags = 0 + self.groupdict = {} + self.groupwidths = [None] # group 0 + self.lookbehindgroups = None + self.grouprefpos = {} + @property + def groups(self): + return len(self.groupwidths) + def opengroup(self, name=None): + gid = self.groups + self.groupwidths.append(None) + if self.groups > MAXGROUPS: + raise error("too many groups") + if name is not None: + ogid = self.groupdict.get(name, None) + if ogid is not None: + raise error("redefinition of group name %r as group %d; " + "was group %d" % (name, gid, ogid)) + self.groupdict[name] = gid + return gid + def closegroup(self, gid, p): + self.groupwidths[gid] = p.getwidth() + def checkgroup(self, gid): + return gid < self.groups and self.groupwidths[gid] is not None + + def checklookbehindgroup(self, gid, source): + if self.lookbehindgroups is not None: + if not self.checkgroup(gid): + raise source.error('cannot refer to an open group') + if gid >= self.lookbehindgroups: + raise source.error('cannot refer to group defined in the same ' + 'lookbehind subpattern') + +class SubPattern: + # a subpattern, in intermediate form + def __init__(self, state, data=None): + self.state = state + if data is None: + data = [] + self.data = data + self.width = None + + def dump(self, level=0): + seqtypes = (tuple, list) + for op, av in self.data: + print(level*" " + str(op), end='') + if op is IN: + # member sublanguage + print() + for op, a in av: + print((level+1)*" " + str(op), a) + elif op is BRANCH: + print() + for i, a in enumerate(av[1]): + if i: + print(level*" " + "OR") + a.dump(level+1) + elif op is GROUPREF_EXISTS: + condgroup, item_yes, item_no = av + print('', condgroup) + item_yes.dump(level+1) + if item_no: + print(level*" " + "ELSE") + item_no.dump(level+1) + elif isinstance(av, SubPattern): + print() + av.dump(level+1) + elif isinstance(av, seqtypes): + nl = False + for a in av: + if isinstance(a, SubPattern): + if not nl: + print() + a.dump(level+1) + nl = True + else: + if not nl: + print(' ', end='') + print(a, end='') + nl = False + if not nl: + print() + else: + print('', av) + def __repr__(self): + return repr(self.data) + def __len__(self): + return len(self.data) + def __delitem__(self, index): + del self.data[index] + def __getitem__(self, index): + if isinstance(index, slice): + return SubPattern(self.state, self.data[index]) + return self.data[index] + def __setitem__(self, index, code): + self.data[index] = code + def insert(self, index, code): + self.data.insert(index, code) + def append(self, code): + self.data.append(code) + def getwidth(self): + # determine the width (min, max) for this subpattern + if self.width is not None: + return self.width + lo = hi = 0 + for op, av in self.data: + if op is BRANCH: + i = MAXWIDTH + j = 0 + for av in av[1]: + l, h = av.getwidth() + i = min(i, l) + j = max(j, h) + lo = lo + i + hi = hi + j + elif op is ATOMIC_GROUP: + i, j = av.getwidth() + lo = lo + i + hi = hi + j + elif op is SUBPATTERN: + i, j = av[-1].getwidth() + lo = lo + i + hi = hi + j + elif op in _REPEATCODES: + i, j = av[2].getwidth() + lo = lo + i * av[0] + if av[1] == MAXREPEAT and j: + hi = MAXWIDTH + else: + hi = hi + j * av[1] + elif op in _UNITCODES: + lo = lo + 1 + hi = hi + 1 + elif op is GROUPREF: + i, j = self.state.groupwidths[av] + lo = lo + i + hi = hi + j + elif op is GROUPREF_EXISTS: + i, j = av[1].getwidth() + if av[2] is not None: + l, h = av[2].getwidth() + i = min(i, l) + j = max(j, h) + else: + i = 0 + lo = lo + i + hi = hi + j + elif op is SUCCESS: + break + self.width = min(lo, MAXWIDTH), min(hi, MAXWIDTH) + return self.width + +class Tokenizer: + def __init__(self, string): + self.istext = isinstance(string, str) + self.string = string + if not self.istext: + string = str(string, 'latin1') + self.decoded_string = string + self.index = 0 + self.next = None + self.__next() + def __next(self): + index = self.index + try: + char = self.decoded_string[index] + except IndexError: + self.next = None + return + if char == "\\": + index += 1 + try: + char += self.decoded_string[index] + except IndexError: + raise error("bad escape (end of pattern)", + self.string, len(self.string) - 1) from None + self.index = index + 1 + self.next = char + def match(self, char): + if char == self.next: + self.__next() + return True + return False + def get(self): + this = self.next + self.__next() + return this + def getwhile(self, n, charset): + result = '' + for _ in range(n): + c = self.next + if c not in charset: + break + result += c + self.__next() + return result + def getuntil(self, terminator, name): + result = '' + while True: + c = self.next + self.__next() + if c is None: + if not result: + raise self.error("missing " + name) + raise self.error("missing %s, unterminated name" % terminator, + len(result)) + if c == terminator: + if not result: + raise self.error("missing " + name, 1) + break + result += c + return result + @property + def pos(self): + return self.index - len(self.next or '') + def tell(self): + return self.index - len(self.next or '') + def seek(self, index): + self.index = index + self.__next() + + def error(self, msg, offset=0): + if not self.istext: + msg = msg.encode('ascii', 'backslashreplace').decode('ascii') + return error(msg, self.string, self.tell() - offset) + + def checkgroupname(self, name, offset): + if not (self.istext or name.isascii()): + msg = "bad character in group name %a" % name + raise self.error(msg, len(name) + offset) + if not name.isidentifier(): + msg = "bad character in group name %r" % name + raise self.error(msg, len(name) + offset) + +def _class_escape(source, escape): + # handle escape code inside character class + code = ESCAPES.get(escape) + if code: + return code + code = CATEGORIES.get(escape) + if code and code[0] is IN: + return code + try: + c = escape[1:2] + if c == "x": + # hexadecimal escape (exactly two digits) + escape += source.getwhile(2, HEXDIGITS) + if len(escape) != 4: + raise source.error("incomplete escape %s" % escape, len(escape)) + return LITERAL, int(escape[2:], 16) + elif c == "u" and source.istext: + # unicode escape (exactly four digits) + escape += source.getwhile(4, HEXDIGITS) + if len(escape) != 6: + raise source.error("incomplete escape %s" % escape, len(escape)) + return LITERAL, int(escape[2:], 16) + elif c == "U" and source.istext: + # unicode escape (exactly eight digits) + escape += source.getwhile(8, HEXDIGITS) + if len(escape) != 10: + raise source.error("incomplete escape %s" % escape, len(escape)) + c = int(escape[2:], 16) + chr(c) # raise ValueError for invalid code + return LITERAL, c + elif c == "N" and source.istext: + import unicodedata + # named unicode escape e.g. \N{EM DASH} + if not source.match('{'): + raise source.error("missing {") + charname = source.getuntil('}', 'character name') + try: + c = ord(unicodedata.lookup(charname)) + except (KeyError, TypeError): + raise source.error("undefined character name %r" % charname, + len(charname) + len(r'\N{}')) from None + return LITERAL, c + elif c in OCTDIGITS: + # octal escape (up to three digits) + escape += source.getwhile(2, OCTDIGITS) + c = int(escape[1:], 8) + if c > 0o377: + raise source.error('octal escape value %s outside of ' + 'range 0-0o377' % escape, len(escape)) + return LITERAL, c + elif c in DIGITS: + raise ValueError + if len(escape) == 2: + if c in ASCIILETTERS: + raise source.error('bad escape %s' % escape, len(escape)) + return LITERAL, ord(escape[1]) + except ValueError: + pass + raise source.error("bad escape %s" % escape, len(escape)) + +def _escape(source, escape, state): + # handle escape code in expression + code = CATEGORIES.get(escape) + if code: + return code + code = ESCAPES.get(escape) + if code: + return code + try: + c = escape[1:2] + if c == "x": + # hexadecimal escape + escape += source.getwhile(2, HEXDIGITS) + if len(escape) != 4: + raise source.error("incomplete escape %s" % escape, len(escape)) + return LITERAL, int(escape[2:], 16) + elif c == "u" and source.istext: + # unicode escape (exactly four digits) + escape += source.getwhile(4, HEXDIGITS) + if len(escape) != 6: + raise source.error("incomplete escape %s" % escape, len(escape)) + return LITERAL, int(escape[2:], 16) + elif c == "U" and source.istext: + # unicode escape (exactly eight digits) + escape += source.getwhile(8, HEXDIGITS) + if len(escape) != 10: + raise source.error("incomplete escape %s" % escape, len(escape)) + c = int(escape[2:], 16) + chr(c) # raise ValueError for invalid code + return LITERAL, c + elif c == "N" and source.istext: + import unicodedata + # named unicode escape e.g. \N{EM DASH} + if not source.match('{'): + raise source.error("missing {") + charname = source.getuntil('}', 'character name') + try: + c = ord(unicodedata.lookup(charname)) + except (KeyError, TypeError): + raise source.error("undefined character name %r" % charname, + len(charname) + len(r'\N{}')) from None + return LITERAL, c + elif c == "0": + # octal escape + escape += source.getwhile(2, OCTDIGITS) + return LITERAL, int(escape[1:], 8) + elif c in DIGITS: + # octal escape *or* decimal group reference (sigh) + if source.next in DIGITS: + escape += source.get() + if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and + source.next in OCTDIGITS): + # got three octal digits; this is an octal escape + escape += source.get() + c = int(escape[1:], 8) + if c > 0o377: + raise source.error('octal escape value %s outside of ' + 'range 0-0o377' % escape, + len(escape)) + return LITERAL, c + # not an octal escape, so this is a group reference + group = int(escape[1:]) + if group < state.groups: + if not state.checkgroup(group): + raise source.error("cannot refer to an open group", + len(escape)) + state.checklookbehindgroup(group, source) + return GROUPREF, group + raise source.error("invalid group reference %d" % group, len(escape) - 1) + if len(escape) == 2: + if c in ASCIILETTERS: + raise source.error("bad escape %s" % escape, len(escape)) + return LITERAL, ord(escape[1]) + except ValueError: + pass + raise source.error("bad escape %s" % escape, len(escape)) + +def _uniq(items): + return list(dict.fromkeys(items)) + +def _parse_sub(source, state, verbose, nested): + # parse an alternation: a|b|c + + items = [] + itemsappend = items.append + sourcematch = source.match + start = source.tell() + while True: + itemsappend(_parse(source, state, verbose, nested + 1, + not nested and not items)) + if not sourcematch("|"): + break + if not nested: + verbose = state.flags & SRE_FLAG_VERBOSE + + if len(items) == 1: + return items[0] + + subpattern = SubPattern(state) + + # check if all items share a common prefix + while True: + prefix = None + for item in items: + if not item: + break + if prefix is None: + prefix = item[0] + elif item[0] != prefix: + break + else: + # all subitems start with a common "prefix". + # move it out of the branch + for item in items: + del item[0] + subpattern.append(prefix) + continue # check next one + break + + # check if the branch can be replaced by a character set + set = [] + for item in items: + if len(item) != 1: + break + op, av = item[0] + if op is LITERAL: + set.append((op, av)) + elif op is IN and av[0][0] is not NEGATE: + set.extend(av) + else: + break + else: + # we can store this as a character set instead of a + # branch (the compiler may optimize this even more) + subpattern.append((IN, _uniq(set))) + return subpattern + + subpattern.append((BRANCH, (None, items))) + return subpattern + +def _parse(source, state, verbose, nested, first=False): + # parse a simple pattern + subpattern = SubPattern(state) + + # precompute constants into local variables + subpatternappend = subpattern.append + sourceget = source.get + sourcematch = source.match + _len = len + _ord = ord + + while True: + + this = source.next + if this is None: + break # end of pattern + if this in "|)": + break # end of subpattern + sourceget() + + if verbose: + # skip whitespace and comments + if this in WHITESPACE: + continue + if this == "#": + while True: + this = sourceget() + if this is None or this == "\n": + break + continue + + if this[0] == "\\": + code = _escape(source, this, state) + subpatternappend(code) + + elif this not in SPECIAL_CHARS: + subpatternappend((LITERAL, _ord(this))) + + elif this == "[": + here = source.tell() - 1 + # character set + set = [] + setappend = set.append +## if sourcematch(":"): +## pass # handle character classes + if source.next == '[': + import warnings + warnings.warn( + 'Possible nested set at position %d' % source.tell(), + FutureWarning, stacklevel=nested + 6 + ) + negate = sourcematch("^") + # check remaining characters + while True: + this = sourceget() + if this is None: + raise source.error("unterminated character set", + source.tell() - here) + if this == "]" and set: + break + elif this[0] == "\\": + code1 = _class_escape(source, this) + else: + if set and this in '-&~|' and source.next == this: + import warnings + warnings.warn( + 'Possible set %s at position %d' % ( + 'difference' if this == '-' else + 'intersection' if this == '&' else + 'symmetric difference' if this == '~' else + 'union', + source.tell() - 1), + FutureWarning, stacklevel=nested + 6 + ) + code1 = LITERAL, _ord(this) + if sourcematch("-"): + # potential range + that = sourceget() + if that is None: + raise source.error("unterminated character set", + source.tell() - here) + if that == "]": + if code1[0] is IN: + code1 = code1[1][0] + setappend(code1) + setappend((LITERAL, _ord("-"))) + break + if that[0] == "\\": + code2 = _class_escape(source, that) + else: + if that == '-': + import warnings + warnings.warn( + 'Possible set difference at position %d' % ( + source.tell() - 2), + FutureWarning, stacklevel=nested + 6 + ) + code2 = LITERAL, _ord(that) + if code1[0] != LITERAL or code2[0] != LITERAL: + msg = "bad character range %s-%s" % (this, that) + raise source.error(msg, len(this) + 1 + len(that)) + lo = code1[1] + hi = code2[1] + if hi < lo: + msg = "bad character range %s-%s" % (this, that) + raise source.error(msg, len(this) + 1 + len(that)) + setappend((RANGE, (lo, hi))) + else: + if code1[0] is IN: + code1 = code1[1][0] + setappend(code1) + + set = _uniq(set) + # XXX: should move set optimization to compiler! + if _len(set) == 1 and set[0][0] is LITERAL: + # optimization + if negate: + subpatternappend((NOT_LITERAL, set[0][1])) + else: + subpatternappend(set[0]) + else: + if negate: + set.insert(0, (NEGATE, None)) + # charmap optimization can't be added here because + # global flags still are not known + subpatternappend((IN, set)) + + elif this in REPEAT_CHARS: + # repeat previous item + here = source.tell() + if this == "?": + min, max = 0, 1 + elif this == "*": + min, max = 0, MAXREPEAT + + elif this == "+": + min, max = 1, MAXREPEAT + elif this == "{": + if source.next == "}": + subpatternappend((LITERAL, _ord(this))) + continue + + min, max = 0, MAXREPEAT + lo = hi = "" + while source.next in DIGITS: + lo += sourceget() + if sourcematch(","): + while source.next in DIGITS: + hi += sourceget() + else: + hi = lo + if not sourcematch("}"): + subpatternappend((LITERAL, _ord(this))) + source.seek(here) + continue + + if lo: + min = int(lo) + if min >= MAXREPEAT: + raise OverflowError("the repetition number is too large") + if hi: + max = int(hi) + if max >= MAXREPEAT: + raise OverflowError("the repetition number is too large") + if max < min: + raise source.error("min repeat greater than max repeat", + source.tell() - here) + else: + raise AssertionError("unsupported quantifier %r" % (char,)) + # figure out which item to repeat + if subpattern: + item = subpattern[-1:] + else: + item = None + if not item or item[0][0] is AT: + raise source.error("nothing to repeat", + source.tell() - here + len(this)) + if item[0][0] in _REPEATCODES: + raise source.error("multiple repeat", + source.tell() - here + len(this)) + if item[0][0] is SUBPATTERN: + group, add_flags, del_flags, p = item[0][1] + if group is None and not add_flags and not del_flags: + item = p + if sourcematch("?"): + # Non-Greedy Match + subpattern[-1] = (MIN_REPEAT, (min, max, item)) + elif sourcematch("+"): + # Possessive Match (Always Greedy) + subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item)) + else: + # Greedy Match + subpattern[-1] = (MAX_REPEAT, (min, max, item)) + + elif this == ".": + subpatternappend((ANY, None)) + + elif this == "(": + start = source.tell() - 1 + capture = True + atomic = False + name = None + add_flags = 0 + del_flags = 0 + if sourcematch("?"): + # options + char = sourceget() + if char is None: + raise source.error("unexpected end of pattern") + if char == "P": + # python extensions + if sourcematch("<"): + # named group: skip forward to end of name + name = source.getuntil(">", "group name") + source.checkgroupname(name, 1) + elif sourcematch("="): + # named backreference + name = source.getuntil(")", "group name") + source.checkgroupname(name, 1) + gid = state.groupdict.get(name) + if gid is None: + msg = "unknown group name %r" % name + raise source.error(msg, len(name) + 1) + if not state.checkgroup(gid): + raise source.error("cannot refer to an open group", + len(name) + 1) + state.checklookbehindgroup(gid, source) + subpatternappend((GROUPREF, gid)) + continue + + else: + char = sourceget() + if char is None: + raise source.error("unexpected end of pattern") + raise source.error("unknown extension ?P" + char, + len(char) + 2) + elif char == ":": + # non-capturing group + capture = False + elif char == "#": + # comment + while True: + if source.next is None: + raise source.error("missing ), unterminated comment", + source.tell() - start) + if sourceget() == ")": + break + continue + + elif char in "=!<": + # lookahead assertions + dir = 1 + if char == "<": + char = sourceget() + if char is None: + raise source.error("unexpected end of pattern") + if char not in "=!": + raise source.error("unknown extension ?<" + char, + len(char) + 2) + dir = -1 # lookbehind + lookbehindgroups = state.lookbehindgroups + if lookbehindgroups is None: + state.lookbehindgroups = state.groups + p = _parse_sub(source, state, verbose, nested + 1) + if dir < 0: + if lookbehindgroups is None: + state.lookbehindgroups = None + if not sourcematch(")"): + raise source.error("missing ), unterminated subpattern", + source.tell() - start) + if char == "=": + subpatternappend((ASSERT, (dir, p))) + elif p: + subpatternappend((ASSERT_NOT, (dir, p))) + else: + subpatternappend((FAILURE, ())) + continue + + elif char == "(": + # conditional backreference group + condname = source.getuntil(")", "group name") + if not (condname.isdecimal() and condname.isascii()): + source.checkgroupname(condname, 1) + condgroup = state.groupdict.get(condname) + if condgroup is None: + msg = "unknown group name %r" % condname + raise source.error(msg, len(condname) + 1) + else: + condgroup = int(condname) + if not condgroup: + raise source.error("bad group number", + len(condname) + 1) + if condgroup >= MAXGROUPS: + msg = "invalid group reference %d" % condgroup + raise source.error(msg, len(condname) + 1) + if condgroup not in state.grouprefpos: + state.grouprefpos[condgroup] = ( + source.tell() - len(condname) - 1 + ) + if not (condname.isdecimal() and condname.isascii()): + import warnings + warnings.warn( + "bad character in group name %s at position %d" % + (repr(condname) if source.istext else ascii(condname), + source.tell() - len(condname) - 1), + DeprecationWarning, stacklevel=nested + 6 + ) + state.checklookbehindgroup(condgroup, source) + item_yes = _parse(source, state, verbose, nested + 1) + if source.match("|"): + item_no = _parse(source, state, verbose, nested + 1) + if source.next == "|": + raise source.error("conditional backref with more than two branches") + else: + item_no = None + if not source.match(")"): + raise source.error("missing ), unterminated subpattern", + source.tell() - start) + subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) + continue + + elif char == ">": + # non-capturing, atomic group + capture = False + atomic = True + elif char in FLAGS or char == "-": + # flags + flags = _parse_flags(source, state, char) + if flags is None: # global flags + if not first or subpattern: + raise source.error('global flags not at the start ' + 'of the expression', + source.tell() - start) + verbose = state.flags & SRE_FLAG_VERBOSE + continue + + add_flags, del_flags = flags + capture = False + else: + raise source.error("unknown extension ?" + char, + len(char) + 1) + + # parse group contents + if capture: + try: + group = state.opengroup(name) + except error as err: + raise source.error(err.msg, len(name) + 1) from None + else: + group = None + sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and + not (del_flags & SRE_FLAG_VERBOSE)) + p = _parse_sub(source, state, sub_verbose, nested + 1) + if not source.match(")"): + raise source.error("missing ), unterminated subpattern", + source.tell() - start) + if group is not None: + state.closegroup(group, p) + if atomic: + assert group is None + subpatternappend((ATOMIC_GROUP, p)) + else: + subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p))) + + elif this == "^": + subpatternappend((AT, AT_BEGINNING)) + + elif this == "$": + subpatternappend((AT, AT_END)) + + else: + raise AssertionError("unsupported special character %r" % (char,)) + + return subpattern + +def _parse_flags(source, state, char): + sourceget = source.get + add_flags = 0 + del_flags = 0 + if char != "-": + while True: + flag = FLAGS[char] + if source.istext: + if char == 'L': + msg = "bad inline flags: cannot use 'L' flag with a str pattern" + raise source.error(msg) + else: + if char == 'u': + msg = "bad inline flags: cannot use 'u' flag with a bytes pattern" + raise source.error(msg) + add_flags |= flag + if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag: + msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible" + raise source.error(msg) + char = sourceget() + if char is None: + raise source.error("missing -, : or )") + if char in ")-:": + break + if char not in FLAGS: + msg = "unknown flag" if char.isalpha() else "missing -, : or )" + raise source.error(msg, len(char)) + if char == ")": + state.flags |= add_flags + return None + if add_flags & GLOBAL_FLAGS: + raise source.error("bad inline flags: cannot turn on global flag", 1) + if char == "-": + char = sourceget() + if char is None: + raise source.error("missing flag") + if char not in FLAGS: + msg = "unknown flag" if char.isalpha() else "missing flag" + raise source.error(msg, len(char)) + while True: + flag = FLAGS[char] + if flag & TYPE_FLAGS: + msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'" + raise source.error(msg) + del_flags |= flag + char = sourceget() + if char is None: + raise source.error("missing :") + if char == ":": + break + if char not in FLAGS: + msg = "unknown flag" if char.isalpha() else "missing :" + raise source.error(msg, len(char)) + assert char == ":" + if del_flags & GLOBAL_FLAGS: + raise source.error("bad inline flags: cannot turn off global flag", 1) + if add_flags & del_flags: + raise source.error("bad inline flags: flag turned on and off", 1) + return add_flags, del_flags + +def fix_flags(src, flags): + # Check and fix flags according to the type of pattern (str or bytes) + if isinstance(src, str): + if flags & SRE_FLAG_LOCALE: + raise ValueError("cannot use LOCALE flag with a str pattern") + if not flags & SRE_FLAG_ASCII: + flags |= SRE_FLAG_UNICODE + elif flags & SRE_FLAG_UNICODE: + raise ValueError("ASCII and UNICODE flags are incompatible") + else: + if flags & SRE_FLAG_UNICODE: + raise ValueError("cannot use UNICODE flag with a bytes pattern") + if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII: + raise ValueError("ASCII and LOCALE flags are incompatible") + return flags + +def parse(str, flags=0, state=None): + # parse 're' pattern into list of (opcode, argument) tuples + + source = Tokenizer(str) + + if state is None: + state = State() + state.flags = flags + state.str = str + + p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0) + p.state.flags = fix_flags(str, p.state.flags) + + if source.next is not None: + assert source.next == ")" + raise source.error("unbalanced parenthesis") + + for g in p.state.grouprefpos: + if g >= p.state.groups: + msg = "invalid group reference %d" % g + raise error(msg, str, p.state.grouprefpos[g]) + + if flags & SRE_FLAG_DEBUG: + p.dump() + + return p + +def parse_template(source, pattern): + # parse 're' replacement string into list of literals and + # group references + s = Tokenizer(source) + sget = s.get + result = [] + literal = [] + lappend = literal.append + def addliteral(): + if s.istext: + result.append(''.join(literal)) + else: + # The tokenizer implicitly decodes bytes objects as latin-1, we must + # therefore re-encode the final representation. + result.append(''.join(literal).encode('latin-1')) + del literal[:] + def addgroup(index, pos): + if index > pattern.groups: + raise s.error("invalid group reference %d" % index, pos) + addliteral() + result.append(index) + groupindex = pattern.groupindex + while True: + this = sget() + if this is None: + break # end of replacement string + if this[0] == "\\": + # group + c = this[1] + if c == "g": + if not s.match("<"): + raise s.error("missing <") + name = s.getuntil(">", "group name") + if not (name.isdecimal() and name.isascii()): + s.checkgroupname(name, 1) + try: + index = groupindex[name] + except KeyError: + raise IndexError("unknown group name %r" % name) from None + else: + index = int(name) + if index >= MAXGROUPS: + raise s.error("invalid group reference %d" % index, + len(name) + 1) + if not (name.isdecimal() and name.isascii()): + import warnings + warnings.warn( + "bad character in group name %s at position %d" % + (repr(name) if s.istext else ascii(name), + s.tell() - len(name) - 1), + DeprecationWarning, stacklevel=5 + ) + addgroup(index, len(name) + 1) + elif c == "0": + if s.next in OCTDIGITS: + this += sget() + if s.next in OCTDIGITS: + this += sget() + lappend(chr(int(this[1:], 8) & 0xff)) + elif c in DIGITS: + isoctal = False + if s.next in DIGITS: + this += sget() + if (c in OCTDIGITS and this[2] in OCTDIGITS and + s.next in OCTDIGITS): + this += sget() + isoctal = True + c = int(this[1:], 8) + if c > 0o377: + raise s.error('octal escape value %s outside of ' + 'range 0-0o377' % this, len(this)) + lappend(chr(c)) + if not isoctal: + addgroup(int(this[1:]), len(this) - 1) + else: + try: + this = chr(ESCAPES[this][1]) + except KeyError: + if c in ASCIILETTERS: + raise s.error('bad escape %s' % this, len(this)) from None + lappend(this) + else: + lappend(this) + addliteral() + return result diff --git a/gixy/core/sre_parse/sre_constants.py b/gixy/core/sre_parse/sre_constants.py deleted file mode 100644 index 670deeb..0000000 --- a/gixy/core/sre_parse/sre_constants.py +++ /dev/null @@ -1,226 +0,0 @@ -# flake8: noqa - -# -# Secret Labs' Regular Expression Engine -# -# various symbols used by the regular expression engine. -# run this script to update the _sre include files! -# -# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. -# -# See the sre.py file for information on usage and redistribution. -# - -"""Internal support module for sre""" - -# update when constants are added or removed - -MAGIC = 20031017 - -try: - from _sre import MAXREPEAT -except ImportError: - import _sre - - MAXREPEAT = _sre.MAXREPEAT = 65535 - - -# SRE standard exception (access as sre.error) -# should this really be here? - -class error(Exception): - pass - - -# operators - -FAILURE = "failure" -SUCCESS = "success" - -ANY = "any" -ANY_ALL = "any_all" -ASSERT = "assert" -ASSERT_NOT = "assert_not" -AT = "at" -BIGCHARSET = "bigcharset" -BRANCH = "branch" -CALL = "call" -CATEGORY = "category" -CHARSET = "charset" -GROUPREF = "groupref" -GROUPREF_IGNORE = "groupref_ignore" -GROUPREF_EXISTS = "groupref_exists" -IN = "in" -IN_IGNORE = "in_ignore" -INFO = "info" -JUMP = "jump" -LITERAL = "literal" -LITERAL_IGNORE = "literal_ignore" -MARK = "mark" -MAX_REPEAT = "max_repeat" -MAX_UNTIL = "max_until" -MIN_REPEAT = "min_repeat" -MIN_UNTIL = "min_until" -NEGATE = "negate" -NOT_LITERAL = "not_literal" -NOT_LITERAL_IGNORE = "not_literal_ignore" -RANGE = "range" -REPEAT = "repeat" -REPEAT_ONE = "repeat_one" -SUBPATTERN = "subpattern" -MIN_REPEAT_ONE = "min_repeat_one" - -# positions -AT_BEGINNING = "at_beginning" -AT_BEGINNING_LINE = "at_beginning_line" -AT_BEGINNING_STRING = "at_beginning_string" -AT_BOUNDARY = "at_boundary" -AT_NON_BOUNDARY = "at_non_boundary" -AT_END = "at_end" -AT_END_LINE = "at_end_line" -AT_END_STRING = "at_end_string" -AT_LOC_BOUNDARY = "at_loc_boundary" -AT_LOC_NON_BOUNDARY = "at_loc_non_boundary" -AT_UNI_BOUNDARY = "at_uni_boundary" -AT_UNI_NON_BOUNDARY = "at_uni_non_boundary" - -# categories -CATEGORY_DIGIT = "category_digit" -CATEGORY_NOT_DIGIT = "category_not_digit" -CATEGORY_SPACE = "category_space" -CATEGORY_NOT_SPACE = "category_not_space" -CATEGORY_WORD = "category_word" -CATEGORY_NOT_WORD = "category_not_word" -CATEGORY_LINEBREAK = "category_linebreak" -CATEGORY_NOT_LINEBREAK = "category_not_linebreak" -CATEGORY_LOC_WORD = "category_loc_word" -CATEGORY_LOC_NOT_WORD = "category_loc_not_word" -CATEGORY_UNI_DIGIT = "category_uni_digit" -CATEGORY_UNI_NOT_DIGIT = "category_uni_not_digit" -CATEGORY_UNI_SPACE = "category_uni_space" -CATEGORY_UNI_NOT_SPACE = "category_uni_not_space" -CATEGORY_UNI_WORD = "category_uni_word" -CATEGORY_UNI_NOT_WORD = "category_uni_not_word" -CATEGORY_UNI_LINEBREAK = "category_uni_linebreak" -CATEGORY_UNI_NOT_LINEBREAK = "category_uni_not_linebreak" - -OPCODES = [ - - # failure=0 success=1 (just because it looks better that way :-) - FAILURE, SUCCESS, - - ANY, ANY_ALL, - ASSERT, ASSERT_NOT, - AT, - BRANCH, - CALL, - CATEGORY, - CHARSET, BIGCHARSET, - GROUPREF, GROUPREF_EXISTS, GROUPREF_IGNORE, - IN, IN_IGNORE, - INFO, - JUMP, - LITERAL, LITERAL_IGNORE, - MARK, - MAX_UNTIL, - MIN_UNTIL, - NOT_LITERAL, NOT_LITERAL_IGNORE, - NEGATE, - RANGE, - REPEAT, - REPEAT_ONE, - SUBPATTERN, - MIN_REPEAT_ONE - -] - -ATCODES = [ - AT_BEGINNING, AT_BEGINNING_LINE, AT_BEGINNING_STRING, AT_BOUNDARY, - AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING, - AT_LOC_BOUNDARY, AT_LOC_NON_BOUNDARY, AT_UNI_BOUNDARY, - AT_UNI_NON_BOUNDARY -] - -CHCODES = [ - CATEGORY_DIGIT, CATEGORY_NOT_DIGIT, CATEGORY_SPACE, - CATEGORY_NOT_SPACE, CATEGORY_WORD, CATEGORY_NOT_WORD, - CATEGORY_LINEBREAK, CATEGORY_NOT_LINEBREAK, CATEGORY_LOC_WORD, - CATEGORY_LOC_NOT_WORD, CATEGORY_UNI_DIGIT, CATEGORY_UNI_NOT_DIGIT, - CATEGORY_UNI_SPACE, CATEGORY_UNI_NOT_SPACE, CATEGORY_UNI_WORD, - CATEGORY_UNI_NOT_WORD, CATEGORY_UNI_LINEBREAK, - CATEGORY_UNI_NOT_LINEBREAK -] - - -def makedict(list): - d = {} - i = 0 - for item in list: - d[item] = i - i = i + 1 - return d - - -OPCODES = makedict(OPCODES) -ATCODES = makedict(ATCODES) -CHCODES = makedict(CHCODES) - -# replacement operations for "ignore case" mode -OP_IGNORE = { - GROUPREF: GROUPREF_IGNORE, - IN: IN_IGNORE, - LITERAL: LITERAL_IGNORE, - NOT_LITERAL: NOT_LITERAL_IGNORE -} - -AT_MULTILINE = { - AT_BEGINNING: AT_BEGINNING_LINE, - AT_END: AT_END_LINE -} - -AT_LOCALE = { - AT_BOUNDARY: AT_LOC_BOUNDARY, - AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY -} - -AT_UNICODE = { - AT_BOUNDARY: AT_UNI_BOUNDARY, - AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY -} - -CH_LOCALE = { - CATEGORY_DIGIT: CATEGORY_DIGIT, - CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT, - CATEGORY_SPACE: CATEGORY_SPACE, - CATEGORY_NOT_SPACE: CATEGORY_NOT_SPACE, - CATEGORY_WORD: CATEGORY_LOC_WORD, - CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD, - CATEGORY_LINEBREAK: CATEGORY_LINEBREAK, - CATEGORY_NOT_LINEBREAK: CATEGORY_NOT_LINEBREAK -} - -CH_UNICODE = { - CATEGORY_DIGIT: CATEGORY_UNI_DIGIT, - CATEGORY_NOT_DIGIT: CATEGORY_UNI_NOT_DIGIT, - CATEGORY_SPACE: CATEGORY_UNI_SPACE, - CATEGORY_NOT_SPACE: CATEGORY_UNI_NOT_SPACE, - CATEGORY_WORD: CATEGORY_UNI_WORD, - CATEGORY_NOT_WORD: CATEGORY_UNI_NOT_WORD, - CATEGORY_LINEBREAK: CATEGORY_UNI_LINEBREAK, - CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK -} - -# flags -SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking) -SRE_FLAG_IGNORECASE = 2 # case insensitive -SRE_FLAG_LOCALE = 4 # honour system locale -SRE_FLAG_MULTILINE = 8 # treat target as multiline string -SRE_FLAG_DOTALL = 16 # treat target as a single string -SRE_FLAG_UNICODE = 32 # use unicode locale -SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments -SRE_FLAG_DEBUG = 128 # debugging - -# flags for INFO primitive -SRE_INFO_PREFIX = 1 # has prefix -SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix) -SRE_INFO_CHARSET = 4 # pattern starts with character from given set diff --git a/gixy/core/sre_parse/sre_parse.py b/gixy/core/sre_parse/sre_parse.py deleted file mode 100644 index dedd5fe..0000000 --- a/gixy/core/sre_parse/sre_parse.py +++ /dev/null @@ -1,827 +0,0 @@ -# flake8: noqa - -# -# Secret Labs' Regular Expression Engine -# -# convert re-style regular expression to sre pattern -# -# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. -# -# See the sre.py file for information on usage and redistribution. -# - -"""Internal support module for sre""" - -from .sre_constants import * - -SPECIAL_CHARS = ".\\[{()*+?^$|" -REPEAT_CHARS = "*+?{" - -DIGITS = set("0123456789") - -OCTDIGITS = set("01234567") -HEXDIGITS = set("0123456789abcdefABCDEF") - -WHITESPACE = set(" \t\n\r\v\f") - -ESCAPES = { - r"\a": (LITERAL, ord("\a")), - r"\b": (LITERAL, ord("\b")), - r"\f": (LITERAL, ord("\f")), - r"\n": (LITERAL, ord("\n")), - r"\r": (LITERAL, ord("\r")), - r"\t": (LITERAL, ord("\t")), - r"\v": (LITERAL, ord("\v")), - r"\\": (LITERAL, ord("\\")) -} - -CATEGORIES = { - r"\A": (AT, AT_BEGINNING_STRING), # start of string - r"\b": (AT, AT_BOUNDARY), - r"\B": (AT, AT_NON_BOUNDARY), - r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), - r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), - r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), - r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), - r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), - r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), - r"\Z": (AT, AT_END_STRING), # end of string -} - -FLAGS = { - # standard flags - "i": SRE_FLAG_IGNORECASE, - "L": SRE_FLAG_LOCALE, - "m": SRE_FLAG_MULTILINE, - "s": SRE_FLAG_DOTALL, - "x": SRE_FLAG_VERBOSE, - # extensions - "t": SRE_FLAG_TEMPLATE, - "u": SRE_FLAG_UNICODE, -} - - -class Pattern: - # master pattern object. keeps track of global attributes - def __init__(self): - self.flags = 0 - self.open = [] - self.groups = 1 - self.groupdict = {} - self.lookbehind = 0 - - def opengroup(self, name=None): - gid = self.groups - self.groups = gid + 1 - if name is not None: - ogid = self.groupdict.get(name, None) - if ogid is not None: - raise error(("redefinition of group name %s as group %d; " - "was group %d" % (repr(name), gid, ogid))) - self.groupdict[name] = gid - self.open.append(gid) - return gid - - def closegroup(self, gid): - self.open.remove(gid) - - def checkgroup(self, gid): - return gid < self.groups and gid not in self.open - - -class SubPattern: - # a subpattern, in intermediate form - def __init__(self, pattern, data=None): - self.pattern = pattern - if data is None: - data = [] - self.data = data - self.width = None - - def __repr__(self): - return repr(self.data) - - def __len__(self): - return len(self.data) - - def __delitem__(self, index): - del self.data[index] - - def __getitem__(self, index): - if isinstance(index, slice): - return SubPattern(self.pattern, self.data[index]) - return self.data[index] - - def __setitem__(self, index, code): - self.data[index] = code - - def insert(self, index, code): - self.data.insert(index, code) - - def append(self, code): - self.data.append(code) - - def getwidth(self): - # determine the width (min, max) for this subpattern - if self.width: - return self.width - lo = hi = 0 - UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) - REPEATCODES = (MIN_REPEAT, MAX_REPEAT) - for op, av in self.data: - if op is BRANCH: - i = MAXREPEAT - 1 - j = 0 - for av in av[1]: - l, h = av.getwidth() - i = min(i, l) - j = max(j, h) - lo = lo + i - hi = hi + j - elif op is CALL: - i, j = av.getwidth() - lo = lo + i - hi = hi + j - elif op is SUBPATTERN: - i, j = av[1].getwidth() - lo = lo + i - hi = hi + j - elif op in REPEATCODES: - i, j = av[2].getwidth() - lo = lo + i * av[0] - hi = hi + j * av[1] - elif op in UNITCODES: - lo = lo + 1 - hi = hi + 1 - elif op == SUCCESS: - break - self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) - return self.width - - -class Tokenizer: - def __init__(self, string): - self.string = string - self.index = 0 - self.__next() - - def __next(self): - if self.index >= len(self.string): - self.next = None - return - char = self.string[self.index] - if char[0] == "\\": - try: - c = self.string[self.index + 1] - except IndexError: - raise error("bogus escape (end of line)") - char = char + c - self.index = self.index + len(char) - self.next = char - - def match(self, char, skip=1): - if char == self.next: - if skip: - self.__next() - return 1 - return 0 - - def get(self): - this = self.next - self.__next() - return this - - def tell(self): - return self.index, self.next - - def seek(self, index): - self.index, self.next = index - - -def isident(char): - return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" - - -def isdigit(char): - return "0" <= char <= "9" - - -def isname(name): - # check that group name is a valid string - if not isident(name[0]): - return False - for char in name[1:]: - if not isident(char) and not isdigit(char): - return False - return True - - -def _class_escape(source, escape): - # handle escape code inside character class - code = ESCAPES.get(escape) - if code: - return code - code = CATEGORIES.get(escape) - if code and code[0] == IN: - return code - try: - c = escape[1:2] - if c == "x": - # hexadecimal escape (exactly two digits) - while source.next in HEXDIGITS and len(escape) < 4: - escape = escape + source.get() - escape = escape[2:] - if len(escape) != 2: - raise error("bogus escape: %s" % repr("\\" + escape)) - return LITERAL, int(escape, 16) & 0xff - elif c in OCTDIGITS: - # octal escape (up to three digits) - while source.next in OCTDIGITS and len(escape) < 4: - escape = escape + source.get() - escape = escape[1:] - return LITERAL, int(escape, 8) & 0xff - elif c in DIGITS: - raise error("bogus escape: %s" % repr(escape)) - if len(escape) == 2: - return LITERAL, ord(escape[1]) - except ValueError: - pass - raise error("bogus escape: %s" % repr(escape)) - - -def _escape(source, escape, state): - # handle escape code in expression - code = CATEGORIES.get(escape) - if code: - return code - code = ESCAPES.get(escape) - if code: - return code - try: - c = escape[1:2] - if c == "x": - # hexadecimal escape - while source.next in HEXDIGITS and len(escape) < 4: - escape = escape + source.get() - if len(escape) != 4: - raise ValueError - return LITERAL, int(escape[2:], 16) & 0xff - elif c == "0": - # octal escape - while source.next in OCTDIGITS and len(escape) < 4: - escape = escape + source.get() - return LITERAL, int(escape[1:], 8) & 0xff - elif c in DIGITS: - # octal escape *or* decimal group reference (sigh) - if source.next in DIGITS: - escape = escape + source.get() - if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and - source.next in OCTDIGITS): - # got three octal digits; this is an octal escape - escape = escape + source.get() - return LITERAL, int(escape[1:], 8) & 0xff - # not an octal escape, so this is a group reference - group = int(escape[1:]) - if group < state.groups: - if not state.checkgroup(group): - raise error("cannot refer to open group") - if state.lookbehind: - import warnings - warnings.warn('group references in lookbehind ' - 'assertions are not supported', - RuntimeWarning) - return GROUPREF, group - raise ValueError - if len(escape) == 2: - return LITERAL, ord(escape[1]) - except ValueError: - pass - raise error("bogus escape: %s" % repr(escape)) - - -def _parse_sub(source, state, nested=1): - # parse an alternation: a|b|c - - items = [] - itemsappend = items.append - sourcematch = source.match - while 1: - itemsappend(_parse(source, state)) - if sourcematch("|"): - continue - if not nested: - break - if not source.next or sourcematch(")", 0): - break - else: - raise error("pattern not properly closed") - - if len(items) == 1: - return items[0] - - subpattern = SubPattern(state) - subpatternappend = subpattern.append - - # check if all items share a common prefix - while 1: - prefix = None - for item in items: - if not item: - break - if prefix is None: - prefix = item[0] - elif item[0] != prefix: - break - else: - # all subitems start with a common "prefix". - # move it out of the branch - for item in items: - del item[0] - subpatternappend(prefix) - continue # check next one - break - - # check if the branch can be replaced by a character set - for item in items: - if len(item) != 1 or item[0][0] != LITERAL: - break - else: - # we can store this as a character set instead of a - # branch (the compiler may optimize this even more) - set = [] - setappend = set.append - for item in items: - setappend(item[0]) - subpatternappend((IN, set)) - return subpattern - - subpattern.append((BRANCH, (None, items))) - return subpattern - - -def _parse_sub_cond(source, state, condgroup): - item_yes = _parse(source, state) - if source.match("|"): - item_no = _parse(source, state) - if source.match("|"): - raise error("conditional backref with more than two branches") - else: - item_no = None - if source.next and not source.match(")", 0): - raise error("pattern not properly closed") - subpattern = SubPattern(state) - subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) - return subpattern - - -_PATTERNENDERS = set("|)") -_ASSERTCHARS = set("=!<") -_LOOKBEHINDASSERTCHARS = set("=!") -_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT]) - - -def _parse(source, state): - # parse a simple pattern - subpattern = SubPattern(state) - - # precompute constants into local variables - subpatternappend = subpattern.append - sourceget = source.get - sourcematch = source.match - _len = len - PATTERNENDERS = _PATTERNENDERS - ASSERTCHARS = _ASSERTCHARS - LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS - REPEATCODES = _REPEATCODES - - while 1: - - if source.next in PATTERNENDERS: - break # end of subpattern - this = sourceget() - if this is None: - break # end of pattern - - if state.flags & SRE_FLAG_VERBOSE: - # skip whitespace and comments - if this in WHITESPACE: - continue - if this == "#": - while 1: - this = sourceget() - if this in (None, "\n"): - break - continue - - if this and this[0] not in SPECIAL_CHARS: - subpatternappend((LITERAL, ord(this))) - - elif this == "[": - # character set - set = [] - setappend = set.append - ## if sourcematch(":"): - ## pass # handle character classes - if sourcematch("^"): - setappend((NEGATE, None)) - # check remaining characters - start = set[:] - while 1: - this = sourceget() - if this == "]" and set != start: - break - elif this and this[0] == "\\": - code1 = _class_escape(source, this) - elif this: - code1 = LITERAL, ord(this) - else: - raise error("unexpected end of regular expression") - if sourcematch("-"): - # potential range - this = sourceget() - if this == "]": - if code1[0] is IN: - code1 = code1[1][0] - setappend(code1) - setappend((LITERAL, ord("-"))) - break - elif this: - if this[0] == "\\": - code2 = _class_escape(source, this) - else: - code2 = LITERAL, ord(this) - if code1[0] != LITERAL or code2[0] != LITERAL: - raise error("bad character range") - lo = code1[1] - hi = code2[1] - if hi < lo: - raise error("bad character range") - setappend((RANGE, (lo, hi))) - else: - raise error("unexpected end of regular expression") - else: - if code1[0] is IN: - code1 = code1[1][0] - setappend(code1) - - # XXX: should move set optimization to compiler! - if _len(set) == 1 and set[0][0] is LITERAL: - subpatternappend(set[0]) # optimization - elif _len(set) == 2 and set[0][0] is NEGATE and set[1][0] is LITERAL: - subpatternappend((NOT_LITERAL, set[1][1])) # optimization - else: - # XXX: should add charmap optimization here - subpatternappend((IN, set)) - - elif this and this[0] in REPEAT_CHARS: - # repeat previous item - if this == "?": - min, max = 0, 1 - elif this == "*": - min, max = 0, MAXREPEAT - - elif this == "+": - min, max = 1, MAXREPEAT - elif this == "{": - if source.next == "}": - subpatternappend((LITERAL, ord(this))) - continue - here = source.tell() - min, max = 0, MAXREPEAT - lo = hi = "" - while source.next in DIGITS: - lo = lo + source.get() - if sourcematch(","): - while source.next in DIGITS: - hi = hi + sourceget() - else: - hi = lo - if not sourcematch("}"): - subpatternappend((LITERAL, ord(this))) - source.seek(here) - continue - if lo: - min = int(lo) - if min >= MAXREPEAT: - raise OverflowError("the repetition number is too large") - if hi: - max = int(hi) - if max >= MAXREPEAT: - raise OverflowError("the repetition number is too large") - if max < min: - raise error("bad repeat interval") - else: - raise error("not supported") - # figure out which item to repeat - if subpattern: - item = subpattern[-1:] - else: - item = None - if not item or (_len(item) == 1 and item[0][0] == AT): - raise error("nothing to repeat") - if item[0][0] in REPEATCODES: - raise error("multiple repeat") - if sourcematch("?"): - subpattern[-1] = (MIN_REPEAT, (min, max, item)) - else: - subpattern[-1] = (MAX_REPEAT, (min, max, item)) - - elif this == ".": - subpatternappend((ANY, None)) - - elif this == "(": - group = 1 - name = None - condgroup = None - if sourcematch("?"): - group = 0 - # options - if sourcematch("P"): - # python extensions - if sourcematch("<"): - # named group: skip forward to end of name - name = "" - while 1: - char = sourceget() - if char is None: - raise error("unterminated name") - if char == ">": - break - name = name + char - group = 1 - if not name: - raise error("missing group name") - if not isname(name): - raise error("bad character in group name %r" % - name) - elif sourcematch("="): - # named backreference - name = "" - while 1: - char = sourceget() - if char is None: - raise error("unterminated name") - if char == ")": - break - name = name + char - if not name: - raise error("missing group name") - if not isname(name): - raise error("bad character in backref group name " - "%r" % name) - gid = state.groupdict.get(name) - if gid is None: - msg = "unknown group name: {0!r}".format(name) - raise error(msg) - if state.lookbehind: - import warnings - warnings.warn('group references in lookbehind ' - 'assertions are not supported', - RuntimeWarning) - subpatternappend((GROUPREF, gid)) - continue - else: - char = sourceget() - if char is None: - raise error("unexpected end of pattern") - raise error("unknown specifier: ?P%s" % char) - elif sourcematch(":"): - # non-capturing group - group = 2 - elif sourcematch("#"): - # comment - while 1: - if source.next is None or source.next == ")": - break - sourceget() - if not sourcematch(")"): - raise error("unbalanced parenthesis") - continue - elif source.next in ASSERTCHARS: - # lookahead assertions - char = sourceget() - dir = 1 - if char == "<": - if source.next not in LOOKBEHINDASSERTCHARS: - raise error("syntax error") - dir = -1 # lookbehind - char = sourceget() - state.lookbehind += 1 - p = _parse_sub(source, state) - if dir < 0: - state.lookbehind -= 1 - if not sourcematch(")"): - raise error("unbalanced parenthesis") - if char == "=": - subpatternappend((ASSERT, (dir, p))) - else: - subpatternappend((ASSERT_NOT, (dir, p))) - continue - elif sourcematch("("): - # conditional backreference group - condname = "" - while 1: - char = sourceget() - if char is None: - raise error("unterminated name") - if char == ")": - break - condname = condname + char - group = 2 - if not condname: - raise error("missing group name") - if isname(condname): - condgroup = state.groupdict.get(condname) - if condgroup is None: - msg = "unknown group name: {0!r}".format(condname) - raise error(msg) - else: - try: - condgroup = int(condname) - except ValueError: - raise error("bad character in group name") - if state.lookbehind: - import warnings - warnings.warn('group references in lookbehind ' - 'assertions are not supported', - RuntimeWarning) - else: - # flags - if not source.next in FLAGS: - raise error("unexpected end of pattern") - while source.next in FLAGS: - state.flags = state.flags | FLAGS[sourceget()] - if group: - # parse group contents - if group == 2: - # anonymous group - group = None - else: - group = state.opengroup(name) - if condgroup: - p = _parse_sub_cond(source, state, condgroup) - else: - p = _parse_sub(source, state) - if not sourcematch(")"): - raise error("unbalanced parenthesis") - if group is not None: - state.closegroup(group) - subpatternappend((SUBPATTERN, (group, p))) - else: - while 1: - char = sourceget() - if char is None: - raise error("unexpected end of pattern") - if char == ")": - break - raise error("unknown extension") - - elif this == "^": - subpatternappend((AT, AT_BEGINNING)) - - elif this == "$": - subpattern.append((AT, AT_END)) - - elif this and this[0] == "\\": - code = _escape(source, this, state) - subpatternappend(code) - - else: - raise error("parser error") - - return subpattern - - -def parse(str, flags=0, pattern=None): - # parse 're' pattern into list of (opcode, argument) tuples - - source = Tokenizer(str) - - if pattern is None: - pattern = Pattern() - pattern.flags = flags - pattern.str = str - - p = _parse_sub(source, pattern, 0) - - tail = source.get() - if tail == ")": - raise error("unbalanced parenthesis") - elif tail: - raise error("bogus characters at end of regular expression") - - if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: - # the VERBOSE flag was switched on inside the pattern. to be - # on the safe side, we'll parse the whole thing again... - return parse(str, p.pattern.flags) - - if flags & SRE_FLAG_DEBUG: - p.dump() - - return p - - -def parse_template(source, pattern): - # parse 're' replacement string into list of literals and - # group references - s = Tokenizer(source) - sget = s.get - p = [] - a = p.append - - def literal(literal, p=p, pappend=a): - if p and p[-1][0] is LITERAL: - p[-1] = LITERAL, p[-1][1] + literal - else: - pappend((LITERAL, literal)) - - sep = source[:0] - if type(sep) is type(""): - makechar = chr - else: - makechar = unichr - while 1: - this = sget() - if this is None: - break # end of replacement string - if this and this[0] == "\\": - # group - c = this[1:2] - if c == "g": - name = "" - if s.match("<"): - while 1: - char = sget() - if char is None: - raise error("unterminated group name") - if char == ">": - break - name = name + char - if not name: - raise error("missing group name") - try: - index = int(name) - if index < 0: - raise error("negative group number") - except ValueError: - if not isname(name): - raise error("bad character in group name") - try: - index = pattern.groupindex[name] - except KeyError: - msg = "unknown group name: {0!r}".format(name) - raise IndexError(msg) - a((MARK, index)) - elif c == "0": - if s.next in OCTDIGITS: - this = this + sget() - if s.next in OCTDIGITS: - this = this + sget() - literal(makechar(int(this[1:], 8) & 0xff)) - elif c in DIGITS: - isoctal = False - if s.next in DIGITS: - this = this + sget() - if (c in OCTDIGITS and this[2] in OCTDIGITS and - s.next in OCTDIGITS): - this = this + sget() - isoctal = True - literal(makechar(int(this[1:], 8) & 0xff)) - if not isoctal: - a((MARK, int(this[1:]))) - else: - try: - this = makechar(ESCAPES[this][1]) - except KeyError: - pass - literal(this) - else: - literal(this) - # convert template to groups and literals lists - i = 0 - groups = [] - groupsappend = groups.append - literals = [None] * len(p) - for c, s in p: - if c is MARK: - groupsappend((i, s)) - # literal[i] is already None - else: - literals[i] = s - i = i + 1 - return groups, literals - - -def expand_template(template, match): - g = match.group - sep = match.string[:0] - groups, literals = template - literals = literals[:] - try: - for index, group in groups: - literals[index] = s = g(group) - if s is None: - raise error("unmatched group") - except IndexError: - raise error("invalid group reference") - return sep.join(literals) diff --git a/tests/core/test_regexp.py b/tests/core/test_regexp.py index ffc1979..d3d6481 100644 --- a/tests/core/test_regexp.py +++ b/tests/core/test_regexp.py @@ -97,6 +97,7 @@ def test_to_string(): (r'1*', '1*'), (r'1*?', '1*?'), (r'1+', '1+'), + (r'(?>abc)def', '(?>abc)def'), ) for case in cases: regexp, string = case