default non-unicode and case-sensitive matching (by pattern templates automatically add `(?iu)` for "ignore case" and "unicode" if expected)

pull/1583/head
sebres 8 years ago
parent ab0ac2111c
commit e735f8f568

@ -254,27 +254,32 @@ class DateDetector(object):
if i < len(self.__templates): if i < len(self.__templates):
ddtempl = self.__templates[i] ddtempl = self.__templates[i]
template = ddtempl.template template = ddtempl.template
distance, endpos = self.__lastPos[0], self.__lastEndPos[0] if template.flags & DateTemplate.LINE_BEGIN:
if logSys.getEffectiveLevel() <= logLevel-1: if logSys.getEffectiveLevel() <= logLevel-1:
logSys.log(logLevel-1, " try to match last template #%02i (from %r to %r): ...%r==%r %s %r==%r...", logSys.log(logLevel-1, " try to match last anchored template #%02i ...", i)
i, distance, endpos, match = template.matchDate(line)
line[distance-1:distance], self.__lastPos[1], else:
line[distance:endpos], distance, endpos = self.__lastPos[0], self.__lastEndPos[0]
line[endpos:endpos+1], self.__lastEndPos[1]) if logSys.getEffectiveLevel() <= logLevel-1:
# check same boundaries left/right, otherwise possible collision/pattern switch: logSys.log(logLevel-1, " try to match last template #%02i (from %r to %r): ...%r==%r %s %r==%r...",
if (line[distance-1:distance] == self.__lastPos[1] and i, distance, endpos,
line[endpos:endpos+1] == self.__lastEndPos[1] line[distance-1:distance], self.__lastPos[1],
): line[distance:endpos],
match = template.matchDate(line, distance, endpos) line[endpos:endpos+1], self.__lastEndPos[1])
if match: # check same boundaries left/right, otherwise possible collision/pattern switch:
distance = match.start() if (line[distance-1:distance] == self.__lastPos[1] and
endpos = match.end() line[endpos:endpos+1] == self.__lastEndPos[1]
# if different position, possible collision/pattern switch: ):
if distance == self.__lastPos[0] and endpos == self.__lastEndPos[0]: match = template.matchDate(line, distance, endpos)
logSys.log(logLevel, " matched last time template #%02i", i) if match:
else: distance = match.start()
logSys.log(logLevel, " ** last pattern collision - pattern change, search ...") endpos = match.end()
match = None # if different position, possible collision/pattern switch:
if distance == self.__lastPos[0] and endpos == self.__lastEndPos[0]:
logSys.log(logLevel, " matched last time template #%02i", i)
else:
logSys.log(logLevel, " ** last pattern collision - pattern change, search ...")
match = None
# search template and better match: # search template and better match:
if not match: if not match:
self.__lastTemplIdx = 0x7fffffff self.__lastTemplIdx = 0x7fffffff

@ -32,13 +32,18 @@ from ..helpers import getLogger
logSys = getLogger(__name__) logSys = getLogger(__name__)
RE_NO_WRD_BOUND_BEG = re.compile(r'^(?:\^|\*\*|\(\?:\^)') RE_NO_WRD_BOUND_BEG = re.compile(r'^(?:\(\?\w+\))?(?:\^|\*\*|\(\?:\^)')
RE_NO_WRD_BOUND_END = re.compile(r'(?<!\\)(?:\$\)?|\*\*)$') RE_NO_WRD_BOUND_END = re.compile(r'(?<!\\)(?:\$\)?|\*\*)$')
RE_DEL_WRD_BOUNDS = re.compile(r'^\*\*|(?<!\\)\*\*$') RE_DEL_WRD_BOUNDS = ( re.compile(r'^(?:\(\?\w+\))?\*\*|(?<!\\)\*\*()$'),
lambda m: m.group().replace('**', '') )
RE_LINE_BOUND_BEG = re.compile(r'^(?:\^|\(\?:\^(?!\|))') RE_LINE_BOUND_BEG = re.compile(r'^(?:\(\?\w+\))?(?:\^|\(\?:\^(?!\|))')
RE_LINE_BOUND_END = re.compile(r'(?<![\\\|])(?:\$\)?)$') RE_LINE_BOUND_END = re.compile(r'(?<![\\\|])(?:\$\)?)$')
RE_ALPHA_PATTERN = re.compile(r'(?<!\%)\%[aAbBpc]')
_Templ_RECache = {}
class DateTemplate(object): class DateTemplate(object):
"""A template which searches for and returns a date from a log line. """A template which searches for and returns a date from a log line.
@ -104,7 +109,7 @@ class DateTemplate(object):
if RE_LINE_BOUND_BEG.search(regex): self.flags |= DateTemplate.LINE_BEGIN if RE_LINE_BOUND_BEG.search(regex): self.flags |= DateTemplate.LINE_BEGIN
if RE_LINE_BOUND_END.search(regex): self.flags |= DateTemplate.LINE_END if RE_LINE_BOUND_END.search(regex): self.flags |= DateTemplate.LINE_END
# remove possible special pattern "**" in front and end of regex: # remove possible special pattern "**" in front and end of regex:
regex = RE_DEL_WRD_BOUNDS.sub('', regex) regex = RE_DEL_WRD_BOUNDS[0].sub(RE_DEL_WRD_BOUNDS[1], regex)
self._regex = regex self._regex = regex
self._cRegex = None self._cRegex = None
@ -116,7 +121,11 @@ class DateTemplate(object):
"""Compile regex by first usage. """Compile regex by first usage.
""" """
if not self._cRegex: if not self._cRegex:
self._cRegex = re.compile(self.regex, re.UNICODE | re.IGNORECASE) try:
self._cRegex = re.compile(self.regex)
except Exception as e:
logSys.error('Compile %r failed, expression %r', self.name, self.regex)
raise e
def matchDate(self, line, *args): def matchDate(self, line, *args):
"""Check if regex for date matches on a log line. """Check if regex for date matches on a log line.
@ -235,7 +244,11 @@ class DatePatternRegex(DateTemplate):
self._pattern = pattern self._pattern = pattern
fmt = self._patternRE.sub(r'%(\1)s', pattern) fmt = self._patternRE.sub(r'%(\1)s', pattern)
self.name = fmt % self._patternName self.name = fmt % self._patternName
super(DatePatternRegex, self).setRegex(fmt % timeRE, wordBegin, wordEnd) regex = fmt % timeRE
# if expected add (?iu) for "ignore case" and "unicode":
if RE_ALPHA_PATTERN.search(pattern):
regex = r'(?iu)' + regex
super(DatePatternRegex, self).setRegex(regex, wordBegin, wordEnd)
def getDate(self, line, dateMatch=None): def getDate(self, line, dateMatch=None):
"""Method to return the date for a log line. """Method to return the date for a log line.

@ -195,11 +195,46 @@ class DateDetectorTest(LogCaptureTestCase):
self.assertEqual(logMatch.group(), '2012/10/11 02:37:17') self.assertEqual(logMatch.group(), '2012/10/11 02:37:17')
def testDateTemplate(self): def testDateTemplate(self):
t = DateTemplate() t = DateTemplate()
t.setRegex('^a{3,5}b?c*$') t.setRegex('^a{3,5}b?c*$')
self.assertEqual(t.getRegex(), '^a{3,5}b?c*$') self.assertEqual(t.regex, '^a{3,5}b?c*$')
self.assertRaises(Exception, t.getDate, '') self.assertRaises(Exception, t.getDate, '')
self.assertEqual(t.matchDate('aaaac').group(), 'aaaac') self.assertEqual(t.matchDate('aaaac').group(), 'aaaac')
## no word boundaries left and right:
t = DatePatternRegex()
t.pattern = '(?iu)**time:%ExY%Exm%ExdT%ExH%ExM%ExS**'
# ** was removed from end-regex:
self.assertFalse('**' in t.regex)
# match date:
dt = 'TIME:20050102T010203'
self.assertEqual(t.matchDate('X' + dt + 'X').group(), dt)
self.assertEqual(t.matchDate(dt).group(), dt)
# wrong year (for exact %ExY):
dt = 'TIME:50050102T010203'
self.assertFalse(t.matchDate(dt))
## start boundary left and word boundary right:
t = DatePatternRegex()
t.pattern = '%ExLBtime:%ExY%Exm%ExdT%ExH%ExM%ExS'
self.assertTrue('^' in t.regex)
# try match date:
dt = 'time:20050102T010203'
self.assertFalse(t.matchDate('X' + dt))
self.assertFalse(t.matchDate(dt + 'X'))
self.assertEqual(t.matchDate('##' + dt + '...').group(), dt)
self.assertEqual(t.matchDate(dt).group(), dt)
# case sensitive:
dt = 'TIME:20050102T010203'
self.assertFalse(t.matchDate(dt))
## auto-switching "ignore case" and "unicode"
t = DatePatternRegex()
t.pattern = '^%Y %b %d'
self.assertTrue('(?iu)' in t.regex)
dt = '2005 jun 03'; self.assertEqual(t.matchDate(dt).group(), dt)
dt = '2005 Jun 03'; self.assertEqual(t.matchDate(dt).group(), dt)
dt = '2005 JUN 03'; self.assertEqual(t.matchDate(dt).group(), dt)
def testAmbiguousInOrderedTemplates(self): def testAmbiguousInOrderedTemplates(self):
dd = self.datedetector dd = self.datedetector

Loading…
Cancel
Save