default non-unicode and case-sensitive matching (by pattern templates automatically add `(?iu)` for "ignore case" and "unicode" if expected)

pull/1583/head
sebres 8 years ago
parent ab0ac2111c
commit e735f8f568

@ -254,6 +254,11 @@ class DateDetector(object):
if i < len(self.__templates): if i < len(self.__templates):
ddtempl = self.__templates[i] ddtempl = self.__templates[i]
template = ddtempl.template template = ddtempl.template
if template.flags & DateTemplate.LINE_BEGIN:
if logSys.getEffectiveLevel() <= logLevel-1:
logSys.log(logLevel-1, " try to match last anchored template #%02i ...", i)
match = template.matchDate(line)
else:
distance, endpos = self.__lastPos[0], self.__lastEndPos[0] distance, endpos = self.__lastPos[0], self.__lastEndPos[0]
if logSys.getEffectiveLevel() <= logLevel-1: if logSys.getEffectiveLevel() <= logLevel-1:
logSys.log(logLevel-1, " try to match last template #%02i (from %r to %r): ...%r==%r %s %r==%r...", logSys.log(logLevel-1, " try to match last template #%02i (from %r to %r): ...%r==%r %s %r==%r...",

@ -32,13 +32,18 @@ from ..helpers import getLogger
logSys = getLogger(__name__) logSys = getLogger(__name__)
RE_NO_WRD_BOUND_BEG = re.compile(r'^(?:\^|\*\*|\(\?:\^)') RE_NO_WRD_BOUND_BEG = re.compile(r'^(?:\(\?\w+\))?(?:\^|\*\*|\(\?:\^)')
RE_NO_WRD_BOUND_END = re.compile(r'(?<!\\)(?:\$\)?|\*\*)$') RE_NO_WRD_BOUND_END = re.compile(r'(?<!\\)(?:\$\)?|\*\*)$')
RE_DEL_WRD_BOUNDS = re.compile(r'^\*\*|(?<!\\)\*\*$') RE_DEL_WRD_BOUNDS = ( re.compile(r'^(?:\(\?\w+\))?\*\*|(?<!\\)\*\*()$'),
lambda m: m.group().replace('**', '') )
RE_LINE_BOUND_BEG = re.compile(r'^(?:\^|\(\?:\^(?!\|))') RE_LINE_BOUND_BEG = re.compile(r'^(?:\(\?\w+\))?(?:\^|\(\?:\^(?!\|))')
RE_LINE_BOUND_END = re.compile(r'(?<![\\\|])(?:\$\)?)$') RE_LINE_BOUND_END = re.compile(r'(?<![\\\|])(?:\$\)?)$')
RE_ALPHA_PATTERN = re.compile(r'(?<!\%)\%[aAbBpc]')
_Templ_RECache = {}
class DateTemplate(object): class DateTemplate(object):
"""A template which searches for and returns a date from a log line. """A template which searches for and returns a date from a log line.
@ -104,7 +109,7 @@ class DateTemplate(object):
if RE_LINE_BOUND_BEG.search(regex): self.flags |= DateTemplate.LINE_BEGIN if RE_LINE_BOUND_BEG.search(regex): self.flags |= DateTemplate.LINE_BEGIN
if RE_LINE_BOUND_END.search(regex): self.flags |= DateTemplate.LINE_END if RE_LINE_BOUND_END.search(regex): self.flags |= DateTemplate.LINE_END
# remove possible special pattern "**" in front and end of regex: # remove possible special pattern "**" in front and end of regex:
regex = RE_DEL_WRD_BOUNDS.sub('', regex) regex = RE_DEL_WRD_BOUNDS[0].sub(RE_DEL_WRD_BOUNDS[1], regex)
self._regex = regex self._regex = regex
self._cRegex = None self._cRegex = None
@ -116,7 +121,11 @@ class DateTemplate(object):
"""Compile regex by first usage. """Compile regex by first usage.
""" """
if not self._cRegex: if not self._cRegex:
self._cRegex = re.compile(self.regex, re.UNICODE | re.IGNORECASE) try:
self._cRegex = re.compile(self.regex)
except Exception as e:
logSys.error('Compile %r failed, expression %r', self.name, self.regex)
raise e
def matchDate(self, line, *args): def matchDate(self, line, *args):
"""Check if regex for date matches on a log line. """Check if regex for date matches on a log line.
@ -235,7 +244,11 @@ class DatePatternRegex(DateTemplate):
self._pattern = pattern self._pattern = pattern
fmt = self._patternRE.sub(r'%(\1)s', pattern) fmt = self._patternRE.sub(r'%(\1)s', pattern)
self.name = fmt % self._patternName self.name = fmt % self._patternName
super(DatePatternRegex, self).setRegex(fmt % timeRE, wordBegin, wordEnd) regex = fmt % timeRE
# if expected add (?iu) for "ignore case" and "unicode":
if RE_ALPHA_PATTERN.search(pattern):
regex = r'(?iu)' + regex
super(DatePatternRegex, self).setRegex(regex, wordBegin, wordEnd)
def getDate(self, line, dateMatch=None): def getDate(self, line, dateMatch=None):
"""Method to return the date for a log line. """Method to return the date for a log line.

@ -197,10 +197,45 @@ class DateDetectorTest(LogCaptureTestCase):
def testDateTemplate(self): def testDateTemplate(self):
t = DateTemplate() t = DateTemplate()
t.setRegex('^a{3,5}b?c*$') t.setRegex('^a{3,5}b?c*$')
self.assertEqual(t.getRegex(), '^a{3,5}b?c*$') self.assertEqual(t.regex, '^a{3,5}b?c*$')
self.assertRaises(Exception, t.getDate, '') self.assertRaises(Exception, t.getDate, '')
self.assertEqual(t.matchDate('aaaac').group(), 'aaaac') self.assertEqual(t.matchDate('aaaac').group(), 'aaaac')
## no word boundaries left and right:
t = DatePatternRegex()
t.pattern = '(?iu)**time:%ExY%Exm%ExdT%ExH%ExM%ExS**'
# ** was removed from end-regex:
self.assertFalse('**' in t.regex)
# match date:
dt = 'TIME:20050102T010203'
self.assertEqual(t.matchDate('X' + dt + 'X').group(), dt)
self.assertEqual(t.matchDate(dt).group(), dt)
# wrong year (for exact %ExY):
dt = 'TIME:50050102T010203'
self.assertFalse(t.matchDate(dt))
## start boundary left and word boundary right:
t = DatePatternRegex()
t.pattern = '%ExLBtime:%ExY%Exm%ExdT%ExH%ExM%ExS'
self.assertTrue('^' in t.regex)
# try match date:
dt = 'time:20050102T010203'
self.assertFalse(t.matchDate('X' + dt))
self.assertFalse(t.matchDate(dt + 'X'))
self.assertEqual(t.matchDate('##' + dt + '...').group(), dt)
self.assertEqual(t.matchDate(dt).group(), dt)
# case sensitive:
dt = 'TIME:20050102T010203'
self.assertFalse(t.matchDate(dt))
## auto-switching "ignore case" and "unicode"
t = DatePatternRegex()
t.pattern = '^%Y %b %d'
self.assertTrue('(?iu)' in t.regex)
dt = '2005 jun 03'; self.assertEqual(t.matchDate(dt).group(), dt)
dt = '2005 Jun 03'; self.assertEqual(t.matchDate(dt).group(), dt)
dt = '2005 JUN 03'; self.assertEqual(t.matchDate(dt).group(), dt)
def testAmbiguousInOrderedTemplates(self): def testAmbiguousInOrderedTemplates(self):
dd = self.datedetector dd = self.datedetector
for (debit, line, cnt) in ( for (debit, line, cnt) in (

Loading…
Cancel
Save