extends date detector template with distance (position of match in log-line), to prevent grave collision using (re)ordered template list (e.g. find-spot of wrong date-match inside foreign input, misleading date patterns by ambiguous formats, etc.);

By change of the distance (e.g. another format found), the pattern with smallest distance will be always preferred now.
To speedup (template lookup) resp. minimize of list reorder counts, the distance will be used as divider factor of the template weight by the templates comparison.
pull/1583/head
sebres 8 years ago
parent 84fe55b99b
commit 75a5440acf

@ -129,11 +129,13 @@ class DateDetectorTemplate(object):
Prevents collectively usage of hits/lastUsed in cached templates
"""
__slots__ = ('template', 'hits', 'lastUsed')
__slots__ = ('template', 'hits', 'lastUsed', 'distance')
def __init__(self, template):
self.template = template
self.hits = 0
self.lastUsed = 0
# the last distance to date-match within the log file:
self.distance = 0x7fffffff
def __getattr__(self, name):
""" Returns attribute of template (called for parameters not in slots)
@ -156,6 +158,8 @@ class DateDetector(object):
self.__known_names = set()
# time the template was long unused (currently 300 == 5m):
self.__unusedTime = 300
# first free place:
self.__firstUnused = 0
def _appendTemplate(self, template):
name = template.name
@ -215,22 +219,52 @@ class DateDetector(object):
The regex match returned from the first successfully matched
template.
"""
match = None
i = 0
found = None, 0x7fffffff, -1
with self.__lock:
for ddtempl in self.__templates:
template = ddtempl.template
match = template.matchDate(line)
if match is not None:
distance = max(1, match.start() + 1)
if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, "Matched time template %s", template.name)
logSys.log(logLevel, " matched time template #%r (at %r <= %r) %s",
i, distance, ddtempl.distance, template.name)
## [grave] if distance changed, possible date-match was found somewhere
## in body of message, so save this template, and search further:
if distance > ddtempl.distance and len(self.__templates) > 1:
if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, " ** distance collision - pattern change, reserve")
## shortest of both:
if distance < found[1]:
found = match, distance, i
## search further:
match = None
i += 1
continue
## winner - stop search:
break
i += 1
# check other template was found (use this one with shortest distance):
if match is None and found[0]:
match, distance, i = found
ddtempl = self.__templates[i]
template = ddtempl.template
# we've winner, incr hits, set distance, usage, reorder, etc:
if match is not None:
ddtempl.hits += 1
ddtempl.distance = distance
ddtempl.lastUsed = time.time()
if self.__firstUnused == i:
self.__firstUnused += 1
# if not first - try to reorder current template (bubble up), they will be not sorted anymore:
if i:
logSys.log(logLevel, " -> reorder template #%r, hits: %r", i, ddtempl.hits)
self._reorderTemplate(i)
# return tuple with match and template reference used for parsing:
return (match, template)
i += 1
# not found:
return (None, None)
@ -263,7 +297,7 @@ class DateDetector(object):
date = template.getDate(line, timeMatch[0])
if date is not None:
if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, "Got time %f for %r using template %s",
logSys.log(logLevel, " got time %f for %r using template %s",
date[0], date[1].group(), template.name)
return date
except ValueError:
@ -282,23 +316,24 @@ class DateDetector(object):
ddtempl = templates[num]
## current hits and time the template was long unused:
untime = ddtempl.lastUsed - self.__unusedTime
hits = ddtempl.hits * ddtempl.template.weight
## try to move faster (first 2 if it still unused, or half of part to current template position):
for pos in (0, 1, num // 2):
phits = templates[pos].hits
if not phits: # if we've found an unused
break
phits *= templates[pos].template.weight
weight = ddtempl.hits * ddtempl.template.weight / ddtempl.distance
## try to move faster (first if unused available, or half of part to current template position):
pos = self.__firstUnused if self.__firstUnused < num else num // 2
pweight = templates[pos].hits * templates[pos].template.weight / templates[pos].distance
## don't move too often (multiline logs resp. log's with different date patterns),
## if template not used too long, replace it also :
if not phits or hits > phits + 5 or templates[pos].lastUsed < untime:
logSys.log(logLevel, " -> compare template #%r & #%r, weight %r > %r", num, pos, weight, pweight)
if not pweight or weight > pweight + 5 or templates[pos].lastUsed < untime:
## if not larger (and target position recently used) - move slow (exact 1 position):
if hits <= phits and templates[pos].lastUsed > untime:
if weight <= pweight and templates[pos].lastUsed > untime:
pos = num-1
## if still smaller and template at position used, don't move:
phits = templates[pos].hits * templates[pos].template.weight
if hits < phits and templates[pos].lastUsed > untime:
pweight = templates[pos].hits * templates[pos].template.weight / templates[pos].distance
logSys.log(logLevel, " -> compare template #%r & #%r, weight %r > %r", num, pos, weight, pweight)
if weight < pweight and templates[pos].lastUsed > untime:
return
templates[pos], templates[num] = ddtempl, templates[pos]
logSys.log(logLevel, " -> moved template #%r -> #%r", num, pos)
## correct first unused:
if pos == self.__firstUnused:
self.__firstUnused += 1

@ -198,6 +198,29 @@ class DateDetectorTest(LogCaptureTestCase):
self.assertRaises(Exception, t.getDate, '')
self.assertEqual(t.matchDate('aaaac').group(), 'aaaac')
def testAmbiguousInOrderedTemplates(self):
dd = DateDetector()
dd.addDefaultTemplate()
for (debit, line, cnt) in (
("2003-03-07 17:05:01", "some free text 2003-03-07 17:05:01 test ...", 15),
# distance collision detection (date from foreign input should not be found):
("030324 0:04:00", "server mysqld[1000]: 030324 0:04:00 [Warning] Access denied ..."
" foreign-input just some free text 2003-03-07 17:05:01 test", 10),
# distance collision detection (first date should be found):
("Sep 16 21:30:26", "server mysqld[1020]: Sep 16 21:30:26 server mysqld: 030916 21:30:26 [Warning] Access denied", 10),
# just to test sorting:
("2005-10-07 06:09:42", "server mysqld[5906]: 2005-10-07 06:09:42 5907 [Warning] Access denied", 20),
("2005-10-08T15:26:18.237955", "server mysqld[5906]: 2005-10-08T15:26:18.237955 6 [Note] Access denied", 20),
# date format changed again:
("051009 10:05:30", "server mysqld[1000]: 051009 10:05:30 [Warning] Access denied ...", 20),
):
logSys.debug('== test: %r', (debit, line, cnt))
for i in range(cnt):
logSys.debug('Line: %s', line)
match, template = dd.matchTime(line)
self.assertTrue(match)
self.assertEqual(match.group(), debit)
iso8601 = DatePatternRegex("%Y-%m-%d[T ]%H:%M:%S(?:\.%f)?%z")
@ -309,30 +332,6 @@ class CustomDateFormatsTest(unittest.TestCase):
else:
self.assertEqual(date, None)
# def testAmbiguousUsingOrderedTemplates(self):
# defDD = DateDetector()
# defDD.addDefaultTemplate()
# for (matched, dp, line) in (
# # wrong date recognized short month/day (unbounded date pattern without separator),
# # in the 2nd and 3th tests (with precise month and day) it should find correct the 2nd date:
# ('200333 010203', r'%Y%m%d %H%M%S', "text:200333 010203 | date:20031230 010203"),
# ('20031230 010203', r'%ExY%Exm%Exd %ExH%ExM%ExS', "text:200333 010203 | date:20031230 010203"),
# ('20031230 010203', None, "text:200333 010203 | date:20031230 010203"),
# ):
# logSys.debug('== test: %r', (matched, dp, line))
# if dp is None:
# dd = defDD
# else:
# dp = DatePatternRegex(dp)
# dd = DateDetector()
# dd.appendTemplate(dp)
# date = dd.getTime(line)
# if matched:
# self.assertTrue(date)
# self.assertEqual(matched, date[1].group())
# else:
# self.assertEqual(date, None)
# def testDefaultTempate(self):
# self.__datedetector.setDefaultRegex("^\S{3}\s{1,2}\d{1,2} \d{2}:\d{2}:\d{2}")

Loading…
Cancel
Save