extends date detector template with distance (position of match in log-line), to prevent grave collision using (re)ordered template list (e.g. find-spot of wrong date-match inside foreign input, misleading date patterns by ambiguous formats, etc.);

By change of the distance (e.g. another format found), the pattern with smallest distance will be always preferred now.
To speedup (template lookup) resp. minimize of list reorder counts, the distance will be used as divider factor of the template weight by the templates comparison.
pull/1583/head
sebres 2016-09-30 20:37:02 +02:00
parent 84fe55b99b
commit 75a5440acf
2 changed files with 81 additions and 47 deletions

View File

@ -129,11 +129,13 @@ class DateDetectorTemplate(object):
Prevents collectively usage of hits/lastUsed in cached templates Prevents collectively usage of hits/lastUsed in cached templates
""" """
__slots__ = ('template', 'hits', 'lastUsed') __slots__ = ('template', 'hits', 'lastUsed', 'distance')
def __init__(self, template): def __init__(self, template):
self.template = template self.template = template
self.hits = 0 self.hits = 0
self.lastUsed = 0 self.lastUsed = 0
# the last distance to date-match within the log file:
self.distance = 0x7fffffff
def __getattr__(self, name): def __getattr__(self, name):
""" Returns attribute of template (called for parameters not in slots) """ Returns attribute of template (called for parameters not in slots)
@ -156,6 +158,8 @@ class DateDetector(object):
self.__known_names = set() self.__known_names = set()
# time the template was long unused (currently 300 == 5m): # time the template was long unused (currently 300 == 5m):
self.__unusedTime = 300 self.__unusedTime = 300
# first free place:
self.__firstUnused = 0
def _appendTemplate(self, template): def _appendTemplate(self, template):
name = template.name name = template.name
@ -215,22 +219,52 @@ class DateDetector(object):
The regex match returned from the first successfully matched The regex match returned from the first successfully matched
template. template.
""" """
match = None
i = 0 i = 0
found = None, 0x7fffffff, -1
with self.__lock: with self.__lock:
for ddtempl in self.__templates: for ddtempl in self.__templates:
template = ddtempl.template template = ddtempl.template
match = template.matchDate(line) match = template.matchDate(line)
if match is not None: if match is not None:
distance = max(1, match.start() + 1)
if logSys.getEffectiveLevel() <= logLevel: if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, "Matched time template %s", template.name) logSys.log(logLevel, " matched time template #%r (at %r <= %r) %s",
i, distance, ddtempl.distance, template.name)
## [grave] if distance changed, possible date-match was found somewhere
## in body of message, so save this template, and search further:
if distance > ddtempl.distance and len(self.__templates) > 1:
if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, " ** distance collision - pattern change, reserve")
## shortest of both:
if distance < found[1]:
found = match, distance, i
## search further:
match = None
i += 1
continue
## winner - stop search:
break
i += 1
# check other template was found (use this one with shortest distance):
if match is None and found[0]:
match, distance, i = found
ddtempl = self.__templates[i]
template = ddtempl.template
# we've winner, incr hits, set distance, usage, reorder, etc:
if match is not None:
ddtempl.hits += 1 ddtempl.hits += 1
ddtempl.distance = distance
ddtempl.lastUsed = time.time() ddtempl.lastUsed = time.time()
if self.__firstUnused == i:
self.__firstUnused += 1
# if not first - try to reorder current template (bubble up), they will be not sorted anymore: # if not first - try to reorder current template (bubble up), they will be not sorted anymore:
if i: if i:
logSys.log(logLevel, " -> reorder template #%r, hits: %r", i, ddtempl.hits)
self._reorderTemplate(i) self._reorderTemplate(i)
# return tuple with match and template reference used for parsing: # return tuple with match and template reference used for parsing:
return (match, template) return (match, template)
i += 1
# not found: # not found:
return (None, None) return (None, None)
@ -263,7 +297,7 @@ class DateDetector(object):
date = template.getDate(line, timeMatch[0]) date = template.getDate(line, timeMatch[0])
if date is not None: if date is not None:
if logSys.getEffectiveLevel() <= logLevel: if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, "Got time %f for %r using template %s", logSys.log(logLevel, " got time %f for %r using template %s",
date[0], date[1].group(), template.name) date[0], date[1].group(), template.name)
return date return date
except ValueError: except ValueError:
@ -282,23 +316,24 @@ class DateDetector(object):
ddtempl = templates[num] ddtempl = templates[num]
## current hits and time the template was long unused: ## current hits and time the template was long unused:
untime = ddtempl.lastUsed - self.__unusedTime untime = ddtempl.lastUsed - self.__unusedTime
hits = ddtempl.hits * ddtempl.template.weight weight = ddtempl.hits * ddtempl.template.weight / ddtempl.distance
## try to move faster (first 2 if it still unused, or half of part to current template position): ## try to move faster (first if unused available, or half of part to current template position):
for pos in (0, 1, num // 2): pos = self.__firstUnused if self.__firstUnused < num else num // 2
phits = templates[pos].hits pweight = templates[pos].hits * templates[pos].template.weight / templates[pos].distance
if not phits: # if we've found an unused
break
phits *= templates[pos].template.weight
## don't move too often (multiline logs resp. log's with different date patterns), ## don't move too often (multiline logs resp. log's with different date patterns),
## if template not used too long, replace it also : ## if template not used too long, replace it also :
if not phits or hits > phits + 5 or templates[pos].lastUsed < untime: logSys.log(logLevel, " -> compare template #%r & #%r, weight %r > %r", num, pos, weight, pweight)
if not pweight or weight > pweight + 5 or templates[pos].lastUsed < untime:
## if not larger (and target position recently used) - move slow (exact 1 position): ## if not larger (and target position recently used) - move slow (exact 1 position):
if hits <= phits and templates[pos].lastUsed > untime: if weight <= pweight and templates[pos].lastUsed > untime:
pos = num-1 pos = num-1
## if still smaller and template at position used, don't move: ## if still smaller and template at position used, don't move:
phits = templates[pos].hits * templates[pos].template.weight pweight = templates[pos].hits * templates[pos].template.weight / templates[pos].distance
if hits < phits and templates[pos].lastUsed > untime: logSys.log(logLevel, " -> compare template #%r & #%r, weight %r > %r", num, pos, weight, pweight)
if weight < pweight and templates[pos].lastUsed > untime:
return return
templates[pos], templates[num] = ddtempl, templates[pos] templates[pos], templates[num] = ddtempl, templates[pos]
logSys.log(logLevel, " -> moved template #%r -> #%r", num, pos)
## correct first unused:
if pos == self.__firstUnused:
self.__firstUnused += 1

View File

@ -198,6 +198,29 @@ class DateDetectorTest(LogCaptureTestCase):
self.assertRaises(Exception, t.getDate, '') self.assertRaises(Exception, t.getDate, '')
self.assertEqual(t.matchDate('aaaac').group(), 'aaaac') self.assertEqual(t.matchDate('aaaac').group(), 'aaaac')
def testAmbiguousInOrderedTemplates(self):
dd = DateDetector()
dd.addDefaultTemplate()
for (debit, line, cnt) in (
("2003-03-07 17:05:01", "some free text 2003-03-07 17:05:01 test ...", 15),
# distance collision detection (date from foreign input should not be found):
("030324 0:04:00", "server mysqld[1000]: 030324 0:04:00 [Warning] Access denied ..."
" foreign-input just some free text 2003-03-07 17:05:01 test", 10),
# distance collision detection (first date should be found):
("Sep 16 21:30:26", "server mysqld[1020]: Sep 16 21:30:26 server mysqld: 030916 21:30:26 [Warning] Access denied", 10),
# just to test sorting:
("2005-10-07 06:09:42", "server mysqld[5906]: 2005-10-07 06:09:42 5907 [Warning] Access denied", 20),
("2005-10-08T15:26:18.237955", "server mysqld[5906]: 2005-10-08T15:26:18.237955 6 [Note] Access denied", 20),
# date format changed again:
("051009 10:05:30", "server mysqld[1000]: 051009 10:05:30 [Warning] Access denied ...", 20),
):
logSys.debug('== test: %r', (debit, line, cnt))
for i in range(cnt):
logSys.debug('Line: %s', line)
match, template = dd.matchTime(line)
self.assertTrue(match)
self.assertEqual(match.group(), debit)
iso8601 = DatePatternRegex("%Y-%m-%d[T ]%H:%M:%S(?:\.%f)?%z") iso8601 = DatePatternRegex("%Y-%m-%d[T ]%H:%M:%S(?:\.%f)?%z")
@ -309,30 +332,6 @@ class CustomDateFormatsTest(unittest.TestCase):
else: else:
self.assertEqual(date, None) self.assertEqual(date, None)
# def testAmbiguousUsingOrderedTemplates(self):
# defDD = DateDetector()
# defDD.addDefaultTemplate()
# for (matched, dp, line) in (
# # wrong date recognized short month/day (unbounded date pattern without separator),
# # in the 2nd and 3th tests (with precise month and day) it should find correct the 2nd date:
# ('200333 010203', r'%Y%m%d %H%M%S', "text:200333 010203 | date:20031230 010203"),
# ('20031230 010203', r'%ExY%Exm%Exd %ExH%ExM%ExS', "text:200333 010203 | date:20031230 010203"),
# ('20031230 010203', None, "text:200333 010203 | date:20031230 010203"),
# ):
# logSys.debug('== test: %r', (matched, dp, line))
# if dp is None:
# dd = defDD
# else:
# dp = DatePatternRegex(dp)
# dd = DateDetector()
# dd.appendTemplate(dp)
# date = dd.getTime(line)
# if matched:
# self.assertTrue(date)
# self.assertEqual(matched, date[1].group())
# else:
# self.assertEqual(date, None)
# def testDefaultTempate(self): # def testDefaultTempate(self):
# self.__datedetector.setDefaultRegex("^\S{3}\s{1,2}\d{1,2} \d{2}:\d{2}:\d{2}") # self.__datedetector.setDefaultRegex("^\S{3}\s{1,2}\d{1,2} \d{2}:\d{2}:\d{2}")