optimized to better usage of the last time template (parse part of line at the same place as last time, if enclosed in the same boundaries)

thereby follow rule "shortest distance to datetime should win", so possible collision causes search though all templates;
speedup it a little bit (not possible collision if distance <= 1 or if line-begin anchoring, so break search if such template found)
pull/1583/head
sebres 2016-10-05 12:11:06 +02:00
parent b9033d004e
commit f56ff5f48b
3 changed files with 145 additions and 65 deletions

View File

@ -26,7 +26,7 @@ import time
from threading import Lock from threading import Lock
from .datetemplate import DatePatternRegex, DateTai64n, DateEpoch from .datetemplate import DateTemplate, DatePatternRegex, DateTai64n, DateEpoch
from ..helpers import getLogger from ..helpers import getLogger
# Gets the instance of the logger. # Gets the instance of the logger.
@ -142,6 +142,10 @@ class DateDetectorTemplate(object):
# the last distance to date-match within the log file: # the last distance to date-match within the log file:
self.distance = 0x7fffffff self.distance = 0x7fffffff
@property
def weight(self):
return self.hits * self.template.weight / max(1, self.distance)
def __getattr__(self, name): def __getattr__(self, name):
""" Returns attribute of template (called for parameters not in slots) """ Returns attribute of template (called for parameters not in slots)
""" """
@ -158,13 +162,14 @@ class DateDetector(object):
_defCache = DateDetectorCache() _defCache = DateDetectorCache()
def __init__(self): def __init__(self):
self.__lock = Lock()
self.__templates = list() self.__templates = list()
self.__known_names = set() self.__known_names = set()
# time the template was long unused (currently 300 == 5m): # time the template was long unused (currently 300 == 5m):
self.__unusedTime = 300 self.__unusedTime = 300
# last known distance: # last known distance (bypass one char collision) and end position:
self.__lastDistance = 0 self.__lastPos = 1, None
self.__lastEndPos = 0x7fffffff, None
self.__lastTemplIdx = 0x7fffffff
# first free place: # first free place:
self.__firstUnused = 0 self.__firstUnused = 0
@ -198,9 +203,8 @@ class DateDetector(object):
def addDefaultTemplate(self): def addDefaultTemplate(self):
"""Add Fail2Ban's default set of date templates. """Add Fail2Ban's default set of date templates.
""" """
with self.__lock: for template in DateDetector._defCache.templates:
for template in DateDetector._defCache.templates: self._appendTemplate(template)
self._appendTemplate(template)
@property @property
def templates(self): def templates(self):
@ -226,29 +230,59 @@ class DateDetector(object):
The regex match returned from the first successfully matched The regex match returned from the first successfully matched
template. template.
""" """
#logSys.log(logLevel, "try to match time for line: %.250s", line)
match = None match = None
i = 0 # first try to use last template with same start/end position:
found = None, 0x7fffffff, -1 i = self.__lastTemplIdx
with self.__lock: if i < len(self.__templates):
ddtempl = self.__templates[i]
template = ddtempl.template
distance, endpos = self.__lastPos[0], self.__lastEndPos[0]
# check same boundaries left/right, otherwise possible collision/pattern switch:
if (line[distance-1:distance] == self.__lastPos[1] and
line[endpos:endpos+1] == self.__lastEndPos[1]
):
if logSys.getEffectiveLevel() <= logLevel-1:
logSys.log(logLevel-1, " try to match last template #%02i (from %r to %r): ... %s ...",
i, distance, endpos, line[distance:endpos])
match = template.matchDate(line, distance, endpos)
if match:
distance = match.start()
endpos = match.end()
# if different position, possible collision/pattern switch:
if distance == self.__lastPos[0] and endpos == self.__lastEndPos[0]:
logSys.log(logLevel, " matched last time template #%02i", i)
else:
logSys.log(logLevel, " ** last pattern collision - pattern change, search ...")
match = None
# search template and better match:
if not match:
self.__lastTemplIdx = 0x7fffffff
logSys.log(logLevel, " search template ...")
found = None, 0x7fffffff, -1
i = 0
for ddtempl in self.__templates: for ddtempl in self.__templates:
template = ddtempl.template template = ddtempl.template
match = template.matchDate(line) match = template.matchDate(line)
if match is not None: if match:
distance = max(1, match.start() + 1) distance = match.start()
endpos = match.end()
if logSys.getEffectiveLevel() <= logLevel: if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, " matched time template #%r (at %r <= %r, %r) %s", logSys.log(logLevel, " matched time template #%02i (at %r <= %r, %r) %s",
i, distance, ddtempl.distance, self.__lastDistance, template.name) i, distance, ddtempl.distance, self.__lastPos[0], template.name)
## if line-begin anchored - stop searching:
if template.flags & DateTemplate.LINE_BEGIN:
break
## [grave] if distance changed, possible date-match was found somewhere ## [grave] if distance changed, possible date-match was found somewhere
## in body of message, so save this template, and search further: ## in body of message, so save this template, and search further:
if ( if (
(distance > ddtempl.distance or distance > self.__lastDistance) and (distance > ddtempl.distance or distance > self.__lastPos[0]) and
len(self.__templates) > 1 len(self.__templates) > 1
): ):
if logSys.getEffectiveLevel() <= logLevel: logSys.log(logLevel, " ** distance collision - pattern change, reserve")
logSys.log(logLevel, " ** distance collision - pattern change, reserve")
## shortest of both: ## shortest of both:
if distance < found[1]: if distance < found[1]:
found = match, distance, i found = match, distance, endpos, i
## search further: ## search further:
match = None match = None
i += 1 i += 1
@ -257,25 +291,29 @@ class DateDetector(object):
break break
i += 1 i += 1
# check other template was found (use this one with shortest distance): # check other template was found (use this one with shortest distance):
if match is None and found[0]: if not match and found[0]:
match, distance, i = found match, distance, endpos, i = found
logSys.log(logLevel, " use best time template #%02i", i)
ddtempl = self.__templates[i] ddtempl = self.__templates[i]
template = ddtempl.template template = ddtempl.template
# we've winner, incr hits, set distance, usage, reorder, etc: # we've winner, incr hits, set distance, usage, reorder, etc:
if match is not None: if match:
ddtempl.hits += 1 ddtempl.hits += 1
self.__lastDistance = ddtempl.distance = distance ddtempl.lastUsed = time.time()
ddtempl.lastUsed = time.time() ddtempl.distance = distance
if self.__firstUnused == i: if self.__firstUnused == i:
self.__firstUnused += 1 self.__firstUnused += 1
# if not first - try to reorder current template (bubble up), they will be not sorted anymore: self.__lastPos = distance, line[distance-1:distance]
if i: self.__lastEndPos = endpos, line[endpos:endpos+1]
logSys.log(logLevel, " -> reorder template #%r, hits: %r", i, ddtempl.hits) # if not first - try to reorder current template (bubble up), they will be not sorted anymore:
self._reorderTemplate(i) if i:
# return tuple with match and template reference used for parsing: i = self._reorderTemplate(i)
return (match, template) self.__lastTemplIdx = i
# return tuple with match and template reference used for parsing:
return (match, template)
# not found: # not found:
logSys.log(logLevel, " no template.")
return (None, None) return (None, None)
def getTime(self, line, timeMatch=None): def getTime(self, line, timeMatch=None):
@ -311,7 +349,8 @@ class DateDetector(object):
date[0], date[1].group(), template.name) date[0], date[1].group(), template.name)
return date return date
except ValueError: except ValueError:
return None pass
return None
def _reorderTemplate(self, num): def _reorderTemplate(self, num):
"""Reorder template (bubble up) in template list if hits grows enough. """Reorder template (bubble up) in template list if hits grows enough.
@ -324,29 +363,36 @@ class DateDetector(object):
if num: if num:
templates = self.__templates templates = self.__templates
ddtempl = templates[num] ddtempl = templates[num]
if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, " -> reorder template #%02i, hits: %r", num, ddtempl.hits)
## current hits and time the template was long unused: ## current hits and time the template was long unused:
untime = ddtempl.lastUsed - self.__unusedTime untime = ddtempl.lastUsed - self.__unusedTime
weight = ddtempl.hits * ddtempl.template.weight / ddtempl.distance weight = ddtempl.weight
## try to move faster (first if unused available, or half of part to current template position): ## try to move faster (first if unused available, or half of part to current template position):
pos = self.__firstUnused if self.__firstUnused < num else num // 2 pos = self.__firstUnused if self.__firstUnused < num else num // 2
pweight = templates[pos].hits * templates[pos].template.weight / templates[pos].distance pweight = templates[pos].weight
## don't move too often (multiline logs resp. log's with different date patterns), ## don't move too often (multiline logs resp. log's with different date patterns),
## if template not used too long, replace it also : ## if template not used too long, replace it also :
logSys.log(logLevel, " -> compare template #%r & #%r, weight %r > %r, hits %r > %r", if logSys.getEffectiveLevel() <= logLevel:
num, pos, weight, pweight, ddtempl.hits, templates[pos].hits) logSys.log(logLevel, " -> compare template #%02i & #%02i, weight %.3f > %.3f, hits %r > %r",
num, pos, weight, pweight, ddtempl.hits, templates[pos].hits)
if not pweight or weight > pweight or templates[pos].lastUsed < untime: if not pweight or weight > pweight or templates[pos].lastUsed < untime:
## if not larger (and target position recently used) - move slow (exact 1 position): ## if not larger (and target position recently used) - move slow (exact 1 position):
if weight <= pweight and templates[pos].lastUsed > untime: if weight <= pweight and templates[pos].lastUsed > untime:
pos = num-1 pos = num-1
## if still smaller and template at position used, don't move: ## if still smaller and template at position used, don't move:
pweight = templates[pos].hits * templates[pos].template.weight / templates[pos].distance pweight = templates[pos].weight
logSys.log(logLevel, " -> compare template #%r & #%r, weight %r > %r, hits %r > %r", if logSys.getEffectiveLevel() <= logLevel:
num, pos, weight, pweight, ddtempl.hits, templates[pos].hits) logSys.log(logLevel, " -> compare template #%02i & #%02i, weight %.3f > %.3f, hits %r > %r",
num, pos, weight, pweight, ddtempl.hits, templates[pos].hits)
if weight < pweight and templates[pos].lastUsed > untime: if weight < pweight and templates[pos].lastUsed > untime:
return return
del templates[num] del templates[num]
templates[pos:0] = [ddtempl] templates[pos:0] = [ddtempl]
## correct first unused: ## correct first unused:
if pos == self.__firstUnused: while self.__firstUnused < len(templates) and templates[self.__firstUnused].hits:
self.__firstUnused += 1 self.__firstUnused += 1
logSys.log(logLevel, " -> moved template #%r -> #%r", num, pos) if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, " -> moved template #%02i -> #%02i", num, pos)
return pos
return num

View File

@ -48,9 +48,14 @@ class DateTemplate(object):
regex regex
""" """
LINE_BEGIN = 8
WORD_BEGIN = 2
WORD_END = 1
def __init__(self): def __init__(self):
self.name = "" self.name = ""
self.weight = 1.0 self.weight = 1.0
self.flags = 0
self._regex = "" self._regex = ""
self._cRegex = None self._cRegex = None
@ -83,12 +88,14 @@ class DateTemplate(object):
regex = regex.strip() regex = regex.strip()
# if word or line start boundary: # if word or line start boundary:
if wordBegin and not RE_NO_WRD_BOUND_BEG.search(regex): if wordBegin and not RE_NO_WRD_BOUND_BEG.search(regex):
self.flags |= DateTemplate.WORD_BEGIN if wordBegin != 'start' else DateTemplate.LINE_BEGIN
regex = (r'(?=^|\b|\W)' if wordBegin != 'start' else r"(?:^|(?<=^\W)|(?<=^\W{2}))") + regex regex = (r'(?=^|\b|\W)' if wordBegin != 'start' else r"(?:^|(?<=^\W)|(?<=^\W{2}))") + regex
self.name = ('{*WD-BEG}' if wordBegin != 'start' else '{^LN-BEG}') + self.name self.name = ('{*WD-BEG}' if wordBegin != 'start' else '{^LN-BEG}') + self.name
# if word end boundary: # if word end boundary:
if wordEnd and not RE_NO_WRD_BOUND_END.search(regex): if wordEnd and not RE_NO_WRD_BOUND_END.search(regex):
self.flags |= DateTemplate.WORD_END
regex += r'(?=\b|\W|$)' regex += r'(?=\b|\W|$)'
self.name += ('{*WD-END}' if wordEnd else '') self.name += '{*WD-END}'
# remove possible special pattern "**" in front and end of regex: # remove possible special pattern "**" in front and end of regex:
regex = RE_DEL_WRD_BOUNDS.sub('', regex) regex = RE_DEL_WRD_BOUNDS.sub('', regex)
self._regex = regex self._regex = regex
@ -97,12 +104,18 @@ class DateTemplate(object):
"""Regex used to search for date. """Regex used to search for date.
""") """)
def matchDate(self, line): def _compileRegex(self):
"""Check if regex for date matches on a log line. """Compile regex by first usage.
""" """
if not self._cRegex: if not self._cRegex:
self._cRegex = re.compile(self.regex, re.UNICODE | re.IGNORECASE) self._cRegex = re.compile(self.regex, re.UNICODE | re.IGNORECASE)
dateMatch = self._cRegex.search(line)
def matchDate(self, line, *args):
"""Check if regex for date matches on a log line.
"""
if not self._cRegex:
self._compileRegex()
dateMatch = self._cRegex.search(line, *args); # pos, endpos
return dateMatch return dateMatch
@abstractmethod @abstractmethod

View File

@ -42,17 +42,20 @@ class DateDetectorTest(LogCaptureTestCase):
def setUp(self): def setUp(self):
"""Call before every test case.""" """Call before every test case."""
LogCaptureTestCase.setUp(self) LogCaptureTestCase.setUp(self)
self.__old_eff_level = datedetector.logLevel
datedetector.logLevel = logSys.getEffectiveLevel()
setUpMyTime() setUpMyTime()
self.__datedetector = DateDetector() self.__datedetector = None
self.__datedetector.addDefaultTemplate()
def tearDown(self): def tearDown(self):
"""Call after every test case.""" """Call after every test case."""
LogCaptureTestCase.tearDown(self) LogCaptureTestCase.tearDown(self)
datedetector.logLevel = self.__old_eff_level
tearDownMyTime() tearDownMyTime()
@property
def datedetector(self):
if self.__datedetector is None:
self.__datedetector = DateDetector()
self.__datedetector.addDefaultTemplate()
return self.__datedetector
def testGetEpochTime(self): def testGetEpochTime(self):
# correct epoch time, using all variants: # correct epoch time, using all variants:
@ -60,7 +63,7 @@ class DateDetectorTest(LogCaptureTestCase):
for date in ("%s", "[%s]", "[%s.555]", "audit(%s.555:101)"): for date in ("%s", "[%s]", "[%s.555]", "audit(%s.555:101)"):
date = date % dateUnix date = date % dateUnix
log = date + " [sshd] error: PAM: Authentication failure" log = date + " [sshd] error: PAM: Authentication failure"
datelog = self.__datedetector.getTime(log) datelog = self.datedetector.getTime(log)
self.assertTrue(datelog, "Parse epoch time for %s failed" % (date,)) self.assertTrue(datelog, "Parse epoch time for %s failed" % (date,))
( datelog, matchlog ) = datelog ( datelog, matchlog ) = datelog
self.assertEqual(int(datelog), dateUnix) self.assertEqual(int(datelog), dateUnix)
@ -70,7 +73,7 @@ class DateDetectorTest(LogCaptureTestCase):
for date in ("%s", "[%s]", "[%s.555]", "audit(%s.555:101)"): for date in ("%s", "[%s]", "[%s.555]", "audit(%s.555:101)"):
date = date % dateUnix date = date % dateUnix
log = date + " [sshd] error: PAM: Authentication failure" log = date + " [sshd] error: PAM: Authentication failure"
datelog = self.__datedetector.getTime(log) datelog = self.datedetector.getTime(log)
self.assertFalse(datelog) self.assertFalse(datelog)
def testGetTime(self): def testGetTime(self):
@ -80,7 +83,7 @@ class DateDetectorTest(LogCaptureTestCase):
# is not correctly determined atm, since year is not present # is not correctly determined atm, since year is not present
# in the log entry. Since this doesn't effect the operation # in the log entry. Since this doesn't effect the operation
# of fail2ban -- we just ignore incorrect day of the week # of fail2ban -- we just ignore incorrect day of the week
( datelog, matchlog ) = self.__datedetector.getTime(log) ( datelog, matchlog ) = self.datedetector.getTime(log)
self.assertEqual(datelog, dateUnix) self.assertEqual(datelog, dateUnix)
self.assertEqual(matchlog.group(), 'Jan 23 21:59:59') self.assertEqual(matchlog.group(), 'Jan 23 21:59:59')
@ -140,7 +143,7 @@ class DateDetectorTest(LogCaptureTestCase):
if not bound and prefix == "word-boundary": continue if not bound and prefix == "word-boundary": continue
logSys.debug(' -- test %-5s for %r', should_match, log) logSys.debug(' -- test %-5s for %r', should_match, log)
# with getTime: # with getTime:
logtime = self.__datedetector.getTime(log) logtime = self.datedetector.getTime(log)
if should_match: if should_match:
self.assertNotEqual(logtime, None, self.assertNotEqual(logtime, None,
"getTime retrieved nothing: failure for %s by prefix %r, anchored: %r, log: %s" % ( sdate, prefix, anchored, log)) "getTime retrieved nothing: failure for %s by prefix %r, anchored: %r, log: %s" % ( sdate, prefix, anchored, log))
@ -152,8 +155,8 @@ class DateDetectorTest(LogCaptureTestCase):
self.assertEqual(logtime, None, self.assertEqual(logtime, None,
"getTime should have not matched for %r by prefix %r Got: %s" % (sdate, prefix, logtime)) "getTime should have not matched for %r by prefix %r Got: %s" % (sdate, prefix, logtime))
# with getTime(matchTime) - this combination used in filter: # with getTime(matchTime) - this combination used in filter:
(timeMatch, template) = matchTime = self.__datedetector.matchTime(log) (timeMatch, template) = matchTime = self.datedetector.matchTime(log)
logtime = self.__datedetector.getTime(log, matchTime) logtime = self.datedetector.getTime(log, matchTime)
logSys.debug(' -- found - %r', template.name if timeMatch else False) logSys.debug(' -- found - %r', template.name if timeMatch else False)
if should_match: if should_match:
self.assertNotEqual(logtime, None, self.assertNotEqual(logtime, None,
@ -168,26 +171,26 @@ class DateDetectorTest(LogCaptureTestCase):
logSys.debug(' -- OK') logSys.debug(' -- OK')
def testAllUniqueTemplateNames(self): def testAllUniqueTemplateNames(self):
self.assertRaises(ValueError, self.__datedetector.appendTemplate, self.assertRaises(ValueError, self.datedetector.appendTemplate,
self.__datedetector.templates[0]) self.datedetector.templates[0])
def testFullYearMatch_gh130(self): def testFullYearMatch_gh130(self):
# see https://github.com/fail2ban/fail2ban/pull/130 # see https://github.com/fail2ban/fail2ban/pull/130
# yoh: unfortunately this test is not really effective to reproduce the # yoh: unfortunately this test is not really effective to reproduce the
# situation but left in place to assure consistent behavior # situation but left in place to assure consistent behavior
mu = time.mktime(datetime.datetime(2012, 10, 11, 2, 37, 17).timetuple()) mu = time.mktime(datetime.datetime(2012, 10, 11, 2, 37, 17).timetuple())
logdate = self.__datedetector.getTime('2012/10/11 02:37:17 [error] 18434#0') logdate = self.datedetector.getTime('2012/10/11 02:37:17 [error] 18434#0')
self.assertNotEqual(logdate, None) self.assertNotEqual(logdate, None)
( logTime, logMatch ) = logdate ( logTime, logMatch ) = logdate
self.assertEqual(logTime, mu) self.assertEqual(logTime, mu)
self.assertEqual(logMatch.group(), '2012/10/11 02:37:17') self.assertEqual(logMatch.group(), '2012/10/11 02:37:17')
# confuse it with year being at the end # confuse it with year being at the end
for i in xrange(10): for i in xrange(10):
( logTime, logMatch ) = self.__datedetector.getTime('11/10/2012 02:37:17 [error] 18434#0') ( logTime, logMatch ) = self.datedetector.getTime('11/10/2012 02:37:17 [error] 18434#0')
self.assertEqual(logTime, mu) self.assertEqual(logTime, mu)
self.assertEqual(logMatch.group(), '11/10/2012 02:37:17') self.assertEqual(logMatch.group(), '11/10/2012 02:37:17')
# and now back to the original # and now back to the original
( logTime, logMatch ) = self.__datedetector.getTime('2012/10/11 02:37:17 [error] 18434#0') ( logTime, logMatch ) = self.datedetector.getTime('2012/10/11 02:37:17 [error] 18434#0')
self.assertEqual(logTime, mu) self.assertEqual(logTime, mu)
self.assertEqual(logMatch.group(), '2012/10/11 02:37:17') self.assertEqual(logMatch.group(), '2012/10/11 02:37:17')
@ -199,8 +202,7 @@ class DateDetectorTest(LogCaptureTestCase):
self.assertEqual(t.matchDate('aaaac').group(), 'aaaac') self.assertEqual(t.matchDate('aaaac').group(), 'aaaac')
def testAmbiguousInOrderedTemplates(self): def testAmbiguousInOrderedTemplates(self):
dd = DateDetector() dd = self.datedetector
dd.addDefaultTemplate()
for (debit, line, cnt) in ( for (debit, line, cnt) in (
# shortest distance to datetime should win: # shortest distance to datetime should win:
("030324 0:03:59", "some free text 030324 0:03:59 -- 2003-03-07 17:05:01 ...", 1), ("030324 0:03:59", "some free text 030324 0:03:59 -- 2003-03-07 17:05:01 ...", 1),
@ -224,6 +226,25 @@ class DateDetectorTest(LogCaptureTestCase):
self.assertTrue(match) self.assertTrue(match)
self.assertEqual(match.group(), debit) self.assertEqual(match.group(), debit)
def testLowLevelLogging(self):
# test coverage for the deep (heavy) debug messages:
try:
self.__old_eff_level = datedetector.logLevel
if datedetector.logLevel < logSys.getEffectiveLevel()+1:
datedetector.logLevel = logSys.getEffectiveLevel()+1
dd = self.datedetector
i = 0
for (line, cnt) in (
("server mysqld[5906]: 2005-10-07 06:09:%02i 5907 [Warning] Access denied", 2),
("server mysqld[5906]: 051007 06:10:%02i 5907 [Warning] Access denied", 5),
("server mysqld[5906]: 2005-10-07 06:09:%02i 5907 [Warning] Access denied", 10),
):
for i in range(i, i+cnt+1):
logSys.debug('== test: %r', (line % i, cnt))
match, template = dd.matchTime(line % i)
self.assertTrue(match)
finally:
datedetector.logLevel = self.__old_eff_level
iso8601 = DatePatternRegex("%Y-%m-%d[T ]%H:%M:%S(?:\.%f)?%z") iso8601 = DatePatternRegex("%Y-%m-%d[T ]%H:%M:%S(?:\.%f)?%z")