optimized to better usage of the last time template (parse part of line at the same place as last time, if enclosed in the same boundaries)

thereby follow rule "shortest distance to datetime should win", so possible collision causes search though all templates;
speedup it a little bit (not possible collision if distance <= 1 or if line-begin anchoring, so break search if such template found)
pull/1583/head
sebres 2016-10-05 12:11:06 +02:00
parent b9033d004e
commit f56ff5f48b
3 changed files with 145 additions and 65 deletions

View File

@ -26,7 +26,7 @@ import time
from threading import Lock
from .datetemplate import DatePatternRegex, DateTai64n, DateEpoch
from .datetemplate import DateTemplate, DatePatternRegex, DateTai64n, DateEpoch
from ..helpers import getLogger
# Gets the instance of the logger.
@ -142,6 +142,10 @@ class DateDetectorTemplate(object):
# the last distance to date-match within the log file:
self.distance = 0x7fffffff
@property
def weight(self):
return self.hits * self.template.weight / max(1, self.distance)
def __getattr__(self, name):
""" Returns attribute of template (called for parameters not in slots)
"""
@ -158,13 +162,14 @@ class DateDetector(object):
_defCache = DateDetectorCache()
def __init__(self):
self.__lock = Lock()
self.__templates = list()
self.__known_names = set()
# time the template was long unused (currently 300 == 5m):
self.__unusedTime = 300
# last known distance:
self.__lastDistance = 0
# last known distance (bypass one char collision) and end position:
self.__lastPos = 1, None
self.__lastEndPos = 0x7fffffff, None
self.__lastTemplIdx = 0x7fffffff
# first free place:
self.__firstUnused = 0
@ -198,9 +203,8 @@ class DateDetector(object):
def addDefaultTemplate(self):
"""Add Fail2Ban's default set of date templates.
"""
with self.__lock:
for template in DateDetector._defCache.templates:
self._appendTemplate(template)
for template in DateDetector._defCache.templates:
self._appendTemplate(template)
@property
def templates(self):
@ -226,29 +230,59 @@ class DateDetector(object):
The regex match returned from the first successfully matched
template.
"""
#logSys.log(logLevel, "try to match time for line: %.250s", line)
match = None
i = 0
found = None, 0x7fffffff, -1
with self.__lock:
# first try to use last template with same start/end position:
i = self.__lastTemplIdx
if i < len(self.__templates):
ddtempl = self.__templates[i]
template = ddtempl.template
distance, endpos = self.__lastPos[0], self.__lastEndPos[0]
# check same boundaries left/right, otherwise possible collision/pattern switch:
if (line[distance-1:distance] == self.__lastPos[1] and
line[endpos:endpos+1] == self.__lastEndPos[1]
):
if logSys.getEffectiveLevel() <= logLevel-1:
logSys.log(logLevel-1, " try to match last template #%02i (from %r to %r): ... %s ...",
i, distance, endpos, line[distance:endpos])
match = template.matchDate(line, distance, endpos)
if match:
distance = match.start()
endpos = match.end()
# if different position, possible collision/pattern switch:
if distance == self.__lastPos[0] and endpos == self.__lastEndPos[0]:
logSys.log(logLevel, " matched last time template #%02i", i)
else:
logSys.log(logLevel, " ** last pattern collision - pattern change, search ...")
match = None
# search template and better match:
if not match:
self.__lastTemplIdx = 0x7fffffff
logSys.log(logLevel, " search template ...")
found = None, 0x7fffffff, -1
i = 0
for ddtempl in self.__templates:
template = ddtempl.template
match = template.matchDate(line)
if match is not None:
distance = max(1, match.start() + 1)
if match:
distance = match.start()
endpos = match.end()
if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, " matched time template #%r (at %r <= %r, %r) %s",
i, distance, ddtempl.distance, self.__lastDistance, template.name)
logSys.log(logLevel, " matched time template #%02i (at %r <= %r, %r) %s",
i, distance, ddtempl.distance, self.__lastPos[0], template.name)
## if line-begin anchored - stop searching:
if template.flags & DateTemplate.LINE_BEGIN:
break
## [grave] if distance changed, possible date-match was found somewhere
## in body of message, so save this template, and search further:
if (
(distance > ddtempl.distance or distance > self.__lastDistance) and
(distance > ddtempl.distance or distance > self.__lastPos[0]) and
len(self.__templates) > 1
):
if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, " ** distance collision - pattern change, reserve")
logSys.log(logLevel, " ** distance collision - pattern change, reserve")
## shortest of both:
if distance < found[1]:
found = match, distance, i
found = match, distance, endpos, i
## search further:
match = None
i += 1
@ -257,25 +291,29 @@ class DateDetector(object):
break
i += 1
# check other template was found (use this one with shortest distance):
if match is None and found[0]:
match, distance, i = found
if not match and found[0]:
match, distance, endpos, i = found
logSys.log(logLevel, " use best time template #%02i", i)
ddtempl = self.__templates[i]
template = ddtempl.template
# we've winner, incr hits, set distance, usage, reorder, etc:
if match is not None:
ddtempl.hits += 1
self.__lastDistance = ddtempl.distance = distance
ddtempl.lastUsed = time.time()
if self.__firstUnused == i:
self.__firstUnused += 1
# if not first - try to reorder current template (bubble up), they will be not sorted anymore:
if i:
logSys.log(logLevel, " -> reorder template #%r, hits: %r", i, ddtempl.hits)
self._reorderTemplate(i)
# return tuple with match and template reference used for parsing:
return (match, template)
# we've winner, incr hits, set distance, usage, reorder, etc:
if match:
ddtempl.hits += 1
ddtempl.lastUsed = time.time()
ddtempl.distance = distance
if self.__firstUnused == i:
self.__firstUnused += 1
self.__lastPos = distance, line[distance-1:distance]
self.__lastEndPos = endpos, line[endpos:endpos+1]
# if not first - try to reorder current template (bubble up), they will be not sorted anymore:
if i:
i = self._reorderTemplate(i)
self.__lastTemplIdx = i
# return tuple with match and template reference used for parsing:
return (match, template)
# not found:
logSys.log(logLevel, " no template.")
return (None, None)
def getTime(self, line, timeMatch=None):
@ -311,7 +349,8 @@ class DateDetector(object):
date[0], date[1].group(), template.name)
return date
except ValueError:
return None
pass
return None
def _reorderTemplate(self, num):
"""Reorder template (bubble up) in template list if hits grows enough.
@ -324,29 +363,36 @@ class DateDetector(object):
if num:
templates = self.__templates
ddtempl = templates[num]
if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, " -> reorder template #%02i, hits: %r", num, ddtempl.hits)
## current hits and time the template was long unused:
untime = ddtempl.lastUsed - self.__unusedTime
weight = ddtempl.hits * ddtempl.template.weight / ddtempl.distance
weight = ddtempl.weight
## try to move faster (first if unused available, or half of part to current template position):
pos = self.__firstUnused if self.__firstUnused < num else num // 2
pweight = templates[pos].hits * templates[pos].template.weight / templates[pos].distance
pweight = templates[pos].weight
## don't move too often (multiline logs resp. log's with different date patterns),
## if template not used too long, replace it also :
logSys.log(logLevel, " -> compare template #%r & #%r, weight %r > %r, hits %r > %r",
num, pos, weight, pweight, ddtempl.hits, templates[pos].hits)
if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, " -> compare template #%02i & #%02i, weight %.3f > %.3f, hits %r > %r",
num, pos, weight, pweight, ddtempl.hits, templates[pos].hits)
if not pweight or weight > pweight or templates[pos].lastUsed < untime:
## if not larger (and target position recently used) - move slow (exact 1 position):
if weight <= pweight and templates[pos].lastUsed > untime:
pos = num-1
## if still smaller and template at position used, don't move:
pweight = templates[pos].hits * templates[pos].template.weight / templates[pos].distance
logSys.log(logLevel, " -> compare template #%r & #%r, weight %r > %r, hits %r > %r",
num, pos, weight, pweight, ddtempl.hits, templates[pos].hits)
pweight = templates[pos].weight
if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, " -> compare template #%02i & #%02i, weight %.3f > %.3f, hits %r > %r",
num, pos, weight, pweight, ddtempl.hits, templates[pos].hits)
if weight < pweight and templates[pos].lastUsed > untime:
return
del templates[num]
templates[pos:0] = [ddtempl]
## correct first unused:
if pos == self.__firstUnused:
while self.__firstUnused < len(templates) and templates[self.__firstUnused].hits:
self.__firstUnused += 1
logSys.log(logLevel, " -> moved template #%r -> #%r", num, pos)
if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, " -> moved template #%02i -> #%02i", num, pos)
return pos
return num

View File

@ -48,9 +48,14 @@ class DateTemplate(object):
regex
"""
LINE_BEGIN = 8
WORD_BEGIN = 2
WORD_END = 1
def __init__(self):
self.name = ""
self.weight = 1.0
self.flags = 0
self._regex = ""
self._cRegex = None
@ -83,12 +88,14 @@ class DateTemplate(object):
regex = regex.strip()
# if word or line start boundary:
if wordBegin and not RE_NO_WRD_BOUND_BEG.search(regex):
self.flags |= DateTemplate.WORD_BEGIN if wordBegin != 'start' else DateTemplate.LINE_BEGIN
regex = (r'(?=^|\b|\W)' if wordBegin != 'start' else r"(?:^|(?<=^\W)|(?<=^\W{2}))") + regex
self.name = ('{*WD-BEG}' if wordBegin != 'start' else '{^LN-BEG}') + self.name
# if word end boundary:
if wordEnd and not RE_NO_WRD_BOUND_END.search(regex):
self.flags |= DateTemplate.WORD_END
regex += r'(?=\b|\W|$)'
self.name += ('{*WD-END}' if wordEnd else '')
self.name += '{*WD-END}'
# remove possible special pattern "**" in front and end of regex:
regex = RE_DEL_WRD_BOUNDS.sub('', regex)
self._regex = regex
@ -97,12 +104,18 @@ class DateTemplate(object):
"""Regex used to search for date.
""")
def matchDate(self, line):
"""Check if regex for date matches on a log line.
def _compileRegex(self):
"""Compile regex by first usage.
"""
if not self._cRegex:
self._cRegex = re.compile(self.regex, re.UNICODE | re.IGNORECASE)
dateMatch = self._cRegex.search(line)
def matchDate(self, line, *args):
"""Check if regex for date matches on a log line.
"""
if not self._cRegex:
self._compileRegex()
dateMatch = self._cRegex.search(line, *args); # pos, endpos
return dateMatch
@abstractmethod

View File

@ -42,17 +42,20 @@ class DateDetectorTest(LogCaptureTestCase):
def setUp(self):
"""Call before every test case."""
LogCaptureTestCase.setUp(self)
self.__old_eff_level = datedetector.logLevel
datedetector.logLevel = logSys.getEffectiveLevel()
setUpMyTime()
self.__datedetector = DateDetector()
self.__datedetector.addDefaultTemplate()
self.__datedetector = None
def tearDown(self):
"""Call after every test case."""
LogCaptureTestCase.tearDown(self)
datedetector.logLevel = self.__old_eff_level
tearDownMyTime()
@property
def datedetector(self):
if self.__datedetector is None:
self.__datedetector = DateDetector()
self.__datedetector.addDefaultTemplate()
return self.__datedetector
def testGetEpochTime(self):
# correct epoch time, using all variants:
@ -60,7 +63,7 @@ class DateDetectorTest(LogCaptureTestCase):
for date in ("%s", "[%s]", "[%s.555]", "audit(%s.555:101)"):
date = date % dateUnix
log = date + " [sshd] error: PAM: Authentication failure"
datelog = self.__datedetector.getTime(log)
datelog = self.datedetector.getTime(log)
self.assertTrue(datelog, "Parse epoch time for %s failed" % (date,))
( datelog, matchlog ) = datelog
self.assertEqual(int(datelog), dateUnix)
@ -70,7 +73,7 @@ class DateDetectorTest(LogCaptureTestCase):
for date in ("%s", "[%s]", "[%s.555]", "audit(%s.555:101)"):
date = date % dateUnix
log = date + " [sshd] error: PAM: Authentication failure"
datelog = self.__datedetector.getTime(log)
datelog = self.datedetector.getTime(log)
self.assertFalse(datelog)
def testGetTime(self):
@ -80,7 +83,7 @@ class DateDetectorTest(LogCaptureTestCase):
# is not correctly determined atm, since year is not present
# in the log entry. Since this doesn't effect the operation
# of fail2ban -- we just ignore incorrect day of the week
( datelog, matchlog ) = self.__datedetector.getTime(log)
( datelog, matchlog ) = self.datedetector.getTime(log)
self.assertEqual(datelog, dateUnix)
self.assertEqual(matchlog.group(), 'Jan 23 21:59:59')
@ -140,7 +143,7 @@ class DateDetectorTest(LogCaptureTestCase):
if not bound and prefix == "word-boundary": continue
logSys.debug(' -- test %-5s for %r', should_match, log)
# with getTime:
logtime = self.__datedetector.getTime(log)
logtime = self.datedetector.getTime(log)
if should_match:
self.assertNotEqual(logtime, None,
"getTime retrieved nothing: failure for %s by prefix %r, anchored: %r, log: %s" % ( sdate, prefix, anchored, log))
@ -152,8 +155,8 @@ class DateDetectorTest(LogCaptureTestCase):
self.assertEqual(logtime, None,
"getTime should have not matched for %r by prefix %r Got: %s" % (sdate, prefix, logtime))
# with getTime(matchTime) - this combination used in filter:
(timeMatch, template) = matchTime = self.__datedetector.matchTime(log)
logtime = self.__datedetector.getTime(log, matchTime)
(timeMatch, template) = matchTime = self.datedetector.matchTime(log)
logtime = self.datedetector.getTime(log, matchTime)
logSys.debug(' -- found - %r', template.name if timeMatch else False)
if should_match:
self.assertNotEqual(logtime, None,
@ -168,26 +171,26 @@ class DateDetectorTest(LogCaptureTestCase):
logSys.debug(' -- OK')
def testAllUniqueTemplateNames(self):
self.assertRaises(ValueError, self.__datedetector.appendTemplate,
self.__datedetector.templates[0])
self.assertRaises(ValueError, self.datedetector.appendTemplate,
self.datedetector.templates[0])
def testFullYearMatch_gh130(self):
# see https://github.com/fail2ban/fail2ban/pull/130
# yoh: unfortunately this test is not really effective to reproduce the
# situation but left in place to assure consistent behavior
mu = time.mktime(datetime.datetime(2012, 10, 11, 2, 37, 17).timetuple())
logdate = self.__datedetector.getTime('2012/10/11 02:37:17 [error] 18434#0')
logdate = self.datedetector.getTime('2012/10/11 02:37:17 [error] 18434#0')
self.assertNotEqual(logdate, None)
( logTime, logMatch ) = logdate
self.assertEqual(logTime, mu)
self.assertEqual(logMatch.group(), '2012/10/11 02:37:17')
# confuse it with year being at the end
for i in xrange(10):
( logTime, logMatch ) = self.__datedetector.getTime('11/10/2012 02:37:17 [error] 18434#0')
( logTime, logMatch ) = self.datedetector.getTime('11/10/2012 02:37:17 [error] 18434#0')
self.assertEqual(logTime, mu)
self.assertEqual(logMatch.group(), '11/10/2012 02:37:17')
# and now back to the original
( logTime, logMatch ) = self.__datedetector.getTime('2012/10/11 02:37:17 [error] 18434#0')
( logTime, logMatch ) = self.datedetector.getTime('2012/10/11 02:37:17 [error] 18434#0')
self.assertEqual(logTime, mu)
self.assertEqual(logMatch.group(), '2012/10/11 02:37:17')
@ -199,8 +202,7 @@ class DateDetectorTest(LogCaptureTestCase):
self.assertEqual(t.matchDate('aaaac').group(), 'aaaac')
def testAmbiguousInOrderedTemplates(self):
dd = DateDetector()
dd.addDefaultTemplate()
dd = self.datedetector
for (debit, line, cnt) in (
# shortest distance to datetime should win:
("030324 0:03:59", "some free text 030324 0:03:59 -- 2003-03-07 17:05:01 ...", 1),
@ -224,6 +226,25 @@ class DateDetectorTest(LogCaptureTestCase):
self.assertTrue(match)
self.assertEqual(match.group(), debit)
def testLowLevelLogging(self):
# test coverage for the deep (heavy) debug messages:
try:
self.__old_eff_level = datedetector.logLevel
if datedetector.logLevel < logSys.getEffectiveLevel()+1:
datedetector.logLevel = logSys.getEffectiveLevel()+1
dd = self.datedetector
i = 0
for (line, cnt) in (
("server mysqld[5906]: 2005-10-07 06:09:%02i 5907 [Warning] Access denied", 2),
("server mysqld[5906]: 051007 06:10:%02i 5907 [Warning] Access denied", 5),
("server mysqld[5906]: 2005-10-07 06:09:%02i 5907 [Warning] Access denied", 10),
):
for i in range(i, i+cnt+1):
logSys.debug('== test: %r', (line % i, cnt))
match, template = dd.matchTime(line % i)
self.assertTrue(match)
finally:
datedetector.logLevel = self.__old_eff_level
iso8601 = DatePatternRegex("%Y-%m-%d[T ]%H:%M:%S(?:\.%f)?%z")