datedetector: matchTime, template list etc. rewritten because of performance degradation (without sorting templates now) - in-place reordering using hits and last used time;

todo: rewrite "reGroupDictStrptime" because may be too slow;
pull/1346/head
sebres 2015-12-23 18:49:30 +01:00
parent e065941ac5
commit 3f2b58e973
8 changed files with 75 additions and 42 deletions

View File

@ -366,7 +366,7 @@ class Fail2banRegex(object):
def process(self, test_lines): def process(self, test_lines):
t0 = time.time() t0 = time.time()
for line_no, line in enumerate(test_lines): for line in test_lines:
if isinstance(line, tuple): if isinstance(line, tuple):
line_datetimestripped, ret = self.testRegex( line_datetimestripped, ret = self.testRegex(
line[0], line[1]) line[0], line[1])
@ -398,8 +398,6 @@ class Fail2banRegex(object):
self._line_stats.missed_lines_timeextracted.append(line_datetimestripped) self._line_stats.missed_lines_timeextracted.append(line_datetimestripped)
self._line_stats.tested += 1 self._line_stats.tested += 1
if line_no % 10 == 0 and self._filter.dateDetector is not None:
self._filter.dateDetector.sortTemplate()
self._time_elapsed = time.time() - t0 self._time_elapsed = time.time() - t0
def printLines(self, ltype): def printLines(self, ltype):

View File

@ -21,6 +21,8 @@ __author__ = "Cyril Jaquier and Fail2Ban Contributors"
__copyright__ = "Copyright (c) 2004 Cyril Jaquier" __copyright__ = "Copyright (c) 2004 Cyril Jaquier"
__license__ = "GPL" __license__ = "GPL"
import time
from threading import Lock from threading import Lock
from .datetemplate import DatePatternRegex, DateTai64n, DateEpoch from .datetemplate import DatePatternRegex, DateTai64n, DateEpoch
@ -44,6 +46,8 @@ class DateDetector(object):
self.__lock = Lock() self.__lock = Lock()
self.__templates = list() self.__templates = list()
self.__known_names = set() self.__known_names = set()
# time the template was long unused (currently 300 == 5m):
self.__unusedTime = 300
def _appendTemplate(self, template): def _appendTemplate(self, template):
name = template.name name = template.name
@ -153,22 +157,29 @@ class DateDetector(object):
The regex match returned from the first successfully matched The regex match returned from the first successfully matched
template. template.
""" """
self.__lock.acquire() i = 0
try: with self.__lock:
for template in self.__templates: for template in self.__templates:
match = template.matchDate(line) match = template.matchDate(line)
if not match is None: if not match is None:
if logSys.getEffectiveLevel() <= logLevel: if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, "Matched time template %s", template.name) logSys.log(logLevel, "Matched time template %s", template.name)
template.hits += 1 template.hits += 1
template.lastUsed = time.time()
# if not first - try to reorder current template (bubble up), they will be not sorted anymore:
if i:
self._reorderTemplate(i)
# return tuple with match and template reference used for parsing:
return (match, template) return (match, template)
return (None, None) i += 1
finally: # not found:
self.__lock.release() return (None, None)
def getTime(self, line): def getTime(self, line):
"""Attempts to return the date on a log line using templates. """Attempts to return the date on a log line using templates.
Obsolete: Use "getTime2" instead.
This uses the templates' `getDate` method in an attempt to find This uses the templates' `getDate` method in an attempt to find
a date. a date.
@ -183,8 +194,7 @@ class DateDetector(object):
The Unix timestamp returned from the first successfully matched The Unix timestamp returned from the first successfully matched
template or None if not found. template or None if not found.
""" """
self.__lock.acquire() with self.__lock:
try:
for template in self.__templates: for template in self.__templates:
try: try:
date = template.getDate(line) date = template.getDate(line)
@ -197,8 +207,6 @@ class DateDetector(object):
except ValueError: # pragma: no cover except ValueError: # pragma: no cover
pass pass
return None return None
finally:
self.__lock.release()
def getTime2(self, line, timeMatch = None): def getTime2(self, line, timeMatch = None):
"""Attempts to return the date on a log line using given template. """Attempts to return the date on a log line using given template.
@ -232,21 +240,28 @@ class DateDetector(object):
return date return date
return self.getTime(line) return self.getTime(line)
def sortTemplate(self): def _reorderTemplate(self, num):
"""Sort the date templates by number of hits """Reorder template (bubble up) in template list if hits grows enough.
Sort the template lists using the hits score. This method is not Parameters
called in this object and thus should be called from time to time. ----------
This ensures the most commonly matched templates are checked first, num : int
improving performance of matchTime and getTime. Index of template should be moved.
""" """
self.__lock.acquire() if num:
try: templates = self.__templates
if logSys.getEffectiveLevel() <= logLevel: template = templates[num]
logSys.log(logLevel, "Sorting the template list") ## current hits and time the template was long unused:
self.__templates.sort(key=lambda x: x.hits, reverse=True) untime = template.lastUsed - self.__unusedTime
t = self.__templates[0] hits = template.hits
if logSys.getEffectiveLevel() <= logLevel: ## don't move too often (multiline logs resp. log's with different date patterns),
logSys.log(logLevel, "Winning template: %s with %d hits", t.name, t.hits) ## if template not used too long, replace it also :
finally: if hits > templates[num-1].hits + 5 or templates[num-1].lastUsed < untime:
self.__lock.release() ## try to move faster (half of part to current template):
pos = num // 2
## if not larger - move slow (exact 1 position):
if hits <= templates[pos].hits or templates[pos].lastUsed < untime:
pos = num-1
templates[pos], templates[num] = template, templates[pos]

View File

@ -50,6 +50,7 @@ class DateTemplate(object):
self._regex = "" self._regex = ""
self._cRegex = None self._cRegex = None
self.hits = 0 self.hits = 0
self.lastUsed = 0
@property @property
def name(self): def name(self):

View File

@ -84,7 +84,6 @@ class FilterGamin(FileFilter):
self.jail.putFailTicket(ticket) self.jail.putFailTicket(ticket)
except FailManagerEmpty: except FailManagerEmpty:
self.failManager.cleanup(MyTime.time()) self.failManager.cleanup(MyTime.time())
self.dateDetector.sortTemplate()
self.__modified = False self.__modified = False
## ##

View File

@ -118,7 +118,6 @@ class FilterPoll(FileFilter):
self.jail.putFailTicket(ticket) self.jail.putFailTicket(ticket)
except FailManagerEmpty: except FailManagerEmpty:
self.failManager.cleanup(MyTime.time()) self.failManager.cleanup(MyTime.time())
self.dateDetector.sortTemplate()
self.__modified = False self.__modified = False
logSys.debug( logSys.debug(
(self.jail is not None and self.jail.name or "jailless") + (self.jail is not None and self.jail.name or "jailless") +

View File

@ -108,7 +108,6 @@ class FilterPyinotify(FileFilter):
self.jail.putFailTicket(ticket) self.jail.putFailTicket(ticket)
except FailManagerEmpty: except FailManagerEmpty:
self.failManager.cleanup(MyTime.time()) self.failManager.cleanup(MyTime.time())
self.dateDetector.sortTemplate()
self.__modified = False self.__modified = False
def _addFileWatcher(self, path): def _addFileWatcher(self, path):

View File

@ -143,13 +143,6 @@ class DateDetectorTest(LogCaptureTestCase):
else: else:
self.assertEqual(logtime, None, "getTime should have not matched for %r Got: %s" % (sdate, logtime)) self.assertEqual(logtime, None, "getTime should have not matched for %r Got: %s" % (sdate, logtime))
def testStableSortTemplate(self):
old_names = [x.name for x in self.__datedetector.templates]
self.__datedetector.sortTemplate()
# If there were no hits -- sorting should not change the order
for old_name, n in zip(old_names, self.__datedetector.templates):
self.assertEqual(old_name, n.name) # "Sort must be stable"
def testAllUniqueTemplateNames(self): def testAllUniqueTemplateNames(self):
self.assertRaises(ValueError, self.__datedetector.appendTemplate, self.assertRaises(ValueError, self.__datedetector.appendTemplate,
self.__datedetector.templates[0]) self.__datedetector.templates[0])
@ -164,13 +157,11 @@ class DateDetectorTest(LogCaptureTestCase):
( logTime, logMatch ) = logdate ( logTime, logMatch ) = logdate
self.assertEqual(logTime, mu) self.assertEqual(logTime, mu)
self.assertEqual(logMatch.group(), '2012/10/11 02:37:17') self.assertEqual(logMatch.group(), '2012/10/11 02:37:17')
self.__datedetector.sortTemplate()
# confuse it with year being at the end # confuse it with year being at the end
for i in xrange(10): for i in xrange(10):
( logTime, logMatch ) = self.__datedetector.getTime('11/10/2012 02:37:17 [error] 18434#0') ( logTime, logMatch ) = self.__datedetector.getTime('11/10/2012 02:37:17 [error] 18434#0')
self.assertEqual(logTime, mu) self.assertEqual(logTime, mu)
self.assertEqual(logMatch.group(), '11/10/2012 02:37:17') self.assertEqual(logMatch.group(), '11/10/2012 02:37:17')
self.__datedetector.sortTemplate()
# and now back to the original # and now back to the original
( logTime, logMatch ) = self.__datedetector.getTime('2012/10/11 02:37:17 [error] 18434#0') ( logTime, logMatch ) = self.__datedetector.getTime('2012/10/11 02:37:17 [error] 18434#0')
self.assertEqual(logTime, mu) self.assertEqual(logTime, mu)

View File

@ -87,8 +87,31 @@ def _maxWaitTime(wtime):
return wtime return wtime
def _tm(time): class _tmSerial():
return datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S") _last_s = -0x7fffffff
_last_m = -0x7fffffff
_str_s = ""
_str_m = ""
@staticmethod
def _tm(time):
# ## strftime it too slow for large time serializer :
# return datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")
c = _tmSerial
sec = (time % 60)
if c._last_s == time - sec:
return "%s%02u" % (c._str_s, sec)
mt = (time % 3600)
if c._last_m == time - mt:
c._last_s = time - sec
c._str_s = "%s%02u:" % (c._str_m, mt // 60)
return "%s%02u" % (c._str_s, sec)
c._last_m = time - mt
c._str_m = datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:")
c._last_s = time - sec
c._str_s = "%s%02u:" % (c._str_m, mt // 60)
return "%s%02u" % (c._str_s, sec)
_tm = _tmSerial._tm
def _assert_equal_entries(utest, found, output, count=None): def _assert_equal_entries(utest, found, output, count=None):
@ -245,6 +268,14 @@ class BasicFilter(unittest.TestCase):
1) 1)
) )
def testTest_tm(self):
unittest.F2B.SkipIfFast()
## test function "_tm" works correct (returns the same as slow strftime):
for i in xrange(1417512352, (1417512352 // 3600 + 3) * 3600):
tm = datetime.datetime.fromtimestamp(i).strftime("%Y-%m-%d %H:%M:%S")
if _tm(i) != tm:
self.assertEqual((_tm(i), i), (tm, i))
class IgnoreIP(LogCaptureTestCase): class IgnoreIP(LogCaptureTestCase):