datedetector: matchTime, template list etc. rewritten because of performance degradation (without sorting templates now) - in-place reordering using hits and last used time;

todo: rewrite "reGroupDictStrptime" because may be too slow;
pull/1346/head
sebres 2015-12-23 18:49:30 +01:00
parent e065941ac5
commit 3f2b58e973
8 changed files with 75 additions and 42 deletions

View File

@ -366,7 +366,7 @@ class Fail2banRegex(object):
def process(self, test_lines):
t0 = time.time()
for line_no, line in enumerate(test_lines):
for line in test_lines:
if isinstance(line, tuple):
line_datetimestripped, ret = self.testRegex(
line[0], line[1])
@ -398,8 +398,6 @@ class Fail2banRegex(object):
self._line_stats.missed_lines_timeextracted.append(line_datetimestripped)
self._line_stats.tested += 1
if line_no % 10 == 0 and self._filter.dateDetector is not None:
self._filter.dateDetector.sortTemplate()
self._time_elapsed = time.time() - t0
def printLines(self, ltype):

View File

@ -21,6 +21,8 @@ __author__ = "Cyril Jaquier and Fail2Ban Contributors"
__copyright__ = "Copyright (c) 2004 Cyril Jaquier"
__license__ = "GPL"
import time
from threading import Lock
from .datetemplate import DatePatternRegex, DateTai64n, DateEpoch
@ -44,6 +46,8 @@ class DateDetector(object):
self.__lock = Lock()
self.__templates = list()
self.__known_names = set()
# time the template was long unused (currently 300 == 5m):
self.__unusedTime = 300
def _appendTemplate(self, template):
name = template.name
@ -153,22 +157,29 @@ class DateDetector(object):
The regex match returned from the first successfully matched
template.
"""
self.__lock.acquire()
try:
i = 0
with self.__lock:
for template in self.__templates:
match = template.matchDate(line)
if not match is None:
if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, "Matched time template %s", template.name)
template.hits += 1
template.lastUsed = time.time()
# if not first - try to reorder current template (bubble up), they will be not sorted anymore:
if i:
self._reorderTemplate(i)
# return tuple with match and template reference used for parsing:
return (match, template)
return (None, None)
finally:
self.__lock.release()
i += 1
# not found:
return (None, None)
def getTime(self, line):
"""Attempts to return the date on a log line using templates.
Obsolete: Use "getTime2" instead.
This uses the templates' `getDate` method in an attempt to find
a date.
@ -183,8 +194,7 @@ class DateDetector(object):
The Unix timestamp returned from the first successfully matched
template or None if not found.
"""
self.__lock.acquire()
try:
with self.__lock:
for template in self.__templates:
try:
date = template.getDate(line)
@ -197,8 +207,6 @@ class DateDetector(object):
except ValueError: # pragma: no cover
pass
return None
finally:
self.__lock.release()
def getTime2(self, line, timeMatch = None):
"""Attempts to return the date on a log line using given template.
@ -232,21 +240,28 @@ class DateDetector(object):
return date
return self.getTime(line)
def sortTemplate(self):
"""Sort the date templates by number of hits
def _reorderTemplate(self, num):
"""Reorder template (bubble up) in template list if hits grows enough.
Sort the template lists using the hits score. This method is not
called in this object and thus should be called from time to time.
This ensures the most commonly matched templates are checked first,
improving performance of matchTime and getTime.
Parameters
----------
num : int
Index of template should be moved.
"""
self.__lock.acquire()
try:
if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, "Sorting the template list")
self.__templates.sort(key=lambda x: x.hits, reverse=True)
t = self.__templates[0]
if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, "Winning template: %s with %d hits", t.name, t.hits)
finally:
self.__lock.release()
if num:
templates = self.__templates
template = templates[num]
## current hits and time the template was long unused:
untime = template.lastUsed - self.__unusedTime
hits = template.hits
## don't move too often (multiline logs resp. log's with different date patterns),
## if template not used too long, replace it also :
if hits > templates[num-1].hits + 5 or templates[num-1].lastUsed < untime:
## try to move faster (half of part to current template):
pos = num // 2
## if not larger - move slow (exact 1 position):
if hits <= templates[pos].hits or templates[pos].lastUsed < untime:
pos = num-1
templates[pos], templates[num] = template, templates[pos]

View File

@ -50,6 +50,7 @@ class DateTemplate(object):
self._regex = ""
self._cRegex = None
self.hits = 0
self.lastUsed = 0
@property
def name(self):

View File

@ -84,7 +84,6 @@ class FilterGamin(FileFilter):
self.jail.putFailTicket(ticket)
except FailManagerEmpty:
self.failManager.cleanup(MyTime.time())
self.dateDetector.sortTemplate()
self.__modified = False
##

View File

@ -118,7 +118,6 @@ class FilterPoll(FileFilter):
self.jail.putFailTicket(ticket)
except FailManagerEmpty:
self.failManager.cleanup(MyTime.time())
self.dateDetector.sortTemplate()
self.__modified = False
logSys.debug(
(self.jail is not None and self.jail.name or "jailless") +

View File

@ -108,7 +108,6 @@ class FilterPyinotify(FileFilter):
self.jail.putFailTicket(ticket)
except FailManagerEmpty:
self.failManager.cleanup(MyTime.time())
self.dateDetector.sortTemplate()
self.__modified = False
def _addFileWatcher(self, path):

View File

@ -143,13 +143,6 @@ class DateDetectorTest(LogCaptureTestCase):
else:
self.assertEqual(logtime, None, "getTime should have not matched for %r Got: %s" % (sdate, logtime))
def testStableSortTemplate(self):
old_names = [x.name for x in self.__datedetector.templates]
self.__datedetector.sortTemplate()
# If there were no hits -- sorting should not change the order
for old_name, n in zip(old_names, self.__datedetector.templates):
self.assertEqual(old_name, n.name) # "Sort must be stable"
def testAllUniqueTemplateNames(self):
self.assertRaises(ValueError, self.__datedetector.appendTemplate,
self.__datedetector.templates[0])
@ -164,13 +157,11 @@ class DateDetectorTest(LogCaptureTestCase):
( logTime, logMatch ) = logdate
self.assertEqual(logTime, mu)
self.assertEqual(logMatch.group(), '2012/10/11 02:37:17')
self.__datedetector.sortTemplate()
# confuse it with year being at the end
for i in xrange(10):
( logTime, logMatch ) = self.__datedetector.getTime('11/10/2012 02:37:17 [error] 18434#0')
self.assertEqual(logTime, mu)
self.assertEqual(logMatch.group(), '11/10/2012 02:37:17')
self.__datedetector.sortTemplate()
# and now back to the original
( logTime, logMatch ) = self.__datedetector.getTime('2012/10/11 02:37:17 [error] 18434#0')
self.assertEqual(logTime, mu)

View File

@ -87,8 +87,31 @@ def _maxWaitTime(wtime):
return wtime
def _tm(time):
return datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")
class _tmSerial():
_last_s = -0x7fffffff
_last_m = -0x7fffffff
_str_s = ""
_str_m = ""
@staticmethod
def _tm(time):
# ## strftime it too slow for large time serializer :
# return datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:%M:%S")
c = _tmSerial
sec = (time % 60)
if c._last_s == time - sec:
return "%s%02u" % (c._str_s, sec)
mt = (time % 3600)
if c._last_m == time - mt:
c._last_s = time - sec
c._str_s = "%s%02u:" % (c._str_m, mt // 60)
return "%s%02u" % (c._str_s, sec)
c._last_m = time - mt
c._str_m = datetime.datetime.fromtimestamp(time).strftime("%Y-%m-%d %H:")
c._last_s = time - sec
c._str_s = "%s%02u:" % (c._str_m, mt // 60)
return "%s%02u" % (c._str_s, sec)
_tm = _tmSerial._tm
def _assert_equal_entries(utest, found, output, count=None):
@ -245,6 +268,14 @@ class BasicFilter(unittest.TestCase):
1)
)
def testTest_tm(self):
unittest.F2B.SkipIfFast()
## test function "_tm" works correct (returns the same as slow strftime):
for i in xrange(1417512352, (1417512352 // 3600 + 3) * 3600):
tm = datetime.datetime.fromtimestamp(i).strftime("%Y-%m-%d %H:%M:%S")
if _tm(i) != tm:
self.assertEqual((_tm(i), i), (tm, i))
class IgnoreIP(LogCaptureTestCase):