amend distance collision check - always find template with shortest distance, also first time (test-case extended);

datedetector property template without lock, initially placed start-anchored templates at begin of template list, small optimization of strptime
pull/1583/head
sebres 2016-10-04 17:25:49 +02:00
parent 75a5440acf
commit b9033d004e
4 changed files with 79 additions and 63 deletions

View File

@ -46,6 +46,8 @@ class DateDetectorCache(object):
def templates(self): def templates(self):
"""List of template instances managed by the detector. """List of template instances managed by the detector.
""" """
if self.__templates:
return self.__templates
with self.__lock: with self.__lock:
if self.__templates: if self.__templates:
return self.__templates return self.__templates
@ -60,21 +62,22 @@ class DateDetectorCache(object):
# exact given template with word benin-end boundary: # exact given template with word benin-end boundary:
template = DatePatternRegex(template) template = DatePatternRegex(template)
# additional template, that prefers datetime at start of a line (safety+performance feature): # additional template, that prefers datetime at start of a line (safety+performance feature):
if 0 and hasattr(template, 'regex'): if hasattr(template, 'regex'):
template2 = copy.copy(template) template2 = copy.copy(template)
regex = getattr(template, 'pattern', template.regex) regex = getattr(template, 'pattern', template.regex)
template2.setRegex(regex, wordBegin='start', wordEnd=True) template2.setRegex(regex, wordBegin='start', wordEnd=True)
if template2.name != template.name: if template2.name != template.name:
# increase weight of such templates, because they should be always # increase weight of such templates, because they should be always
# preferred in template sorting process (bubble up): # preferred in template sorting process (bubble up):
template2.weight = 100 template2.weight = 100.0
self.__templates.append(template2) self.__tmpcache[0].append(template2)
# add template: # add template:
self.__templates.append(template) self.__tmpcache[1].append(template)
def _addDefaultTemplate(self): def _addDefaultTemplate(self):
"""Add resp. cache Fail2Ban's default set of date templates. """Add resp. cache Fail2Ban's default set of date templates.
""" """
self.__tmpcache = [], []
# ISO 8601, simple date, optional subsecond and timezone: # ISO 8601, simple date, optional subsecond and timezone:
# 2005-01-23T21:59:59.981746, 2005-01-23 21:59:59 # 2005-01-23T21:59:59.981746, 2005-01-23 21:59:59
# simple date: 2005/01/23 21:59:59 # simple date: 2005/01/23 21:59:59
@ -122,6 +125,8 @@ class DateDetectorCache(object):
self._cacheTemplate("%b %d, %ExY %I:%M:%S %p") self._cacheTemplate("%b %d, %ExY %I:%M:%S %p")
# ASSP: Apr-27-13 02:33:06 # ASSP: Apr-27-13 02:33:06
self._cacheTemplate("^%b-%d-%Exy %H:%M:%S") self._cacheTemplate("^%b-%d-%Exy %H:%M:%S")
self.__templates = self.__tmpcache[0] + self.__tmpcache[1]
del self.__tmpcache
class DateDetectorTemplate(object): class DateDetectorTemplate(object):
@ -158,6 +163,8 @@ class DateDetector(object):
self.__known_names = set() self.__known_names = set()
# time the template was long unused (currently 300 == 5m): # time the template was long unused (currently 300 == 5m):
self.__unusedTime = 300 self.__unusedTime = 300
# last known distance:
self.__lastDistance = 0
# first free place: # first free place:
self.__firstUnused = 0 self.__firstUnused = 0
@ -229,11 +236,14 @@ class DateDetector(object):
if match is not None: if match is not None:
distance = max(1, match.start() + 1) distance = max(1, match.start() + 1)
if logSys.getEffectiveLevel() <= logLevel: if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, " matched time template #%r (at %r <= %r) %s", logSys.log(logLevel, " matched time template #%r (at %r <= %r, %r) %s",
i, distance, ddtempl.distance, template.name) i, distance, ddtempl.distance, self.__lastDistance, template.name)
## [grave] if distance changed, possible date-match was found somewhere ## [grave] if distance changed, possible date-match was found somewhere
## in body of message, so save this template, and search further: ## in body of message, so save this template, and search further:
if distance > ddtempl.distance and len(self.__templates) > 1: if (
(distance > ddtempl.distance or distance > self.__lastDistance) and
len(self.__templates) > 1
):
if logSys.getEffectiveLevel() <= logLevel: if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, " ** distance collision - pattern change, reserve") logSys.log(logLevel, " ** distance collision - pattern change, reserve")
## shortest of both: ## shortest of both:
@ -254,7 +264,7 @@ class DateDetector(object):
# we've winner, incr hits, set distance, usage, reorder, etc: # we've winner, incr hits, set distance, usage, reorder, etc:
if match is not None: if match is not None:
ddtempl.hits += 1 ddtempl.hits += 1
ddtempl.distance = distance self.__lastDistance = ddtempl.distance = distance
ddtempl.lastUsed = time.time() ddtempl.lastUsed = time.time()
if self.__firstUnused == i: if self.__firstUnused == i:
self.__firstUnused += 1 self.__firstUnused += 1
@ -322,18 +332,21 @@ class DateDetector(object):
pweight = templates[pos].hits * templates[pos].template.weight / templates[pos].distance pweight = templates[pos].hits * templates[pos].template.weight / templates[pos].distance
## don't move too often (multiline logs resp. log's with different date patterns), ## don't move too often (multiline logs resp. log's with different date patterns),
## if template not used too long, replace it also : ## if template not used too long, replace it also :
logSys.log(logLevel, " -> compare template #%r & #%r, weight %r > %r", num, pos, weight, pweight) logSys.log(logLevel, " -> compare template #%r & #%r, weight %r > %r, hits %r > %r",
if not pweight or weight > pweight + 5 or templates[pos].lastUsed < untime: num, pos, weight, pweight, ddtempl.hits, templates[pos].hits)
if not pweight or weight > pweight or templates[pos].lastUsed < untime:
## if not larger (and target position recently used) - move slow (exact 1 position): ## if not larger (and target position recently used) - move slow (exact 1 position):
if weight <= pweight and templates[pos].lastUsed > untime: if weight <= pweight and templates[pos].lastUsed > untime:
pos = num-1 pos = num-1
## if still smaller and template at position used, don't move: ## if still smaller and template at position used, don't move:
pweight = templates[pos].hits * templates[pos].template.weight / templates[pos].distance pweight = templates[pos].hits * templates[pos].template.weight / templates[pos].distance
logSys.log(logLevel, " -> compare template #%r & #%r, weight %r > %r", num, pos, weight, pweight) logSys.log(logLevel, " -> compare template #%r & #%r, weight %r > %r, hits %r > %r",
num, pos, weight, pweight, ddtempl.hits, templates[pos].hits)
if weight < pweight and templates[pos].lastUsed > untime: if weight < pweight and templates[pos].lastUsed > untime:
return return
templates[pos], templates[num] = ddtempl, templates[pos] del templates[num]
logSys.log(logLevel, " -> moved template #%r -> #%r", num, pos) templates[pos:0] = [ddtempl]
## correct first unused: ## correct first unused:
if pos == self.__firstUnused: if pos == self.__firstUnused:
self.__firstUnused += 1 self.__firstUnused += 1
logSys.log(logLevel, " -> moved template #%r -> #%r", num, pos)

View File

@ -50,7 +50,7 @@ class DateTemplate(object):
def __init__(self): def __init__(self):
self.name = "" self.name = ""
self.weight = 1 self.weight = 1.0
self._regex = "" self._regex = ""
self._cRegex = None self._cRegex = None
@ -232,11 +232,7 @@ class DatePatternRegex(DateTemplate):
if not dateMatch: if not dateMatch:
dateMatch = self.matchDate(line) dateMatch = self.matchDate(line)
if dateMatch: if dateMatch:
groupdict = dict( return reGroupDictStrptime(dateMatch.groupdict()), dateMatch
(key, value)
for key, value in dateMatch.groupdict().iteritems()
if value is not None)
return reGroupDictStrptime(groupdict), dateMatch
class DateTai64n(DateTemplate): class DateTai64n(DateTemplate):

View File

@ -80,7 +80,7 @@ def getTimePatternRE():
names[key] = "%%%s" % key names[key] = "%%%s" % key
return (patt, names) return (patt, names)
def reGroupDictStrptime(found_dict): def reGroupDictStrptime(found_dict, msec=False):
"""Return time from dictionary of strptime fields """Return time from dictionary of strptime fields
This is tweaked from python built-in _strptime. This is tweaked from python built-in _strptime.
@ -109,14 +109,15 @@ def reGroupDictStrptime(found_dict):
# weekday and julian defaulted to -1 so as to signal need to calculate # weekday and julian defaulted to -1 so as to signal need to calculate
# values # values
weekday = julian = -1 weekday = julian = -1
for group_key in found_dict.keys(): for key, val in found_dict.iteritems():
if val is None: continue
# Directives not explicitly handled below: # Directives not explicitly handled below:
# c, x, X # c, x, X
# handled by making out of other directives # handled by making out of other directives
# U, W # U, W
# worthless without day of the week # worthless without day of the week
if group_key == 'y': if key == 'y':
year = int(found_dict['y']) year = int(val)
# Open Group specification for strptime() states that a %y # Open Group specification for strptime() states that a %y
#value in the range of [00, 68] is in the century 2000, while #value in the range of [00, 68] is in the century 2000, while
#[69,99] is in the century 1900 #[69,99] is in the century 1900
@ -124,20 +125,20 @@ def reGroupDictStrptime(found_dict):
year += 2000 year += 2000
else: else:
year += 1900 year += 1900
elif group_key == 'Y': elif key == 'Y':
year = int(found_dict['Y']) year = int(val)
elif group_key == 'm': elif key == 'm':
month = int(found_dict['m']) month = int(val)
elif group_key == 'B': elif key == 'B':
month = locale_time.f_month.index(found_dict['B'].lower()) month = locale_time.f_month.index(val.lower())
elif group_key == 'b': elif key == 'b':
month = locale_time.a_month.index(found_dict['b'].lower()) month = locale_time.a_month.index(val.lower())
elif group_key == 'd': elif key == 'd':
day = int(found_dict['d']) day = int(val)
elif group_key == 'H': elif key == 'H':
hour = int(found_dict['H']) hour = int(val)
elif group_key == 'I': elif key == 'I':
hour = int(found_dict['I']) hour = int(val)
ampm = found_dict.get('p', '').lower() ampm = found_dict.get('p', '').lower()
# If there was no AM/PM indicator, we'll treat this like AM # If there was no AM/PM indicator, we'll treat this like AM
if ampm in ('', locale_time.am_pm[0]): if ampm in ('', locale_time.am_pm[0]):
@ -152,37 +153,38 @@ def reGroupDictStrptime(found_dict):
# 12 noon == 12 PM == hour 12 # 12 noon == 12 PM == hour 12
if hour != 12: if hour != 12:
hour += 12 hour += 12
elif group_key == 'M': elif key == 'M':
minute = int(found_dict['M']) minute = int(val)
elif group_key == 'S': elif key == 'S':
second = int(found_dict['S']) second = int(val)
elif group_key == 'f': elif key == 'f':
s = found_dict['f'] if msec:
# Pad to always return microseconds. s = val
s += "0" * (6 - len(s)) # Pad to always return microseconds.
fraction = int(s) s += "0" * (6 - len(s))
elif group_key == 'A': fraction = int(s)
weekday = locale_time.f_weekday.index(found_dict['A'].lower()) elif key == 'A':
elif group_key == 'a': weekday = locale_time.f_weekday.index(val.lower())
weekday = locale_time.a_weekday.index(found_dict['a'].lower()) elif key == 'a':
elif group_key == 'w': weekday = locale_time.a_weekday.index(val.lower())
weekday = int(found_dict['w']) elif key == 'w':
weekday = int(val)
if weekday == 0: if weekday == 0:
weekday = 6 weekday = 6
else: else:
weekday -= 1 weekday -= 1
elif group_key == 'j': elif key == 'j':
julian = int(found_dict['j']) julian = int(val)
elif group_key in ('U', 'W'): elif key in ('U', 'W'):
week_of_year = int(found_dict[group_key]) week_of_year = int(val)
if group_key == 'U': if key == 'U':
# U starts week on Sunday. # U starts week on Sunday.
week_of_year_start = 6 week_of_year_start = 6
else: else:
# W starts week on Monday. # W starts week on Monday.
week_of_year_start = 0 week_of_year_start = 0
elif group_key == 'z': elif key == 'z':
z = found_dict['z'] z = val
if z == "Z": if z == "Z":
tzoffset = 0 tzoffset = 0
else: else:
@ -227,7 +229,7 @@ def reGroupDictStrptime(found_dict):
# Actully create date # Actully create date
date_result = datetime.datetime( date_result = datetime.datetime(
year, month, day, hour, minute, second, fraction) year, month, day, hour, minute, second, fraction)
if gmtoff: if gmtoff is not None:
date_result = date_result - datetime.timedelta(seconds=gmtoff) date_result = date_result - datetime.timedelta(seconds=gmtoff)
if date_result > now and assume_today: if date_result > now and assume_today:
@ -240,7 +242,9 @@ def reGroupDictStrptime(found_dict):
year=year-1, month=month, day=day) year=year-1, month=month, day=day)
if gmtoff is not None: if gmtoff is not None:
return calendar.timegm(date_result.utctimetuple()) tm = calendar.timegm(date_result.utctimetuple())
else: else:
return time.mktime(date_result.timetuple()) tm = time.mktime(date_result.timetuple())
if msec:
tm += fraction/1000000.0
return tm

View File

@ -202,6 +202,9 @@ class DateDetectorTest(LogCaptureTestCase):
dd = DateDetector() dd = DateDetector()
dd.addDefaultTemplate() dd.addDefaultTemplate()
for (debit, line, cnt) in ( for (debit, line, cnt) in (
# shortest distance to datetime should win:
("030324 0:03:59", "some free text 030324 0:03:59 -- 2003-03-07 17:05:01 ...", 1),
# some free text with datetime:
("2003-03-07 17:05:01", "some free text 2003-03-07 17:05:01 test ...", 15), ("2003-03-07 17:05:01", "some free text 2003-03-07 17:05:01 test ...", 15),
# distance collision detection (date from foreign input should not be found): # distance collision detection (date from foreign input should not be found):
("030324 0:04:00", "server mysqld[1000]: 030324 0:04:00 [Warning] Access denied ..." ("030324 0:04:00", "server mysqld[1000]: 030324 0:04:00 [Warning] Access denied ..."