amend distance collision check - always find template with shortest distance, also first time (test-case extended);

datedetector property template without lock, initially placed start-anchored templates at begin of template list, small optimization of strptime
pull/1583/head
sebres 2016-10-04 17:25:49 +02:00
parent 75a5440acf
commit b9033d004e
4 changed files with 79 additions and 63 deletions

View File

@ -46,6 +46,8 @@ class DateDetectorCache(object):
def templates(self):
"""List of template instances managed by the detector.
"""
if self.__templates:
return self.__templates
with self.__lock:
if self.__templates:
return self.__templates
@ -60,21 +62,22 @@ class DateDetectorCache(object):
# exact given template with word benin-end boundary:
template = DatePatternRegex(template)
# additional template, that prefers datetime at start of a line (safety+performance feature):
if 0 and hasattr(template, 'regex'):
if hasattr(template, 'regex'):
template2 = copy.copy(template)
regex = getattr(template, 'pattern', template.regex)
template2.setRegex(regex, wordBegin='start', wordEnd=True)
if template2.name != template.name:
# increase weight of such templates, because they should be always
# preferred in template sorting process (bubble up):
template2.weight = 100
self.__templates.append(template2)
template2.weight = 100.0
self.__tmpcache[0].append(template2)
# add template:
self.__templates.append(template)
self.__tmpcache[1].append(template)
def _addDefaultTemplate(self):
"""Add resp. cache Fail2Ban's default set of date templates.
"""
self.__tmpcache = [], []
# ISO 8601, simple date, optional subsecond and timezone:
# 2005-01-23T21:59:59.981746, 2005-01-23 21:59:59
# simple date: 2005/01/23 21:59:59
@ -122,6 +125,8 @@ class DateDetectorCache(object):
self._cacheTemplate("%b %d, %ExY %I:%M:%S %p")
# ASSP: Apr-27-13 02:33:06
self._cacheTemplate("^%b-%d-%Exy %H:%M:%S")
self.__templates = self.__tmpcache[0] + self.__tmpcache[1]
del self.__tmpcache
class DateDetectorTemplate(object):
@ -158,6 +163,8 @@ class DateDetector(object):
self.__known_names = set()
# time the template was long unused (currently 300 == 5m):
self.__unusedTime = 300
# last known distance:
self.__lastDistance = 0
# first free place:
self.__firstUnused = 0
@ -229,11 +236,14 @@ class DateDetector(object):
if match is not None:
distance = max(1, match.start() + 1)
if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, " matched time template #%r (at %r <= %r) %s",
i, distance, ddtempl.distance, template.name)
logSys.log(logLevel, " matched time template #%r (at %r <= %r, %r) %s",
i, distance, ddtempl.distance, self.__lastDistance, template.name)
## [grave] if distance changed, possible date-match was found somewhere
## in body of message, so save this template, and search further:
if distance > ddtempl.distance and len(self.__templates) > 1:
if (
(distance > ddtempl.distance or distance > self.__lastDistance) and
len(self.__templates) > 1
):
if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, " ** distance collision - pattern change, reserve")
## shortest of both:
@ -254,7 +264,7 @@ class DateDetector(object):
# we've winner, incr hits, set distance, usage, reorder, etc:
if match is not None:
ddtempl.hits += 1
ddtempl.distance = distance
self.__lastDistance = ddtempl.distance = distance
ddtempl.lastUsed = time.time()
if self.__firstUnused == i:
self.__firstUnused += 1
@ -322,18 +332,21 @@ class DateDetector(object):
pweight = templates[pos].hits * templates[pos].template.weight / templates[pos].distance
## don't move too often (multiline logs resp. log's with different date patterns),
## if template not used too long, replace it also :
logSys.log(logLevel, " -> compare template #%r & #%r, weight %r > %r", num, pos, weight, pweight)
if not pweight or weight > pweight + 5 or templates[pos].lastUsed < untime:
logSys.log(logLevel, " -> compare template #%r & #%r, weight %r > %r, hits %r > %r",
num, pos, weight, pweight, ddtempl.hits, templates[pos].hits)
if not pweight or weight > pweight or templates[pos].lastUsed < untime:
## if not larger (and target position recently used) - move slow (exact 1 position):
if weight <= pweight and templates[pos].lastUsed > untime:
pos = num-1
## if still smaller and template at position used, don't move:
pweight = templates[pos].hits * templates[pos].template.weight / templates[pos].distance
logSys.log(logLevel, " -> compare template #%r & #%r, weight %r > %r", num, pos, weight, pweight)
logSys.log(logLevel, " -> compare template #%r & #%r, weight %r > %r, hits %r > %r",
num, pos, weight, pweight, ddtempl.hits, templates[pos].hits)
if weight < pweight and templates[pos].lastUsed > untime:
return
templates[pos], templates[num] = ddtempl, templates[pos]
logSys.log(logLevel, " -> moved template #%r -> #%r", num, pos)
del templates[num]
templates[pos:0] = [ddtempl]
## correct first unused:
if pos == self.__firstUnused:
self.__firstUnused += 1
logSys.log(logLevel, " -> moved template #%r -> #%r", num, pos)

View File

@ -50,7 +50,7 @@ class DateTemplate(object):
def __init__(self):
self.name = ""
self.weight = 1
self.weight = 1.0
self._regex = ""
self._cRegex = None
@ -232,11 +232,7 @@ class DatePatternRegex(DateTemplate):
if not dateMatch:
dateMatch = self.matchDate(line)
if dateMatch:
groupdict = dict(
(key, value)
for key, value in dateMatch.groupdict().iteritems()
if value is not None)
return reGroupDictStrptime(groupdict), dateMatch
return reGroupDictStrptime(dateMatch.groupdict()), dateMatch
class DateTai64n(DateTemplate):

View File

@ -80,7 +80,7 @@ def getTimePatternRE():
names[key] = "%%%s" % key
return (patt, names)
def reGroupDictStrptime(found_dict):
def reGroupDictStrptime(found_dict, msec=False):
"""Return time from dictionary of strptime fields
This is tweaked from python built-in _strptime.
@ -109,14 +109,15 @@ def reGroupDictStrptime(found_dict):
# weekday and julian defaulted to -1 so as to signal need to calculate
# values
weekday = julian = -1
for group_key in found_dict.keys():
for key, val in found_dict.iteritems():
if val is None: continue
# Directives not explicitly handled below:
# c, x, X
# handled by making out of other directives
# U, W
# worthless without day of the week
if group_key == 'y':
year = int(found_dict['y'])
if key == 'y':
year = int(val)
# Open Group specification for strptime() states that a %y
#value in the range of [00, 68] is in the century 2000, while
#[69,99] is in the century 1900
@ -124,20 +125,20 @@ def reGroupDictStrptime(found_dict):
year += 2000
else:
year += 1900
elif group_key == 'Y':
year = int(found_dict['Y'])
elif group_key == 'm':
month = int(found_dict['m'])
elif group_key == 'B':
month = locale_time.f_month.index(found_dict['B'].lower())
elif group_key == 'b':
month = locale_time.a_month.index(found_dict['b'].lower())
elif group_key == 'd':
day = int(found_dict['d'])
elif group_key == 'H':
hour = int(found_dict['H'])
elif group_key == 'I':
hour = int(found_dict['I'])
elif key == 'Y':
year = int(val)
elif key == 'm':
month = int(val)
elif key == 'B':
month = locale_time.f_month.index(val.lower())
elif key == 'b':
month = locale_time.a_month.index(val.lower())
elif key == 'd':
day = int(val)
elif key == 'H':
hour = int(val)
elif key == 'I':
hour = int(val)
ampm = found_dict.get('p', '').lower()
# If there was no AM/PM indicator, we'll treat this like AM
if ampm in ('', locale_time.am_pm[0]):
@ -152,37 +153,38 @@ def reGroupDictStrptime(found_dict):
# 12 noon == 12 PM == hour 12
if hour != 12:
hour += 12
elif group_key == 'M':
minute = int(found_dict['M'])
elif group_key == 'S':
second = int(found_dict['S'])
elif group_key == 'f':
s = found_dict['f']
elif key == 'M':
minute = int(val)
elif key == 'S':
second = int(val)
elif key == 'f':
if msec:
s = val
# Pad to always return microseconds.
s += "0" * (6 - len(s))
fraction = int(s)
elif group_key == 'A':
weekday = locale_time.f_weekday.index(found_dict['A'].lower())
elif group_key == 'a':
weekday = locale_time.a_weekday.index(found_dict['a'].lower())
elif group_key == 'w':
weekday = int(found_dict['w'])
elif key == 'A':
weekday = locale_time.f_weekday.index(val.lower())
elif key == 'a':
weekday = locale_time.a_weekday.index(val.lower())
elif key == 'w':
weekday = int(val)
if weekday == 0:
weekday = 6
else:
weekday -= 1
elif group_key == 'j':
julian = int(found_dict['j'])
elif group_key in ('U', 'W'):
week_of_year = int(found_dict[group_key])
if group_key == 'U':
elif key == 'j':
julian = int(val)
elif key in ('U', 'W'):
week_of_year = int(val)
if key == 'U':
# U starts week on Sunday.
week_of_year_start = 6
else:
# W starts week on Monday.
week_of_year_start = 0
elif group_key == 'z':
z = found_dict['z']
elif key == 'z':
z = val
if z == "Z":
tzoffset = 0
else:
@ -227,7 +229,7 @@ def reGroupDictStrptime(found_dict):
# Actully create date
date_result = datetime.datetime(
year, month, day, hour, minute, second, fraction)
if gmtoff:
if gmtoff is not None:
date_result = date_result - datetime.timedelta(seconds=gmtoff)
if date_result > now and assume_today:
@ -240,7 +242,9 @@ def reGroupDictStrptime(found_dict):
year=year-1, month=month, day=day)
if gmtoff is not None:
return calendar.timegm(date_result.utctimetuple())
tm = calendar.timegm(date_result.utctimetuple())
else:
return time.mktime(date_result.timetuple())
tm = time.mktime(date_result.timetuple())
if msec:
tm += fraction/1000000.0
return tm

View File

@ -202,6 +202,9 @@ class DateDetectorTest(LogCaptureTestCase):
dd = DateDetector()
dd.addDefaultTemplate()
for (debit, line, cnt) in (
# shortest distance to datetime should win:
("030324 0:03:59", "some free text 030324 0:03:59 -- 2003-03-07 17:05:01 ...", 1),
# some free text with datetime:
("2003-03-07 17:05:01", "some free text 2003-03-07 17:05:01 test ...", 15),
# distance collision detection (date from foreign input should not be found):
("030324 0:04:00", "server mysqld[1000]: 030324 0:04:00 [Warning] Access denied ..."