Merge branch '0.10' into 0.11

pull/2656/head
sebres 2020-03-02 17:10:48 +01:00
commit f088e7bf76
7 changed files with 118 additions and 72 deletions

View File

@ -43,6 +43,9 @@ ver. 0.11.2-dev (20??/??/??) - development edition
### New Features ### New Features
### Enhancements ### Enhancements
* introduced new prefix `{UNB}` for `datepattern` to disable word boundaries in regex;
* datetemplate: improved anchor detection for capturing groups `(^...)`;
* performance optimization of `datepattern` (better search algorithm in datedetector, especially for single template);
ver. 0.11.1 (2020/01/11) - this-is-the-way ver. 0.11.1 (2020/01/11) - this-is-the-way

View File

@ -337,64 +337,75 @@ class DateDetector(object):
# if no templates specified - default templates should be used: # if no templates specified - default templates should be used:
if not len(self.__templates): if not len(self.__templates):
self.addDefaultTemplate() self.addDefaultTemplate()
logSys.log(logLevel-1, "try to match time for line: %.120s", line) log = logSys.log if logSys.getEffectiveLevel() <= logLevel else lambda *args: None
match = None log(logLevel-1, "try to match time for line: %.120s", line)
# first try to use last template with same start/end position: # first try to use last template with same start/end position:
match = None
found = None, 0x7fffffff, 0x7fffffff, -1
ignoreBySearch = 0x7fffffff ignoreBySearch = 0x7fffffff
i = self.__lastTemplIdx i = self.__lastTemplIdx
if i < len(self.__templates): if i < len(self.__templates):
ddtempl = self.__templates[i] ddtempl = self.__templates[i]
template = ddtempl.template template = ddtempl.template
if template.flags & (DateTemplate.LINE_BEGIN|DateTemplate.LINE_END): if template.flags & (DateTemplate.LINE_BEGIN|DateTemplate.LINE_END):
if logSys.getEffectiveLevel() <= logLevel-1: # pragma: no cover - very-heavy debug log(logLevel-1, " try to match last anchored template #%02i ...", i)
logSys.log(logLevel-1, " try to match last anchored template #%02i ...", i)
match = template.matchDate(line) match = template.matchDate(line)
ignoreBySearch = i ignoreBySearch = i
else: else:
distance, endpos = self.__lastPos[0], self.__lastEndPos[0] distance, endpos = self.__lastPos[0], self.__lastEndPos[0]
if logSys.getEffectiveLevel() <= logLevel-1: log(logLevel-1, " try to match last template #%02i (from %r to %r): ...%r==%r %s %r==%r...",
logSys.log(logLevel-1, " try to match last template #%02i (from %r to %r): ...%r==%r %s %r==%r...",
i, distance, endpos, i, distance, endpos,
line[distance-1:distance], self.__lastPos[1], line[distance-1:distance], self.__lastPos[1],
line[distance:endpos], line[distance:endpos],
line[endpos:endpos+1], self.__lastEndPos[1]) line[endpos:endpos+1], self.__lastEndPos[2])
# check same boundaries left/right, otherwise possible collision/pattern switch: # check same boundaries left/right, outside fully equal, inside only if not alnum (e. g. bound RE
if (line[distance-1:distance] == self.__lastPos[1] and # with space or some special char), otherwise possible collision/pattern switch:
line[endpos:endpos+1] == self.__lastEndPos[1] if ((
): line[distance-1:distance] == self.__lastPos[1] or
(line[distance] == self.__lastPos[2] and not self.__lastPos[2].isalnum())
) and (
line[endpos:endpos+1] == self.__lastEndPos[2] or
(line[endpos-1] == self.__lastEndPos[1] and not self.__lastEndPos[1].isalnum())
)):
# search in line part only:
log(logLevel-1, " boundaries are correct, search in part %r", line[distance:endpos])
match = template.matchDate(line, distance, endpos) match = template.matchDate(line, distance, endpos)
else:
log(logLevel-1, " boundaries show conflict, try whole search")
match = template.matchDate(line)
ignoreBySearch = i
if match: if match:
distance = match.start() distance = match.start()
endpos = match.end() endpos = match.end()
# if different position, possible collision/pattern switch: # if different position, possible collision/pattern switch:
if ( if (
len(self.__templates) == 1 or # single template:
template.flags & (DateTemplate.LINE_BEGIN|DateTemplate.LINE_END) or template.flags & (DateTemplate.LINE_BEGIN|DateTemplate.LINE_END) or
(distance == self.__lastPos[0] and endpos == self.__lastEndPos[0]) (distance == self.__lastPos[0] and endpos == self.__lastEndPos[0])
): ):
logSys.log(logLevel, " matched last time template #%02i", i) log(logLevel, " matched last time template #%02i", i)
else: else:
logSys.log(logLevel, " ** last pattern collision - pattern change, search ...") log(logLevel, " ** last pattern collision - pattern change, reserve & search ...")
found = match, distance, endpos, i; # save current best alternative
match = None match = None
else: else:
logSys.log(logLevel, " ** last pattern not found - pattern change, search ...") log(logLevel, " ** last pattern not found - pattern change, search ...")
# search template and better match: # search template and better match:
if not match: if not match:
logSys.log(logLevel, " search template (%i) ...", len(self.__templates)) log(logLevel, " search template (%i) ...", len(self.__templates))
found = None, 0x7fffffff, 0x7fffffff, -1
i = 0 i = 0
for ddtempl in self.__templates: for ddtempl in self.__templates:
if logSys.getEffectiveLevel() <= logLevel-1:
logSys.log(logLevel-1, " try template #%02i: %s", i, ddtempl.name)
if i == ignoreBySearch: if i == ignoreBySearch:
i += 1 i += 1
continue continue
log(logLevel-1, " try template #%02i: %s", i, ddtempl.name)
template = ddtempl.template template = ddtempl.template
match = template.matchDate(line) match = template.matchDate(line)
if match: if match:
distance = match.start() distance = match.start()
endpos = match.end() endpos = match.end()
if logSys.getEffectiveLevel() <= logLevel: log(logLevel, " matched time template #%02i (at %r <= %r, %r) %s",
logSys.log(logLevel, " matched time template #%02i (at %r <= %r, %r) %s",
i, distance, ddtempl.distance, self.__lastPos[0], template.name) i, distance, ddtempl.distance, self.__lastPos[0], template.name)
## last (or single) template - fast stop: ## last (or single) template - fast stop:
if i+1 >= len(self.__templates): if i+1 >= len(self.__templates):
@ -408,7 +419,7 @@ class DateDetector(object):
## [grave] if distance changed, possible date-match was found somewhere ## [grave] if distance changed, possible date-match was found somewhere
## in body of message, so save this template, and search further: ## in body of message, so save this template, and search further:
if distance > ddtempl.distance or distance > self.__lastPos[0]: if distance > ddtempl.distance or distance > self.__lastPos[0]:
logSys.log(logLevel, " ** distance collision - pattern change, reserve") log(logLevel, " ** distance collision - pattern change, reserve")
## shortest of both: ## shortest of both:
if distance < found[1]: if distance < found[1]:
found = match, distance, endpos, i found = match, distance, endpos, i
@ -422,7 +433,7 @@ class DateDetector(object):
# check other template was found (use this one with shortest distance): # check other template was found (use this one with shortest distance):
if not match and found[0]: if not match and found[0]:
match, distance, endpos, i = found match, distance, endpos, i = found
logSys.log(logLevel, " use best time template #%02i", i) log(logLevel, " use best time template #%02i", i)
ddtempl = self.__templates[i] ddtempl = self.__templates[i]
template = ddtempl.template template = ddtempl.template
# we've winner, incr hits, set distance, usage, reorder, etc: # we've winner, incr hits, set distance, usage, reorder, etc:
@ -432,8 +443,8 @@ class DateDetector(object):
ddtempl.distance = distance ddtempl.distance = distance
if self.__firstUnused == i: if self.__firstUnused == i:
self.__firstUnused += 1 self.__firstUnused += 1
self.__lastPos = distance, line[distance-1:distance] self.__lastPos = distance, line[distance-1:distance], line[distance]
self.__lastEndPos = endpos, line[endpos:endpos+1] self.__lastEndPos = endpos, line[endpos-1], line[endpos:endpos+1]
# if not first - try to reorder current template (bubble up), they will be not sorted anymore: # if not first - try to reorder current template (bubble up), they will be not sorted anymore:
if i and i != self.__lastTemplIdx: if i and i != self.__lastTemplIdx:
i = self._reorderTemplate(i) i = self._reorderTemplate(i)
@ -442,7 +453,7 @@ class DateDetector(object):
return (match, template) return (match, template)
# not found: # not found:
logSys.log(logLevel, " no template.") log(logLevel, " no template.")
return (None, None) return (None, None)
@property @property

View File

@ -36,15 +36,16 @@ logSys = getLogger(__name__)
RE_GROUPED = re.compile(r'(?<!(?:\(\?))(?<!\\)\((?!\?)') RE_GROUPED = re.compile(r'(?<!(?:\(\?))(?<!\\)\((?!\?)')
RE_GROUP = ( re.compile(r'^((?:\(\?\w+\))?\^?(?:\(\?\w+\))?)(.*?)(\$?)$'), r"\1(\2)\3" ) RE_GROUP = ( re.compile(r'^((?:\(\?\w+\))?\^?(?:\(\?\w+\))?)(.*?)(\$?)$'), r"\1(\2)\3" )
RE_EXLINE_NO_BOUNDS = re.compile(r'^\{UNB\}')
RE_EXLINE_BOUND_BEG = re.compile(r'^\{\^LN-BEG\}') RE_EXLINE_BOUND_BEG = re.compile(r'^\{\^LN-BEG\}')
RE_EXSANC_BOUND_BEG = re.compile(r'^\(\?:\^\|\\b\|\\W\)') RE_EXSANC_BOUND_BEG = re.compile(r'^\((?:\?:)?\^\|\\b\|\\W\)')
RE_EXEANC_BOUND_BEG = re.compile(r'\(\?=\\b\|\\W\|\$\)$') RE_EXEANC_BOUND_BEG = re.compile(r'\(\?=\\b\|\\W\|\$\)$')
RE_NO_WRD_BOUND_BEG = re.compile(r'^\(*(?:\(\?\w+\))?(?:\^|\(*\*\*|\(\?:\^)') RE_NO_WRD_BOUND_BEG = re.compile(r'^\(*(?:\(\?\w+\))?(?:\^|\(*\*\*|\((?:\?:)?\^)')
RE_NO_WRD_BOUND_END = re.compile(r'(?<!\\)(?:\$\)?|\\b|\\s|\*\*\)*)$') RE_NO_WRD_BOUND_END = re.compile(r'(?<!\\)(?:\$\)?|\\b|\\s|\*\*\)*)$')
RE_DEL_WRD_BOUNDS = ( re.compile(r'^\(*(?:\(\?\w+\))?\(*\*\*|(?<!\\)\*\*\)*$'), RE_DEL_WRD_BOUNDS = ( re.compile(r'^\(*(?:\(\?\w+\))?\(*\*\*|(?<!\\)\*\*\)*$'),
lambda m: m.group().replace('**', '') ) lambda m: m.group().replace('**', '') )
RE_LINE_BOUND_BEG = re.compile(r'^(?:\(\?\w+\))?(?:\^|\(\?:\^(?!\|))') RE_LINE_BOUND_BEG = re.compile(r'^(?:\(\?\w+\))?(?:\^|\((?:\?:)?\^(?!\|))')
RE_LINE_BOUND_END = re.compile(r'(?<![\\\|])(?:\$\)?)$') RE_LINE_BOUND_END = re.compile(r'(?<![\\\|])(?:\$\)?)$')
RE_ALPHA_PATTERN = re.compile(r'(?<!\%)\%[aAbBpc]') RE_ALPHA_PATTERN = re.compile(r'(?<!\%)\%[aAbBpc]')
@ -119,7 +120,7 @@ class DateTemplate(object):
if boundBegin: if boundBegin:
self.flags |= DateTemplate.WORD_BEGIN if wordBegin != 'start' else DateTemplate.LINE_BEGIN self.flags |= DateTemplate.WORD_BEGIN if wordBegin != 'start' else DateTemplate.LINE_BEGIN
if wordBegin != 'start': if wordBegin != 'start':
regex = r'(?:^|\b|\W)' + regex regex = r'(?=^|\b|\W)' + regex
else: else:
regex = r"^(?:\W{0,2})?" + regex regex = r"^(?:\W{0,2})?" + regex
if not self.name.startswith('{^LN-BEG}'): if not self.name.startswith('{^LN-BEG}'):
@ -128,8 +129,10 @@ class DateTemplate(object):
if boundEnd: if boundEnd:
self.flags |= DateTemplate.WORD_END self.flags |= DateTemplate.WORD_END
regex += r'(?=\b|\W|$)' regex += r'(?=\b|\W|$)'
if RE_LINE_BOUND_BEG.search(regex): self.flags |= DateTemplate.LINE_BEGIN if not (self.flags & DateTemplate.LINE_BEGIN) and RE_LINE_BOUND_BEG.search(regex):
if RE_LINE_BOUND_END.search(regex): self.flags |= DateTemplate.LINE_END self.flags |= DateTemplate.LINE_BEGIN
if not (self.flags & DateTemplate.LINE_END) and RE_LINE_BOUND_END.search(regex):
self.flags |= DateTemplate.LINE_END
# remove possible special pattern "**" in front and end of regex: # remove possible special pattern "**" in front and end of regex:
regex = RE_DEL_WRD_BOUNDS[0].sub(RE_DEL_WRD_BOUNDS[1], regex) regex = RE_DEL_WRD_BOUNDS[0].sub(RE_DEL_WRD_BOUNDS[1], regex)
self._regex = regex self._regex = regex
@ -188,7 +191,7 @@ class DateTemplate(object):
def unboundPattern(pattern): def unboundPattern(pattern):
return RE_EXEANC_BOUND_BEG.sub('', return RE_EXEANC_BOUND_BEG.sub('',
RE_EXSANC_BOUND_BEG.sub('', RE_EXSANC_BOUND_BEG.sub('',
RE_EXLINE_BOUND_BEG.sub('', pattern) RE_EXLINE_BOUND_BEG.sub('', RE_EXLINE_NO_BOUNDS.sub('', pattern))
) )
) )
@ -297,6 +300,10 @@ class DatePatternRegex(DateTemplate):
def setRegex(self, pattern, wordBegin=True, wordEnd=True): def setRegex(self, pattern, wordBegin=True, wordEnd=True):
# original pattern: # original pattern:
self._pattern = pattern self._pattern = pattern
# if unbound signalled - reset boundaries left and right:
if RE_EXLINE_NO_BOUNDS.search(pattern):
pattern = RE_EXLINE_NO_BOUNDS.sub('', pattern)
wordBegin = wordEnd = False
# if explicit given {^LN-BEG} - remove it from pattern and set 'start' in wordBegin: # if explicit given {^LN-BEG} - remove it from pattern and set 'start' in wordBegin:
if wordBegin and RE_EXLINE_BOUND_BEG.search(pattern): if wordBegin and RE_EXLINE_BOUND_BEG.search(pattern):
pattern = RE_EXLINE_BOUND_BEG.sub('', pattern) pattern = RE_EXLINE_BOUND_BEG.sub('', pattern)

View File

@ -92,10 +92,7 @@ class FailManager:
if attempt <= 0: if attempt <= 0:
attempt += 1 attempt += 1
unixTime = ticket.getTime() unixTime = ticket.getTime()
fData.setLastTime(unixTime) fData.adjustTime(unixTime, self.__maxTime)
if fData.getLastReset() < unixTime - self.__maxTime:
fData.setLastReset(unixTime)
fData.setRetry(0)
fData.inc(matches, attempt, count) fData.inc(matches, attempt, count)
# truncate to maxMatches: # truncate to maxMatches:
if self.maxMatches: if self.maxMatches:
@ -139,7 +136,7 @@ class FailManager:
def cleanup(self, time): def cleanup(self, time):
with self.__lock: with self.__lock:
todelete = [fid for fid,item in self.__failList.iteritems() \ todelete = [fid for fid,item in self.__failList.iteritems() \
if item.getLastTime() + self.__maxTime <= time] if item.getTime() + self.__maxTime <= time]
if len(todelete) == len(self.__failList): if len(todelete) == len(self.__failList):
# remove all: # remove all:
self.__failList = dict() self.__failList = dict()
@ -153,7 +150,7 @@ class FailManager:
else: else:
# create new dictionary without items to be deleted: # create new dictionary without items to be deleted:
self.__failList = dict((fid,item) for fid,item in self.__failList.iteritems() \ self.__failList = dict((fid,item) for fid,item in self.__failList.iteritems() \
if item.getLastTime() + self.__maxTime > time) if item.getTime() + self.__maxTime > time)
self.__bgSvc.service() self.__bgSvc.service()
def delFailure(self, fid): def delFailure(self, fid):

View File

@ -227,15 +227,14 @@ class FailTicket(Ticket):
def __init__(self, ip=None, time=None, matches=None, data={}, ticket=None): def __init__(self, ip=None, time=None, matches=None, data={}, ticket=None):
# this class variables: # this class variables:
self._retry = 0 self._firstTime = None
self._lastReset = None self._retry = 1
# create/copy using default ticket constructor: # create/copy using default ticket constructor:
Ticket.__init__(self, ip, time, matches, data, ticket) Ticket.__init__(self, ip, time, matches, data, ticket)
# init: # init:
if ticket is None: if not isinstance(ticket, FailTicket):
self._lastReset = time if time is not None else self.getTime() self._firstTime = time if time is not None else self.getTime()
if not self._retry: self._retry = self._data.get('failures', 1)
self._retry = self._data['failures'];
def setRetry(self, value): def setRetry(self, value):
""" Set artificial retry count, normally equal failures / attempt, """ Set artificial retry count, normally equal failures / attempt,
@ -252,7 +251,20 @@ class FailTicket(Ticket):
""" Returns failures / attempt count or """ Returns failures / attempt count or
artificial retry count increased for bad IPs artificial retry count increased for bad IPs
""" """
return max(self._retry, self._data['failures']) return self._retry
def adjustTime(self, time, maxTime):
""" Adjust time of ticket and current attempts count considering given maxTime
as estimation from rate by previous known interval (if it exceeds the findTime)
"""
if time > self._time:
# expand current interval and attemps count (considering maxTime):
if self._firstTime < time - maxTime:
# adjust retry calculated as estimation from rate by previous known interval:
self._retry = int(round(self._retry / float(time - self._firstTime) * maxTime))
self._firstTime = time - maxTime
# last time of failure:
self._time = time
def inc(self, matches=None, attempt=1, count=1): def inc(self, matches=None, attempt=1, count=1):
self._retry += count self._retry += count
@ -264,19 +276,6 @@ class FailTicket(Ticket):
else: else:
self._data['matches'] = matches self._data['matches'] = matches
def setLastTime(self, value):
if value > self._time:
self._time = value
def getLastTime(self):
return self._time
def getLastReset(self):
return self._lastReset
def setLastReset(self, value):
self._lastReset = value
@staticmethod @staticmethod
def wrap(o): def wrap(o):
o.__class__ = FailTicket o.__class__ = FailTicket

View File

@ -330,6 +330,27 @@ class DateDetectorTest(LogCaptureTestCase):
dt = '2005 Jun 03'; self.assertEqual(t.matchDate(dt).group(1), dt) dt = '2005 Jun 03'; self.assertEqual(t.matchDate(dt).group(1), dt)
dt = '2005 JUN 03'; self.assertEqual(t.matchDate(dt).group(1), dt) dt = '2005 JUN 03'; self.assertEqual(t.matchDate(dt).group(1), dt)
def testNotAnchoredCollision(self):
# try for patterns with and without word boundaries:
for dp in (r'%H:%M:%S', r'{UNB}%H:%M:%S'):
dd = DateDetector()
dd.appendTemplate(dp)
# boundary of timestamp changes right and left (and time is left and right in line):
for fmt in ('%s test', '%8s test', 'test %s', 'test %8s'):
for dt in (
'00:01:02',
'00:01:2',
'00:1:2',
'0:1:2',
'00:1:2',
'00:01:2',
'00:01:02',
'0:1:2',
'00:01:02',
):
t = dd.getTime(fmt % dt)
self.assertEqual((t[0], t[1].group()), (1123970462.0, dt))
def testAmbiguousInOrderedTemplates(self): def testAmbiguousInOrderedTemplates(self):
dd = self.datedetector dd = self.datedetector
for (debit, line, cnt) in ( for (debit, line, cnt) in (

View File

@ -69,10 +69,10 @@ class TicketTests(unittest.TestCase):
self.assertEqual(ft.getTime(), tm) self.assertEqual(ft.getTime(), tm)
self.assertEqual(ft.getMatches(), matches2) self.assertEqual(ft.getMatches(), matches2)
ft.setAttempt(2) ft.setAttempt(2)
self.assertEqual(ft.getAttempt(), 2)
# retry is max of set retry and failures:
self.assertEqual(ft.getRetry(), 2)
ft.setRetry(1) ft.setRetry(1)
self.assertEqual(ft.getAttempt(), 2)
self.assertEqual(ft.getRetry(), 1)
ft.setRetry(2)
self.assertEqual(ft.getRetry(), 2) self.assertEqual(ft.getRetry(), 2)
ft.setRetry(3) ft.setRetry(3)
self.assertEqual(ft.getRetry(), 3) self.assertEqual(ft.getRetry(), 3)
@ -86,13 +86,21 @@ class TicketTests(unittest.TestCase):
self.assertEqual(ft.getRetry(), 14) self.assertEqual(ft.getRetry(), 14)
self.assertEqual(ft.getMatches(), matches3) self.assertEqual(ft.getMatches(), matches3)
# last time (ignore if smaller as time): # last time (ignore if smaller as time):
self.assertEqual(ft.getLastTime(), tm)
ft.setLastTime(tm-60)
self.assertEqual(ft.getTime(), tm) self.assertEqual(ft.getTime(), tm)
self.assertEqual(ft.getLastTime(), tm) ft.adjustTime(tm-60, 3600)
ft.setLastTime(tm+60) self.assertEqual(ft.getTime(), tm)
self.assertEqual(ft.getRetry(), 14)
ft.adjustTime(tm+60, 3600)
self.assertEqual(ft.getTime(), tm+60) self.assertEqual(ft.getTime(), tm+60)
self.assertEqual(ft.getLastTime(), tm+60) self.assertEqual(ft.getRetry(), 14)
ft.adjustTime(tm+3600, 3600)
self.assertEqual(ft.getTime(), tm+3600)
self.assertEqual(ft.getRetry(), 14)
# adjust time so interval is larger than find time (3600), so reset retry count:
ft.adjustTime(tm+7200, 3600)
self.assertEqual(ft.getTime(), tm+7200)
self.assertEqual(ft.getRetry(), 7); # estimated attempts count
self.assertEqual(ft.getAttempt(), 4); # real known failure count
ft.setData('country', 'DE') ft.setData('country', 'DE')
self.assertEqual(ft.getData(), self.assertEqual(ft.getData(),
{'matches': ['first', 'second', 'third'], 'failures': 4, 'country': 'DE'}) {'matches': ['first', 'second', 'third'], 'failures': 4, 'country': 'DE'})
@ -102,10 +110,10 @@ class TicketTests(unittest.TestCase):
self.assertEqual(ft, ft2) self.assertEqual(ft, ft2)
self.assertEqual(ft.getData(), ft2.getData()) self.assertEqual(ft.getData(), ft2.getData())
self.assertEqual(ft2.getAttempt(), 4) self.assertEqual(ft2.getAttempt(), 4)
self.assertEqual(ft2.getRetry(), 14) self.assertEqual(ft2.getRetry(), 7)
self.assertEqual(ft2.getMatches(), matches3) self.assertEqual(ft2.getMatches(), matches3)
self.assertEqual(ft2.getTime(), ft.getTime()) self.assertEqual(ft2.getTime(), ft.getTime())
self.assertEqual(ft2.getLastTime(), ft.getLastTime()) self.assertEqual(ft2.getTime(), ft.getTime())
self.assertEqual(ft2.getBanTime(), ft.getBanTime()) self.assertEqual(ft2.getBanTime(), ft.getBanTime())
def testTicketFlags(self): def testTicketFlags(self):