performance optimization of `datepattern` (better search algorithm);

datetemplate: improved anchor detection for capturing groups `(^...)`; introduced new prefix `{UNB}` for `datepattern` to disable word boundaries in regex;
datedetector: speedup special case if only one template is defined (every match wins - no collision, no sorting, no other best match possible)
pull/2651/head
sebres 2020-02-28 11:49:43 +01:00
parent 2e42b98cd3
commit 4766547e1f
4 changed files with 78 additions and 36 deletions

View File

@ -37,6 +37,9 @@ ver. 0.10.6-dev (20??/??/??) - development edition
### New Features
### Enhancements
* introduced new prefix `{UNB}` for `datepattern` to disable word boundaries in regex;
* datetemplate: improved anchor detection for capturing groups `(^...)`;
* performance optimization of `datepattern` (better search algorithm in datedetector, especially for single template);
ver. 0.10.5 (2020/01/10) - deserve-more-respect-a-jedis-weapon-must

View File

@ -337,65 +337,76 @@ class DateDetector(object):
# if no templates specified - default templates should be used:
if not len(self.__templates):
self.addDefaultTemplate()
logSys.log(logLevel-1, "try to match time for line: %.120s", line)
match = None
log = logSys.log if logSys.getEffectiveLevel() <= logLevel else lambda *args: None
log(logLevel-1, "try to match time for line: %.120s", line)
# first try to use last template with same start/end position:
match = None
found = None, 0x7fffffff, 0x7fffffff, -1
ignoreBySearch = 0x7fffffff
i = self.__lastTemplIdx
if i < len(self.__templates):
ddtempl = self.__templates[i]
template = ddtempl.template
if template.flags & (DateTemplate.LINE_BEGIN|DateTemplate.LINE_END):
if logSys.getEffectiveLevel() <= logLevel-1: # pragma: no cover - very-heavy debug
logSys.log(logLevel-1, " try to match last anchored template #%02i ...", i)
log(logLevel-1, " try to match last anchored template #%02i ...", i)
match = template.matchDate(line)
ignoreBySearch = i
else:
distance, endpos = self.__lastPos[0], self.__lastEndPos[0]
if logSys.getEffectiveLevel() <= logLevel-1:
logSys.log(logLevel-1, " try to match last template #%02i (from %r to %r): ...%r==%r %s %r==%r...",
i, distance, endpos,
line[distance-1:distance], self.__lastPos[1],
line[distance:endpos],
line[endpos:endpos+1], self.__lastEndPos[1])
# check same boundaries left/right, otherwise possible collision/pattern switch:
if (line[distance-1:distance] == self.__lastPos[1] and
line[endpos:endpos+1] == self.__lastEndPos[1]
):
log(logLevel-1, " try to match last template #%02i (from %r to %r): ...%r==%r %s %r==%r...",
i, distance, endpos,
line[distance-1:distance], self.__lastPos[1],
line[distance:endpos],
line[endpos:endpos+1], self.__lastEndPos[2])
# check same boundaries left/right, outside fully equal, inside only if not alnum (e. g. bound RE
# with space or some special char), otherwise possible collision/pattern switch:
if ((
line[distance-1:distance] == self.__lastPos[1] or
(line[distance] == self.__lastPos[2] and not self.__lastPos[2].isalnum())
) and (
line[endpos:endpos+1] == self.__lastEndPos[2] or
(line[endpos-1] == self.__lastEndPos[1] and not self.__lastEndPos[1].isalnum())
)):
# search in line part only:
log(logLevel-1, " boundaries are correct, search in part %r", line[distance:endpos])
match = template.matchDate(line, distance, endpos)
else:
log(logLevel-1, " boundaries show conflict, try whole search")
match = template.matchDate(line)
ignoreBySearch = i
if match:
distance = match.start()
endpos = match.end()
# if different position, possible collision/pattern switch:
if (
len(self.__templates) == 1 or # single template:
template.flags & (DateTemplate.LINE_BEGIN|DateTemplate.LINE_END) or
(distance == self.__lastPos[0] and endpos == self.__lastEndPos[0])
):
logSys.log(logLevel, " matched last time template #%02i", i)
log(logLevel, " matched last time template #%02i", i)
else:
logSys.log(logLevel, " ** last pattern collision - pattern change, search ...")
log(logLevel, " ** last pattern collision - pattern change, reserve & search ...")
found = match, distance, endpos, i; # save current best alternative
match = None
else:
logSys.log(logLevel, " ** last pattern not found - pattern change, search ...")
log(logLevel, " ** last pattern not found - pattern change, search ...")
# search template and better match:
if not match:
logSys.log(logLevel, " search template (%i) ...", len(self.__templates))
found = None, 0x7fffffff, 0x7fffffff, -1
log(logLevel, " search template (%i) ...", len(self.__templates))
i = 0
for ddtempl in self.__templates:
if logSys.getEffectiveLevel() <= logLevel-1:
logSys.log(logLevel-1, " try template #%02i: %s", i, ddtempl.name)
if i == ignoreBySearch:
i += 1
continue
log(logLevel-1, " try template #%02i: %s", i, ddtempl.name)
template = ddtempl.template
match = template.matchDate(line)
if match:
distance = match.start()
endpos = match.end()
if logSys.getEffectiveLevel() <= logLevel:
logSys.log(logLevel, " matched time template #%02i (at %r <= %r, %r) %s",
i, distance, ddtempl.distance, self.__lastPos[0], template.name)
log(logLevel, " matched time template #%02i (at %r <= %r, %r) %s",
i, distance, ddtempl.distance, self.__lastPos[0], template.name)
## last (or single) template - fast stop:
if i+1 >= len(self.__templates):
break
@ -408,7 +419,7 @@ class DateDetector(object):
## [grave] if distance changed, possible date-match was found somewhere
## in body of message, so save this template, and search further:
if distance > ddtempl.distance or distance > self.__lastPos[0]:
logSys.log(logLevel, " ** distance collision - pattern change, reserve")
log(logLevel, " ** distance collision - pattern change, reserve")
## shortest of both:
if distance < found[1]:
found = match, distance, endpos, i
@ -422,7 +433,7 @@ class DateDetector(object):
# check other template was found (use this one with shortest distance):
if not match and found[0]:
match, distance, endpos, i = found
logSys.log(logLevel, " use best time template #%02i", i)
log(logLevel, " use best time template #%02i", i)
ddtempl = self.__templates[i]
template = ddtempl.template
# we've winner, incr hits, set distance, usage, reorder, etc:
@ -432,8 +443,8 @@ class DateDetector(object):
ddtempl.distance = distance
if self.__firstUnused == i:
self.__firstUnused += 1
self.__lastPos = distance, line[distance-1:distance]
self.__lastEndPos = endpos, line[endpos:endpos+1]
self.__lastPos = distance, line[distance-1:distance], line[distance]
self.__lastEndPos = endpos, line[endpos-1], line[endpos:endpos+1]
# if not first - try to reorder current template (bubble up), they will be not sorted anymore:
if i and i != self.__lastTemplIdx:
i = self._reorderTemplate(i)
@ -442,7 +453,7 @@ class DateDetector(object):
return (match, template)
# not found:
logSys.log(logLevel, " no template.")
log(logLevel, " no template.")
return (None, None)
@property

View File

@ -36,15 +36,16 @@ logSys = getLogger(__name__)
RE_GROUPED = re.compile(r'(?<!(?:\(\?))(?<!\\)\((?!\?)')
RE_GROUP = ( re.compile(r'^((?:\(\?\w+\))?\^?(?:\(\?\w+\))?)(.*?)(\$?)$'), r"\1(\2)\3" )
RE_EXLINE_NO_BOUNDS = re.compile(r'^\{UNB\}')
RE_EXLINE_BOUND_BEG = re.compile(r'^\{\^LN-BEG\}')
RE_EXSANC_BOUND_BEG = re.compile(r'^\(\?:\^\|\\b\|\\W\)')
RE_EXSANC_BOUND_BEG = re.compile(r'^\((?:\?:)?\^\|\\b\|\\W\)')
RE_EXEANC_BOUND_BEG = re.compile(r'\(\?=\\b\|\\W\|\$\)$')
RE_NO_WRD_BOUND_BEG = re.compile(r'^\(*(?:\(\?\w+\))?(?:\^|\(*\*\*|\(\?:\^)')
RE_NO_WRD_BOUND_BEG = re.compile(r'^\(*(?:\(\?\w+\))?(?:\^|\(*\*\*|\((?:\?:)?\^)')
RE_NO_WRD_BOUND_END = re.compile(r'(?<!\\)(?:\$\)?|\\b|\\s|\*\*\)*)$')
RE_DEL_WRD_BOUNDS = ( re.compile(r'^\(*(?:\(\?\w+\))?\(*\*\*|(?<!\\)\*\*\)*$'),
lambda m: m.group().replace('**', '') )
RE_LINE_BOUND_BEG = re.compile(r'^(?:\(\?\w+\))?(?:\^|\(\?:\^(?!\|))')
RE_LINE_BOUND_BEG = re.compile(r'^(?:\(\?\w+\))?(?:\^|\((?:\?:)?\^(?!\|))')
RE_LINE_BOUND_END = re.compile(r'(?<![\\\|])(?:\$\)?)$')
RE_ALPHA_PATTERN = re.compile(r'(?<!\%)\%[aAbBpc]')
@ -119,7 +120,7 @@ class DateTemplate(object):
if boundBegin:
self.flags |= DateTemplate.WORD_BEGIN if wordBegin != 'start' else DateTemplate.LINE_BEGIN
if wordBegin != 'start':
regex = r'(?:^|\b|\W)' + regex
regex = r'(?=^|\b|\W)' + regex
else:
regex = r"^(?:\W{0,2})?" + regex
if not self.name.startswith('{^LN-BEG}'):
@ -128,8 +129,10 @@ class DateTemplate(object):
if boundEnd:
self.flags |= DateTemplate.WORD_END
regex += r'(?=\b|\W|$)'
if RE_LINE_BOUND_BEG.search(regex): self.flags |= DateTemplate.LINE_BEGIN
if RE_LINE_BOUND_END.search(regex): self.flags |= DateTemplate.LINE_END
if not (self.flags & DateTemplate.LINE_BEGIN) and RE_LINE_BOUND_BEG.search(regex):
self.flags |= DateTemplate.LINE_BEGIN
if not (self.flags & DateTemplate.LINE_END) and RE_LINE_BOUND_END.search(regex):
self.flags |= DateTemplate.LINE_END
# remove possible special pattern "**" in front and end of regex:
regex = RE_DEL_WRD_BOUNDS[0].sub(RE_DEL_WRD_BOUNDS[1], regex)
self._regex = regex
@ -188,7 +191,7 @@ class DateTemplate(object):
def unboundPattern(pattern):
return RE_EXEANC_BOUND_BEG.sub('',
RE_EXSANC_BOUND_BEG.sub('',
RE_EXLINE_BOUND_BEG.sub('', pattern)
RE_EXLINE_BOUND_BEG.sub('', RE_EXLINE_NO_BOUNDS.sub('', pattern))
)
)
@ -297,6 +300,10 @@ class DatePatternRegex(DateTemplate):
def setRegex(self, pattern, wordBegin=True, wordEnd=True):
# original pattern:
self._pattern = pattern
# if unbound signalled - reset boundaries left and right:
if RE_EXLINE_NO_BOUNDS.search(pattern):
pattern = RE_EXLINE_NO_BOUNDS.sub('', pattern)
wordBegin = wordEnd = False
# if explicit given {^LN-BEG} - remove it from pattern and set 'start' in wordBegin:
if wordBegin and RE_EXLINE_BOUND_BEG.search(pattern):
pattern = RE_EXLINE_BOUND_BEG.sub('', pattern)

View File

@ -330,6 +330,27 @@ class DateDetectorTest(LogCaptureTestCase):
dt = '2005 Jun 03'; self.assertEqual(t.matchDate(dt).group(1), dt)
dt = '2005 JUN 03'; self.assertEqual(t.matchDate(dt).group(1), dt)
def testNotAnchoredCollision(self):
# try for patterns with and without word boundaries:
for dp in (r'%H:%M:%S', r'{UNB}%H:%M:%S'):
dd = DateDetector()
dd.appendTemplate(dp)
# boundary of timestamp changes right and left (and time is left and right in line):
for fmt in ('%s test', '%8s test', 'test %s', 'test %8s'):
for dt in (
'00:01:02',
'00:01:2',
'00:1:2',
'0:1:2',
'00:1:2',
'00:01:2',
'00:01:02',
'0:1:2',
'00:01:02',
):
t = dd.getTime(fmt % dt)
self.assertEqual((t[0], t[1].group()), (1123970462.0, dt))
def testAmbiguousInOrderedTemplates(self):
dd = self.datedetector
for (debit, line, cnt) in (