prevent completely read of big files first time (after start of service), initial seek to start time using half-interval search algorithm (see issue #795):

now polling backend only (currently not implemented for gamin and pyinotify backends);
python3/pypy compatibility fix + removing obsolete code
pull/1346/head
sebres 2014-09-16 13:50:32 +02:00
parent 3cfdc5ecb3
commit 42199957d0
3 changed files with 104 additions and 2 deletions

View File

@ -24,6 +24,7 @@ __license__ = "GPL"
import codecs
import fcntl
import locale
import logging
import os
import re
import sys
@ -190,6 +191,7 @@ class Filter(JailThread):
# @param value the time
def setFindTime(self, value):
value = MyTime.str2seconds(value)
self.__findTime = value
self.failManager.setMaxTime(value)
logSys.info("Set findtime = %s" % value)
@ -651,7 +653,7 @@ class FileFilter(Filter):
# MyTime.time()-self.findTime. When a failure is detected, a FailTicket
# is created and is added to the FailManager.
def getFailures(self, filename):
def getFailures(self, filename, startTime=None):
log = self.getLog(filename)
if log is None:
logSys.error("Unable to get failures in " + filename)
@ -673,6 +675,11 @@ class FileFilter(Filter):
logSys.exception(e)
return False
# prevent completely read of big files first time (after start of service), initial seek to start time using half-interval search algorithm:
if log.getPos() == 0 and startTime is not None:
# startTime = MyTime.time() - self.getFindTime()
self.seekToTime(log, startTime)
# yoh: has_content is just a bool, so do not expect it to
# change -- loop is exited upon break, and is not entered at
# all if upon container opening that one was empty. If we
@ -690,6 +697,74 @@ class FileFilter(Filter):
db.updateLog(self.jail, log)
return True
##
# Seeks to line with date (search using half-interval search algorithm), to start polling from it
#
def seekToTime(self, container, date):
fs = container.getFileSize()
if logSys.getEffectiveLevel() <= logging.DEBUG:
logSys.debug("Seek to find time %s (%s), file size %s", date,
datetime.datetime.fromtimestamp(date).strftime("%Y-%m-%d %H:%M:%S"), fs)
date -= 0.009
minp = 0
maxp = fs
lastpos = 0
lastFew = 0
lastTime = None
cntr = 0
unixTime = None
lasti = 0
movecntr = 3
while maxp > minp:
i = int(minp + (maxp - minp) / 2)
pos = container.seek(i)
cntr += 1
# within next 5 lines try to find any legal datetime:
lncntr = 5;
dateTimeMatch = None
llen = 0
i = pos
while True:
line = container.readline()
if not line:
break
llen += len(line)
l = line.rstrip('\r\n')
timeMatch = self.dateDetector.matchTime(l)
if timeMatch:
dateTimeMatch = self.dateDetector.getTime(l[timeMatch.start():timeMatch.end()])
if not dateTimeMatch and lncntr:
lncntr -= 1
continue
break
# if we can't move (position not changed)
if i + llen == lasti:
movecntr -= 1
if movecntr <= 0:
break
lasti = i + llen;
# not found at this step - stop searching
if not dateTimeMatch:
break
unixTime = dateTimeMatch[0]
if unixTime >= date:
maxp = i
else:
minp = i + llen
lastFew = pos;
lastTime = unixTime
lastpos = pos
# if found position have a time greater as given - use smallest time we have found
if unixTime is None or unixTime > date:
unixTime = lastTime
lastpos = container.seek(lastFew, False)
else:
lastpos = container.seek(lastpos, False)
if logSys.getEffectiveLevel() <= logging.DEBUG:
logSys.debug("Position %s from %s, found time %s (%s) within %s seeks", lastpos, fs, unixTime,
(datetime.datetime.fromtimestamp(unixTime).strftime("%Y-%m-%d %H:%M:%S") if unixTime is not None else ''), cntr)
def status(self, flavor="basic"):
"""Status of Filter plus files being monitored.
"""
@ -742,6 +817,9 @@ class FileContainer:
def getFileName(self):
return self.__filename
def getFileSize(self):
return os.path.getsize(self.__filename);
def setEncoding(self, encoding):
codecs.lookup(encoding) # Raises LookupError if invalid
self.__encoding = encoding
@ -788,6 +866,16 @@ class FileContainer:
self.__handler.seek(self.__pos)
return True
def seek(self, offs, endLine=True):
h = self.__handler
# seek to given position
h.seek(offs, 0)
# goto end of next line
if endLine:
h.readline()
# get current real position
return h.tell()
@staticmethod
def decode_line(filename, enc, line):
try:

View File

@ -57,6 +57,7 @@ class FilterPoll(FileFilter):
## The time of the last modification of the file.
self.__prevStats = dict()
self.__file404Cnt = dict()
self.__initial = dict()
logSys.debug("Created FilterPoll")
##
@ -94,7 +95,11 @@ class FilterPoll(FileFilter):
for container in self.getLogs():
filename = container.getFileName()
if self.isModified(filename):
self.getFailures(filename)
# set start time as now - find time for first usage only (prevent performance bug with polling of big files)
self.getFailures(filename,
(MyTime.time() - self.getFindTime()) if not self.__initial.get(filename) else None
)
self.__initial[filename] = True
self.__modified = True
if self.__modified:

View File

@ -912,6 +912,15 @@ class GetFailures(LogCaptureTestCase):
self.filter.getFailures(GetFailures.FILENAME_03)
_assert_correct_last_attempt(self, self.filter, output)
def testGetFailures03_seek(self):
# same test as above but with seek to 'Aug 14 11:55:04' - so other output ...
output = ('203.162.223.135', 5, 1124013544.0)
self.filter.addLogPath(GetFailures.FILENAME_03)
self.filter.addFailRegex("error,relay=<HOST>,.*550 User unknown")
self.filter.getFailures(GetFailures.FILENAME_03, output[2] - 4*60 + 1)
_assert_correct_last_attempt(self, self.filter, output)
def testGetFailures04(self):
output = [('212.41.96.186', 4, 1124013600.0),
('212.41.96.185', 4, 1124017198.0)]