readline fixed to consider interim new-line character as part of code point in multi-byte logs (e. g. unicode: utf-16be, utf-16le);

suppress warning "Error decoding line" for incomplete line (produced by not fully read multi-byte new-line character at end of data);
added test coverage for such logs
pull/2337/merge
sebres 2021-03-23 12:57:42 +01:00
parent 996920cdaa
commit cbac7c176a
2 changed files with 100 additions and 23 deletions

View File

@ -1398,6 +1398,9 @@ class FileContainer:
try:
return line.decode(enc, 'strict')
except (UnicodeDecodeError, UnicodeEncodeError) as e:
# avoid warning if got incomplete end of line (e. g. '\n' in "...[0A" followed by "00]..." for utf-16le:
if (e.end == len(line) and line[e.start] in b'\r\n'):
return line[0:e.start].decode(enc, 'replace')
global _decode_line_warn
lev = 7
if not _decode_line_warn.get(filename, 0):
@ -1406,9 +1409,9 @@ class FileContainer:
logSys.log(lev,
"Error decoding line from '%s' with '%s'.", filename, enc)
if logSys.getEffectiveLevel() <= lev:
logSys.log(lev, "Consider setting logencoding=utf-8 (or another appropriate"
" encoding) for this jail. Continuing"
" to process line ignoring invalid characters: %r",
logSys.log(lev,
"Consider setting logencoding to appropriate encoding for this jail. "
"Continuing to process line ignoring invalid characters: %r",
line)
# decode with replacing error chars:
line = line.decode(enc, 'replace')
@ -1422,23 +1425,51 @@ class FileContainer:
if line is not complete (and complete=True) or there is no content to read.
If line is complete (and complete is True), it also shift current known
position to begin of next line.
Also it is safe against interim new-line bytes (e. g. part of multi-byte char)
in given encoding.
"""
if self.__handler is None:
return ""
rl = self.__handler.readline()
if rl == b'':
# read raw bytes up to \n char:
b = self.__handler.readline()
if not b:
return None
# trim new-line here and check the line was written complete (contains a new-line):
l = rl.rstrip(b'\r\n')
if self.inOperation and complete:
if l == rl:
bl = len(b)
# convert to log-encoding (new-line char could disappear if it is part of multi-byte sequence):
r = FileContainer.decode_line(
self.getFileName(), self.getEncoding(), b)
# trim new-line at end and check the line was written complete (contains a new-line):
l = r.rstrip('\r\n')
if complete:
if l == r:
# try to fill buffer in order to find line-end in log encoding:
fnd = 0
while 1:
r = self.__handler.readline()
if not r:
break
b += r
bl += len(r)
# convert to log-encoding:
r = FileContainer.decode_line(
self.getFileName(), self.getEncoding(), b)
# ensure new-line is not in the middle (buffered 2 strings, e. g. in utf-16le it is "...[0A"+"00]..."):
e = r.find('\n')
if e >= 0 and e != len(r)-1:
l, r = r[0:e], r[0:e+1]
# back to bytes and get offset to seek after NL:
r = r.encode(self.getEncoding(), 'replace')
self.__handler.seek(-bl+len(r), 1)
return l
# trim new-line at end and check the line was written complete (contains a new-line):
l = r.rstrip('\r\n')
if l != r:
return l
# not fulfilled - seek back and return:
self.__handler.seek(self.__pos, 0)
self.__handler.seek(-bl, 1)
return None
# shift position (to be able to seek back above):
self.__pos += len(rl)
return FileContainer.decode_line(
self.getFileName(), self.getEncoding(), l)
return l
def close(self):
if self.__handler is not None:

View File

@ -644,6 +644,19 @@ class LogFile(LogCaptureTestCase):
self.filter = FilterPoll(None)
self.assertRaises(IOError, self.filter.addLogPath, LogFile.MISSING)
def testDecodeLineWarn(self):
# incomplete line (missing byte at end), warning is suppressed:
l = u"correct line\n"
r = l.encode('utf-16le')
self.assertEqual(FileContainer.decode_line('TESTFILE', 'utf-16le', r), l)
self.assertEqual(FileContainer.decode_line('TESTFILE', 'utf-16le', r[0:-1]), l[0:-1])
self.assertNotLogged('Error decoding line')
# complete line (incorrect surrogate in the middle), warning is there:
r = b"incorrect \xc8\x0a line\n"
l = r.decode('utf-8', 'replace')
self.assertEqual(FileContainer.decode_line('TESTFILE', 'utf-8', r), l)
self.assertLogged('Error decoding line')
class LogFileFilterPoll(unittest.TestCase):
@ -1633,16 +1646,49 @@ class GetFailures(LogCaptureTestCase):
def testCRLFFailures01(self):
# We first adjust logfile/failures to end with CR+LF
fname = tempfile.mktemp(prefix='tmp_fail2ban', suffix='crlf')
# poor man unix2dos:
fin, fout = open(GetFailures.FILENAME_01, 'rb'), open(fname, 'wb')
for l in fin.read().splitlines():
fout.write(l + b'\r\n')
fin.close()
fout.close()
try:
# poor man unix2dos:
fin, fout = open(GetFailures.FILENAME_01, 'rb'), open(fname, 'wb')
for l in fin.read().splitlines():
fout.write(l + b'\r\n')
fin.close()
fout.close()
# now see if we should be getting the "same" failures
self.testGetFailures01(filename=fname)
_killfile(fout, fname)
# now see if we should be getting the "same" failures
self.testGetFailures01(filename=fname)
finally:
_killfile(fout, fname)
def testNLCharAsPartOfUniChar(self):
fname = tempfile.mktemp(prefix='tmp_fail2ban', suffix='crlf')
# test two multi-byte encodings (both contains `\x0A` in either \x02\x0A or \x0A\x02):
for enc in ('utf-16be', 'utf-16le'):
self.pruneLog("[test-phase encoding=%s]" % enc)
try:
fout = open(fname, 'wb')
tm = int(time.time())
# test on unicode string containing \x0A as part of uni-char,
# it must produce exactly 2 lines (both are failures):
for l in (
u'%s \u20AC Failed auth: invalid user Test\u020A from 192.0.2.1\n' % tm,
u'%s \u20AC Failed auth: invalid user TestI from 192.0.2.2\n' % tm
):
fout.write(l.encode(enc))
fout.close()
self.filter.setLogEncoding(enc)
self.filter.addLogPath(fname, autoSeek=0)
self.filter.setDatePattern((r'^EPOCH',))
self.filter.addFailRegex(r"Failed .* from <HOST>")
self.filter.getFailures(fname)
self.assertLogged(
"[DummyJail] Found 192.0.2.1",
"[DummyJail] Found 192.0.2.2", all=True, wait=True)
finally:
_killfile(fout, fname)
self.filter.delLogPath(fname)
# must find 4 failures and generate 2 tickets (2 IPs with each 2 failures):
self.assertEqual(self.filter.failManager.getFailCount(), (2, 4))
def testGetFailures02(self):
output = ('141.3.81.106', 4, 1124013539.0,