readline fixed to consider interim new-line character as part of code point in multi-byte logs (e. g. unicode: utf-16be, utf-16le);

suppress warning "Error decoding line" for incomplete line (produced by not fully read multi-byte new-line character at end of data);
added test coverage for such logs
pull/2337/merge
sebres 2021-03-23 12:57:42 +01:00
parent 996920cdaa
commit cbac7c176a
2 changed files with 100 additions and 23 deletions

View File

@ -1398,6 +1398,9 @@ class FileContainer:
try: try:
return line.decode(enc, 'strict') return line.decode(enc, 'strict')
except (UnicodeDecodeError, UnicodeEncodeError) as e: except (UnicodeDecodeError, UnicodeEncodeError) as e:
# avoid warning if got incomplete end of line (e. g. '\n' in "...[0A" followed by "00]..." for utf-16le:
if (e.end == len(line) and line[e.start] in b'\r\n'):
return line[0:e.start].decode(enc, 'replace')
global _decode_line_warn global _decode_line_warn
lev = 7 lev = 7
if not _decode_line_warn.get(filename, 0): if not _decode_line_warn.get(filename, 0):
@ -1406,9 +1409,9 @@ class FileContainer:
logSys.log(lev, logSys.log(lev,
"Error decoding line from '%s' with '%s'.", filename, enc) "Error decoding line from '%s' with '%s'.", filename, enc)
if logSys.getEffectiveLevel() <= lev: if logSys.getEffectiveLevel() <= lev:
logSys.log(lev, "Consider setting logencoding=utf-8 (or another appropriate" logSys.log(lev,
" encoding) for this jail. Continuing" "Consider setting logencoding to appropriate encoding for this jail. "
" to process line ignoring invalid characters: %r", "Continuing to process line ignoring invalid characters: %r",
line) line)
# decode with replacing error chars: # decode with replacing error chars:
line = line.decode(enc, 'replace') line = line.decode(enc, 'replace')
@ -1422,23 +1425,51 @@ class FileContainer:
if line is not complete (and complete=True) or there is no content to read. if line is not complete (and complete=True) or there is no content to read.
If line is complete (and complete is True), it also shift current known If line is complete (and complete is True), it also shift current known
position to begin of next line. position to begin of next line.
Also it is safe against interim new-line bytes (e. g. part of multi-byte char)
in given encoding.
""" """
if self.__handler is None: if self.__handler is None:
return "" return ""
rl = self.__handler.readline() # read raw bytes up to \n char:
if rl == b'': b = self.__handler.readline()
if not b:
return None return None
# trim new-line here and check the line was written complete (contains a new-line): bl = len(b)
l = rl.rstrip(b'\r\n') # convert to log-encoding (new-line char could disappear if it is part of multi-byte sequence):
if self.inOperation and complete: r = FileContainer.decode_line(
if l == rl: self.getFileName(), self.getEncoding(), b)
# trim new-line at end and check the line was written complete (contains a new-line):
l = r.rstrip('\r\n')
if complete:
if l == r:
# try to fill buffer in order to find line-end in log encoding:
fnd = 0
while 1:
r = self.__handler.readline()
if not r:
break
b += r
bl += len(r)
# convert to log-encoding:
r = FileContainer.decode_line(
self.getFileName(), self.getEncoding(), b)
# ensure new-line is not in the middle (buffered 2 strings, e. g. in utf-16le it is "...[0A"+"00]..."):
e = r.find('\n')
if e >= 0 and e != len(r)-1:
l, r = r[0:e], r[0:e+1]
# back to bytes and get offset to seek after NL:
r = r.encode(self.getEncoding(), 'replace')
self.__handler.seek(-bl+len(r), 1)
return l
# trim new-line at end and check the line was written complete (contains a new-line):
l = r.rstrip('\r\n')
if l != r:
return l
# not fulfilled - seek back and return: # not fulfilled - seek back and return:
self.__handler.seek(self.__pos, 0) self.__handler.seek(-bl, 1)
return None return None
# shift position (to be able to seek back above): return l
self.__pos += len(rl)
return FileContainer.decode_line(
self.getFileName(), self.getEncoding(), l)
def close(self): def close(self):
if self.__handler is not None: if self.__handler is not None:

View File

@ -644,6 +644,19 @@ class LogFile(LogCaptureTestCase):
self.filter = FilterPoll(None) self.filter = FilterPoll(None)
self.assertRaises(IOError, self.filter.addLogPath, LogFile.MISSING) self.assertRaises(IOError, self.filter.addLogPath, LogFile.MISSING)
def testDecodeLineWarn(self):
# incomplete line (missing byte at end), warning is suppressed:
l = u"correct line\n"
r = l.encode('utf-16le')
self.assertEqual(FileContainer.decode_line('TESTFILE', 'utf-16le', r), l)
self.assertEqual(FileContainer.decode_line('TESTFILE', 'utf-16le', r[0:-1]), l[0:-1])
self.assertNotLogged('Error decoding line')
# complete line (incorrect surrogate in the middle), warning is there:
r = b"incorrect \xc8\x0a line\n"
l = r.decode('utf-8', 'replace')
self.assertEqual(FileContainer.decode_line('TESTFILE', 'utf-8', r), l)
self.assertLogged('Error decoding line')
class LogFileFilterPoll(unittest.TestCase): class LogFileFilterPoll(unittest.TestCase):
@ -1633,6 +1646,7 @@ class GetFailures(LogCaptureTestCase):
def testCRLFFailures01(self): def testCRLFFailures01(self):
# We first adjust logfile/failures to end with CR+LF # We first adjust logfile/failures to end with CR+LF
fname = tempfile.mktemp(prefix='tmp_fail2ban', suffix='crlf') fname = tempfile.mktemp(prefix='tmp_fail2ban', suffix='crlf')
try:
# poor man unix2dos: # poor man unix2dos:
fin, fout = open(GetFailures.FILENAME_01, 'rb'), open(fname, 'wb') fin, fout = open(GetFailures.FILENAME_01, 'rb'), open(fname, 'wb')
for l in fin.read().splitlines(): for l in fin.read().splitlines():
@ -1642,8 +1656,40 @@ class GetFailures(LogCaptureTestCase):
# now see if we should be getting the "same" failures # now see if we should be getting the "same" failures
self.testGetFailures01(filename=fname) self.testGetFailures01(filename=fname)
finally:
_killfile(fout, fname) _killfile(fout, fname)
def testNLCharAsPartOfUniChar(self):
fname = tempfile.mktemp(prefix='tmp_fail2ban', suffix='crlf')
# test two multi-byte encodings (both contains `\x0A` in either \x02\x0A or \x0A\x02):
for enc in ('utf-16be', 'utf-16le'):
self.pruneLog("[test-phase encoding=%s]" % enc)
try:
fout = open(fname, 'wb')
tm = int(time.time())
# test on unicode string containing \x0A as part of uni-char,
# it must produce exactly 2 lines (both are failures):
for l in (
u'%s \u20AC Failed auth: invalid user Test\u020A from 192.0.2.1\n' % tm,
u'%s \u20AC Failed auth: invalid user TestI from 192.0.2.2\n' % tm
):
fout.write(l.encode(enc))
fout.close()
self.filter.setLogEncoding(enc)
self.filter.addLogPath(fname, autoSeek=0)
self.filter.setDatePattern((r'^EPOCH',))
self.filter.addFailRegex(r"Failed .* from <HOST>")
self.filter.getFailures(fname)
self.assertLogged(
"[DummyJail] Found 192.0.2.1",
"[DummyJail] Found 192.0.2.2", all=True, wait=True)
finally:
_killfile(fout, fname)
self.filter.delLogPath(fname)
# must find 4 failures and generate 2 tickets (2 IPs with each 2 failures):
self.assertEqual(self.filter.failManager.getFailCount(), (2, 4))
def testGetFailures02(self): def testGetFailures02(self):
output = ('141.3.81.106', 4, 1124013539.0, output = ('141.3.81.106', 4, 1124013539.0,
[u'Aug 14 11:%d:59 i60p295 sshd[12365]: Failed publickey for roehl from ::ffff:141.3.81.106 port 51332 ssh2' [u'Aug 14 11:%d:59 i60p295 sshd[12365]: Failed publickey for roehl from ::ffff:141.3.81.106 port 51332 ssh2'