From cbac7c176a93207027a53aa4591342f7268a1547 Mon Sep 17 00:00:00 2001 From: sebres Date: Tue, 23 Mar 2021 12:57:42 +0100 Subject: [PATCH] readline fixed to consider interim new-line character as part of code point in multi-byte logs (e. g. unicode: utf-16be, utf-16le); suppress warning "Error decoding line" for incomplete line (produced by not fully read multi-byte new-line character at end of data); added test coverage for such logs --- fail2ban/server/filter.py | 59 ++++++++++++++++++++++------- fail2ban/tests/filtertestcase.py | 64 +++++++++++++++++++++++++++----- 2 files changed, 100 insertions(+), 23 deletions(-) diff --git a/fail2ban/server/filter.py b/fail2ban/server/filter.py index 845b069d..16279627 100644 --- a/fail2ban/server/filter.py +++ b/fail2ban/server/filter.py @@ -1398,6 +1398,9 @@ class FileContainer: try: return line.decode(enc, 'strict') except (UnicodeDecodeError, UnicodeEncodeError) as e: + # avoid warning if got incomplete end of line (e. g. '\n' in "...[0A" followed by "00]..." for utf-16le: + if (e.end == len(line) and line[e.start] in b'\r\n'): + return line[0:e.start].decode(enc, 'replace') global _decode_line_warn lev = 7 if not _decode_line_warn.get(filename, 0): @@ -1406,9 +1409,9 @@ class FileContainer: logSys.log(lev, "Error decoding line from '%s' with '%s'.", filename, enc) if logSys.getEffectiveLevel() <= lev: - logSys.log(lev, "Consider setting logencoding=utf-8 (or another appropriate" - " encoding) for this jail. Continuing" - " to process line ignoring invalid characters: %r", + logSys.log(lev, + "Consider setting logencoding to appropriate encoding for this jail. " + "Continuing to process line ignoring invalid characters: %r", line) # decode with replacing error chars: line = line.decode(enc, 'replace') @@ -1422,23 +1425,51 @@ class FileContainer: if line is not complete (and complete=True) or there is no content to read. If line is complete (and complete is True), it also shift current known position to begin of next line. + + Also it is safe against interim new-line bytes (e. g. part of multi-byte char) + in given encoding. """ if self.__handler is None: return "" - rl = self.__handler.readline() - if rl == b'': + # read raw bytes up to \n char: + b = self.__handler.readline() + if not b: return None - # trim new-line here and check the line was written complete (contains a new-line): - l = rl.rstrip(b'\r\n') - if self.inOperation and complete: - if l == rl: + bl = len(b) + # convert to log-encoding (new-line char could disappear if it is part of multi-byte sequence): + r = FileContainer.decode_line( + self.getFileName(), self.getEncoding(), b) + # trim new-line at end and check the line was written complete (contains a new-line): + l = r.rstrip('\r\n') + if complete: + if l == r: + # try to fill buffer in order to find line-end in log encoding: + fnd = 0 + while 1: + r = self.__handler.readline() + if not r: + break + b += r + bl += len(r) + # convert to log-encoding: + r = FileContainer.decode_line( + self.getFileName(), self.getEncoding(), b) + # ensure new-line is not in the middle (buffered 2 strings, e. g. in utf-16le it is "...[0A"+"00]..."): + e = r.find('\n') + if e >= 0 and e != len(r)-1: + l, r = r[0:e], r[0:e+1] + # back to bytes and get offset to seek after NL: + r = r.encode(self.getEncoding(), 'replace') + self.__handler.seek(-bl+len(r), 1) + return l + # trim new-line at end and check the line was written complete (contains a new-line): + l = r.rstrip('\r\n') + if l != r: + return l # not fulfilled - seek back and return: - self.__handler.seek(self.__pos, 0) + self.__handler.seek(-bl, 1) return None - # shift position (to be able to seek back above): - self.__pos += len(rl) - return FileContainer.decode_line( - self.getFileName(), self.getEncoding(), l) + return l def close(self): if self.__handler is not None: diff --git a/fail2ban/tests/filtertestcase.py b/fail2ban/tests/filtertestcase.py index 799adfd3..f8621f29 100644 --- a/fail2ban/tests/filtertestcase.py +++ b/fail2ban/tests/filtertestcase.py @@ -644,6 +644,19 @@ class LogFile(LogCaptureTestCase): self.filter = FilterPoll(None) self.assertRaises(IOError, self.filter.addLogPath, LogFile.MISSING) + def testDecodeLineWarn(self): + # incomplete line (missing byte at end), warning is suppressed: + l = u"correct line\n" + r = l.encode('utf-16le') + self.assertEqual(FileContainer.decode_line('TESTFILE', 'utf-16le', r), l) + self.assertEqual(FileContainer.decode_line('TESTFILE', 'utf-16le', r[0:-1]), l[0:-1]) + self.assertNotLogged('Error decoding line') + # complete line (incorrect surrogate in the middle), warning is there: + r = b"incorrect \xc8\x0a line\n" + l = r.decode('utf-8', 'replace') + self.assertEqual(FileContainer.decode_line('TESTFILE', 'utf-8', r), l) + self.assertLogged('Error decoding line') + class LogFileFilterPoll(unittest.TestCase): @@ -1633,16 +1646,49 @@ class GetFailures(LogCaptureTestCase): def testCRLFFailures01(self): # We first adjust logfile/failures to end with CR+LF fname = tempfile.mktemp(prefix='tmp_fail2ban', suffix='crlf') - # poor man unix2dos: - fin, fout = open(GetFailures.FILENAME_01, 'rb'), open(fname, 'wb') - for l in fin.read().splitlines(): - fout.write(l + b'\r\n') - fin.close() - fout.close() + try: + # poor man unix2dos: + fin, fout = open(GetFailures.FILENAME_01, 'rb'), open(fname, 'wb') + for l in fin.read().splitlines(): + fout.write(l + b'\r\n') + fin.close() + fout.close() - # now see if we should be getting the "same" failures - self.testGetFailures01(filename=fname) - _killfile(fout, fname) + # now see if we should be getting the "same" failures + self.testGetFailures01(filename=fname) + finally: + _killfile(fout, fname) + + def testNLCharAsPartOfUniChar(self): + fname = tempfile.mktemp(prefix='tmp_fail2ban', suffix='crlf') + # test two multi-byte encodings (both contains `\x0A` in either \x02\x0A or \x0A\x02): + for enc in ('utf-16be', 'utf-16le'): + self.pruneLog("[test-phase encoding=%s]" % enc) + try: + fout = open(fname, 'wb') + tm = int(time.time()) + # test on unicode string containing \x0A as part of uni-char, + # it must produce exactly 2 lines (both are failures): + for l in ( + u'%s \u20AC Failed auth: invalid user Test\u020A from 192.0.2.1\n' % tm, + u'%s \u20AC Failed auth: invalid user TestI from 192.0.2.2\n' % tm + ): + fout.write(l.encode(enc)) + fout.close() + + self.filter.setLogEncoding(enc) + self.filter.addLogPath(fname, autoSeek=0) + self.filter.setDatePattern((r'^EPOCH',)) + self.filter.addFailRegex(r"Failed .* from ") + self.filter.getFailures(fname) + self.assertLogged( + "[DummyJail] Found 192.0.2.1", + "[DummyJail] Found 192.0.2.2", all=True, wait=True) + finally: + _killfile(fout, fname) + self.filter.delLogPath(fname) + # must find 4 failures and generate 2 tickets (2 IPs with each 2 failures): + self.assertEqual(self.filter.failManager.getFailCount(), (2, 4)) def testGetFailures02(self): output = ('141.3.81.106', 4, 1124013539.0,