readline fixed to consider interim new-line character as part of code point in multi-byte logs (e. g. unicode: utf-16be, utf-16le);

suppress warning "Error decoding line" for incomplete line (produced by not fully read multi-byte new-line character at end of data); added test coverage for such logs
2021-03-23 12:57:42 +01:00 · 2021-03-23 12:57:42 +01:00 · cbac7c176a
parent 996920cdaa
commit cbac7c176a
2 changed files with 100 additions and 23 deletions
--- a/fail2ban/server/filter.py
+++ b/fail2ban/server/filter.py
@ -1398,6 +1398,9 @@ class FileContainer:
 		try:
 			return line.decode(enc, 'strict')
 		except (UnicodeDecodeError, UnicodeEncodeError) as e:
 			# avoid warning if got incomplete end of line (e. g. '\n' in "...[0A" followed by "00]..." for utf-16le:
 			if (e.end == len(line) and line[e.start] in b'\r\n'):
 				return line[0:e.start].decode(enc, 'replace')
 			global _decode_line_warn
 			lev = 7
 			if not _decode_line_warn.get(filename, 0):
@ -1406,9 +1409,9 @@ class FileContainer:
 			logSys.log(lev,
 				"Error decoding line from '%s' with '%s'.", filename, enc)
 			if logSys.getEffectiveLevel() <= lev:
-				logSys.log(lev, "Consider setting logencoding=utf-8 (or another appropriate"
+				logSys.log(lev,
-					" encoding) for this jail. Continuing"
+					"Consider setting logencoding to appropriate encoding for this jail. "
-					" to process line ignoring invalid characters: %r",
+					"Continuing to process line ignoring invalid characters: %r",
 					line)
 			# decode with replacing error chars:
 			line = line.decode(enc, 'replace')
@ -1422,23 +1425,51 @@ class FileContainer:
 		if line is not complete (and complete=True) or there is no content to read.
 		If line is complete (and complete is True), it also shift current known 
 		position to begin of next line.
 		Also it is safe against interim new-line bytes (e. g. part of multi-byte char)
 		in given encoding.
 		"""
 		if self.__handler is None:
 			return ""
-		rl = self.__handler.readline()
+		# read raw bytes up to \n char:
-		if rl == b'':
+		b = self.__handler.readline()
 		if not b:
 			return None
-		# trim new-line here and check the line was written complete (contains a new-line):
+		bl = len(b)
-		l = rl.rstrip(b'\r\n')
+		# convert to log-encoding (new-line char could disappear if it is part of multi-byte sequence):
-		if self.inOperation and complete:
+		r = FileContainer.decode_line(
-			if l == rl:
+			self.getFileName(), self.getEncoding(), b)
 		# trim new-line at end and check the line was written complete (contains a new-line):
 		l = r.rstrip('\r\n')
 		if complete:
 			if l == r:
 				# try to fill buffer in order to find line-end in log encoding:
 				fnd = 0
 				while 1:
 					r = self.__handler.readline()
 					if not r:
 						break
 					b += r
 					bl += len(r)
 					# convert to log-encoding:
 					r = FileContainer.decode_line(
 						self.getFileName(), self.getEncoding(), b)
 					# ensure new-line is not in the middle (buffered 2 strings, e. g. in utf-16le it is "...[0A"+"00]..."):
 					e = r.find('\n')
 					if e >= 0 and e != len(r)-1:
 						l, r = r[0:e], r[0:e+1]
 						# back to bytes and get offset to seek after NL:
 						r = r.encode(self.getEncoding(), 'replace')
 						self.__handler.seek(-bl+len(r), 1)
 						return l
 					# trim new-line at end and check the line was written complete (contains a new-line):
 					l = r.rstrip('\r\n')
 					if l != r:
 						return l
 				# not fulfilled - seek back and return:
-				self.__handler.seek(self.__pos, 0)
+				self.__handler.seek(-bl, 1)
 				return None
-			# shift position (to be able to seek back above):
+		return l
 			self.__pos += len(rl)
 		return FileContainer.decode_line(
 			self.getFileName(), self.getEncoding(), l)
 	def close(self):
 		if self.__handler is not None:
--- a/fail2ban/tests/filtertestcase.py
+++ b/fail2ban/tests/filtertestcase.py
@ -644,6 +644,19 @@ class LogFile(LogCaptureTestCase):
 		self.filter = FilterPoll(None)
 		self.assertRaises(IOError, self.filter.addLogPath, LogFile.MISSING)
 	def testDecodeLineWarn(self):
 		# incomplete line (missing byte at end), warning is suppressed:
 		l = u"correct line\n"
 		r = l.encode('utf-16le')
 		self.assertEqual(FileContainer.decode_line('TESTFILE', 'utf-16le', r), l)
 		self.assertEqual(FileContainer.decode_line('TESTFILE', 'utf-16le', r[0:-1]), l[0:-1])
 		self.assertNotLogged('Error decoding line')
 		# complete line (incorrect surrogate in the middle), warning is there:
 		r = b"incorrect \xc8\x0a line\n"
 		l = r.decode('utf-8', 'replace')
 		self.assertEqual(FileContainer.decode_line('TESTFILE', 'utf-8', r), l)
 		self.assertLogged('Error decoding line')
 class LogFileFilterPoll(unittest.TestCase):
@ -1633,6 +1646,7 @@ class GetFailures(LogCaptureTestCase):
 	def testCRLFFailures01(self):
 		# We first adjust logfile/failures to end with CR+LF
 		fname = tempfile.mktemp(prefix='tmp_fail2ban', suffix='crlf')
 		try:
 			# poor man unix2dos:
 			fin, fout = open(GetFailures.FILENAME_01, 'rb'), open(fname, 'wb')
 			for l in fin.read().splitlines():
@ -1642,8 +1656,40 @@ class GetFailures(LogCaptureTestCase):
 			# now see if we should be getting the "same" failures
 			self.testGetFailures01(filename=fname)
 		finally:
 			_killfile(fout, fname)
 	def testNLCharAsPartOfUniChar(self):
 		fname = tempfile.mktemp(prefix='tmp_fail2ban', suffix='crlf')
 		# test two multi-byte encodings (both contains `\x0A` in either \x02\x0A or \x0A\x02):
 		for enc in ('utf-16be', 'utf-16le'):
 			self.pruneLog("[test-phase encoding=%s]" % enc)
 			try:
 				fout = open(fname, 'wb')
 				tm = int(time.time())
 				# test on unicode string containing \x0A as part of uni-char,
 				# it must produce exactly 2 lines (both are failures):
 				for l in (
 					u'%s \u20AC Failed auth: invalid user Test\u020A from 192.0.2.1\n' % tm,
 					u'%s \u20AC Failed auth: invalid user TestI from 192.0.2.2\n' % tm
 				):
 					fout.write(l.encode(enc))
 				fout.close()
 				self.filter.setLogEncoding(enc)
 				self.filter.addLogPath(fname, autoSeek=0)
 				self.filter.setDatePattern((r'^EPOCH',))
 				self.filter.addFailRegex(r"Failed .* from <HOST>")
 				self.filter.getFailures(fname)
 				self.assertLogged(
 					"[DummyJail] Found 192.0.2.1",
 					"[DummyJail] Found 192.0.2.2", all=True, wait=True)
 			finally:
 				_killfile(fout, fname)
 				self.filter.delLogPath(fname)
 		# must find 4 failures and generate 2 tickets (2 IPs with each 2 failures):
 		self.assertEqual(self.filter.failManager.getFailCount(), (2, 4))
 	def testGetFailures02(self):
 		output = ('141.3.81.106', 4, 1124013539.0,
 				  [u'Aug 14 11:%d:59 i60p295 sshd[12365]: Failed publickey for roehl from ::ffff:141.3.81.106 port 51332 ssh2'