readline fixed to consider interim new-line character as part of code point in multi-byte logs (e. g. unicode: utf-16be, utf-16le);

suppress warning "Error decoding line" for incomplete line (produced by not fully read multi-byte new-line character at end of data); added test coverage for such logs
2021-03-23 12:57:42 +01:00 · 2021-03-23 12:57:42 +01:00 · cbac7c176a
parent 996920cdaa
commit cbac7c176a
2 changed files with 100 additions and 23 deletions
--- a/fail2ban/server/filter.py
+++ b/fail2ban/server/filter.py
@ -1398,6 +1398,9 @@ class FileContainer:
 		try:
 			return line.decode(enc, 'strict')
 		except (UnicodeDecodeError, UnicodeEncodeError) as e:
+			# avoid warning if got incomplete end of line (e. g. '\n' in "...[0A" followed by "00]..." for utf-16le:
+			if (e.end == len(line) and line[e.start] in b'\r\n'):
+				return line[0:e.start].decode(enc, 'replace')
 			global _decode_line_warn
 			lev = 7
 			if not _decode_line_warn.get(filename, 0):
@ -1406,9 +1409,9 @@ class FileContainer:
 			logSys.log(lev,
 				"Error decoding line from '%s' with '%s'.", filename, enc)
 			if logSys.getEffectiveLevel() <= lev:
-				logSys.log(lev, "Consider setting logencoding=utf-8 (or another appropriate"
-					" encoding) for this jail. Continuing"
-					" to process line ignoring invalid characters: %r",
+				logSys.log(lev,
+					"Consider setting logencoding to appropriate encoding for this jail. "
+					"Continuing to process line ignoring invalid characters: %r",
 					line)
 			# decode with replacing error chars:
 			line = line.decode(enc, 'replace')
@ -1422,23 +1425,51 @@ class FileContainer:
 		if line is not complete (and complete=True) or there is no content to read.
 		If line is complete (and complete is True), it also shift current known 
 		position to begin of next line.
+
+		Also it is safe against interim new-line bytes (e. g. part of multi-byte char)
+		in given encoding.
 		"""
 		if self.__handler is None:
 			return ""
-		rl = self.__handler.readline()
-		if rl == b'':
+		# read raw bytes up to \n char:
+		b = self.__handler.readline()
+		if not b:
 			return None
-		# trim new-line here and check the line was written complete (contains a new-line):
-		l = rl.rstrip(b'\r\n')
-		if self.inOperation and complete:
-			if l == rl:
+		bl = len(b)
+		# convert to log-encoding (new-line char could disappear if it is part of multi-byte sequence):
+		r = FileContainer.decode_line(
+			self.getFileName(), self.getEncoding(), b)
+		# trim new-line at end and check the line was written complete (contains a new-line):
+		l = r.rstrip('\r\n')
+		if complete:
+			if l == r:
+				# try to fill buffer in order to find line-end in log encoding:
+				fnd = 0
+				while 1:
+					r = self.__handler.readline()
+					if not r:
+						break
+					b += r
+					bl += len(r)
+					# convert to log-encoding:
+					r = FileContainer.decode_line(
+						self.getFileName(), self.getEncoding(), b)
+					# ensure new-line is not in the middle (buffered 2 strings, e. g. in utf-16le it is "...[0A"+"00]..."):
+					e = r.find('\n')
+					if e >= 0 and e != len(r)-1:
+						l, r = r[0:e], r[0:e+1]
+						# back to bytes and get offset to seek after NL:
+						r = r.encode(self.getEncoding(), 'replace')
+						self.__handler.seek(-bl+len(r), 1)
+						return l
+					# trim new-line at end and check the line was written complete (contains a new-line):
+					l = r.rstrip('\r\n')
+					if l != r:
+						return l
 				# not fulfilled - seek back and return:
-				self.__handler.seek(self.__pos, 0)
+				self.__handler.seek(-bl, 1)
 				return None
-			# shift position (to be able to seek back above):
-			self.__pos += len(rl)
-		return FileContainer.decode_line(
-			self.getFileName(), self.getEncoding(), l)
+		return l

 	def close(self):
 		if self.__handler is not None:
--- a/fail2ban/tests/filtertestcase.py
+++ b/fail2ban/tests/filtertestcase.py
@ -644,6 +644,19 @@ class LogFile(LogCaptureTestCase):
 		self.filter = FilterPoll(None)
 		self.assertRaises(IOError, self.filter.addLogPath, LogFile.MISSING)

+	def testDecodeLineWarn(self):
+		# incomplete line (missing byte at end), warning is suppressed:
+		l = u"correct line\n"
+		r = l.encode('utf-16le')
+		self.assertEqual(FileContainer.decode_line('TESTFILE', 'utf-16le', r), l)
+		self.assertEqual(FileContainer.decode_line('TESTFILE', 'utf-16le', r[0:-1]), l[0:-1])
+		self.assertNotLogged('Error decoding line')
+		# complete line (incorrect surrogate in the middle), warning is there:
+		r = b"incorrect \xc8\x0a line\n"
+		l = r.decode('utf-8', 'replace')
+		self.assertEqual(FileContainer.decode_line('TESTFILE', 'utf-8', r), l)
+		self.assertLogged('Error decoding line')
+

 class LogFileFilterPoll(unittest.TestCase):

@ -1633,16 +1646,49 @@ class GetFailures(LogCaptureTestCase):
 	def testCRLFFailures01(self):
 		# We first adjust logfile/failures to end with CR+LF
 		fname = tempfile.mktemp(prefix='tmp_fail2ban', suffix='crlf')
-		# poor man unix2dos:
-		fin, fout = open(GetFailures.FILENAME_01, 'rb'), open(fname, 'wb')
-		for l in fin.read().splitlines():
-			fout.write(l + b'\r\n')
-		fin.close()
-		fout.close()
+		try:
+			# poor man unix2dos:
+			fin, fout = open(GetFailures.FILENAME_01, 'rb'), open(fname, 'wb')
+			for l in fin.read().splitlines():
+				fout.write(l + b'\r\n')
+			fin.close()
+			fout.close()

-		# now see if we should be getting the "same" failures
-		self.testGetFailures01(filename=fname)
-		_killfile(fout, fname)
+			# now see if we should be getting the "same" failures
+			self.testGetFailures01(filename=fname)
+		finally:
+			_killfile(fout, fname)
+
+	def testNLCharAsPartOfUniChar(self):
+		fname = tempfile.mktemp(prefix='tmp_fail2ban', suffix='crlf')
+		# test two multi-byte encodings (both contains `\x0A` in either \x02\x0A or \x0A\x02):
+		for enc in ('utf-16be', 'utf-16le'):
+			self.pruneLog("[test-phase encoding=%s]" % enc)
+			try:
+				fout = open(fname, 'wb')
+				tm = int(time.time())
+				# test on unicode string containing \x0A as part of uni-char,
+				# it must produce exactly 2 lines (both are failures):
+				for l in (
+					u'%s \u20AC Failed auth: invalid user Test\u020A from 192.0.2.1\n' % tm,
+					u'%s \u20AC Failed auth: invalid user TestI from 192.0.2.2\n' % tm
+				):
+					fout.write(l.encode(enc))
+				fout.close()
+
+				self.filter.setLogEncoding(enc)
+				self.filter.addLogPath(fname, autoSeek=0)
+				self.filter.setDatePattern((r'^EPOCH',))
+				self.filter.addFailRegex(r"Failed .* from <HOST>")
+				self.filter.getFailures(fname)
+				self.assertLogged(
+					"[DummyJail] Found 192.0.2.1",
+					"[DummyJail] Found 192.0.2.2", all=True, wait=True)
+			finally:
+				_killfile(fout, fname)
+				self.filter.delLogPath(fname)
+		# must find 4 failures and generate 2 tickets (2 IPs with each 2 failures):
+		self.assertEqual(self.filter.failManager.getFailCount(), (2, 4))

 	def testGetFailures02(self):
 		output = ('141.3.81.106', 4, 1124013539.0,