From cea880bc40cfafe9621e6e1ab7b9e9d762d1a202 Mon Sep 17 00:00:00 2001 From: Andrew Krasichkov Date: Sun, 14 May 2017 16:05:17 +0300 Subject: [PATCH] Try to autodetect encoding for bynary string while parsing. Deal with UTF-8 BOM --- gixy/cli/main.py | 4 ++-- gixy/parser/nginx_parser.py | 2 +- gixy/parser/raw_parser.py | 14 ++++++++++++-- tests/parser/test_raw_parser.py | 25 ++++++++++++++++++++++--- 4 files changed, 37 insertions(+), 8 deletions(-) diff --git a/gixy/cli/main.py b/gixy/cli/main.py index 8d5b487..c0b4c65 100644 --- a/gixy/cli/main.py +++ b/gixy/cli/main.py @@ -151,10 +151,10 @@ def main(): with Gixy(config=config) as yoda: if path == '-': - with os.fdopen(sys.stdin.fileno(), 'r') as fdata: + with os.fdopen(sys.stdin.fileno(), 'rb') as fdata: yoda.audit('', fdata, is_stdin=True) else: - with open(path, mode='r') as fdata: + with open(path, mode='rb') as fdata: yoda.audit(path, fdata, is_stdin=False) formatted = formatters()[config.output_format]().format(yoda) diff --git a/gixy/parser/nginx_parser.py b/gixy/parser/nginx_parser.py index ec9506b..ef71378 100644 --- a/gixy/parser/nginx_parser.py +++ b/gixy/parser/nginx_parser.py @@ -23,7 +23,7 @@ class NginxParser(object): def parse_file(self, path, root=None): LOG.debug("Parse file: {}".format(path)) - content = open(path).read() + content = open(path, mode='rb').read() return self.parse(content=content, root=root, path_info=path) def parse(self, content, root=None, path_info=None): diff --git a/gixy/parser/raw_parser.py b/gixy/parser/raw_parser.py index 96cd992..71908f4 100644 --- a/gixy/parser/raw_parser.py +++ b/gixy/parser/raw_parser.py @@ -1,4 +1,6 @@ import logging +import codecs +import six from cached_property import cached_property from pyparsing import ( @@ -27,11 +29,19 @@ class RawParser(object): """ Returns the parsed tree. """ - content = data.strip() + if isinstance(data, six.binary_type): + if data[:3] == codecs.BOM_UTF8: + encoding = 'utf-8-sig' + else: + encoding = 'latin1' + content = data.decode(encoding).strip() + else: + content = data.strip() + if not content: return ParseResults() - return self.script.parseString(data, parseAll=True) + return self.script.parseString(content, parseAll=True) @cached_property def script(self): diff --git a/tests/parser/test_raw_parser.py b/tests/parser/test_raw_parser.py index 4a328c6..03421f4 100644 --- a/tests/parser/test_raw_parser.py +++ b/tests/parser/test_raw_parser.py @@ -1,7 +1,4 @@ from nose.tools import assert_equals -import mock -from six import StringIO -from six.moves import builtins from gixy.parser.raw_parser import * @@ -527,6 +524,28 @@ def test_empty_config(): assert_config(config, expected) +def test_utfbom_decoding(): + config = b'''\xef\xbb\xbf +add_header X-Test "Windows-1251"; + ''' + + expected = [ + ['add_header', 'X-Test', 'Windows-1251'] + ] + + assert_config(config, expected) + + +def test_national_comment_decoding(): + config = b''' +# \xeb\xff-\xeb\xff-\xeb\xff = Lya-lya-lya +add_header X-Test "Windows-1251"; + ''' + + actual = RawParser().parse(config) + assert_equals(len(actual.asList()), 2) + + def assert_config(config, expected): actual = RawParser().parse(config) assert_equals(actual.asList(), expected)