2019-05-21 08:24:01 +00:00
|
|
|
# ~*~ coding: utf-8 ~*~
|
|
|
|
#
|
|
|
|
|
2019-05-31 09:40:57 +00:00
|
|
|
import chardet
|
2019-05-21 08:24:01 +00:00
|
|
|
import unicodecsv
|
|
|
|
|
2024-04-28 09:51:00 +00:00
|
|
|
from common.utils import lazyproperty
|
2020-12-07 07:23:05 +00:00
|
|
|
from .base import BaseFileParser
|
2024-04-28 09:51:00 +00:00
|
|
|
from ..const import CSV_FILE_ESCAPE_CHARS
|
2019-05-21 08:24:01 +00:00
|
|
|
|
|
|
|
|
2020-12-07 07:23:05 +00:00
|
|
|
class CSVFileParser(BaseFileParser):
|
2019-05-21 08:24:01 +00:00
|
|
|
media_type = 'text/csv'
|
|
|
|
|
2024-04-28 09:51:00 +00:00
|
|
|
@lazyproperty
|
|
|
|
def match_escape_chars(self):
|
|
|
|
chars = []
|
|
|
|
for c in CSV_FILE_ESCAPE_CHARS:
|
|
|
|
dq_char = '"{}'.format(c)
|
|
|
|
sg_char = "'{}".format(c)
|
|
|
|
chars.append(dq_char)
|
|
|
|
chars.append(sg_char)
|
|
|
|
return tuple(chars)
|
|
|
|
|
|
|
|
|
2019-05-21 08:24:01 +00:00
|
|
|
@staticmethod
|
|
|
|
def _universal_newlines(stream):
|
|
|
|
"""
|
|
|
|
保证在`通用换行模式`下打开文件
|
|
|
|
"""
|
|
|
|
for line in stream.splitlines():
|
|
|
|
yield line
|
2024-04-28 09:51:00 +00:00
|
|
|
|
|
|
|
def __parse_row(self, row):
|
|
|
|
row_escape = []
|
|
|
|
for d in row:
|
|
|
|
if isinstance(d, str) and d.strip().startswith(self.match_escape_chars):
|
|
|
|
d = d.lstrip("'").lstrip('"')
|
|
|
|
row_escape.append(d)
|
|
|
|
return row_escape
|
2019-05-21 08:24:01 +00:00
|
|
|
|
2020-12-07 07:23:05 +00:00
|
|
|
def generate_rows(self, stream_data):
|
|
|
|
detect_result = chardet.detect(stream_data)
|
|
|
|
encoding = detect_result.get("encoding", "utf-8")
|
|
|
|
lines = self._universal_newlines(stream_data)
|
|
|
|
csv_reader = unicodecsv.reader(lines, encoding=encoding)
|
2019-05-21 08:24:01 +00:00
|
|
|
for row in csv_reader:
|
2024-04-28 09:51:00 +00:00
|
|
|
row = self.__parse_row(row)
|
2019-05-21 08:24:01 +00:00
|
|
|
yield row
|