|
|
|
@ -27,11 +27,11 @@
|
|
|
|
|
import re |
|
|
|
|
import six |
|
|
|
|
from markdown import markdown |
|
|
|
|
from os import linesep |
|
|
|
|
from .common import NotifyFormat |
|
|
|
|
|
|
|
|
|
if six.PY2: |
|
|
|
|
from HTMLParser import HTMLParser |
|
|
|
|
|
|
|
|
|
else: |
|
|
|
|
from html.parser import HTMLParser |
|
|
|
|
|
|
|
|
@ -46,6 +46,8 @@ def convert_between(from_format, to_format, body):
|
|
|
|
|
(NotifyFormat.MARKDOWN, NotifyFormat.HTML): markdown, |
|
|
|
|
(NotifyFormat.TEXT, NotifyFormat.HTML): text_to_html, |
|
|
|
|
(NotifyFormat.HTML, NotifyFormat.TEXT): html_to_text, |
|
|
|
|
# For now; use same converter for Markdown support |
|
|
|
|
(NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_text, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
convert = converters.get((from_format, to_format)) |
|
|
|
@ -85,7 +87,7 @@ def text_to_html(body):
|
|
|
|
|
# Execute our map against our body in addition to |
|
|
|
|
# swapping out new lines and replacing them with <br/> |
|
|
|
|
return re.sub( |
|
|
|
|
r'\r*\n', '<br/>\r\n', re_table.sub(lambda x: re_map[x.group()], body)) |
|
|
|
|
r'\r*\n', '<br/>\n', re_table.sub(lambda x: re_map[x.group()], body)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def html_to_text(body): |
|
|
|
@ -94,37 +96,134 @@ def html_to_text(body):
|
|
|
|
|
""" |
|
|
|
|
|
|
|
|
|
parser = HTMLConverter() |
|
|
|
|
if six.PY2: |
|
|
|
|
# Python 2.7 requires an additional parsing to un-escape characters |
|
|
|
|
body = parser.unescape(body) |
|
|
|
|
|
|
|
|
|
parser.feed(body) |
|
|
|
|
parser.close() |
|
|
|
|
return parser.converted |
|
|
|
|
result = parser.converted |
|
|
|
|
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class HTMLConverter(HTMLParser, object): |
|
|
|
|
"""An HTML to plain text converter tuned for email messages.""" |
|
|
|
|
|
|
|
|
|
# The following tags must start on a new line |
|
|
|
|
BLOCK_TAGS = ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', |
|
|
|
|
'div', 'td', 'th', 'code', 'pre', 'label', 'li',) |
|
|
|
|
|
|
|
|
|
# the folowing tags ignore any internal text |
|
|
|
|
IGNORE_TAGS = ('style', 'link', 'meta', 'title', 'html', 'head', 'script') |
|
|
|
|
|
|
|
|
|
# Condense Whitespace |
|
|
|
|
WS_TRIM = re.compile(r'[\s]+', re.DOTALL | re.MULTILINE) |
|
|
|
|
|
|
|
|
|
# Sentinel value for block tag boundaries, which may be consolidated into a |
|
|
|
|
# single line break. |
|
|
|
|
BLOCK_END = {} |
|
|
|
|
|
|
|
|
|
def __init__(self, **kwargs): |
|
|
|
|
super(HTMLConverter, self).__init__(**kwargs) |
|
|
|
|
|
|
|
|
|
# Shoudl we store the text content or not? |
|
|
|
|
self._do_store = True |
|
|
|
|
|
|
|
|
|
# Initialize internal result list |
|
|
|
|
self._result = [] |
|
|
|
|
|
|
|
|
|
# Initialize public result field (not populated until close() is |
|
|
|
|
# called) |
|
|
|
|
self.converted = "" |
|
|
|
|
|
|
|
|
|
def close(self): |
|
|
|
|
# Removes all html before the last "}". Some HTML can return additional |
|
|
|
|
# style information with text output. |
|
|
|
|
self.converted = str(self.converted).split('}')[-1].strip() |
|
|
|
|
|
|
|
|
|
def handle_data(self, data): |
|
|
|
|
self.converted += data.strip() |
|
|
|
|
string = ''.join(self._finalize(self._result)) |
|
|
|
|
self.converted = string.strip() |
|
|
|
|
|
|
|
|
|
if six.PY2: |
|
|
|
|
# See https://stackoverflow.com/questions/10993612/\ |
|
|
|
|
# how-to-remove-xa0-from-string-in-python |
|
|
|
|
# |
|
|
|
|
# This is required since the unescape() nbsp; with \xa0 when |
|
|
|
|
# using Python 2.7 |
|
|
|
|
self.converted = self.converted.replace(u'\xa0', u' ') |
|
|
|
|
|
|
|
|
|
def _finalize(self, result): |
|
|
|
|
""" |
|
|
|
|
Combines and strips consecutive strings, then converts consecutive |
|
|
|
|
block ends into singleton newlines. |
|
|
|
|
|
|
|
|
|
[ {be} " Hello " {be} {be} " World!" ] -> "\nHello\nWorld!" |
|
|
|
|
""" |
|
|
|
|
|
|
|
|
|
# None means the last visited item was a block end. |
|
|
|
|
accum = None |
|
|
|
|
|
|
|
|
|
for item in result: |
|
|
|
|
if item == self.BLOCK_END: |
|
|
|
|
# Multiple consecutive block ends; do nothing. |
|
|
|
|
if accum is None: |
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
# First block end; yield the current string, plus a newline. |
|
|
|
|
yield accum.strip() + '\n' |
|
|
|
|
accum = None |
|
|
|
|
|
|
|
|
|
# Multiple consecutive strings; combine them. |
|
|
|
|
elif accum is not None: |
|
|
|
|
accum += item |
|
|
|
|
|
|
|
|
|
# First consecutive string; store it. |
|
|
|
|
else: |
|
|
|
|
accum = item |
|
|
|
|
|
|
|
|
|
# Yield the last string if we have not already done so. |
|
|
|
|
if accum is not None: |
|
|
|
|
yield accum.strip() |
|
|
|
|
|
|
|
|
|
def handle_data(self, data, *args, **kwargs): |
|
|
|
|
""" |
|
|
|
|
Store our data if it is not on the ignore list |
|
|
|
|
""" |
|
|
|
|
|
|
|
|
|
# initialize our previous flag |
|
|
|
|
if self._do_store: |
|
|
|
|
|
|
|
|
|
# Tidy our whitespace |
|
|
|
|
content = self.WS_TRIM.sub(' ', data) |
|
|
|
|
self._result.append(content) |
|
|
|
|
|
|
|
|
|
def handle_starttag(self, tag, attrs): |
|
|
|
|
""" |
|
|
|
|
Process our starting HTML Tag |
|
|
|
|
""" |
|
|
|
|
# Toggle initial states |
|
|
|
|
self._do_store = tag not in self.IGNORE_TAGS |
|
|
|
|
|
|
|
|
|
if tag in self.BLOCK_TAGS: |
|
|
|
|
self._result.append(self.BLOCK_END) |
|
|
|
|
|
|
|
|
|
if tag == 'li': |
|
|
|
|
self.converted += linesep + '- ' |
|
|
|
|
elif tag == 'blockquote': |
|
|
|
|
self.converted += linesep + linesep + '\t' |
|
|
|
|
elif tag in ('p', 'h1', 'h2', 'h3', 'h4', 'tr', 'th'): |
|
|
|
|
self.converted += linesep + '\n' |
|
|
|
|
self._result.append('- ') |
|
|
|
|
|
|
|
|
|
elif tag == 'br': |
|
|
|
|
self.converted += linesep |
|
|
|
|
self._result.append('\n') |
|
|
|
|
|
|
|
|
|
elif tag == 'hr': |
|
|
|
|
if self._result: |
|
|
|
|
self._result[-1] = self._result[-1].rstrip(' ') |
|
|
|
|
|
|
|
|
|
self._result.append('\n---\n') |
|
|
|
|
|
|
|
|
|
elif tag == 'blockquote': |
|
|
|
|
self._result.append(' >') |
|
|
|
|
|
|
|
|
|
def handle_endtag(self, tag): |
|
|
|
|
if tag == 'blockquote': |
|
|
|
|
self.converted += linesep + linesep |
|
|
|
|
""" |
|
|
|
|
Edge case handling of open/close tags |
|
|
|
|
""" |
|
|
|
|
self._do_store = True |
|
|
|
|
|
|
|
|
|
if tag in self.BLOCK_TAGS: |
|
|
|
|
self._result.append(self.BLOCK_END) |
|
|
|
|