|
|
|
@ -27,11 +27,11 @@
|
|
|
|
|
import re
|
|
|
|
|
import six
|
|
|
|
|
from markdown import markdown
|
|
|
|
|
from os import linesep
|
|
|
|
|
from .common import NotifyFormat
|
|
|
|
|
|
|
|
|
|
if six.PY2:
|
|
|
|
|
from HTMLParser import HTMLParser
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
from html.parser import HTMLParser
|
|
|
|
|
|
|
|
|
@ -46,6 +46,8 @@ def convert_between(from_format, to_format, body):
|
|
|
|
|
(NotifyFormat.MARKDOWN, NotifyFormat.HTML): markdown,
|
|
|
|
|
(NotifyFormat.TEXT, NotifyFormat.HTML): text_to_html,
|
|
|
|
|
(NotifyFormat.HTML, NotifyFormat.TEXT): html_to_text,
|
|
|
|
|
# For now; use same converter for Markdown support
|
|
|
|
|
(NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_text,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
convert = converters.get((from_format, to_format))
|
|
|
|
@ -85,7 +87,7 @@ def text_to_html(body):
|
|
|
|
|
# Execute our map against our body in addition to
|
|
|
|
|
# swapping out new lines and replacing them with <br/>
|
|
|
|
|
return re.sub(
|
|
|
|
|
r'\r*\n', '<br/>\r\n', re_table.sub(lambda x: re_map[x.group()], body))
|
|
|
|
|
r'\r*\n', '<br/>\n', re_table.sub(lambda x: re_map[x.group()], body))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def html_to_text(body):
|
|
|
|
@ -94,37 +96,134 @@ def html_to_text(body):
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
parser = HTMLConverter()
|
|
|
|
|
if six.PY2:
|
|
|
|
|
# Python 2.7 requires an additional parsing to un-escape characters
|
|
|
|
|
body = parser.unescape(body)
|
|
|
|
|
|
|
|
|
|
parser.feed(body)
|
|
|
|
|
parser.close()
|
|
|
|
|
return parser.converted
|
|
|
|
|
result = parser.converted
|
|
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class HTMLConverter(HTMLParser, object):
|
|
|
|
|
"""An HTML to plain text converter tuned for email messages."""
|
|
|
|
|
|
|
|
|
|
# The following tags must start on a new line
|
|
|
|
|
BLOCK_TAGS = ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
|
|
|
|
'div', 'td', 'th', 'code', 'pre', 'label', 'li',)
|
|
|
|
|
|
|
|
|
|
# the folowing tags ignore any internal text
|
|
|
|
|
IGNORE_TAGS = ('style', 'link', 'meta', 'title', 'html', 'head', 'script')
|
|
|
|
|
|
|
|
|
|
# Condense Whitespace
|
|
|
|
|
WS_TRIM = re.compile(r'[\s]+', re.DOTALL | re.MULTILINE)
|
|
|
|
|
|
|
|
|
|
# Sentinel value for block tag boundaries, which may be consolidated into a
|
|
|
|
|
# single line break.
|
|
|
|
|
BLOCK_END = {}
|
|
|
|
|
|
|
|
|
|
def __init__(self, **kwargs):
|
|
|
|
|
super(HTMLConverter, self).__init__(**kwargs)
|
|
|
|
|
|
|
|
|
|
# Shoudl we store the text content or not?
|
|
|
|
|
self._do_store = True
|
|
|
|
|
|
|
|
|
|
# Initialize internal result list
|
|
|
|
|
self._result = []
|
|
|
|
|
|
|
|
|
|
# Initialize public result field (not populated until close() is
|
|
|
|
|
# called)
|
|
|
|
|
self.converted = ""
|
|
|
|
|
|
|
|
|
|
def close(self):
|
|
|
|
|
# Removes all html before the last "}". Some HTML can return additional
|
|
|
|
|
# style information with text output.
|
|
|
|
|
self.converted = str(self.converted).split('}')[-1].strip()
|
|
|
|
|
string = ''.join(self._finalize(self._result))
|
|
|
|
|
self.converted = string.strip()
|
|
|
|
|
|
|
|
|
|
if six.PY2:
|
|
|
|
|
# See https://stackoverflow.com/questions/10993612/\
|
|
|
|
|
# how-to-remove-xa0-from-string-in-python
|
|
|
|
|
#
|
|
|
|
|
# This is required since the unescape() nbsp; with \xa0 when
|
|
|
|
|
# using Python 2.7
|
|
|
|
|
self.converted = self.converted.replace(u'\xa0', u' ')
|
|
|
|
|
|
|
|
|
|
def _finalize(self, result):
|
|
|
|
|
"""
|
|
|
|
|
Combines and strips consecutive strings, then converts consecutive
|
|
|
|
|
block ends into singleton newlines.
|
|
|
|
|
|
|
|
|
|
[ {be} " Hello " {be} {be} " World!" ] -> "\nHello\nWorld!"
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# None means the last visited item was a block end.
|
|
|
|
|
accum = None
|
|
|
|
|
|
|
|
|
|
for item in result:
|
|
|
|
|
if item == self.BLOCK_END:
|
|
|
|
|
# Multiple consecutive block ends; do nothing.
|
|
|
|
|
if accum is None:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
def handle_data(self, data):
|
|
|
|
|
self.converted += data.strip()
|
|
|
|
|
# First block end; yield the current string, plus a newline.
|
|
|
|
|
yield accum.strip() + '\n'
|
|
|
|
|
accum = None
|
|
|
|
|
|
|
|
|
|
# Multiple consecutive strings; combine them.
|
|
|
|
|
elif accum is not None:
|
|
|
|
|
accum += item
|
|
|
|
|
|
|
|
|
|
# First consecutive string; store it.
|
|
|
|
|
else:
|
|
|
|
|
accum = item
|
|
|
|
|
|
|
|
|
|
# Yield the last string if we have not already done so.
|
|
|
|
|
if accum is not None:
|
|
|
|
|
yield accum.strip()
|
|
|
|
|
|
|
|
|
|
def handle_data(self, data, *args, **kwargs):
|
|
|
|
|
"""
|
|
|
|
|
Store our data if it is not on the ignore list
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# initialize our previous flag
|
|
|
|
|
if self._do_store:
|
|
|
|
|
|
|
|
|
|
# Tidy our whitespace
|
|
|
|
|
content = self.WS_TRIM.sub(' ', data)
|
|
|
|
|
self._result.append(content)
|
|
|
|
|
|
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
|
|
|
"""
|
|
|
|
|
Process our starting HTML Tag
|
|
|
|
|
"""
|
|
|
|
|
# Toggle initial states
|
|
|
|
|
self._do_store = tag not in self.IGNORE_TAGS
|
|
|
|
|
|
|
|
|
|
if tag in self.BLOCK_TAGS:
|
|
|
|
|
self._result.append(self.BLOCK_END)
|
|
|
|
|
|
|
|
|
|
if tag == 'li':
|
|
|
|
|
self.converted += linesep + '- '
|
|
|
|
|
elif tag == 'blockquote':
|
|
|
|
|
self.converted += linesep + linesep + '\t'
|
|
|
|
|
elif tag in ('p', 'h1', 'h2', 'h3', 'h4', 'tr', 'th'):
|
|
|
|
|
self.converted += linesep + '\n'
|
|
|
|
|
self._result.append('- ')
|
|
|
|
|
|
|
|
|
|
elif tag == 'br':
|
|
|
|
|
self.converted += linesep
|
|
|
|
|
self._result.append('\n')
|
|
|
|
|
|
|
|
|
|
elif tag == 'hr':
|
|
|
|
|
if self._result:
|
|
|
|
|
self._result[-1] = self._result[-1].rstrip(' ')
|
|
|
|
|
|
|
|
|
|
self._result.append('\n---\n')
|
|
|
|
|
|
|
|
|
|
elif tag == 'blockquote':
|
|
|
|
|
self._result.append(' >')
|
|
|
|
|
|
|
|
|
|
def handle_endtag(self, tag):
|
|
|
|
|
if tag == 'blockquote':
|
|
|
|
|
self.converted += linesep + linesep
|
|
|
|
|
"""
|
|
|
|
|
Edge case handling of open/close tags
|
|
|
|
|
"""
|
|
|
|
|
self._do_store = True
|
|
|
|
|
|
|
|
|
|
if tag in self.BLOCK_TAGS:
|
|
|
|
|
self._result.append(self.BLOCK_END)
|
|
|
|
|