From 405e26e22d2d7ede6a9dbda56e7e30f1e98d4b76 Mon Sep 17 00:00:00 2001
From: Chris Caron And a paragraph too. Some obnoxious text here. line 1 line 2 line 3 line 1 line 2 line 3 line 1>"
+ " line 2"
+ " line 3>") == "line 1\nline 2\nline 3>"
+
+ # Make sure we ignore fields that aren't important to us
+ assert to_html(
+ ""
+ " line 1
return re.sub(
- r'\r*\n', '
\r\n', re_table.sub(lambda x: re_map[x.group()], body))
+ r'\r*\n', '
\n', re_table.sub(lambda x: re_map[x.group()], body))
def html_to_text(body):
@@ -94,37 +96,134 @@ def html_to_text(body):
"""
parser = HTMLConverter()
+ if six.PY2:
+ # Python 2.7 requires an additional parsing to un-escape characters
+ body = parser.unescape(body)
+
parser.feed(body)
parser.close()
- return parser.converted
+ result = parser.converted
+
+ return result
class HTMLConverter(HTMLParser, object):
"""An HTML to plain text converter tuned for email messages."""
+ # The following tags must start on a new line
+ BLOCK_TAGS = ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+ 'div', 'td', 'th', 'code', 'pre', 'label', 'li',)
+
+ # the folowing tags ignore any internal text
+ IGNORE_TAGS = ('style', 'link', 'meta', 'title', 'html', 'head', 'script')
+
+ # Condense Whitespace
+ WS_TRIM = re.compile(r'[\s]+', re.DOTALL | re.MULTILINE)
+
+ # Sentinel value for block tag boundaries, which may be consolidated into a
+ # single line break.
+ BLOCK_END = {}
+
def __init__(self, **kwargs):
super(HTMLConverter, self).__init__(**kwargs)
+ # Shoudl we store the text content or not?
+ self._do_store = True
+
+ # Initialize internal result list
+ self._result = []
+
+ # Initialize public result field (not populated until close() is
+ # called)
self.converted = ""
def close(self):
- # Removes all html before the last "}". Some HTML can return additional
- # style information with text output.
- self.converted = str(self.converted).split('}')[-1].strip()
-
- def handle_data(self, data):
- self.converted += data.strip()
+ string = ''.join(self._finalize(self._result))
+ self.converted = string.strip()
+
+ if six.PY2:
+ # See https://stackoverflow.com/questions/10993612/\
+ # how-to-remove-xa0-from-string-in-python
+ #
+ # This is required since the unescape() nbsp; with \xa0 when
+ # using Python 2.7
+ self.converted = self.converted.replace(u'\xa0', u' ')
+
+ def _finalize(self, result):
+ """
+ Combines and strips consecutive strings, then converts consecutive
+ block ends into singleton newlines.
+
+ [ {be} " Hello " {be} {be} " World!" ] -> "\nHello\nWorld!"
+ """
+
+ # None means the last visited item was a block end.
+ accum = None
+
+ for item in result:
+ if item == self.BLOCK_END:
+ # Multiple consecutive block ends; do nothing.
+ if accum is None:
+ continue
+
+ # First block end; yield the current string, plus a newline.
+ yield accum.strip() + '\n'
+ accum = None
+
+ # Multiple consecutive strings; combine them.
+ elif accum is not None:
+ accum += item
+
+ # First consecutive string; store it.
+ else:
+ accum = item
+
+ # Yield the last string if we have not already done so.
+ if accum is not None:
+ yield accum.strip()
+
+ def handle_data(self, data, *args, **kwargs):
+ """
+ Store our data if it is not on the ignore list
+ """
+
+ # initialize our previous flag
+ if self._do_store:
+
+ # Tidy our whitespace
+ content = self.WS_TRIM.sub(' ', data)
+ self._result.append(content)
def handle_starttag(self, tag, attrs):
+ """
+ Process our starting HTML Tag
+ """
+ # Toggle initial states
+ self._do_store = tag not in self.IGNORE_TAGS
+
+ if tag in self.BLOCK_TAGS:
+ self._result.append(self.BLOCK_END)
+
if tag == 'li':
- self.converted += linesep + '- '
- elif tag == 'blockquote':
- self.converted += linesep + linesep + '\t'
- elif tag in ('p', 'h1', 'h2', 'h3', 'h4', 'tr', 'th'):
- self.converted += linesep + '\n'
+ self._result.append('- ')
+
elif tag == 'br':
- self.converted += linesep
+ self._result.append('\n')
+
+ elif tag == 'hr':
+ if self._result:
+ self._result[-1] = self._result[-1].rstrip(' ')
+
+ self._result.append('\n---\n')
+
+ elif tag == 'blockquote':
+ self._result.append(' >')
def handle_endtag(self, tag):
- if tag == 'blockquote':
- self.converted += linesep + linesep
+ """
+ Edge case handling of open/close tags
+ """
+ self._do_store = True
+
+ if tag in self.BLOCK_TAGS:
+ self._result.append(self.BLOCK_END)
diff --git a/test/test_conversion.py b/test/test_conversion.py
index 506fb806..ddaed179 100644
--- a/test/test_conversion.py
+++ b/test/test_conversion.py
@@ -25,6 +25,7 @@
from apprise import NotifyFormat
from apprise.conversion import convert_between
+import pytest
# Disable logging for a cleaner testing output
import logging
@@ -35,24 +36,101 @@ def test_html_to_text():
"""conversion: Test HTML to plain text
"""
- def convert(body):
+ def to_html(body):
+ """
+ A function to simply html conversion tests
+ """
return convert_between(NotifyFormat.HTML, NotifyFormat.TEXT, body)
- assert convert("No HTML code here.") == "No HTML code here."
+ assert to_html("No HTML code here.") == "No HTML code here."
- clist = convert("
")
+ clist = to_html("
")
assert "Lots and lots" in clist
assert "of lists." in clist
- assert "To be or not to be." in convert(
+ assert "To be or not to be." in to_html(
"To be or not to be.
")
- cspace = convert(
+ cspace = to_html(
"Fancy heading
"
"
Plus line break.
and )
+ assert to_html(
+ "some information
and more information") == \
+ "some information\n\nand more information"
+
+ #
+ # Test bad tags
+ #
+
+ # first 2 entries are okay, but last will do as best as it can
+ assert to_html(
+ "
3rd line") == \ + "line 1 bold\nmy link\n3rd line" + + #