From 405e26e22d2d7ede6a9dbda56e7e30f1e98d4b76 Mon Sep 17 00:00:00 2001 From: Chris Caron Date: Tue, 1 Feb 2022 22:23:42 -0500 Subject: [PATCH] HTML to TEXT/MARKDOWN cleanup and refactoring (#530) --- apprise/Apprise.py | 66 ++++++++++---------- apprise/conversion.py | 133 +++++++++++++++++++++++++++++++++++----- test/test_conversion.py | 90 +++++++++++++++++++++++++-- 3 files changed, 234 insertions(+), 55 deletions(-) diff --git a/apprise/Apprise.py b/apprise/Apprise.py index 77d8a895..f535786a 100644 --- a/apprise/Apprise.py +++ b/apprise/Apprise.py @@ -514,39 +514,19 @@ class Apprise(object): # was set to None), or we did define a tag and the logic above # determined we need to notify the service it's associated with if server.notify_format not in conversion_map: - conversion_map[server.notify_format] = \ - convert_between(body_format, server.notify_format, body) - - if interpret_escapes: - # - # Escape our content - # - - try: - # Added overhead required due to Python 3 Encoding Bug - # identified here: https://bugs.python.org/issue21331 - conversion_map[server.notify_format] = \ - conversion_map[server.notify_format]\ - .encode('ascii', 'backslashreplace')\ - .decode('unicode-escape') - - except UnicodeDecodeError: # pragma: no cover - # This occurs using a very old verion of Python 2.7 such - # as the one that ships with CentOS/RedHat 7.x (v2.7.5). - conversion_map[server.notify_format] = \ - conversion_map[server.notify_format] \ - .decode('string_escape') - - except AttributeError: - # Must be of string type - logger.error('Failed to escape message body') - raise TypeError - - if title: + conversion_map[server.notify_format] = convert_between( + body_format, server.notify_format, body) + + if interpret_escapes: + # + # Escape our content + # + try: # Added overhead required due to Python 3 Encoding Bug # identified here: https://bugs.python.org/issue21331 - title = title\ + conversion_map[server.notify_format] = \ + conversion_map[server.notify_format]\ .encode('ascii', 'backslashreplace')\ .decode('unicode-escape') @@ -554,13 +534,35 @@ class Apprise(object): # This occurs using a very old verion of Python 2.7 # such as the one that ships with CentOS/RedHat 7.x # (v2.7.5). - title = title.decode('string_escape') + conversion_map[server.notify_format] = \ + conversion_map[server.notify_format] \ + .decode('string_escape') except AttributeError: # Must be of string type - logger.error('Failed to escape message title') + logger.error('Failed to escape message body') raise TypeError + if title: + try: + # Added overhead required due to Python 3 Encoding + # Bug identified here: + # https://bugs.python.org/issue21331 + title = title\ + .encode('ascii', 'backslashreplace')\ + .decode('unicode-escape') + + except UnicodeDecodeError: # pragma: no cover + # This occurs using a very old verion of Python 2.7 + # such as the one that ships with CentOS/RedHat 7.x + # (v2.7.5). + title = title.decode('string_escape') + + except AttributeError: + # Must be of string type + logger.error('Failed to escape message title') + raise TypeError + yield handler( server, body=conversion_map[server.notify_format], diff --git a/apprise/conversion.py b/apprise/conversion.py index 560a5b9c..fea87bf8 100644 --- a/apprise/conversion.py +++ b/apprise/conversion.py @@ -27,11 +27,11 @@ import re import six from markdown import markdown -from os import linesep from .common import NotifyFormat if six.PY2: from HTMLParser import HTMLParser + else: from html.parser import HTMLParser @@ -46,6 +46,8 @@ def convert_between(from_format, to_format, body): (NotifyFormat.MARKDOWN, NotifyFormat.HTML): markdown, (NotifyFormat.TEXT, NotifyFormat.HTML): text_to_html, (NotifyFormat.HTML, NotifyFormat.TEXT): html_to_text, + # For now; use same converter for Markdown support + (NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_text, } convert = converters.get((from_format, to_format)) @@ -85,7 +87,7 @@ def text_to_html(body): # Execute our map against our body in addition to # swapping out new lines and replacing them with
return re.sub( - r'\r*\n', '
\r\n', re_table.sub(lambda x: re_map[x.group()], body)) + r'\r*\n', '
\n', re_table.sub(lambda x: re_map[x.group()], body)) def html_to_text(body): @@ -94,37 +96,134 @@ def html_to_text(body): """ parser = HTMLConverter() + if six.PY2: + # Python 2.7 requires an additional parsing to un-escape characters + body = parser.unescape(body) + parser.feed(body) parser.close() - return parser.converted + result = parser.converted + + return result class HTMLConverter(HTMLParser, object): """An HTML to plain text converter tuned for email messages.""" + # The following tags must start on a new line + BLOCK_TAGS = ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + 'div', 'td', 'th', 'code', 'pre', 'label', 'li',) + + # the folowing tags ignore any internal text + IGNORE_TAGS = ('style', 'link', 'meta', 'title', 'html', 'head', 'script') + + # Condense Whitespace + WS_TRIM = re.compile(r'[\s]+', re.DOTALL | re.MULTILINE) + + # Sentinel value for block tag boundaries, which may be consolidated into a + # single line break. + BLOCK_END = {} + def __init__(self, **kwargs): super(HTMLConverter, self).__init__(**kwargs) + # Shoudl we store the text content or not? + self._do_store = True + + # Initialize internal result list + self._result = [] + + # Initialize public result field (not populated until close() is + # called) self.converted = "" def close(self): - # Removes all html before the last "}". Some HTML can return additional - # style information with text output. - self.converted = str(self.converted).split('}')[-1].strip() - - def handle_data(self, data): - self.converted += data.strip() + string = ''.join(self._finalize(self._result)) + self.converted = string.strip() + + if six.PY2: + # See https://stackoverflow.com/questions/10993612/\ + # how-to-remove-xa0-from-string-in-python + # + # This is required since the unescape() nbsp; with \xa0 when + # using Python 2.7 + self.converted = self.converted.replace(u'\xa0', u' ') + + def _finalize(self, result): + """ + Combines and strips consecutive strings, then converts consecutive + block ends into singleton newlines. + + [ {be} " Hello " {be} {be} " World!" ] -> "\nHello\nWorld!" + """ + + # None means the last visited item was a block end. + accum = None + + for item in result: + if item == self.BLOCK_END: + # Multiple consecutive block ends; do nothing. + if accum is None: + continue + + # First block end; yield the current string, plus a newline. + yield accum.strip() + '\n' + accum = None + + # Multiple consecutive strings; combine them. + elif accum is not None: + accum += item + + # First consecutive string; store it. + else: + accum = item + + # Yield the last string if we have not already done so. + if accum is not None: + yield accum.strip() + + def handle_data(self, data, *args, **kwargs): + """ + Store our data if it is not on the ignore list + """ + + # initialize our previous flag + if self._do_store: + + # Tidy our whitespace + content = self.WS_TRIM.sub(' ', data) + self._result.append(content) def handle_starttag(self, tag, attrs): + """ + Process our starting HTML Tag + """ + # Toggle initial states + self._do_store = tag not in self.IGNORE_TAGS + + if tag in self.BLOCK_TAGS: + self._result.append(self.BLOCK_END) + if tag == 'li': - self.converted += linesep + '- ' - elif tag == 'blockquote': - self.converted += linesep + linesep + '\t' - elif tag in ('p', 'h1', 'h2', 'h3', 'h4', 'tr', 'th'): - self.converted += linesep + '\n' + self._result.append('- ') + elif tag == 'br': - self.converted += linesep + self._result.append('\n') + + elif tag == 'hr': + if self._result: + self._result[-1] = self._result[-1].rstrip(' ') + + self._result.append('\n---\n') + + elif tag == 'blockquote': + self._result.append(' >') def handle_endtag(self, tag): - if tag == 'blockquote': - self.converted += linesep + linesep + """ + Edge case handling of open/close tags + """ + self._do_store = True + + if tag in self.BLOCK_TAGS: + self._result.append(self.BLOCK_END) diff --git a/test/test_conversion.py b/test/test_conversion.py index 506fb806..ddaed179 100644 --- a/test/test_conversion.py +++ b/test/test_conversion.py @@ -25,6 +25,7 @@ from apprise import NotifyFormat from apprise.conversion import convert_between +import pytest # Disable logging for a cleaner testing output import logging @@ -35,24 +36,101 @@ def test_html_to_text(): """conversion: Test HTML to plain text """ - def convert(body): + def to_html(body): + """ + A function to simply html conversion tests + """ return convert_between(NotifyFormat.HTML, NotifyFormat.TEXT, body) - assert convert("No HTML code here.") == "No HTML code here." + assert to_html("No HTML code here.") == "No HTML code here." - clist = convert("") + clist = to_html("") assert "Lots and lots" in clist assert "of lists." in clist - assert "To be or not to be." in convert( + assert "To be or not to be." in to_html( "
To be or not to be.
") - cspace = convert( + cspace = to_html( "

Fancy heading

" "

And a paragraph too.
Plus line break.

") assert "Fancy heading" in cspace assert "And a paragraph too.\nPlus line break." in cspace - assert convert( + assert to_html( "" "

Some obnoxious text here.

") == "Some obnoxious text here." + + assert to_html( + "

line 1

" + "

line 2

" + "

line 3

") == "line 1\nline 2\nline 3" + + # Case sensitivity + assert to_html( + "

line 1

" + "

line 2

" + "

line 3

") == "line 1\nline 2\nline 3" + + # double new lines (testing
and
) + assert to_html( + "some information

and more information") == \ + "some information\n\nand more information" + + # + # Test bad tags + # + + # first 2 entries are okay, but last will do as best as it can + assert to_html( + "

line 1" + "

line 2" + "

line 3>") == "line 1\nline 2\nline 3>" + + # Make sure we ignore fields that aren't important to us + assert to_html( + "" + "

line 1

" + "Another line without being enclosed") == \ + "line 1\nAnother line without being enclosed" + + # Test cases when there are no new lines (we're dealing with just inline + # entries); an empty entry as well + assert to_html("test " + "my link") == \ + "test my link" + + #

missing + assert to_html("
line 1 bold
" + " my link" + "

3rd line") == \ + "line 1 bold\nmy link\n3rd line" + + #


on it's own + assert to_html("
") == "---" + assert to_html("
") == "---" + + # We need to handle HTML Encodings + assert to_html(""" + + ignore this entry + + Let's handle special html encoding +
+ + """) == "Let's handle special html encoding\n---" + + # If you give nothing, you get nothing in return + assert to_html("") == "" + + with pytest.raises(TypeError): + # Invalid input + assert to_html(None) + + with pytest.raises(TypeError): + # Invalid input + assert to_html(42) + + with pytest.raises(TypeError): + # Invalid input + assert to_html(object)