HTML to TEXT/MARKDOWN cleanup and refactoring (#530)

2022-02-01 22:23:42 -05:00 · 2022-02-01 22:23:42 -05:00 · 405e26e22d
parent 8fa146685f
commit 405e26e22d
3 changed files with 231 additions and 52 deletions
--- a/apprise/Apprise.py
+++ b/apprise/Apprise.py
@ -514,8 +514,8 @@ class Apprise(object):
            # was set to None), or we did define a tag and the logic above
            # determined we need to notify the service it's associated with
            if server.notify_format not in conversion_map:
-                conversion_map[server.notify_format] = \
-                    convert_between(body_format, server.notify_format, body)
+                conversion_map[server.notify_format] = convert_between(
+                    body_format, server.notify_format, body)

                if interpret_escapes:
                    #
@ -531,8 +531,9 @@ class Apprise(object):
                            .decode('unicode-escape')

                    except UnicodeDecodeError:  # pragma: no cover
-                    # This occurs using a very old verion of Python 2.7 such
-                    # as the one that ships with CentOS/RedHat 7.x (v2.7.5).
+                        # This occurs using a very old verion of Python 2.7
+                        # such as the one that ships with CentOS/RedHat 7.x
+                        # (v2.7.5).
                        conversion_map[server.notify_format] = \
                            conversion_map[server.notify_format] \
                            .decode('string_escape')
@ -544,8 +545,9 @@ class Apprise(object):

                    if title:
                        try:
-                        # Added overhead required due to Python 3 Encoding Bug
-                        # identified here: https://bugs.python.org/issue21331
+                            # Added overhead required due to Python 3 Encoding
+                            # Bug identified here:
+                            #  https://bugs.python.org/issue21331
                            title = title\
                                .encode('ascii', 'backslashreplace')\
                                .decode('unicode-escape')
--- a/apprise/conversion.py
+++ b/apprise/conversion.py
@ -27,11 +27,11 @@
 import re
 import six
 from markdown import markdown
-from os import linesep
 from .common import NotifyFormat

 if six.PY2:
    from HTMLParser import HTMLParser
+
 else:
    from html.parser import HTMLParser

@ -46,6 +46,8 @@ def convert_between(from_format, to_format, body):
        (NotifyFormat.MARKDOWN, NotifyFormat.HTML): markdown,
        (NotifyFormat.TEXT, NotifyFormat.HTML): text_to_html,
        (NotifyFormat.HTML, NotifyFormat.TEXT): html_to_text,
+        # For now; use same converter for Markdown support
+        (NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_text,
    }

    convert = converters.get((from_format, to_format))
@ -85,7 +87,7 @@ def text_to_html(body):
    # Execute our map against our body in addition to
    # swapping out new lines and replacing them with <br/>
    return re.sub(
-        r'\r*\n', '<br/>\r\n', re_table.sub(lambda x: re_map[x.group()], body))
+        r'\r*\n', '<br/>\n', re_table.sub(lambda x: re_map[x.group()], body))


 def html_to_text(body):
@ -94,37 +96,134 @@ def html_to_text(body):
    """

    parser = HTMLConverter()
+    if six.PY2:
+        # Python 2.7 requires an additional parsing to un-escape characters
+        body = parser.unescape(body)
+
    parser.feed(body)
    parser.close()
-    return parser.converted
+    result = parser.converted
+
+    return result


 class HTMLConverter(HTMLParser, object):
    """An HTML to plain text converter tuned for email messages."""

+    # The following tags must start on a new line
+    BLOCK_TAGS = ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+                  'div', 'td', 'th', 'code', 'pre', 'label', 'li',)
+
+    # the folowing tags ignore any internal text
+    IGNORE_TAGS = ('style', 'link', 'meta', 'title', 'html', 'head', 'script')
+
+    # Condense Whitespace
+    WS_TRIM = re.compile(r'[\s]+', re.DOTALL | re.MULTILINE)
+
+    # Sentinel value for block tag boundaries, which may be consolidated into a
+    # single line break.
+    BLOCK_END = {}
+
    def __init__(self, **kwargs):
        super(HTMLConverter, self).__init__(**kwargs)

+        # Shoudl we store the text content or not?
+        self._do_store = True
+
+        # Initialize internal result list
+        self._result = []
+
+        # Initialize public result field (not populated until close() is
+        # called)
        self.converted = ""

    def close(self):
-        # Removes all html before the last "}". Some HTML can return additional
-        # style information with text output.
-        self.converted = str(self.converted).split('}')[-1].strip()
+        string = ''.join(self._finalize(self._result))
+        self.converted = string.strip()

-    def handle_data(self, data):
-        self.converted += data.strip()
+        if six.PY2:
+            # See https://stackoverflow.com/questions/10993612/\
+            #       how-to-remove-xa0-from-string-in-python
+            #
+            # This is required since the unescape() nbsp; with \xa0 when
+            # using Python 2.7
+            self.converted = self.converted.replace(u'\xa0', u' ')
+
+    def _finalize(self, result):
+        """
+        Combines and strips consecutive strings, then converts consecutive
+        block ends into singleton newlines.
+
+        [ {be} " Hello " {be} {be} " World!" ] -> "\nHello\nWorld!"
+        """
+
+        # None means the last visited item was a block end.
+        accum = None
+
+        for item in result:
+            if item == self.BLOCK_END:
+                # Multiple consecutive block ends; do nothing.
+                if accum is None:
+                    continue
+
+                # First block end; yield the current string, plus a newline.
+                yield accum.strip() + '\n'
+                accum = None
+
+            # Multiple consecutive strings; combine them.
+            elif accum is not None:
+                accum += item
+
+            # First consecutive string; store it.
+            else:
+                accum = item
+
+        # Yield the last string if we have not already done so.
+        if accum is not None:
+            yield accum.strip()
+
+    def handle_data(self, data, *args, **kwargs):
+        """
+        Store our data if it is not on the ignore list
+        """
+
+        # initialize our previous flag
+        if self._do_store:
+
+            # Tidy our whitespace
+            content = self.WS_TRIM.sub(' ', data)
+            self._result.append(content)

    def handle_starttag(self, tag, attrs):
+        """
+        Process our starting HTML Tag
+        """
+        # Toggle initial states
+        self._do_store = tag not in self.IGNORE_TAGS
+
+        if tag in self.BLOCK_TAGS:
+            self._result.append(self.BLOCK_END)
+
        if tag == 'li':
-            self.converted += linesep + '- '
-        elif tag == 'blockquote':
-            self.converted += linesep + linesep + '\t'
-        elif tag in ('p', 'h1', 'h2', 'h3', 'h4', 'tr', 'th'):
-            self.converted += linesep + '\n'
+            self._result.append('- ')
+
        elif tag == 'br':
-            self.converted += linesep
+            self._result.append('\n')
+
+        elif tag == 'hr':
+            if self._result:
+                self._result[-1] = self._result[-1].rstrip(' ')
+
+            self._result.append('\n---\n')
+
+        elif tag == 'blockquote':
+            self._result.append(' >')

    def handle_endtag(self, tag):
-        if tag == 'blockquote':
-            self.converted += linesep + linesep
+        """
+        Edge case handling of open/close tags
+        """
+        self._do_store = True
+
+        if tag in self.BLOCK_TAGS:
+            self._result.append(self.BLOCK_END)
--- a/test/test_conversion.py
+++ b/test/test_conversion.py
@ -25,6 +25,7 @@

 from apprise import NotifyFormat
 from apprise.conversion import convert_between
+import pytest

 # Disable logging for a cleaner testing output
 import logging
@ -35,24 +36,101 @@ def test_html_to_text():
    """conversion: Test HTML to plain text
    """

-    def convert(body):
+    def to_html(body):
+        """
+        A function to simply html conversion tests
+        """
        return convert_between(NotifyFormat.HTML, NotifyFormat.TEXT, body)

-    assert convert("No HTML code here.") == "No HTML code here."
+    assert to_html("No HTML code here.") == "No HTML code here."

-    clist = convert("<ul><li>Lots and lots</li><li>of lists.</li></ul>")
+    clist = to_html("<ul><li>Lots and lots</li><li>of lists.</li></ul>")
    assert "Lots and lots" in clist
    assert "of lists." in clist

-    assert "To be or not to be." in convert(
+    assert "To be or not to be." in to_html(
        "<blockquote>To be or not to be.</blockquote>")

-    cspace = convert(
+    cspace = to_html(
        "<h2>Fancy heading</h2>"
        "<p>And a paragraph too.<br>Plus line break.</p>")
    assert "Fancy heading" in cspace
    assert "And a paragraph too.\nPlus line break." in cspace

-    assert convert(
+    assert to_html(
        "<style>body { font: 200%; }</style>"
        "<p>Some obnoxious text here.</p>") == "Some obnoxious text here."
+
+    assert to_html(
+        "<p>line 1</p>"
+        "<p>line 2</p>"
+        "<p>line 3</p>") == "line 1\nline 2\nline 3"
+
+    # Case sensitivity
+    assert to_html(
+        "<p>line 1</P>"
+        "<P>line 2</P>"
+        "<P>line 3</P>") == "line 1\nline 2\nline 3"
+
+    # double new lines (testing <br> and </br>)
+    assert to_html(
+        "some information<br/><br>and more information") == \
+        "some information\n\nand more information"
+
+    #
+    # Test bad tags
+    #
+
+    # first 2 entries are okay, but last will do as best as it can
+    assert to_html(
+        "<p>line 1</>"
+        "<p>line 2</gar>"
+        "<p>line 3>") == "line 1\nline 2\nline 3>"
+
+    # Make sure we ignore fields that aren't important to us
+    assert to_html(
+        "<script>ignore this</script>"
+        "<p>line 1</p>"
+        "Another line without being enclosed") == \
+        "line 1\nAnother line without being enclosed"
+
+    # Test cases when there are no new lines (we're dealing with just inline
+    # entries); an empty entry as well
+    assert to_html("<span></span<<span>test</span> "
+                   "<a href='#'>my link</a>") == \
+        "test my link"
+
+    # </p> missing
+    assert to_html("<body><div>line 1 <b>bold</b></div>  "
+                   " <a href='#'>my link</a>"
+                   "<p>3rd line</body>") == \
+        "line 1 bold\nmy link\n3rd line"
+
+    # <hr/> on it's own
+    assert to_html("<hr/>") == "---"
+    assert to_html("<hr>") == "---"
+
+    # We need to handle HTML Encodings
+    assert to_html("""
+        <html>
+            <title>ignore this entry</title>
+        <body>
+          Let&apos;s handle&nbsp;special html encoding
+          <hr/>
+        </body>
+        """) == "Let's handle special html encoding\n---"
+
+    # If you give nothing, you get nothing in return
+    assert to_html("") == ""
+
+    with pytest.raises(TypeError):
+        # Invalid input
+        assert to_html(None)
+
+    with pytest.raises(TypeError):
+        # Invalid input
+        assert to_html(42)
+
+    with pytest.raises(TypeError):
+        # Invalid input
+        assert to_html(object)