code & test improvements, added more coverage

2023-10-06 18:08:10 -04:00 · 2023-10-06 18:08:10 -04:00 · f725b3ac75
parent 8d543a5eb3
commit f725b3ac75
2 changed files with 51 additions and 15 deletions
--- a/apprise/conversion.py
+++ b/apprise/conversion.py
@ -101,7 +101,7 @@ class HTMLConverter(HTMLParser, object):

    # The following tags must start on a new line
    BLOCK_TAGS = ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
-                  'div', 'td', 'th', 'code', 'pre', 'label', 'li',)
+                  'div', 'td', 'th', 'pre', 'samp', 'label', 'li',)

    # the folowing tags ignore any internal text
    IGNORE_TAGS = (
@ -216,8 +216,10 @@ class HTMLMarkDownConverter(HTMLConverter):
    """An HTML to markdown converter tuned for email messages."""

    # Escape markdown characters
-    MARKDOWN_ESCAPE = re.compile(r'([\\`*_{}[\]<>()#+\-.!|])',
-                                 re.DOTALL | re.MULTILINE)
+    MARKDOWN_ESCAPE = re.compile(r'([`*#])', re.DOTALL | re.MULTILINE)
+
+    # Detect Carriage Return
+    HAS_CR = re.compile(r'[\r*\n]+', re.DOTALL | re.MULTILINE)

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
@ -225,6 +227,8 @@ class HTMLMarkDownConverter(HTMLConverter):
        # Store href value
        self._link = ""

+        self._preserver_cr = False
+
    def handle_data(self, data, *args, **kwargs):
        """
        Store our data if it is not on the ignore list
@ -234,7 +238,8 @@ class HTMLMarkDownConverter(HTMLConverter):
        if self._do_store:

            # Tidy our whitespace
-            content = self.WS_TRIM.sub(' ', data)
+            content = self.WS_TRIM.sub(' ', data) \
+                if not self._preserver_cr else data
            content = self.MARKDOWN_ESCAPE.sub(r'\\\1', content)

            # Add hyperlink
@ -287,19 +292,28 @@ class HTMLMarkDownConverter(HTMLConverter):
        elif tag == 'h6':
            self._result.append('###### ')

-        elif tag in ['strong', 'b']:
+        elif tag in ('strong', 'b'):
            self._result.append('**')

-        elif tag in ['em', 'i']:
+        elif tag in ('em', 'i'):
            self._result.append('*')

        elif tag == 'code':
            self._result.append('`')
+            self._preserver_cr = True
+
+        elif tag in ('pre', 'samp'):
+            self._result.append('```')
+            self._result.append(self.BLOCK_END)
+            self._preserver_cr = True

        elif tag == 'a':
            for name, link in attrs:
                if name == 'href':
                    self._link = '(' + link + ')'
+                    # Take an early exit for speed (in case there are more
+                    # parameters - no need to waste time looking at them)
+                    break

    def handle_endtag(self, tag):
        """
@ -311,11 +325,17 @@ class HTMLMarkDownConverter(HTMLConverter):
        if tag in self.BLOCK_TAGS:
            self._result.append(self.BLOCK_END)

-        if tag in ['strong', 'b']:
+        if tag in ('strong', 'b'):
            self._result.append('**')

-        elif tag in ['em', 'i']:
+        elif tag in ('em', 'i'):
            self._result.append('*')

        elif tag == 'code':
            self._result.append('`')
+            self._preserver_cr = False
+
+        elif tag in ('pre', 'samp'):
+            self._result.append('```')
+            self._result.append(self.BLOCK_END)
+            self._preserver_cr = False
--- a/test/test_conversion.py
+++ b/test/test_conversion.py
@ -153,24 +153,24 @@ def test_conversion_html_to_markdown():
        """
        return convert_between(NotifyFormat.HTML, NotifyFormat.MARKDOWN, body)

-    assert to_markdown("No HTML code here.") == "No HTML code here\."
+    assert to_markdown("No HTML code here.") == "No HTML code here."

    clist = to_markdown("<ul><li>Lots and lots</li><li>of lists.</li></ul>")
    assert "- Lots and lots" in clist
-    assert "- of lists\." in clist
+    assert "- of lists." in clist

-    assert "> To be or not to be\." == to_markdown(
+    assert "> To be or not to be." == to_markdown(
        "<blockquote>To be or not to be.</blockquote>")

    cspace = to_markdown(
        "<h2>Fancy heading</h2>"
        "<p>And a paragraph too.<br>Plus line break.</p>")
    assert "# Fancy heading" in cspace
-    assert "And a paragraph too\.\nPlus line break\." in cspace
+    assert "And a paragraph too.\nPlus line break." in cspace

    assert to_markdown(
        "<style>body { font: 200%; }</style>"
-        "<p>Some obnoxious text here.</p>") == "Some obnoxious text here\."
+        "<p>Some obnoxious text here.</p>") == "Some obnoxious text here."

    assert to_markdown(
        "<p>line 1</p>"
@ -194,9 +194,18 @@ def test_conversion_html_to_markdown():

    # first 2 entries are okay, but last will do as best as it can
    assert to_markdown(
+        "<h1>Heading 1</h1>"
+        "<h2>Heading 2</h2>"
+        "<h3>Heading 3</h3>"
+        "<h4>Heading 4</h4>"
+        "<h5>Heading 5</h5>"
+        "<h6>Heading 6</h6>"
        "<p>line 1</>"
-        "<p>line 2</gar>"
-        "<p>line 3>") == "line 1\nline 2\nline 3\>"
+        "<p><em>line 2</em></gar>"
+        "<p>line 3>") == \
+        "# Heading 1\n## Heading 2\n### Heading 3\n" \
+        "#### Heading 4\n##### Heading 5\n###### Heading 6\n" \
+        "line 1\n*line 2*\nline 3>"

    # Make sure we ignore fields that aren't important to us
    assert to_markdown(
@ -205,6 +214,13 @@ def test_conversion_html_to_markdown():
        "Another line without being enclosed") == \
        "line 1\nAnother line without being enclosed"

+    # Test <code> and <pre>
+    assert to_markdown(
+        "<code>multi-line 1\nmulti-line 2</code>more content"
+        "<pre>multi-line 1\nmulti-line 2</pre>more content") == \
+        '`multi-line 1\nmulti-line 2`more content' \
+        '\n```\nmulti-line 1\nmulti-line 2\n```\nmore content'
+
    # Test cases when there are no new lines (we're dealing with just inline
    # entries); an empty entry as well
    assert to_markdown("<span></span<<span>test</span> "