diff --git a/apprise/conversion.py b/apprise/conversion.py index 689697f0..89171433 100644 --- a/apprise/conversion.py +++ b/apprise/conversion.py @@ -101,7 +101,7 @@ class HTMLConverter(HTMLParser, object): # The following tags must start on a new line BLOCK_TAGS = ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', - 'div', 'td', 'th', 'code', 'pre', 'label', 'li',) + 'div', 'td', 'th', 'pre', 'samp', 'label', 'li',) # the folowing tags ignore any internal text IGNORE_TAGS = ( @@ -216,8 +216,10 @@ class HTMLMarkDownConverter(HTMLConverter): """An HTML to markdown converter tuned for email messages.""" # Escape markdown characters - MARKDOWN_ESCAPE = re.compile(r'([\\`*_{}[\]<>()#+\-.!|])', - re.DOTALL | re.MULTILINE) + MARKDOWN_ESCAPE = re.compile(r'([`*#])', re.DOTALL | re.MULTILINE) + + # Detect Carriage Return + HAS_CR = re.compile(r'[\r*\n]+', re.DOTALL | re.MULTILINE) def __init__(self, **kwargs): super().__init__(**kwargs) @@ -225,6 +227,8 @@ class HTMLMarkDownConverter(HTMLConverter): # Store href value self._link = "" + self._preserver_cr = False + def handle_data(self, data, *args, **kwargs): """ Store our data if it is not on the ignore list @@ -234,7 +238,8 @@ class HTMLMarkDownConverter(HTMLConverter): if self._do_store: # Tidy our whitespace - content = self.WS_TRIM.sub(' ', data) + content = self.WS_TRIM.sub(' ', data) \ + if not self._preserver_cr else data content = self.MARKDOWN_ESCAPE.sub(r'\\\1', content) # Add hyperlink @@ -287,19 +292,28 @@ class HTMLMarkDownConverter(HTMLConverter): elif tag == 'h6': self._result.append('###### ') - elif tag in ['strong', 'b']: + elif tag in ('strong', 'b'): self._result.append('**') - elif tag in ['em', 'i']: + elif tag in ('em', 'i'): self._result.append('*') elif tag == 'code': self._result.append('`') + self._preserver_cr = True + + elif tag in ('pre', 'samp'): + self._result.append('```') + self._result.append(self.BLOCK_END) + self._preserver_cr = True elif tag == 'a': for name, link in attrs: if name == 'href': self._link = '(' + link + ')' + # Take an early exit for speed (in case there are more + # parameters - no need to waste time looking at them) + break def handle_endtag(self, tag): """ @@ -311,11 +325,17 @@ class HTMLMarkDownConverter(HTMLConverter): if tag in self.BLOCK_TAGS: self._result.append(self.BLOCK_END) - if tag in ['strong', 'b']: + if tag in ('strong', 'b'): self._result.append('**') - elif tag in ['em', 'i']: + elif tag in ('em', 'i'): self._result.append('*') elif tag == 'code': self._result.append('`') + self._preserver_cr = False + + elif tag in ('pre', 'samp'): + self._result.append('```') + self._result.append(self.BLOCK_END) + self._preserver_cr = False diff --git a/test/test_conversion.py b/test/test_conversion.py index 0a8230d5..c09eb86e 100644 --- a/test/test_conversion.py +++ b/test/test_conversion.py @@ -153,24 +153,24 @@ def test_conversion_html_to_markdown(): """ return convert_between(NotifyFormat.HTML, NotifyFormat.MARKDOWN, body) - assert to_markdown("No HTML code here.") == "No HTML code here\." + assert to_markdown("No HTML code here.") == "No HTML code here." clist = to_markdown("
To be or not to be.") cspace = to_markdown( "
And a paragraph too.
Plus line break.
Some obnoxious text here.
") == "Some obnoxious text here\." + "Some obnoxious text here.
") == "Some obnoxious text here." assert to_markdown( "line 1
" @@ -194,9 +194,18 @@ def test_conversion_html_to_markdown(): # first 2 entries are okay, but last will do as best as it can assert to_markdown( + "line 1>" - "
line 2" - "
line 3>") == "line 1\nline 2\nline 3\>" + "
line 2" + "
line 3>") == \
+ "# Heading 1\n## Heading 2\n### Heading 3\n" \
+ "#### Heading 4\n##### Heading 5\n###### Heading 6\n" \
+ "line 1\n*line 2*\nline 3>"
# Make sure we ignore fields that aren't important to us
assert to_markdown(
@@ -205,6 +214,13 @@ def test_conversion_html_to_markdown():
"Another line without being enclosed") == \
"line 1\nAnother line without being enclosed"
+ # Test and
+ assert to_markdown(
+ "
multi-line 1\nmulti-line 2
more content"
+ "multi-line 1\nmulti-line 2
more content") == \
+ '`multi-line 1\nmulti-line 2`more content' \
+ '\n```\nmulti-line 1\nmulti-line 2\n```\nmore content'
+
# Test cases when there are no new lines (we're dealing with just inline
# entries); an empty entry as well
assert to_markdown("test "