mirror of https://github.com/caronc/apprise
Add conversion for HTML to markdown
parent
31caff1ac9
commit
8d543a5eb3
|
@ -50,8 +50,7 @@ def convert_between(from_format, to_format, content):
|
||||||
(NotifyFormat.MARKDOWN, NotifyFormat.HTML): markdown_to_html,
|
(NotifyFormat.MARKDOWN, NotifyFormat.HTML): markdown_to_html,
|
||||||
(NotifyFormat.TEXT, NotifyFormat.HTML): text_to_html,
|
(NotifyFormat.TEXT, NotifyFormat.HTML): text_to_html,
|
||||||
(NotifyFormat.HTML, NotifyFormat.TEXT): html_to_text,
|
(NotifyFormat.HTML, NotifyFormat.TEXT): html_to_text,
|
||||||
# For now; use same converter for Markdown support
|
(NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_markdown,
|
||||||
(NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_text,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
convert = converters.get((from_format, to_format))
|
convert = converters.get((from_format, to_format))
|
||||||
|
@ -86,6 +85,17 @@ def html_to_text(content):
|
||||||
return parser.converted
|
return parser.converted
|
||||||
|
|
||||||
|
|
||||||
|
def html_to_markdown(content):
|
||||||
|
"""
|
||||||
|
Converts a content from HTML to markdown.
|
||||||
|
"""
|
||||||
|
|
||||||
|
parser = HTMLMarkDownConverter()
|
||||||
|
parser.feed(content)
|
||||||
|
parser.close()
|
||||||
|
return parser.converted
|
||||||
|
|
||||||
|
|
||||||
class HTMLConverter(HTMLParser, object):
|
class HTMLConverter(HTMLParser, object):
|
||||||
"""An HTML to plain text converter tuned for email messages."""
|
"""An HTML to plain text converter tuned for email messages."""
|
||||||
|
|
||||||
|
@ -200,3 +210,112 @@ class HTMLConverter(HTMLParser, object):
|
||||||
|
|
||||||
if tag in self.BLOCK_TAGS:
|
if tag in self.BLOCK_TAGS:
|
||||||
self._result.append(self.BLOCK_END)
|
self._result.append(self.BLOCK_END)
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLMarkDownConverter(HTMLConverter):
|
||||||
|
"""An HTML to markdown converter tuned for email messages."""
|
||||||
|
|
||||||
|
# Escape markdown characters
|
||||||
|
MARKDOWN_ESCAPE = re.compile(r'([\\`*_{}[\]<>()#+\-.!|])',
|
||||||
|
re.DOTALL | re.MULTILINE)
|
||||||
|
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
# Store href value
|
||||||
|
self._link = ""
|
||||||
|
|
||||||
|
def handle_data(self, data, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
Store our data if it is not on the ignore list
|
||||||
|
"""
|
||||||
|
|
||||||
|
# initialize our previous flag
|
||||||
|
if self._do_store:
|
||||||
|
|
||||||
|
# Tidy our whitespace
|
||||||
|
content = self.WS_TRIM.sub(' ', data)
|
||||||
|
content = self.MARKDOWN_ESCAPE.sub(r'\\\1', content)
|
||||||
|
|
||||||
|
# Add hyperlink
|
||||||
|
if self._link == "":
|
||||||
|
self._result.append(content)
|
||||||
|
else:
|
||||||
|
self._result.append("[" + content + "]" + self._link)
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
"""
|
||||||
|
Process our starting HTML Tag
|
||||||
|
"""
|
||||||
|
# Toggle initial states
|
||||||
|
self._do_store = tag not in self.IGNORE_TAGS
|
||||||
|
self._link = ""
|
||||||
|
|
||||||
|
if tag in self.BLOCK_TAGS:
|
||||||
|
self._result.append(self.BLOCK_END)
|
||||||
|
|
||||||
|
if tag == 'li':
|
||||||
|
self._result.append('- ')
|
||||||
|
|
||||||
|
elif tag == 'br':
|
||||||
|
self._result.append('\n')
|
||||||
|
|
||||||
|
elif tag == 'hr':
|
||||||
|
if self._result:
|
||||||
|
self._result[-1] = self._result[-1].rstrip(' ')
|
||||||
|
|
||||||
|
self._result.append('\n---\n')
|
||||||
|
|
||||||
|
elif tag == 'blockquote':
|
||||||
|
self._result.append('> ')
|
||||||
|
|
||||||
|
elif tag == 'h1':
|
||||||
|
self._result.append('# ')
|
||||||
|
|
||||||
|
elif tag == 'h2':
|
||||||
|
self._result.append('## ')
|
||||||
|
|
||||||
|
elif tag == 'h3':
|
||||||
|
self._result.append('### ')
|
||||||
|
|
||||||
|
elif tag == 'h4':
|
||||||
|
self._result.append('#### ')
|
||||||
|
|
||||||
|
elif tag == 'h5':
|
||||||
|
self._result.append('##### ')
|
||||||
|
|
||||||
|
elif tag == 'h6':
|
||||||
|
self._result.append('###### ')
|
||||||
|
|
||||||
|
elif tag in ['strong', 'b']:
|
||||||
|
self._result.append('**')
|
||||||
|
|
||||||
|
elif tag in ['em', 'i']:
|
||||||
|
self._result.append('*')
|
||||||
|
|
||||||
|
elif tag == 'code':
|
||||||
|
self._result.append('`')
|
||||||
|
|
||||||
|
elif tag == 'a':
|
||||||
|
for name, link in attrs:
|
||||||
|
if name == 'href':
|
||||||
|
self._link = '(' + link + ')'
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
"""
|
||||||
|
Edge case handling of open/close tags
|
||||||
|
"""
|
||||||
|
self._do_store = True
|
||||||
|
self._link = ""
|
||||||
|
|
||||||
|
if tag in self.BLOCK_TAGS:
|
||||||
|
self._result.append(self.BLOCK_END)
|
||||||
|
|
||||||
|
if tag in ['strong', 'b']:
|
||||||
|
self._result.append('**')
|
||||||
|
|
||||||
|
elif tag in ['em', 'i']:
|
||||||
|
self._result.append('*')
|
||||||
|
|
||||||
|
elif tag == 'code':
|
||||||
|
self._result.append('`')
|
||||||
|
|
|
@ -143,6 +143,110 @@ def test_conversion_html_to_text():
|
||||||
assert to_html(object)
|
assert to_html(object)
|
||||||
|
|
||||||
|
|
||||||
|
def test_conversion_html_to_markdown():
|
||||||
|
"""conversion: Test HTML to plain text
|
||||||
|
"""
|
||||||
|
|
||||||
|
def to_markdown(body):
|
||||||
|
"""
|
||||||
|
A function to simply html conversion tests
|
||||||
|
"""
|
||||||
|
return convert_between(NotifyFormat.HTML, NotifyFormat.MARKDOWN, body)
|
||||||
|
|
||||||
|
assert to_markdown("No HTML code here.") == "No HTML code here\."
|
||||||
|
|
||||||
|
clist = to_markdown("<ul><li>Lots and lots</li><li>of lists.</li></ul>")
|
||||||
|
assert "- Lots and lots" in clist
|
||||||
|
assert "- of lists\." in clist
|
||||||
|
|
||||||
|
assert "> To be or not to be\." == to_markdown(
|
||||||
|
"<blockquote>To be or not to be.</blockquote>")
|
||||||
|
|
||||||
|
cspace = to_markdown(
|
||||||
|
"<h2>Fancy heading</h2>"
|
||||||
|
"<p>And a paragraph too.<br>Plus line break.</p>")
|
||||||
|
assert "# Fancy heading" in cspace
|
||||||
|
assert "And a paragraph too\.\nPlus line break\." in cspace
|
||||||
|
|
||||||
|
assert to_markdown(
|
||||||
|
"<style>body { font: 200%; }</style>"
|
||||||
|
"<p>Some obnoxious text here.</p>") == "Some obnoxious text here\."
|
||||||
|
|
||||||
|
assert to_markdown(
|
||||||
|
"<p>line 1</p>"
|
||||||
|
"<p>line 2</p>"
|
||||||
|
"<p>line 3</p>") == "line 1\nline 2\nline 3"
|
||||||
|
|
||||||
|
# Case sensitivity
|
||||||
|
assert to_markdown(
|
||||||
|
"<p>line 1</P>"
|
||||||
|
"<P>line 2</P>"
|
||||||
|
"<P>line 3</P>") == "line 1\nline 2\nline 3"
|
||||||
|
|
||||||
|
# double new lines (testing <br> and </br>)
|
||||||
|
assert to_markdown(
|
||||||
|
"some information<br/><br>and more information") == \
|
||||||
|
"some information\n\nand more information"
|
||||||
|
|
||||||
|
#
|
||||||
|
# Test bad tags
|
||||||
|
#
|
||||||
|
|
||||||
|
# first 2 entries are okay, but last will do as best as it can
|
||||||
|
assert to_markdown(
|
||||||
|
"<p>line 1</>"
|
||||||
|
"<p>line 2</gar>"
|
||||||
|
"<p>line 3>") == "line 1\nline 2\nline 3\>"
|
||||||
|
|
||||||
|
# Make sure we ignore fields that aren't important to us
|
||||||
|
assert to_markdown(
|
||||||
|
"<script>ignore this</script>"
|
||||||
|
"<p>line 1</p>"
|
||||||
|
"Another line without being enclosed") == \
|
||||||
|
"line 1\nAnother line without being enclosed"
|
||||||
|
|
||||||
|
# Test cases when there are no new lines (we're dealing with just inline
|
||||||
|
# entries); an empty entry as well
|
||||||
|
assert to_markdown("<span></span<<span>test</span> "
|
||||||
|
"<a href='#'>my link</a>") == \
|
||||||
|
"test [my link](#)"
|
||||||
|
|
||||||
|
# </p> missing
|
||||||
|
assert to_markdown("<body><div>line 1 <b>bold</b></div> "
|
||||||
|
" <a href='/link'>my link</a>"
|
||||||
|
"<p>3rd line</body>") == \
|
||||||
|
"line 1 **bold**\n[my link](/link)\n3rd line"
|
||||||
|
|
||||||
|
# <hr/> on it's own
|
||||||
|
assert to_markdown("<hr/>") == "---"
|
||||||
|
assert to_markdown("<hr>") == "---"
|
||||||
|
|
||||||
|
# We need to handle HTML Encodings
|
||||||
|
assert to_markdown("""
|
||||||
|
<html>
|
||||||
|
<title>ignore this entry</title>
|
||||||
|
<body>
|
||||||
|
Let's handle special html encoding
|
||||||
|
<hr/>
|
||||||
|
</body>
|
||||||
|
""") == "Let's handle special html encoding\n---"
|
||||||
|
|
||||||
|
# If you give nothing, you get nothing in return
|
||||||
|
assert to_markdown("") == ""
|
||||||
|
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
# Invalid input
|
||||||
|
assert to_markdown(None)
|
||||||
|
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
# Invalid input
|
||||||
|
assert to_markdown(42)
|
||||||
|
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
# Invalid input
|
||||||
|
assert to_markdown(object)
|
||||||
|
|
||||||
|
|
||||||
def test_conversion_text_to():
|
def test_conversion_text_to():
|
||||||
"""conversion: Test Text to all types
|
"""conversion: Test Text to all types
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Reference in New Issue