HTML to TEXT/MARKDOWN cleanup and refactoring (#530)

pull/539/head
Chris Caron 2022-02-01 22:23:42 -05:00 committed by GitHub
parent 8fa146685f
commit 405e26e22d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 231 additions and 52 deletions

View File

@ -514,8 +514,8 @@ class Apprise(object):
# was set to None), or we did define a tag and the logic above # was set to None), or we did define a tag and the logic above
# determined we need to notify the service it's associated with # determined we need to notify the service it's associated with
if server.notify_format not in conversion_map: if server.notify_format not in conversion_map:
conversion_map[server.notify_format] = \ conversion_map[server.notify_format] = convert_between(
convert_between(body_format, server.notify_format, body) body_format, server.notify_format, body)
if interpret_escapes: if interpret_escapes:
# #
@ -531,8 +531,9 @@ class Apprise(object):
.decode('unicode-escape') .decode('unicode-escape')
except UnicodeDecodeError: # pragma: no cover except UnicodeDecodeError: # pragma: no cover
# This occurs using a very old verion of Python 2.7 such # This occurs using a very old verion of Python 2.7
# as the one that ships with CentOS/RedHat 7.x (v2.7.5). # such as the one that ships with CentOS/RedHat 7.x
# (v2.7.5).
conversion_map[server.notify_format] = \ conversion_map[server.notify_format] = \
conversion_map[server.notify_format] \ conversion_map[server.notify_format] \
.decode('string_escape') .decode('string_escape')
@ -544,8 +545,9 @@ class Apprise(object):
if title: if title:
try: try:
# Added overhead required due to Python 3 Encoding Bug # Added overhead required due to Python 3 Encoding
# identified here: https://bugs.python.org/issue21331 # Bug identified here:
# https://bugs.python.org/issue21331
title = title\ title = title\
.encode('ascii', 'backslashreplace')\ .encode('ascii', 'backslashreplace')\
.decode('unicode-escape') .decode('unicode-escape')

View File

@ -27,11 +27,11 @@
import re import re
import six import six
from markdown import markdown from markdown import markdown
from os import linesep
from .common import NotifyFormat from .common import NotifyFormat
if six.PY2: if six.PY2:
from HTMLParser import HTMLParser from HTMLParser import HTMLParser
else: else:
from html.parser import HTMLParser from html.parser import HTMLParser
@ -46,6 +46,8 @@ def convert_between(from_format, to_format, body):
(NotifyFormat.MARKDOWN, NotifyFormat.HTML): markdown, (NotifyFormat.MARKDOWN, NotifyFormat.HTML): markdown,
(NotifyFormat.TEXT, NotifyFormat.HTML): text_to_html, (NotifyFormat.TEXT, NotifyFormat.HTML): text_to_html,
(NotifyFormat.HTML, NotifyFormat.TEXT): html_to_text, (NotifyFormat.HTML, NotifyFormat.TEXT): html_to_text,
# For now; use same converter for Markdown support
(NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_text,
} }
convert = converters.get((from_format, to_format)) convert = converters.get((from_format, to_format))
@ -85,7 +87,7 @@ def text_to_html(body):
# Execute our map against our body in addition to # Execute our map against our body in addition to
# swapping out new lines and replacing them with <br/> # swapping out new lines and replacing them with <br/>
return re.sub( return re.sub(
r'\r*\n', '<br/>\r\n', re_table.sub(lambda x: re_map[x.group()], body)) r'\r*\n', '<br/>\n', re_table.sub(lambda x: re_map[x.group()], body))
def html_to_text(body): def html_to_text(body):
@ -94,37 +96,134 @@ def html_to_text(body):
""" """
parser = HTMLConverter() parser = HTMLConverter()
if six.PY2:
# Python 2.7 requires an additional parsing to un-escape characters
body = parser.unescape(body)
parser.feed(body) parser.feed(body)
parser.close() parser.close()
return parser.converted result = parser.converted
return result
class HTMLConverter(HTMLParser, object): class HTMLConverter(HTMLParser, object):
"""An HTML to plain text converter tuned for email messages.""" """An HTML to plain text converter tuned for email messages."""
# The following tags must start on a new line
BLOCK_TAGS = ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'div', 'td', 'th', 'code', 'pre', 'label', 'li',)
# the folowing tags ignore any internal text
IGNORE_TAGS = ('style', 'link', 'meta', 'title', 'html', 'head', 'script')
# Condense Whitespace
WS_TRIM = re.compile(r'[\s]+', re.DOTALL | re.MULTILINE)
# Sentinel value for block tag boundaries, which may be consolidated into a
# single line break.
BLOCK_END = {}
def __init__(self, **kwargs): def __init__(self, **kwargs):
super(HTMLConverter, self).__init__(**kwargs) super(HTMLConverter, self).__init__(**kwargs)
# Shoudl we store the text content or not?
self._do_store = True
# Initialize internal result list
self._result = []
# Initialize public result field (not populated until close() is
# called)
self.converted = "" self.converted = ""
def close(self): def close(self):
# Removes all html before the last "}". Some HTML can return additional string = ''.join(self._finalize(self._result))
# style information with text output. self.converted = string.strip()
self.converted = str(self.converted).split('}')[-1].strip()
def handle_data(self, data): if six.PY2:
self.converted += data.strip() # See https://stackoverflow.com/questions/10993612/\
# how-to-remove-xa0-from-string-in-python
#
# This is required since the unescape() nbsp; with \xa0 when
# using Python 2.7
self.converted = self.converted.replace(u'\xa0', u' ')
def _finalize(self, result):
"""
Combines and strips consecutive strings, then converts consecutive
block ends into singleton newlines.
[ {be} " Hello " {be} {be} " World!" ] -> "\nHello\nWorld!"
"""
# None means the last visited item was a block end.
accum = None
for item in result:
if item == self.BLOCK_END:
# Multiple consecutive block ends; do nothing.
if accum is None:
continue
# First block end; yield the current string, plus a newline.
yield accum.strip() + '\n'
accum = None
# Multiple consecutive strings; combine them.
elif accum is not None:
accum += item
# First consecutive string; store it.
else:
accum = item
# Yield the last string if we have not already done so.
if accum is not None:
yield accum.strip()
def handle_data(self, data, *args, **kwargs):
"""
Store our data if it is not on the ignore list
"""
# initialize our previous flag
if self._do_store:
# Tidy our whitespace
content = self.WS_TRIM.sub(' ', data)
self._result.append(content)
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
"""
Process our starting HTML Tag
"""
# Toggle initial states
self._do_store = tag not in self.IGNORE_TAGS
if tag in self.BLOCK_TAGS:
self._result.append(self.BLOCK_END)
if tag == 'li': if tag == 'li':
self.converted += linesep + '- ' self._result.append('- ')
elif tag == 'blockquote':
self.converted += linesep + linesep + '\t'
elif tag in ('p', 'h1', 'h2', 'h3', 'h4', 'tr', 'th'):
self.converted += linesep + '\n'
elif tag == 'br': elif tag == 'br':
self.converted += linesep self._result.append('\n')
elif tag == 'hr':
if self._result:
self._result[-1] = self._result[-1].rstrip(' ')
self._result.append('\n---\n')
elif tag == 'blockquote':
self._result.append(' >')
def handle_endtag(self, tag): def handle_endtag(self, tag):
if tag == 'blockquote': """
self.converted += linesep + linesep Edge case handling of open/close tags
"""
self._do_store = True
if tag in self.BLOCK_TAGS:
self._result.append(self.BLOCK_END)

View File

@ -25,6 +25,7 @@
from apprise import NotifyFormat from apprise import NotifyFormat
from apprise.conversion import convert_between from apprise.conversion import convert_between
import pytest
# Disable logging for a cleaner testing output # Disable logging for a cleaner testing output
import logging import logging
@ -35,24 +36,101 @@ def test_html_to_text():
"""conversion: Test HTML to plain text """conversion: Test HTML to plain text
""" """
def convert(body): def to_html(body):
"""
A function to simply html conversion tests
"""
return convert_between(NotifyFormat.HTML, NotifyFormat.TEXT, body) return convert_between(NotifyFormat.HTML, NotifyFormat.TEXT, body)
assert convert("No HTML code here.") == "No HTML code here." assert to_html("No HTML code here.") == "No HTML code here."
clist = convert("<ul><li>Lots and lots</li><li>of lists.</li></ul>") clist = to_html("<ul><li>Lots and lots</li><li>of lists.</li></ul>")
assert "Lots and lots" in clist assert "Lots and lots" in clist
assert "of lists." in clist assert "of lists." in clist
assert "To be or not to be." in convert( assert "To be or not to be." in to_html(
"<blockquote>To be or not to be.</blockquote>") "<blockquote>To be or not to be.</blockquote>")
cspace = convert( cspace = to_html(
"<h2>Fancy heading</h2>" "<h2>Fancy heading</h2>"
"<p>And a paragraph too.<br>Plus line break.</p>") "<p>And a paragraph too.<br>Plus line break.</p>")
assert "Fancy heading" in cspace assert "Fancy heading" in cspace
assert "And a paragraph too.\nPlus line break." in cspace assert "And a paragraph too.\nPlus line break." in cspace
assert convert( assert to_html(
"<style>body { font: 200%; }</style>" "<style>body { font: 200%; }</style>"
"<p>Some obnoxious text here.</p>") == "Some obnoxious text here." "<p>Some obnoxious text here.</p>") == "Some obnoxious text here."
assert to_html(
"<p>line 1</p>"
"<p>line 2</p>"
"<p>line 3</p>") == "line 1\nline 2\nline 3"
# Case sensitivity
assert to_html(
"<p>line 1</P>"
"<P>line 2</P>"
"<P>line 3</P>") == "line 1\nline 2\nline 3"
# double new lines (testing <br> and </br>)
assert to_html(
"some information<br/><br>and more information") == \
"some information\n\nand more information"
#
# Test bad tags
#
# first 2 entries are okay, but last will do as best as it can
assert to_html(
"<p>line 1</>"
"<p>line 2</gar>"
"<p>line 3>") == "line 1\nline 2\nline 3>"
# Make sure we ignore fields that aren't important to us
assert to_html(
"<script>ignore this</script>"
"<p>line 1</p>"
"Another line without being enclosed") == \
"line 1\nAnother line without being enclosed"
# Test cases when there are no new lines (we're dealing with just inline
# entries); an empty entry as well
assert to_html("<span></span<<span>test</span> "
"<a href='#'>my link</a>") == \
"test my link"
# </p> missing
assert to_html("<body><div>line 1 <b>bold</b></div> "
" <a href='#'>my link</a>"
"<p>3rd line</body>") == \
"line 1 bold\nmy link\n3rd line"
# <hr/> on it's own
assert to_html("<hr/>") == "---"
assert to_html("<hr>") == "---"
# We need to handle HTML Encodings
assert to_html("""
<html>
<title>ignore this entry</title>
<body>
Let&apos;s handle&nbsp;special html encoding
<hr/>
</body>
""") == "Let's handle special html encoding\n---"
# If you give nothing, you get nothing in return
assert to_html("") == ""
with pytest.raises(TypeError):
# Invalid input
assert to_html(None)
with pytest.raises(TypeError):
# Invalid input
assert to_html(42)
with pytest.raises(TypeError):
# Invalid input
assert to_html(object)