From fd0cb3ffcc192bc132ac341cc8c555fd571b5345 Mon Sep 17 00:00:00 2001 From: Chris Caron Date: Sun, 1 May 2022 14:43:55 -0400 Subject: [PATCH] Re-worked Telegram HTML/Markdown -> HTML Conversion (#579) --- apprise/conversion.py | 4 +- apprise/plugins/NotifyTelegram.py | 128 +++++++++++++++++------------- test/test_conversion.py | 1 - test/test_plugin_telegram.py | 63 ++++++++++----- test/var/01_test_example.html | 66 +++++++++++++++ 5 files changed, 186 insertions(+), 76 deletions(-) create mode 100644 test/var/01_test_example.html diff --git a/apprise/conversion.py b/apprise/conversion.py index bfd9a644..bb192e7e 100644 --- a/apprise/conversion.py +++ b/apprise/conversion.py @@ -96,7 +96,9 @@ class HTMLConverter(HTMLParser, object): 'div', 'td', 'th', 'code', 'pre', 'label', 'li',) # the folowing tags ignore any internal text - IGNORE_TAGS = ('style', 'link', 'meta', 'title', 'html', 'head', 'script') + IGNORE_TAGS = ( + 'form', 'input', 'textarea', 'select', 'ul', 'ol', 'style', 'link', + 'meta', 'title', 'html', 'head', 'script') # Condense Whitespace WS_TRIM = re.compile(r'[\s]+', re.DOTALL | re.MULTILINE) diff --git a/apprise/plugins/NotifyTelegram.py b/apprise/plugins/NotifyTelegram.py index 23552eb6..1317d7ca 100644 --- a/apprise/plugins/NotifyTelegram.py +++ b/apprise/plugins/NotifyTelegram.py @@ -177,44 +177,85 @@ class NotifyTelegram(NotifyBase): # characters passed into it. to handle this situation, we need to # search the body for these sequences and convert them to the # output the user expected - __telegram_escape_html_dict = { - # New Lines - re.compile(r'<\s*/?br\s*/?>\r*\n?', re.I): '\r\n', - re.compile(r'<\s*/(br|p|div|li)[^>]*>\r*\n?', re.I): '\r\n', - - # The following characters can be altered to become supported - re.compile(r'<\s*pre[^>]*>', re.I): '', - re.compile(r'<\s*/pre[^>]*>', re.I): '', + __telegram_escape_html_entries = ( + # Comments + (re.compile( + r'\s*\s*', + (re.I | re.M | re.S)), '', {}), # the following tags are not supported - re.compile( - r'<\s*(br|p|div|span|body|script|meta|html|font' - r'|label|iframe|li|ol|ul|source|script)[^>]*>', re.I): '', + (re.compile( + r'\s*<\s*(!?DOCTYPE|p|div|span|body|script|link|' + r'meta|html|font|head|label|form|input|textarea|select|iframe|' + r'source|script)([^a-z0-9>][^>]*)?>\s*', + (re.I | re.M | re.S)), '', {}), - re.compile( - r'<\s*/(span|body|script|meta|html|font' - r'|label|iframe|ol|ul|source|script)[^>]*>', re.I): '', - - # Italic - re.compile(r'<\s*(caption|em)[^>]*>', re.I): '', - re.compile(r'<\s*/(caption|em)[^>]*>', re.I): '', + # All closing tags to be removed are put here + (re.compile( + r'\s*<\s*/(span|body|script|meta|html|font|head|' + r'label|form|input|textarea|select|ol|ul|link|' + r'iframe|source|script)([^a-z0-9>][^>]*)?>\s*', + (re.I | re.M | re.S)), '', {}), # Bold - re.compile(r'<\s*(h[1-6]|title|strong)[^>]*>', re.I): '', - re.compile(r'<\s*/(h[1-6]|title|strong)[^>]*>', re.I): '', + (re.compile( + r'<\s*(strong)([^a-z0-9>][^>]*)?>', + (re.I | re.M | re.S)), '', {}), + (re.compile( + r'<\s*/\s*(strong)([^a-z0-9>][^>]*)?>', + (re.I | re.M | re.S)), '', {}), + (re.compile( + r'\s*<\s*(h[1-6]|title)([^a-z0-9>][^>]*)?>\s*', + (re.I | re.M | re.S)), '{}', {'html': '\r\n'}), + (re.compile( + r'\s*<\s*/\s*(h[1-6]|title)([^a-z0-9>][^>]*)?>\s*', + (re.I | re.M | re.S)), + '{}', {'html': '
'}), + + # Italic + (re.compile( + r'<\s*(caption|em)([^a-z0-9>][^>]*)?>', + (re.I | re.M | re.S)), '', {}), + (re.compile( + r'<\s*/\s*(caption|em)([^a-z0-9>][^>]*)?>', + (re.I | re.M | re.S)), '', {}), + + # Bullet Lists + (re.compile( + r'<\s*li([^a-z0-9>][^>]*)?>\s*', + (re.I | re.M | re.S)), ' -', {}), + + # convert pre tags to code (supported by Telegram) + (re.compile( + r'<\s*pre([^a-z0-9>][^>]*)?>', + (re.I | re.M | re.S)), '{}', {'html': '\r\n'}), + (re.compile( + r'<\s*/\s*pre([^a-z0-9>][^>]*)?>', + (re.I | re.M | re.S)), '{}', {'html': '\r\n'}), + + # New Lines + (re.compile( + r'\s*<\s*/?\s*(ol|ul|br|hr)\s*/?>\s*', + (re.I | re.M | re.S)), '\r\n', {}), + (re.compile( + r'\s*<\s*/\s*(br|p|hr|li|div)([^a-z0-9>][^>]*)?>\s*', + (re.I | re.M | re.S)), '\r\n', {}), # HTML Spaces ( ) and tabs ( ) aren't supported # See https://core.telegram.org/bots/api#html-style - re.compile(r'\ ?', re.I): ' ', + (re.compile(r'\ ?', re.I), ' ', {}), # Tabs become 3 spaces - re.compile(r'\ ?', re.I): ' ', + (re.compile(r'\ ?', re.I), ' ', {}), # Some characters get re-escaped by the Telegram upstream # service so we need to convert these back, - re.compile(r'\'?', re.I): '\'', - re.compile(r'\"?', re.I): '"', - } + (re.compile(r'\'?', re.I), '\'', {}), + (re.compile(r'\"?', re.I), '"', {}), + + # New line cleanup + (re.compile(r'\r*\n[\r\n]+', re.I), '\r\n', {}), + ) # Define our template tokens template_tokens = dict(NotifyBase.template_tokens, **{ @@ -597,38 +638,19 @@ class NotifyTelegram(NotifyBase): # Use Telegram's HTML mode payload['parse_mode'] = 'HTML' - for r, v in self.__telegram_escape_html_dict.items(): - body = r.sub(v, body, re.I) + for r, v, m in self.__telegram_escape_html_entries: + + if 'html' in m: + # Handle special cases where we need to alter new lines + # for presentation purposes + v = v.format(m['html'] if body_format in ( + NotifyFormat.HTML, NotifyFormat.MARKDOWN) else '') + + body = r.sub(v, body) # Prepare our payload based on HTML or TEXT payload['text'] = body - # else: # self.notify_format == NotifyFormat.TEXT: - # # Use Telegram's HTML mode - # payload['parse_mode'] = 'HTML' - - # # Further html escaping required... - # telegram_escape_text_dict = { - # # We need to escape characters that conflict with html - # # entity blocks (< and >) when displaying text - # r'>': '>', - # r'<': '<', - # r'\&': '&', - # } - - # # Create a regular expression from the dictionary keys - # text_regex = re.compile("(%s)" % "|".join( - # map(re.escape, telegram_escape_text_dict.keys())).lower(), - # re.I) - - # # For each match, look-up corresponding value in dictionary - # body = text_regex.sub( # pragma: no branch - # lambda mo: telegram_escape_text_dict[ - # mo.string[mo.start():mo.end()]], body) - - # # prepare our payload based on HTML or TEXT - # payload['text'] = body - # Create a copy of the chat_ids list targets = list(self.targets) while len(targets): diff --git a/test/test_conversion.py b/test/test_conversion.py index c6ab6d8a..0908f232 100644 --- a/test/test_conversion.py +++ b/test/test_conversion.py @@ -22,7 +22,6 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. - from apprise import NotifyFormat from apprise.conversion import convert_between import pytest diff --git a/test/test_plugin_telegram.py b/test/test_plugin_telegram.py index a9f191b6..d2fa3c0c 100644 --- a/test/test_plugin_telegram.py +++ b/test/test_plugin_telegram.py @@ -625,11 +625,10 @@ def test_plugin_telegram_formating_py3(mock_post): # Test that everything is escaped properly in a TEXT mode assert payload['text'] == \ - '🚨 Change detected for <i>Apprise ' \ - 'Test Title</i>\r\n<a href=' \ - '"http://localhost"><i>Apprise Body Title<' \ - '/i></a> had <a href="http://' \ - '127.0.0.1">a change</a>' + '🚨 Change detected for <i>Apprise Test Title</i>' \ + '\r\n<a href="http://localhost"><i>' \ + 'Apprise Body Title</i></a> had <' \ + 'a href="http://127.0.0.1">a change</a>' # Reset our values mock_post.reset_mock() @@ -718,8 +717,9 @@ def test_plugin_telegram_formating_py3(mock_post): # Test that everything is escaped properly in a HTML mode assert payload['text'] == \ - '🚨 Another Change detected for Apprise Test Title' \ - '\r\nApprise Body Title' \ + '\r\n🚨 Another Change detected for ' \ + 'Apprise Test Title\r\n\r\n' \ + 'Apprise Body Title' \ ' had a change\r\n' # Now we'll test an edge case where a title was defined, but after @@ -881,11 +881,11 @@ def test_plugin_telegram_formating_py2(mock_post): # Test that everything is escaped properly in a TEXT mode assert payload['text'].encode('utf-8') == \ - '\xf0\x9f\x9a\xa8 Change detected for <i>' \ - 'Apprise Test Title</i>\r\n<a ' \ - 'href="http://localhost"><i>Apprise Body ' \ - 'Title</i></a> had <a href="' \ - 'http://127.0.0.1">a change</a>' + '\xf0\x9f\x9a\xa8 Change detected for <i>' \ + 'Apprise Test Title</i>\r\n<' \ + 'a href="http://localhost"><i>Apprise Body Title' \ + '</i></a> had <a href="http://127.0.0.1"' \ + '>a change</a>' # Reset our values mock_post.reset_mock() @@ -969,9 +969,9 @@ def test_plugin_telegram_formating_py2(mock_post): # Test that everything is escaped properly in a HTML mode assert payload['text'].encode('utf-8') == \ - '\xf0\x9f\x9a\xa8 Change detected for ' \ - 'Apprise Test Title\r\n' \ - 'Apprise Body Title'\ + '\r\n\xf0\x9f\x9a\xa8 Change detected for ' \ + 'Apprise Test Title\r\n\r\n' \ + 'Apprise Body Title' \ ' had a change\r\n' # Reset our values @@ -1163,8 +1163,8 @@ def test_plugin_telegram_html_formatting(mock_post): # Test that everything is escaped properly in a HTML mode assert payload['text'] == \ - '\'information\'\r\n"This is in Italic"' \ - '\r\n Headings are dropped and converted to bold' + '\r\n\'information\'\r\n\r\n"This is in Italic"' \ + '\r\n Headings are dropped and converted to bold\r\n' mock_post.reset_mock() @@ -1177,7 +1177,28 @@ def test_plugin_telegram_html_formatting(mock_post): assert payload['text'] == \ '<title>&apos;information&apos</title>' \ - '\r\n<em>&quot;This is in Italic&quot</em' \ - '><br/><h5>&emsp;&emspHeadings&nbsp;' \ - 'are dropped and&nbspconverted to bold<' \ - '/h5>' + '\r\n<em>&quot;This is in Italic&quot</em><' \ + 'br/><h5>&emsp;&emspHeadings&nbsp;are ' \ + 'dropped and&nbspconverted to bold</h5>' + + # Lest test more complex HTML examples now + mock_post.reset_mock() + + test_file_01 = os.path.join( + TEST_VAR_DIR, '01_test_example.html') + with open(test_file_01) as html_file: + assert aobj.notify( + body=html_file.read(), body_format=NotifyFormat.HTML) + + # owner has already been looked up, so only one call is made + assert mock_post.call_count == 1 + + payload = loads(mock_post.call_args_list[0][1]['data']) + assert payload['text'] == \ + '\r\nBootstrap 101 Template\r\nMy Title\r\n' \ + 'Heading 1\r\n-Bullet 1\r\n-Bullet 2\r\n-Bullet 3\r\n' \ + '-Bullet 1\r\n-Bullet 2\r\n-Bullet 3\r\nHeading 2\r\n' \ + 'A div entry\r\nA div entry\r\nA pre entry\r\n' \ + 'Heading 3\r\nHeading 4\r\nHeading 5\r\n' \ + 'Heading 6\r\nA set of text\r\n' \ + 'Another line after the set of text\r\nMore text\r\nlabel' diff --git a/test/var/01_test_example.html b/test/var/01_test_example.html new file mode 100644 index 00000000..07891255 --- /dev/null +++ b/test/var/01_test_example.html @@ -0,0 +1,66 @@ + + + + + + + + Bootstrap 101 Template + + + + + + + + + +

My Title

+ + + + + + +

Heading 1

+

+

    +
  • Bullet 1
  • +
  • Bullet 2
  • +
  • Bullet 3
  • +
+ +
    +
  1. Bullet 1
  2. +
  3. Bullet 2
  4. +
  5. Bullet 3
  6. +
+

+ +

Heading 2

+
A div entry
+

+ A div entry +

A pre entry
+

+ +

Heading 3

+

Heading 4

+
Heading 5
+
Heading 6
+ +

+ A set of text
Another line after the set of text +


+ More text +

+
+ + +