Telegram escaping completely refactored (#386)

2021-05-15 16:08:53 -04:00 · 2021-05-15 16:08:53 -04:00 · 59aa5f5d10
parent 7f7ee043d9
commit 59aa5f5d10
2 changed files with 75 additions and 25 deletions
--- a/apprise/plugins/NotifyTelegram.py
+++ b/apprise/plugins/NotifyTelegram.py
@ -524,39 +524,73 @@ class NotifyTelegram(NotifyBase):
                body,
            )
-        elif self.notify_format == NotifyFormat.HTML:
+        else:  # HTML or TEXT
            # Use Telegram's HTML mode
            payload['parse_mode'] = 'HTML'
-            # HTML Spaces (&nbsp;) and tabs (&emsp;) aren't supported
+            # Telegram's HTML support doesn't like having HTML escaped
-            # See https://core.telegram.org/bots/api#html-style
+            # characters passed into it.  to handle this situation, we need to
-            body = re.sub('&nbsp;?', ' ', body, re.I)
+            # search the body for these sequences and convert them to the
-
+            # output the user expected
-            # Tabs become 3 spaces
+            telegram_escape_html_dict = {
            body = re.sub('&emsp;?', '   ', body, re.I)
            if title:
                # HTML Spaces (&nbsp;) and tabs (&emsp;) aren't supported
                # See https://core.telegram.org/bots/api#html-style
-                title = re.sub('&nbsp;?', ' ', title, re.I)
+                r'nbsp': ' ',
                # Tabs become 3 spaces
-                title = re.sub('&emsp;?', '   ', title, re.I)
+                r'emsp': '   ',
-            payload['text'] = '{}{}'.format(
+                # Some characters get re-escaped by the Telegram upstream
-                '<b>{}</b>\r\n'.format(title) if title else '',
+                # service so we need to convert these back,
-                body,
+                r'apos': '\'',
-            )
+                r'quot': '"',
            }
-        else:  # pass directly as is...
+            # Create a regular expression from the dictionary keys
-            payload['parse_mode'] = 'HTML'
+            html_regex = re.compile("&(%s);?" % "|".join(
                map(re.escape, telegram_escape_html_dict.keys())).lower(),
                re.I)
-            # Telegram strangely escapes all HTML characters for us already
+            # For each match, look-up corresponding value in dictionary
-            # but to avoid causing issues with HTML, we escape the < and >
+            # we look +1 to ignore the & that does not appear in the index
-            # characters
+            # we only look at the first 4 characters because we don't want to
-            title = re.sub('>', '&gt;', title, re.I)
+            # fail on &apos; as it's accepted (along with &apos - no
-            title = re.sub('<', '&lt;', title, re.I)
+            # semi-colon)
-            body = re.sub('>', '&gt;', body, re.I)
+            body = html_regex.sub(  # pragma: no branch
-            body = re.sub('<', '&lt;', body, re.I)
+                lambda mo: telegram_escape_html_dict[
                    mo.string[mo.start():mo.end()][1:5]], body)
            if title:
                # For each match, look-up corresponding value in dictionary
                # Indexing is explained above (for how the body is parsed)
                title = html_regex.sub(  # pragma: no branch
                    lambda mo: telegram_escape_html_dict[
                        mo.string[mo.start():mo.end()][1:5]], title)
            if self.notify_format == NotifyFormat.TEXT:
                telegram_escape_text_dict = {
                    # We need to escape characters that conflict with html
                    # entity blocks (< and >) when displaying text
                    r'>': '&gt;',
                    r'<': '&lt;',
                }
                # Create a regular expression from the dictionary keys
                text_regex = re.compile("(%s)" % "|".join(
                    map(re.escape, telegram_escape_text_dict.keys())).lower(),
                    re.I)
                # For each match, look-up corresponding value in dictionary
                body = text_regex.sub(  # pragma: no branch
                    lambda mo: telegram_escape_text_dict[
                        mo.string[mo.start():mo.end()]], body)
                if title:
                    # For each match, look-up corresponding value in dictionary
                    title = text_regex.sub(  # pragma: no branch
                        lambda mo: telegram_escape_text_dict[
                            mo.string[mo.start():mo.end()]], title)
            payload['text'] = '{}{}'.format(
                '<b>{}</b>\r\n'.format(title) if title else '',
--- a/test/test_telegram.py
+++ b/test/test_telegram.py
@ -29,6 +29,7 @@ import pytest
 import mock
 import requests
 from json import dumps
 from json import loads
 from apprise import Apprise
 from apprise import AppriseAttachment
 from apprise import AppriseAsset
@ -202,11 +203,26 @@ def test_notify_telegram_plugin(mock_post, mock_get):
    })
    mock_post.return_value.status_code = requests.codes.ok
    # Test sending attachments
    obj = plugins.NotifyTelegram(bot_token=bot_token, targets='12345')
    assert len(obj.targets) == 1
    assert obj.targets[0] == '12345'
    # Test the escaping of characters since Telegram escapes stuff for us to
    # which we need to consider
    mock_post.reset_mock()
    body = "<p>\'\"This can't\t\r\nfail&nbsp;us\"\'</p>"
    assert obj.notify(
        body=body, title='special characters',
        notify_type=NotifyType.INFO) is True
    assert mock_post.call_count == 1
    payload = loads(mock_post.call_args_list[0][1]['data'])
    # Our special characters are escaped properly
    assert payload['text'] == \
        '<b>special characters</b>\r\n&lt;p&gt;'\
        '\'"This can\'t\t\r\nfail us"\'&lt;/p&gt;'
    # Test sending attachments
    attach = AppriseAttachment(os.path.join(TEST_VAR_DIR, 'apprise-test.gif'))
    assert obj.notify(
        body='body', title='title', notify_type=NotifyType.INFO,