Files
apprise/tests/test_utils_format.py

373 lines
13 KiB
Python

# BSD 2-Clause License
#
# Apprise - Push Notification Library.
# Copyright (c) 2025, Chris Caron <lead2gold@gmail.com>
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
from apprise import NotifyFormat
from apprise.utils.format import html_adjust, markdown_adjust, smart_split
def test_smart_split_prefers_newlines_over_spaces_and_punctuation():
"""
Newlines should win even if there are spaces and punctuation before the
limit.
"""
text = "line1\nline2 line3. line4"
# Long enough to include the newline and some of the next line
limit = 12
chunks = smart_split(text, limit, body_format=NotifyFormat.TEXT)
# First chunk should end immediately after the newline
assert chunks[0] == "line1\n"
# Nothing lost
assert "".join(chunks) == text
def test_smart_split_prefers_spaces_over_hard_split():
"""
When there are no newlines, split on the last space/tab before falling back
to a hard character-limit split.
"""
text = "word1 word2 word3"
# Force a split between word2 and word3
limit = 12 # "word1 word2 " is 12 characters
chunks = smart_split(text, limit, body_format=NotifyFormat.TEXT)
assert chunks == ["word1 word2 ", "word3"]
assert "".join(chunks) == text
def test_smart_split_can_split_after_punctuation_plus_whitespace():
"""
Exercise the punctuation+whitespace pattern. In practice this collapses
to the same split point as the last space, but we verify the behaviour.
"""
text = "Hello world. Again"
# Force the split around ". "
# "Hello world. " is 13 characters
limit = 13
chunks = smart_split(text, limit, body_format=NotifyFormat.TEXT)
# First chunk should end at the space after the period
assert chunks[0] == "Hello world. "
assert chunks[1] == "Again"
assert "".join(chunks) == text
def test_smart_split_avoids_splitting_inside_html_entity() -> None:
"""
In HTML mode we must not end a chunk in the middle of '&...;'.
We do NOT assert exact chunk values. Instead we assert:
- TEXT mode can split inside the entity.
- HTML mode never has a chunk that contains '&' without a matching ';'
after it in the same chunk.
"""
text = "1234&nbsp;5678"
limit = 8 # without adjustment, we would cut inside '&nbsp;'
# Plain text mode: allowed to split anywhere
chunks_text = smart_split(text, limit, body_format=NotifyFormat.TEXT)
assert "".join(chunks_text) == text
# Sanity: in TEXT mode we *do* split inside the entity
assert any(
"&" in chunk and ";" not in chunk[chunk.find("&") :]
for chunk in chunks_text
)
# HTML mode: entity-aware
chunks_html = smart_split(text, limit, body_format=NotifyFormat.HTML)
assert "".join(chunks_html) == text
# If a chunk contains '&', it must also contain the terminating ';'
# for that entity within the same chunk.
for chunk in chunks_html:
idx = chunk.find("&")
if idx == -1:
continue
semi = chunk.find(";", idx + 1)
assert semi != -1, f"Chunk ends inside HTML entity: {chunk!r}"
def test_smart_split_avoids_splitting_inside_markdown_link() -> None:
"""
In MARKDOWN mode, do not split inside [text](url).
We only require that the full [link](...) lies in a single chunk and that
any '[' appearing in a chunk has a matching ')' in that same chunk.
"""
link = "[link](https://example.com)"
text = "AAAA" + link
limit = len(link)
chunks = smart_split(text, limit, body_format=NotifyFormat.MARKDOWN)
assert "".join(chunks) == text
# Entire link must be contained in one chunk
assert any(link in chunk for chunk in chunks)
# If a chunk has '[', it must also contain its closing ')'
for chunk in chunks:
idx = chunk.find("[")
if idx == -1:
continue
semi = chunk.find(")", idx + 1)
assert semi != -1, f"Markdown link was split inside chunk: {chunk!r}"
def test_smart_split_avoids_splitting_inside_markdown_image() -> None:
"""
In MARKDOWN mode, do not split inside the [alt](url) of an image.
The implementation currently splits "AAAA![alt](...)" as:
- "AAAA!"
- "[alt](...)"
which is acceptable, as the [alt](url) part is kept intact.
"""
image = "![alt](https://example.com/image.png)"
text = "AAAA" + image
limit = len(image)
chunks = smart_split(text, limit, body_format=NotifyFormat.MARKDOWN)
assert "".join(chunks) == text
inner = "[alt](https://example.com/image.png)"
# The [alt](...) portion must appear fully within a single chunk
assert any(inner in chunk for chunk in chunks)
# As with links, any '[' in a chunk must have its matching ')' within
# the same chunk so we never split inside the [alt](url) part.
for chunk in chunks:
idx = chunk.find("[")
if idx == -1:
continue
semi = chunk.find(")", idx + 1)
assert semi != -1, f"Markdown image was split inside chunk: {chunk!r}"
def test_smart_split_empty_and_none_input() -> None:
"""
Empty / None input should be returned as a single-element list unchanged.
"""
assert smart_split("", 10, body_format=NotifyFormat.TEXT) == [""]
assert smart_split("", 0, body_format=NotifyFormat.TEXT) == [""]
assert smart_split("content", 0, body_format=NotifyFormat.TEXT) == [""]
# None short-circuits before len() is called
assert smart_split(None, 10, body_format=NotifyFormat.TEXT) == [""]
def test_smart_split_html_entity_exact_boundary() -> None:
"""
Splitting exactly at an HTML entity boundary should not shift the split
point (no need to "fix up" a perfectly aligned boundary).
"""
text = "AAAA&nbsp;BBBB"
limit = len("AAAA&nbsp;") # split exactly after the entity
chunks = smart_split(text, limit, body_format=NotifyFormat.HTML)
# We expect the entity to remain whole in the first chunk
assert chunks == ["AAAA&nbsp;", "BBBB"]
assert "".join(chunks) == text
def test_smart_split_markdown_link_exact_boundary() -> None:
"""
Splitting exactly after a Markdown link should not cause any adjustment.
"""
link = "[link](https://example.com)"
tail = " TAIL"
text = link + tail
limit = len(link) # split immediately after ')'
chunks = smart_split(text, limit, body_format=NotifyFormat.MARKDOWN)
# First chunk is exactly the link, second is the remainder
assert chunks[0] == link
assert "".join(chunks) == text
# Sanity: the link itself is not split across chunks
assert any(link in chunk for chunk in chunks)
def test_smart_split_whitespace_priority_with_tabs_and_newlines() -> None:
"""
Exercise newline vs space/tab priority with a mix of whitespace.
"""
text = "word1\tword2\nword3"
# Case 1: window ends just before the newline, so only tab is visible.
limit_without_newline = text.index("\n") # position of '\n'
chunks_no_nl = smart_split(
text, limit_without_newline, body_format=NotifyFormat.TEXT
)
# First chunk should end after the tab, since that is the last space/tab
assert chunks_no_nl[0] == "word1\t"
assert "".join(chunks_no_nl) == text
# Case 2: window includes the newline; newline should win over tab.
limit_with_newline = text.index("\n") + 1
chunks_with_nl = smart_split(
text, limit_with_newline, body_format=NotifyFormat.TEXT
)
# First chunk should now end after the newline
assert chunks_with_nl[0] == "word1\tword2\n"
assert "".join(chunks_with_nl) == text
def test_smart_split_very_short_limit() -> None:
"""
Very small limits should still split deterministically without loss.
"""
text = "ABC"
chunks = smart_split(text, 1, body_format=NotifyFormat.TEXT)
# One character per chunk
assert chunks == ["A", "B", "C"]
assert "".join(chunks) == text
def test_smart_split_very_long_limit() -> None:
"""
Very large limits (>= len(text)) should return a single chunk.
"""
text = "A short message for testing"
chunks = smart_split(text, 10_000, body_format=NotifyFormat.TEXT)
assert chunks == [text]
assert "".join(chunks) == text
def test_html_adjust_guard_paths_and_no_entity() -> None:
"""
Cover the early-return guard in html_adjust and the path where there is
no '&' at all in the search window.
"""
text = "abcdef"
# split_at <= window_start -> early-return unchanged
assert html_adjust(text, window_start=2, split_at=2) == 2
# split_at beyond the end of the text -> early-return unchanged
assert html_adjust(
text, window_start=0, split_at=len(text) + 5) == len(text) + 5
# No '&' in window, nothing to adjust
assert html_adjust(text, window_start=0, split_at=3) == 3
def test_html_adjust_inside_and_at_boundary_of_entity() -> None:
"""
Exercise the path where html_adjust moves the split back to '&' when the
split falls inside an entity, and the path where the split is exactly at
the entity boundary and should not move.
"""
text = "1234&nbsp;5678"
# indexes: 0..3 '1234', 4 '&', 5 'n', 6 'b', 7 's', 8 'p', 9 ';', 10 '5'...
# Split inside '&nbsp;' (at index 8) -> move back to '&' (index 4)
assert html_adjust(text, window_start=0, split_at=8) == 4
# Split exactly after the ';' (index 10) -> already outside entity
assert html_adjust(text, window_start=0, split_at=10) == 10
def test_markdown_adjust_guard_and_no_construct() -> None:
"""
Cover the guard in markdown_adjust and the case where there is no
'[' or '!' in the window.
"""
text = "plain text"
# split_at <= window_start -> early-return unchanged
assert markdown_adjust(text, window_start=4, split_at=4) == 4
# split_at past the end -> early-return unchanged
assert markdown_adjust(
text, window_start=0, split_at=len(text) + 3) == len(text) + 3
# No markdown constructs -> nothing to adjust
assert markdown_adjust(text, window_start=0, split_at=5) == 5
def test_markdown_adjust_inside_construct_moves_to_start() -> None:
"""
Exercise the positive path in markdown_adjust where the split lands
inside a [text](url) construct and the function moves the split
back to the start of the construct.
"""
link = "[link](https://example.com)"
# Choose a split point inside the URL
split_at = link.index("(") + 3 # somewhere inside "(https..."
adjusted = markdown_adjust(link, window_start=0, split_at=split_at)
# Should move back to the '[' at index 0
assert adjusted == 0
def test_smart_split_markdown_guard_split_at_start_is_reset() -> None:
"""
Cover the smart_split guard 'if split_at <= start: split_at = orig_split'.
We force markdown_adjust to move the split back to the window start,
then verify smart_split resets to the original split so progress is
still made and chunks join back to the original text.
"""
text = "[link](https://example.com)"
limit = 5 # will cause the first soft split to land inside the link
chunks = smart_split(text, limit, body_format=NotifyFormat.MARKDOWN)
# We should never get stuck; all chunks must be non-empty
assert len(chunks) >= 2
assert all(chunks)
# Re-joining all chunks must restore the original text
assert "".join(chunks) == text
def test_smart_split_uses_punctuation_branch_on_rare_whitespace() -> None:
"""
When punctuation is followed by rare whitespace (vertical tab / form feed)
and there are no spaces/tabs/newlines, we should use the punctuation
+ whitespace split branch.
"""
vt = "\x0b" # vertical tab
text = f"Hello.{vt}World"
# Window includes 'Hello.' and the VT
limit = len("Hello.") + 1
chunks = smart_split(text, limit, body_format=NotifyFormat.TEXT)
assert "".join(chunks) == text
# We expect the first chunk to end after the rare whitespace
assert chunks[0] == f"Hello.{vt}"
assert chunks[1] == "World"