2022-01-04 23:07:50 +00:00
|
|
|
/** @file testUniConversion.cxx
|
|
|
|
** Unit Tests for Scintilla internal data structures
|
|
|
|
**/
|
2019-05-04 18:14:48 +00:00
|
|
|
|
|
|
|
#include <cstring>
|
|
|
|
|
|
|
|
#include <string>
|
|
|
|
#include <string_view>
|
|
|
|
#include <vector>
|
2022-01-04 23:07:50 +00:00
|
|
|
#include <optional>
|
2019-05-04 18:14:48 +00:00
|
|
|
#include <algorithm>
|
|
|
|
#include <memory>
|
|
|
|
|
2022-01-04 23:07:50 +00:00
|
|
|
#include "Debugging.h"
|
2019-05-04 18:14:48 +00:00
|
|
|
|
|
|
|
#include "UniConversion.h"
|
|
|
|
|
|
|
|
#include "catch.hpp"
|
|
|
|
|
2022-01-04 23:07:50 +00:00
|
|
|
using namespace Scintilla::Internal;
|
2019-05-04 18:14:48 +00:00
|
|
|
|
|
|
|
// Test UniConversion.
|
|
|
|
// Use examples from Wikipedia:
|
|
|
|
// https://en.wikipedia.org/wiki/UTF-8
|
|
|
|
|
|
|
|
TEST_CASE("UTF16Length") {
|
|
|
|
|
|
|
|
SECTION("UTF16Length ASCII") {
|
|
|
|
// Latin Small Letter A
|
|
|
|
const char *s = "a";
|
|
|
|
size_t len = UTF16Length(s);
|
|
|
|
REQUIRE(len == 1U);
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UTF16Length Example1") {
|
|
|
|
// Dollar Sign
|
|
|
|
const char *s = "\x24";
|
|
|
|
size_t len = UTF16Length(s);
|
|
|
|
REQUIRE(len == 1U);
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UTF16Length Example2") {
|
|
|
|
// Cent Sign
|
|
|
|
const char *s = "\xC2\xA2";
|
|
|
|
size_t len = UTF16Length(s);
|
|
|
|
REQUIRE(len == 1U);
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UTF16Length Example3") {
|
|
|
|
// Euro Sign
|
|
|
|
const char *s = "\xE2\x82\xAC";
|
|
|
|
size_t len = UTF16Length(s);
|
|
|
|
REQUIRE(len == 1U);
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UTF16Length Example4") {
|
|
|
|
// Gothic Letter Hwair
|
|
|
|
const char *s = "\xF0\x90\x8D\x88";
|
|
|
|
size_t len = UTF16Length(s);
|
|
|
|
REQUIRE(len == 2U);
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UTF16Length Invalid Trail byte in lead position") {
|
|
|
|
const char *s = "a\xB5yz";
|
|
|
|
size_t len = UTF16Length(s);
|
|
|
|
REQUIRE(len == 4U);
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UTF16Length Invalid Lead byte at end") {
|
|
|
|
const char *s = "a\xC2";
|
|
|
|
size_t len = UTF16Length(s);
|
|
|
|
REQUIRE(len == 2U);
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UTF16Length Invalid Lead byte implies 3 trails but only 2") {
|
|
|
|
const char *s = "a\xF1yz";
|
|
|
|
size_t len = UTF16Length(s);
|
|
|
|
REQUIRE(len == 2U);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_CASE("UniConversion") {
|
|
|
|
|
|
|
|
// UnicodeFromUTF8
|
|
|
|
|
|
|
|
SECTION("UnicodeFromUTF8 ASCII") {
|
2022-01-04 23:07:50 +00:00
|
|
|
const unsigned char s[]={'a', 0, 0, 0};
|
2019-05-04 18:14:48 +00:00
|
|
|
REQUIRE(UnicodeFromUTF8(s) == 'a');
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UnicodeFromUTF8 Example1") {
|
2022-01-04 23:07:50 +00:00
|
|
|
const unsigned char s[]={0x24, 0, 0, 0};
|
2019-05-04 18:14:48 +00:00
|
|
|
REQUIRE(UnicodeFromUTF8(s) == 0x24);
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UnicodeFromUTF8 Example2") {
|
2022-01-04 23:07:50 +00:00
|
|
|
const unsigned char s[]={0xC2, 0xA2, 0, 0};
|
2019-05-04 18:14:48 +00:00
|
|
|
REQUIRE(UnicodeFromUTF8(s) == 0xA2);
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UnicodeFromUTF8 Example3") {
|
|
|
|
const unsigned char s[]={0xE2, 0x82, 0xAC, 0};
|
|
|
|
REQUIRE(UnicodeFromUTF8(s) == 0x20AC);
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UnicodeFromUTF8 Example4") {
|
|
|
|
const unsigned char s[]={0xF0, 0x90, 0x8D, 0x88, 0};
|
|
|
|
REQUIRE(UnicodeFromUTF8(s) == 0x10348);
|
|
|
|
}
|
|
|
|
|
|
|
|
// UTF16FromUTF8
|
|
|
|
|
|
|
|
SECTION("UTF16FromUTF8 ASCII") {
|
|
|
|
const char s[] = {'a', 0};
|
|
|
|
wchar_t tbuf[1] = {0};
|
|
|
|
size_t tlen = UTF16FromUTF8(s, tbuf, 1);
|
|
|
|
REQUIRE(tlen == 1U);
|
|
|
|
REQUIRE(tbuf[0] == 'a');
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UTF16FromUTF8 Example1") {
|
|
|
|
const char s[] = {'\x24', 0};
|
|
|
|
wchar_t tbuf[1] = {0};
|
|
|
|
size_t tlen = UTF16FromUTF8(s, tbuf, 1);
|
|
|
|
REQUIRE(tlen == 1U);
|
|
|
|
REQUIRE(tbuf[0] == 0x24);
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UTF16FromUTF8 Example2") {
|
|
|
|
const char s[] = {'\xC2', '\xA2', 0};
|
|
|
|
wchar_t tbuf[1] = {0};
|
|
|
|
size_t tlen = UTF16FromUTF8(s, tbuf, 1);
|
|
|
|
REQUIRE(tlen == 1U);
|
|
|
|
REQUIRE(tbuf[0] == 0xA2);
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UTF16FromUTF8 Example3") {
|
|
|
|
const char s[] = {'\xE2', '\x82', '\xAC', 0};
|
|
|
|
wchar_t tbuf[1] = {0};
|
|
|
|
size_t tlen = UTF16FromUTF8(s, tbuf, 1);;
|
|
|
|
REQUIRE(tlen == 1U);
|
|
|
|
REQUIRE(tbuf[0] == 0x20AC);
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UTF16FromUTF8 Example4") {
|
|
|
|
const char s[] = {'\xF0', '\x90', '\x8D', '\x88', 0};
|
|
|
|
wchar_t tbuf[2] = {0, 0};
|
|
|
|
size_t tlen = UTF16FromUTF8(s, tbuf, 2);
|
|
|
|
REQUIRE(tlen == 2U);
|
|
|
|
REQUIRE(tbuf[0] == 0xD800);
|
|
|
|
REQUIRE(tbuf[1] == 0xDF48);
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UTF16FromUTF8 Invalid Trail byte in lead position") {
|
|
|
|
const char s[] = "a\xB5yz";
|
|
|
|
wchar_t tbuf[4] = {};
|
|
|
|
size_t tlen = UTF16FromUTF8(s, tbuf, 4);
|
|
|
|
REQUIRE(tlen == 4U);
|
|
|
|
REQUIRE(tbuf[0] == 'a');
|
|
|
|
REQUIRE(tbuf[1] == 0xB5);
|
|
|
|
REQUIRE(tbuf[2] == 'y');
|
|
|
|
REQUIRE(tbuf[3] == 'z');
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UTF16FromUTF8 Invalid Lead byte at end") {
|
|
|
|
const char s[] = "a\xC2";
|
|
|
|
wchar_t tbuf[2] = {};
|
|
|
|
size_t tlen = UTF16FromUTF8(s, tbuf, 2);
|
|
|
|
REQUIRE(tlen == 2U);
|
|
|
|
REQUIRE(tbuf[0] == 'a');
|
|
|
|
REQUIRE(tbuf[1] == 0xC2);
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UTF16FromUTF8 Invalid Lead byte implies 3 trails but only 2") {
|
|
|
|
const char *s = "a\xF1yz";
|
|
|
|
wchar_t tbuf[4] = {};
|
|
|
|
size_t tlen = UTF16FromUTF8(s, tbuf, 4);
|
|
|
|
REQUIRE(tlen == 2U);
|
|
|
|
REQUIRE(tbuf[0] == 'a');
|
|
|
|
REQUIRE(tbuf[1] == 0xF1);
|
|
|
|
}
|
|
|
|
|
|
|
|
// UTF32FromUTF8
|
|
|
|
|
|
|
|
SECTION("UTF32FromUTF8 ASCII") {
|
|
|
|
const char s[] = {'a', 0};
|
|
|
|
unsigned int tbuf[1] = {0};
|
|
|
|
size_t tlen = UTF32FromUTF8(s, tbuf, 1);
|
|
|
|
REQUIRE(tlen == 1U);
|
|
|
|
REQUIRE(tbuf[0] == static_cast<unsigned int>('a'));
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UTF32FromUTF8 Example1") {
|
|
|
|
const char s[] = {'\x24', 0};
|
|
|
|
unsigned int tbuf[1] = {0};
|
|
|
|
size_t tlen = UTF32FromUTF8(s, tbuf, 1);
|
|
|
|
REQUIRE(tlen == 1U);
|
|
|
|
REQUIRE(tbuf[0] == 0x24);
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UTF32FromUTF8 Example2") {
|
|
|
|
const char s[] = {'\xC2', '\xA2', 0};
|
|
|
|
unsigned int tbuf[1] = {0};
|
|
|
|
size_t tlen = UTF32FromUTF8(s, tbuf, 1);
|
|
|
|
REQUIRE(tlen == 1U);
|
|
|
|
REQUIRE(tbuf[0] == 0xA2);
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UTF32FromUTF8 Example3") {
|
|
|
|
const char s[] = {'\xE2', '\x82', '\xAC', 0};
|
|
|
|
unsigned int tbuf[1] = {0};
|
|
|
|
size_t tlen = UTF32FromUTF8(s, tbuf, 1);
|
|
|
|
REQUIRE(tlen == 1U);
|
|
|
|
REQUIRE(tbuf[0] == 0x20AC);
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UTF32FromUTF8 Example4") {
|
|
|
|
const char s[] = {'\xF0', '\x90', '\x8D', '\x88', 0};
|
|
|
|
unsigned int tbuf[1] = {0};
|
|
|
|
size_t tlen = UTF32FromUTF8(s, tbuf, 1);
|
|
|
|
REQUIRE(tlen == 1U);
|
|
|
|
REQUIRE(tbuf[0] == 0x10348);
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UTF32FromUTF8 Invalid Trail byte in lead position") {
|
|
|
|
const char s[] = "a\xB5yz";
|
|
|
|
unsigned int tbuf[4] = {};
|
|
|
|
size_t tlen = UTF32FromUTF8(s, tbuf, 4);
|
|
|
|
REQUIRE(tlen == 4U);
|
|
|
|
REQUIRE(tbuf[0] == static_cast<unsigned int>('a'));
|
|
|
|
REQUIRE(tbuf[1] == 0xB5);
|
|
|
|
REQUIRE(tbuf[2] == static_cast<unsigned int>('y'));
|
|
|
|
REQUIRE(tbuf[3] == static_cast<unsigned int>('z'));
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UTF32FromUTF8 Invalid Lead byte at end") {
|
|
|
|
const char s[] = "a\xC2";
|
|
|
|
unsigned int tbuf[2] = {};
|
|
|
|
size_t tlen = UTF32FromUTF8(s, tbuf, 2);
|
|
|
|
REQUIRE(tlen == 2U);
|
|
|
|
REQUIRE(tbuf[0] == static_cast<unsigned int>('a'));
|
|
|
|
REQUIRE(tbuf[1] == 0xC2);
|
|
|
|
}
|
|
|
|
|
|
|
|
SECTION("UTF32FromUTF8 Invalid Lead byte implies 3 trails but only 2") {
|
|
|
|
const char *s = "a\xF1yz";
|
|
|
|
unsigned int tbuf[4] = {};
|
|
|
|
size_t tlen = UTF32FromUTF8(s, tbuf, 4);
|
|
|
|
REQUIRE(tlen == 2U);
|
|
|
|
REQUIRE(tbuf[0] == static_cast<unsigned int>('a'));
|
|
|
|
REQUIRE(tbuf[1] == 0xF1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
// Simple adapter to avoid casting
|
Updated to Scintilla 5.4.1 & Lexilla 5.3.0
Scintilla 5.4.1
https://www.scintilla.org/scintilla541.zip
Released 27 December 2023.
1. Add IDocumentEditable interface to allow efficient interaction with document objects which may not be visible in a Scintilla instance. This feature is provisonal and may change before being declared stable. For better type-safety, the ScintillaCall C++ API uses IDocumentEditable* where void* was used before which may require changes to client code that uses document pointer APIs DocPointer, SetDocPointer, CreateDocument, AddRefDocument, and ReleaseDocument.
2. Ctrl-click on a selection deselects it in multiple selection mode.
3. Add SCI_SELECTIONFROMPOINT for modifying multiple selections.
4. Add SCI_SETMOVEEXTENDSSELECTION and SCI_CHANGESELECTIONMODE to simplify selection mode manipulation.
5. Improve performance of global replace by reducing cache invalidation overhead. [Feature #1502](https://sourceforge.net/p/scintilla/feature-requests/1502/).
6. Fix regular expression search for "\<" matching beginning of search when not beginning of word and for "\>" not matching line end. [Bug #2157](https://sourceforge.net/p/scintilla/bugs/2157/).
7. Fix regular expression search failure when search for "\<" followed by search for "\>". [Bug #2413](https://sourceforge.net/p/scintilla/bugs/2413/).
8. Fix regular expression assertion (^, $, \b. \B) failures when using SCFIND_CXX11REGEX. [Bug #2405](https://sourceforge.net/p/scintilla/bugs/2405/).
9. Fix regular expression bug in reverse direction where shortened match returned. [Bug #2405](https://sourceforge.net/p/scintilla/bugs/2405/).
10. Avoid character fragments in regular expression search results. [Bug #2405](https://sourceforge.net/p/scintilla/bugs/2405/).
11. With a document that does not have the SC_DOCUMENTOPTION_TEXT_LARGE option set, allocating more than 2G (calling SCI_ALLOCATE or similar) will now fail with SC_STATUS_FAILURE.
12. Protect SCI_REPLACETARGET, SCI_REPLACETARGETMINIMAL, and SCI_REPLACETARGETRE from application changing target in notification handlers. [Bug #2289](https://sourceforge.net/p/scintilla/bugs/2289/).
Lexilla 5.3.0
https://www.scintilla.org/lexilla530.zip
Released 27 December 2023.
1. Fix calling AddStaticLexerModule by defining as C++ instead of C which matches header. [Bug #2421](https://sourceforge.net/p/scintilla/bugs/2421/).
2. Bash: Fix shift operator << incorrectly recognized as here-doc. [Issue #215](https://github.com/ScintillaOrg/lexilla/issues/215).
3. Bash: Fix termination of '${' with first unquoted '}' instead of nesting. [Issue #216](https://github.com/ScintillaOrg/lexilla/issues/216).
4. HTML: JavaScript double-quoted strings may escape line end with '\'. [Issue #214](https://github.com/ScintillaOrg/lexilla/issues/214).
5. Lua: recognize --- doc comments. Defined by [LDoc](https://github.com/lunarmodules/ldoc). Does not recognize --[[-- doc comments which seem less common.
Close #14375
2023-11-19 17:46:55 +00:00
|
|
|
int UTFClass(std::string_view sv) noexcept {
|
|
|
|
return UTF8Classify(sv);
|
2019-05-04 18:14:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_CASE("UTF8Classify") {
|
|
|
|
|
|
|
|
// These tests are supposed to hit every return statement in UTF8Classify in order
|
|
|
|
// with some hit multiple times.
|
|
|
|
|
|
|
|
// Single byte
|
|
|
|
|
|
|
|
SECTION("UTF8Classify Simple ASCII") {
|
|
|
|
REQUIRE(UTFClass("a") == 1);
|
|
|
|
}
|
|
|
|
SECTION("UTF8Classify Invalid Too large lead") {
|
|
|
|
REQUIRE(UTFClass("\xF5") == (1|UTF8MaskInvalid));
|
|
|
|
}
|
|
|
|
SECTION("UTF8Classify Overlong") {
|
|
|
|
REQUIRE(UTFClass("\xC0\x80") == (1 | UTF8MaskInvalid));
|
|
|
|
}
|
|
|
|
SECTION("UTF8Classify single trail byte") {
|
|
|
|
REQUIRE(UTFClass("\x80") == (1 | UTF8MaskInvalid));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Invalid length tests
|
|
|
|
|
|
|
|
SECTION("UTF8Classify 2 byte lead, string less than 2 long") {
|
|
|
|
REQUIRE(UTFClass("\xD0") == (1 | UTF8MaskInvalid));
|
|
|
|
}
|
|
|
|
SECTION("UTF8Classify 3 byte lead, string less than 3 long") {
|
|
|
|
REQUIRE(UTFClass("\xEF") == (1 | UTF8MaskInvalid));
|
|
|
|
}
|
|
|
|
SECTION("UTF8Classify 4 byte lead, string less than 4 long") {
|
|
|
|
REQUIRE(UTFClass("\xF0") == (1 | UTF8MaskInvalid));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Invalid first trail byte tests
|
|
|
|
|
|
|
|
SECTION("UTF8Classify 2 byte lead trail is invalid") {
|
|
|
|
REQUIRE(UTFClass("\xD0q") == (1 | UTF8MaskInvalid));
|
|
|
|
}
|
|
|
|
SECTION("UTF8Classify 3 byte lead invalid trails") {
|
|
|
|
REQUIRE(UTFClass("\xE2qq") == (1 | UTF8MaskInvalid));
|
|
|
|
}
|
|
|
|
SECTION("UTF8Classify 4 byte bad trails") {
|
|
|
|
REQUIRE(UTFClass("\xF0xyz") == (1 | UTF8MaskInvalid));
|
|
|
|
}
|
|
|
|
|
|
|
|
// 2 byte lead
|
|
|
|
|
|
|
|
SECTION("UTF8Classify 2 byte valid character") {
|
|
|
|
REQUIRE(UTFClass("\xD0\x80") == 2);
|
|
|
|
}
|
|
|
|
|
|
|
|
// 3 byte lead
|
|
|
|
|
|
|
|
SECTION("UTF8Classify 3 byte lead, overlong") {
|
|
|
|
REQUIRE(UTFClass("\xE0\x80\xAF") == (1 | UTF8MaskInvalid));
|
|
|
|
}
|
|
|
|
SECTION("UTF8Classify 3 byte lead, surrogate") {
|
|
|
|
REQUIRE(UTFClass("\xED\xA0\x80") == (1 | UTF8MaskInvalid));
|
|
|
|
}
|
|
|
|
SECTION("UTF8Classify FFFE non-character") {
|
|
|
|
REQUIRE(UTFClass("\xEF\xBF\xBE") == (3 | UTF8MaskInvalid));
|
|
|
|
}
|
|
|
|
SECTION("UTF8Classify FFFF non-character") {
|
|
|
|
REQUIRE(UTFClass("\xEF\xBF\xBF") == (3 | UTF8MaskInvalid));
|
|
|
|
}
|
|
|
|
SECTION("UTF8Classify FDD0 non-character") {
|
|
|
|
REQUIRE(UTFClass("\xEF\xB7\x90") == (3 | UTF8MaskInvalid));
|
|
|
|
}
|
|
|
|
SECTION("UTF8Classify 3 byte valid character") {
|
|
|
|
REQUIRE(UTFClass("\xE2\x82\xAC") == 3);
|
|
|
|
}
|
|
|
|
|
|
|
|
// 4 byte lead
|
|
|
|
|
|
|
|
SECTION("UTF8Classify 1FFFF non-character") {
|
|
|
|
REQUIRE(UTFClass("\xF0\x9F\xBF\xBF") == (4 | UTF8MaskInvalid));
|
|
|
|
}
|
|
|
|
SECTION("UTF8Classify 1 Greater than max Unicode 110000") {
|
|
|
|
// Maximum Unicode value is 10FFFF so 110000 is out of range
|
|
|
|
REQUIRE(UTFClass("\xF4\x90\x80\x80") == (1 | UTF8MaskInvalid));
|
|
|
|
}
|
|
|
|
SECTION("UTF8Classify 4 byte overlong") {
|
|
|
|
REQUIRE(UTFClass("\xF0\x80\x80\x80") == (1 | UTF8MaskInvalid));
|
|
|
|
}
|
|
|
|
SECTION("UTF8Classify 4 byte valid character") {
|
|
|
|
REQUIRE(UTFClass("\xF0\x9F\x8C\x90") == 4);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Invalid 2nd or 3rd continuation bytes
|
|
|
|
SECTION("UTF8Classify 3 byte lead invalid 2nd trail") {
|
|
|
|
REQUIRE(UTFClass("\xE2\x82q") == (1 | UTF8MaskInvalid));
|
|
|
|
}
|
|
|
|
SECTION("UTF8Classify 4 byte lead invalid 2nd trail") {
|
|
|
|
REQUIRE(UTFClass("\xF0\x9Fq\x9F") == (1 | UTF8MaskInvalid));
|
|
|
|
}
|
|
|
|
SECTION("UTF8Classify 4 byte lead invalid 3rd trail") {
|
|
|
|
REQUIRE(UTFClass("\xF0\x9F\x9Fq") == (1 | UTF8MaskInvalid));
|
|
|
|
}
|
|
|
|
}
|