notepad-plus-plus/scintilla/src/UniConversion.cxx

407 lines
12 KiB
C++

// Scintilla source code edit control
/** @file UniConversion.cxx
** Functions to handle UTF-8 and UTF-16 strings.
**/
// Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
// The License.txt file describes the conditions under which this software may be distributed.
#include <cstdlib>
#include <stdexcept>
#include <string>
#include <string_view>
#include "UniConversion.h"
using namespace Scintilla;
namespace Scintilla {
size_t UTF8Length(std::wstring_view wsv) noexcept {
size_t len = 0;
for (size_t i = 0; i < wsv.length() && wsv[i];) {
const unsigned int uch = wsv[i];
if (uch < 0x80) {
len++;
} else if (uch < 0x800) {
len += 2;
} else if ((uch >= SURROGATE_LEAD_FIRST) &&
(uch <= SURROGATE_TRAIL_LAST)) {
len += 4;
i++;
} else {
len += 3;
}
i++;
}
return len;
}
size_t UTF8PositionFromUTF16Position(std::string_view u8Text, size_t positionUTF16) noexcept {
size_t positionUTF8 = 0;
for (size_t lengthUTF16 = 0; (positionUTF8 < u8Text.length()) && (lengthUTF16 < positionUTF16);) {
const unsigned char uch = u8Text[positionUTF8];
const unsigned int byteCount = UTF8BytesOfLead[uch];
lengthUTF16 += UTF16LengthFromUTF8ByteCount(byteCount);
positionUTF8 += byteCount;
}
return positionUTF8;
}
void UTF8FromUTF16(std::wstring_view wsv, char *putf, size_t len) {
size_t k = 0;
for (size_t i = 0; i < wsv.length() && wsv[i];) {
const unsigned int uch = wsv[i];
if (uch < 0x80) {
putf[k++] = static_cast<char>(uch);
} else if (uch < 0x800) {
putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
} else if ((uch >= SURROGATE_LEAD_FIRST) &&
(uch <= SURROGATE_TRAIL_LAST)) {
// Half a surrogate pair
i++;
const unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (wsv[i] & 0x3ff);
putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
} else {
putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
}
i++;
}
if (k < len)
putf[k] = '\0';
}
void UTF8FromUTF32Character(int uch, char *putf) noexcept {
size_t k = 0;
if (uch < 0x80) {
putf[k++] = static_cast<char>(uch);
} else if (uch < 0x800) {
putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
} else if (uch < 0x10000) {
putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
} else {
putf[k++] = static_cast<char>(0xF0 | (uch >> 18));
putf[k++] = static_cast<char>(0x80 | ((uch >> 12) & 0x3f));
putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
}
putf[k] = '\0';
}
size_t UTF16Length(std::string_view svu8) noexcept {
size_t ulen = 0;
for (size_t i = 0; i< svu8.length();) {
const unsigned char ch = svu8[i];
const unsigned int byteCount = UTF8BytesOfLead[ch];
const unsigned int utf16Len = UTF16LengthFromUTF8ByteCount(byteCount);
i += byteCount;
ulen += (i > svu8.length()) ? 1 : utf16Len;
}
return ulen;
}
constexpr unsigned char TrailByteValue(unsigned char c) {
// The top 2 bits are 0b10 to indicate a trail byte.
// The lower 6 bits contain the value.
return c & 0b0011'1111;
}
size_t UTF16FromUTF8(std::string_view svu8, wchar_t *tbuf, size_t tlen) {
size_t ui = 0;
for (size_t i = 0; i < svu8.length();) {
unsigned char ch = svu8[i];
const unsigned int byteCount = UTF8BytesOfLead[ch];
unsigned int value;
if (i + byteCount > svu8.length()) {
// Trying to read past end but still have space to write
if (ui < tlen) {
tbuf[ui] = ch;
ui++;
}
break;
}
const size_t outLen = UTF16LengthFromUTF8ByteCount(byteCount);
if (ui + outLen > tlen) {
throw std::runtime_error("UTF16FromUTF8: attempted write beyond end");
}
i++;
switch (byteCount) {
case 1:
tbuf[ui] = ch;
break;
case 2:
value = (ch & 0x1F) << 6;
ch = svu8[i++];
value += TrailByteValue(ch);
tbuf[ui] = static_cast<wchar_t>(value);
break;
case 3:
value = (ch & 0xF) << 12;
ch = svu8[i++];
value += (TrailByteValue(ch) << 6);
ch = svu8[i++];
value += TrailByteValue(ch);
tbuf[ui] = static_cast<wchar_t>(value);
break;
default:
// Outside the BMP so need two surrogates
value = (ch & 0x7) << 18;
ch = svu8[i++];
value += TrailByteValue(ch) << 12;
ch = svu8[i++];
value += TrailByteValue(ch) << 6;
ch = svu8[i++];
value += TrailByteValue(ch);
tbuf[ui] = static_cast<wchar_t>(((value - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
ui++;
tbuf[ui] = static_cast<wchar_t>((value & 0x3ff) + SURROGATE_TRAIL_FIRST);
break;
}
ui++;
}
return ui;
}
size_t UTF32Length(std::string_view svu8) noexcept {
size_t ulen = 0;
for (size_t i = 0; i < svu8.length();) {
const unsigned char ch = svu8[i];
const unsigned int byteCount = UTF8BytesOfLead[ch];
i += byteCount;
ulen++;
}
return ulen;
}
size_t UTF32FromUTF8(std::string_view svu8, unsigned int *tbuf, size_t tlen) {
size_t ui = 0;
for (size_t i = 0; i < svu8.length();) {
unsigned char ch = svu8[i];
const unsigned int byteCount = UTF8BytesOfLead[ch];
unsigned int value;
if (i + byteCount > svu8.length()) {
// Trying to read past end but still have space to write
if (ui < tlen) {
tbuf[ui] = ch;
ui++;
}
break;
}
if (ui == tlen) {
throw std::runtime_error("UTF32FromUTF8: attempted write beyond end");
}
i++;
switch (byteCount) {
case 1:
value = ch;
break;
case 2:
value = (ch & 0x1F) << 6;
ch = svu8[i++];
value += TrailByteValue(ch);
break;
case 3:
value = (ch & 0xF) << 12;
ch = svu8[i++];
value += TrailByteValue(ch) << 6;
ch = svu8[i++];
value += TrailByteValue(ch);
break;
default:
value = (ch & 0x7) << 18;
ch = svu8[i++];
value += TrailByteValue(ch) << 12;
ch = svu8[i++];
value += TrailByteValue(ch) << 6;
ch = svu8[i++];
value += TrailByteValue(ch);
break;
}
tbuf[ui] = value;
ui++;
}
return ui;
}
std::wstring WStringFromUTF8(std::string_view svu8) {
if constexpr (sizeof(wchar_t) == 2) {
const size_t len16 = UTF16Length(svu8);
std::wstring ws(len16, 0);
UTF16FromUTF8(svu8, &ws[0], len16);
return ws;
} else {
const size_t len32 = UTF32Length(svu8);
std::wstring ws(len32, 0);
UTF32FromUTF8(svu8, reinterpret_cast<unsigned int *>(&ws[0]), len32);
return ws;
}
}
unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noexcept {
if (val < SUPPLEMENTAL_PLANE_FIRST) {
tbuf[0] = static_cast<wchar_t>(val);
return 1;
} else {
tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
return 2;
}
}
const unsigned char UTF8BytesOfLead[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00 - 0F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 10 - 1F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 20 - 2F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 30 - 3F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40 - 4F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 50 - 5F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 60 - 6F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70 - 7F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80 - 8F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 90 - 9F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A0 - AF
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B0 - BF
1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0 - CF
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0 - DF
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0 - EF
4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // F0 - FF
};
// Return both the width of the first character in the string and a status
// saying whether it is valid or invalid.
// Most invalid sequences return a width of 1 so are treated as isolated bytes but
// the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
// reasonably treated as code points in some circumstances. They will, however,
// not have associated glyphs.
int UTF8Classify(const unsigned char *us, size_t len) noexcept {
// For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
if (us[0] < 0x80) {
// ASCII
return 1;
}
const size_t byteCount = UTF8BytesOfLead[us[0]];
if (byteCount == 1 || byteCount > len) {
// Invalid lead byte
return UTF8MaskInvalid | 1;
}
if (!UTF8IsTrailByte(us[1])) {
// Invalid trail byte
return UTF8MaskInvalid | 1;
}
switch (byteCount) {
case 2:
return 2;
case 3:
if (UTF8IsTrailByte(us[2])) {
if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
// Overlong
return UTF8MaskInvalid | 1;
}
if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
// Surrogate
return UTF8MaskInvalid | 1;
}
if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
// U+FFFE non-character - 3 bytes long
return UTF8MaskInvalid | 3;
}
if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
// U+FFFF non-character - 3 bytes long
return UTF8MaskInvalid | 3;
}
if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
// U+FDD0 .. U+FDEF
return UTF8MaskInvalid | 3;
}
return 3;
}
break;
default:
if (UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
// *FFFE or *FFFF non-character
return UTF8MaskInvalid | 4;
}
if (*us == 0xf4) {
// Check if encoding a value beyond the last Unicode character 10FFFF
if (us[1] > 0x8f) {
return UTF8MaskInvalid | 1;
}
} else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
// Overlong
return UTF8MaskInvalid | 1;
}
return 4;
}
break;
}
return UTF8MaskInvalid | 1;
}
int UTF8DrawBytes(const unsigned char *us, int len) noexcept {
const int utf8StatusNext = UTF8Classify(us, len);
return (utf8StatusNext & UTF8MaskInvalid) ? 1 : (utf8StatusNext & UTF8MaskWidth);
}
bool UTF8IsValid(std::string_view svu8) noexcept {
const unsigned char *us = reinterpret_cast<const unsigned char *>(svu8.data());
size_t remaining = svu8.length();
while (remaining > 0) {
const int utf8Status = UTF8Classify(us, remaining);
if (utf8Status & UTF8MaskInvalid) {
return false;
} else {
const int lenChar = utf8Status & UTF8MaskWidth;
us += lenChar;
remaining -= lenChar;
}
}
return remaining == 0;
}
// Replace invalid bytes in UTF-8 with the replacement character
std::string FixInvalidUTF8(const std::string &text) {
std::string result;
const char *s = text.c_str();
size_t remaining = text.size();
while (remaining > 0) {
const int utf8Status = UTF8Classify(reinterpret_cast<const unsigned char *>(s), remaining);
if (utf8Status & UTF8MaskInvalid) {
// Replacement character 0xFFFD = UTF8:"efbfbd".
result.append("\xef\xbf\xbd");
s++;
remaining--;
} else {
const size_t len = utf8Status & UTF8MaskWidth;
result.append(s, len);
s += len;
remaining -= len;
}
}
return result;
}
}