Fix UTF-16 decoding/encoding for code points above U+FFFF
Fix #9597, fix #3747, fix #5754, close #9599pull/9707/head
parent
dc106a4c95
commit
38bf76e843
|
@ -174,11 +174,15 @@ size_t Utf8_16_Read::convert(char* buf, size_t len)
|
|||
|
||||
m_Iter16.set(m_pBuf + nSkip, len - nSkip, m_eEncoding);
|
||||
|
||||
for (; m_Iter16; ++m_Iter16)
|
||||
{
|
||||
*pCur++ = m_Iter16.get();
|
||||
}
|
||||
while (m_Iter16)
|
||||
{
|
||||
++m_Iter16;
|
||||
utf8 c;
|
||||
while (m_Iter16.get(&c))
|
||||
*pCur++ = c;
|
||||
}
|
||||
m_nNewBufSize = pCur - m_pNewBuf;
|
||||
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
@ -342,18 +346,18 @@ size_t Utf8_16_Write::fwrite(const void* p, size_t _size)
|
|||
|
||||
Utf8_Iter iter8;
|
||||
iter8.set(static_cast<const ubyte*>(p), _size, m_eEncoding);
|
||||
|
||||
int bufIndex = 0;
|
||||
while (iter8) {
|
||||
if (iter8.canGet()) {
|
||||
buf[bufIndex++] = iter8.get();
|
||||
}
|
||||
|
||||
int bufIndex = 0;
|
||||
while (iter8) {
|
||||
++iter8;
|
||||
while ((bufIndex < bufSize) && iter8.canGet())
|
||||
iter8.get(&buf [bufIndex++]);
|
||||
|
||||
if (bufIndex == bufSize || !iter8) {
|
||||
if (!::fwrite(buf, bufIndex*sizeof(utf16), 1, m_pFile)) return 0;
|
||||
bufIndex = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
ret = 1;
|
||||
break;
|
||||
}
|
||||
|
@ -412,7 +416,7 @@ size_t Utf8_16_Write::convert(char* p, size_t _size)
|
|||
|
||||
for (; iter8; ++iter8) {
|
||||
if (iter8.canGet()) {
|
||||
*pCur++ = iter8.get();
|
||||
iter8.get(pCur++);
|
||||
}
|
||||
}
|
||||
m_nBufSize = (const char*)pCur - (const char*)m_pNewBuf;
|
||||
|
@ -461,7 +465,8 @@ void Utf8_Iter::reset()
|
|||
m_pRead = NULL;
|
||||
m_pEnd = NULL;
|
||||
m_eState = eStart;
|
||||
m_nCur = 0;
|
||||
m_out1st = 0;
|
||||
m_outLst = 0;
|
||||
m_eEncoding = uni8Bit;
|
||||
}
|
||||
|
||||
|
@ -471,35 +476,50 @@ void Utf8_Iter::set(const ubyte* pBuf, size_t nLen, UniMode eEncoding)
|
|||
m_pRead = pBuf;
|
||||
m_pEnd = pBuf + nLen;
|
||||
m_eEncoding = eEncoding;
|
||||
operator++();
|
||||
// Note: m_eState, m_nCur not set
|
||||
}
|
||||
|
||||
bool Utf8_Iter::get(utf16* c)
|
||||
{
|
||||
#ifdef _DEBUG
|
||||
assert(m_out1st != m_outLst);
|
||||
#endif
|
||||
if (m_out1st == m_outLst) return false;
|
||||
*c = m_out [m_out1st];
|
||||
m_out1st = (m_out1st + 1) % _countof (m_out);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Go to the next byte.
|
||||
void Utf8_Iter::operator++()
|
||||
{
|
||||
if (m_out1st != m_outLst) return;
|
||||
switch (m_eState)
|
||||
{
|
||||
case eStart:
|
||||
if (*m_pRead < 0x80) {
|
||||
m_nCur = *m_pRead;
|
||||
toStart();
|
||||
m_code = *m_pRead;
|
||||
toStart();
|
||||
} else if (*m_pRead < 0xE0) {
|
||||
m_nCur = static_cast<utf16>((0x1F & *m_pRead) << 6);
|
||||
m_eState = e2Bytes_Byte2;
|
||||
m_code = static_cast<utf16>(0x1f & *m_pRead);
|
||||
m_eState = eFollow;
|
||||
m_count = 1;
|
||||
} else if (*m_pRead < 0xF0) {
|
||||
m_code = static_cast<utf16>(0x0f & *m_pRead);
|
||||
m_eState = eFollow;
|
||||
m_count = 2;
|
||||
} else {
|
||||
m_nCur = static_cast<utf16>((0xF & *m_pRead) << 12);
|
||||
m_eState = e3Bytes_Byte2;
|
||||
m_code = static_cast<utf16>(0x07 & *m_pRead);
|
||||
m_eState = eFollow;
|
||||
m_count = 3;
|
||||
}
|
||||
break;
|
||||
case e2Bytes_Byte2:
|
||||
case e3Bytes_Byte3:
|
||||
m_nCur |= static_cast<utf8>(0x3F & *m_pRead);
|
||||
toStart();
|
||||
break;
|
||||
case e3Bytes_Byte2:
|
||||
m_nCur |= static_cast<utf16>((0x3F & *m_pRead) << 6);
|
||||
m_eState = e3Bytes_Byte3;
|
||||
|
||||
case eFollow:
|
||||
m_code = (m_code << 6) | static_cast<utf8>(0x3F & *m_pRead);
|
||||
m_count--;
|
||||
if (m_count == 0)
|
||||
toStart();
|
||||
break;
|
||||
}
|
||||
++m_pRead;
|
||||
|
@ -507,19 +527,32 @@ void Utf8_Iter::operator++()
|
|||
|
||||
void Utf8_Iter::toStart()
|
||||
{
|
||||
m_eState = eStart;
|
||||
if (m_eEncoding == uni16BE || m_eEncoding == uni16BE_NoBOM)
|
||||
{
|
||||
swap();
|
||||
bool swap = (m_eEncoding == uni16BE || m_eEncoding == uni16BE_NoBOM);
|
||||
if (m_code < 0x10000)
|
||||
{
|
||||
utf16 c = swap ? _byteswap_ushort((utf16)m_code) : (utf16)m_code;
|
||||
pushout (c);
|
||||
}
|
||||
else
|
||||
{
|
||||
m_code -= 0x10000;
|
||||
utf16 c1 = (utf16)(0xD800 | (m_code >> 10));
|
||||
utf16 c2 = (utf16)(0xDC00 | (m_code & 0x3ff));
|
||||
if (swap)
|
||||
{
|
||||
c1 = _byteswap_ushort(c1);
|
||||
c2 = _byteswap_ushort(c2);
|
||||
}
|
||||
pushout(c1);
|
||||
pushout(c2);
|
||||
}
|
||||
m_eState = eStart;
|
||||
}
|
||||
|
||||
void Utf8_Iter::swap()
|
||||
void Utf8_Iter::pushout(utf16 c)
|
||||
{
|
||||
utf8* p = reinterpret_cast<utf8*>(&m_nCur);
|
||||
utf8 swapbyte = *p;
|
||||
*p = *(p + 1);
|
||||
*(p + 1) = swapbyte;
|
||||
m_out [m_outLst] = c;
|
||||
m_outLst = (m_outLst + 1) % _countof(m_out);
|
||||
}
|
||||
|
||||
//==================================================
|
||||
|
@ -534,20 +567,51 @@ void Utf16_Iter::reset()
|
|||
m_pRead = NULL;
|
||||
m_pEnd = NULL;
|
||||
m_eState = eStart;
|
||||
m_nCur = 0;
|
||||
m_out1st = 0;
|
||||
m_outLst = 0;
|
||||
m_nCur16 = 0;
|
||||
m_eEncoding = uni8Bit;
|
||||
}
|
||||
|
||||
bool Utf16_Iter::get(utf8 *c)
|
||||
{
|
||||
if (m_out1st != m_outLst)
|
||||
{
|
||||
*c = m_out [m_out1st];
|
||||
m_out1st = (m_out1st + 1) % _countof(m_out);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
void Utf16_Iter::pushout(ubyte c)
|
||||
{
|
||||
m_out [m_outLst] = c;
|
||||
m_outLst = (m_outLst + 1) % _countof(m_out);
|
||||
}
|
||||
|
||||
void Utf16_Iter::set(const ubyte* pBuf, size_t nLen, UniMode eEncoding)
|
||||
{
|
||||
m_pBuf = pBuf;
|
||||
m_pRead = pBuf;
|
||||
m_pEnd = pBuf + nLen;
|
||||
m_eEncoding = eEncoding;
|
||||
m_eState = eStart;
|
||||
operator++();
|
||||
// Note: m_eState, m_nCur, m_nCur16 not reinitalized.
|
||||
// Note: m_eState, m_out*, m_nCur16 not reinitalized.
|
||||
}
|
||||
|
||||
void Utf16_Iter::read()
|
||||
{
|
||||
if (m_eEncoding == uni16LE || m_eEncoding == uni16LE_NoBOM)
|
||||
{
|
||||
m_nCur16 = *m_pRead++;
|
||||
m_nCur16 |= static_cast<utf16>(*m_pRead << 8);
|
||||
}
|
||||
else //(m_eEncoding == uni16BE || m_eEncoding == uni16BE_NoBOM)
|
||||
{
|
||||
m_nCur16 = static_cast<utf16>(*m_pRead++ << 8);
|
||||
m_nCur16 |= *m_pRead;
|
||||
}
|
||||
++m_pRead;
|
||||
}
|
||||
|
||||
// Goes to the next byte.
|
||||
|
@ -555,42 +619,46 @@ void Utf16_Iter::set(const ubyte* pBuf, size_t nLen, UniMode eEncoding)
|
|||
// This way we can continue from a partial buffer that doesn't align
|
||||
void Utf16_Iter::operator++()
|
||||
{
|
||||
if (m_out1st != m_outLst) return;
|
||||
switch (m_eState)
|
||||
{
|
||||
{
|
||||
case eStart:
|
||||
if (m_eEncoding == uni16LE || m_eEncoding == uni16LE_NoBOM)
|
||||
{
|
||||
m_nCur16 = *m_pRead++;
|
||||
m_nCur16 |= static_cast<utf16>(*m_pRead << 8);
|
||||
}
|
||||
else //(m_eEncoding == uni16BE || m_eEncoding == uni16BE_NoBOM)
|
||||
{
|
||||
m_nCur16 = static_cast<utf16>(*m_pRead++ << 8);
|
||||
m_nCur16 |= *m_pRead;
|
||||
}
|
||||
++m_pRead;
|
||||
|
||||
if (m_nCur16 < 0x80) {
|
||||
m_nCur = static_cast<ubyte>(m_nCur16 & 0xFF);
|
||||
read();
|
||||
if ((m_nCur16 >= 0xd800) && (m_nCur16 < 0xdc00)) {
|
||||
m_eState = eSurrogate;
|
||||
m_highSurrogate = m_nCur16;
|
||||
}
|
||||
else if (m_nCur16 < 0x80) {
|
||||
pushout(static_cast<ubyte>(m_nCur16 & 0xFF));
|
||||
m_eState = eStart;
|
||||
} else if (m_nCur16 < 0x800) {
|
||||
m_nCur = static_cast<ubyte>(0xC0 | m_nCur16 >> 6);
|
||||
pushout(static_cast<ubyte>(0xC0 | m_nCur16 >> 6));
|
||||
m_eState = e2Bytes2;
|
||||
} else {
|
||||
m_nCur = static_cast<ubyte>(0xE0 | m_nCur16 >> 12);
|
||||
pushout(static_cast<ubyte>(0xE0 | m_nCur16 >> 12));
|
||||
m_eState = e3Bytes2;
|
||||
}
|
||||
break;
|
||||
case e2Bytes2:
|
||||
case e3Bytes3:
|
||||
m_nCur = static_cast<ubyte>(0x80 | m_nCur16 & 0x3F);
|
||||
pushout(static_cast<ubyte>(0x80 | m_nCur16 & 0x3F));
|
||||
m_eState = eStart;
|
||||
break;
|
||||
case e3Bytes2:
|
||||
m_nCur = static_cast<ubyte>(0x80 | ((m_nCur16 >> 6) & 0x3F));
|
||||
pushout(static_cast<ubyte>(0x80 | ((m_nCur16 >> 6) & 0x3F)));
|
||||
m_eState = e3Bytes3;
|
||||
break;
|
||||
case eSurrogate:
|
||||
read();
|
||||
if ((m_nCur16 >= 0xDC00) && (m_nCur16 < 0xE000))
|
||||
{ // valid surrogate pair
|
||||
UINT code = 0x10000 + ((m_highSurrogate & 0x3ff) << 10) + (m_nCur16 & 0x3ff);
|
||||
pushout(0xf0 | (code >> 18) & 0x07);
|
||||
pushout(0x80 | (code >> 12) & 0x3f);
|
||||
pushout(0x80 | (code >> 6) & 0x3f);
|
||||
pushout(0x80 | code & 0x3f);
|
||||
m_eState = eStart;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -42,25 +42,30 @@ public:
|
|||
eStart,
|
||||
e2Bytes2,
|
||||
e3Bytes2,
|
||||
e3Bytes3
|
||||
e3Bytes3,
|
||||
eSurrogate
|
||||
};
|
||||
|
||||
Utf16_Iter();
|
||||
void reset();
|
||||
void set(const ubyte* pBuf, size_t nLen, UniMode eEncoding);
|
||||
utf8 get() const { return m_nCur; };
|
||||
bool get(utf8 *c);
|
||||
void operator++();
|
||||
eState getState() { return m_eState; };
|
||||
operator bool() { return m_pRead <= m_pEnd; };
|
||||
operator bool() { return m_pRead < m_pEnd; };
|
||||
|
||||
protected:
|
||||
void toStart(); // Put to start state, swap bytes if necessary
|
||||
void read();
|
||||
void pushout(ubyte c);
|
||||
|
||||
protected:
|
||||
UniMode m_eEncoding;
|
||||
eState m_eState;
|
||||
utf8 m_nCur;
|
||||
utf8 m_out [16];
|
||||
int m_out1st;
|
||||
int m_outLst;
|
||||
utf16 m_nCur16;
|
||||
utf16 m_highSurrogate;
|
||||
const ubyte* m_pBuf;
|
||||
const ubyte* m_pRead;
|
||||
const ubyte* m_pEnd;
|
||||
|
@ -72,29 +77,22 @@ public:
|
|||
Utf8_Iter();
|
||||
void reset();
|
||||
void set(const ubyte* pBuf, size_t nLen, UniMode eEncoding);
|
||||
utf16 get() const {
|
||||
#ifdef _DEBUG
|
||||
assert(m_eState == eStart);
|
||||
#endif
|
||||
return m_nCur;
|
||||
}
|
||||
bool canGet() const { return m_eState == eStart; }
|
||||
bool get(utf16* c);
|
||||
bool canGet() const { return m_out1st != m_outLst; }
|
||||
void toStart();
|
||||
void operator++();
|
||||
operator bool() { return m_pRead <= m_pEnd; }
|
||||
operator bool() { return m_pRead < m_pEnd; }
|
||||
|
||||
protected:
|
||||
void swap();
|
||||
void toStart(); // Put to start state, swap bytes if necessary
|
||||
enum eState {
|
||||
eStart,
|
||||
e2Bytes_Byte2,
|
||||
e3Bytes_Byte2,
|
||||
e3Bytes_Byte3
|
||||
};
|
||||
enum eState {eStart, eFollow};
|
||||
void pushout(utf16 c);
|
||||
protected:
|
||||
UniMode m_eEncoding;
|
||||
eState m_eState;
|
||||
utf16 m_nCur;
|
||||
int m_code;
|
||||
int m_count;
|
||||
utf16 m_out [4];
|
||||
int m_out1st, m_outLst;
|
||||
const ubyte* m_pBuf;
|
||||
const ubyte* m_pRead;
|
||||
const ubyte* m_pEnd;
|
||||
|
@ -112,7 +110,6 @@ public:
|
|||
size_t getNewSize() const { return m_nNewBufSize; }
|
||||
|
||||
UniMode getEncoding() const { return m_eEncoding; }
|
||||
size_t calcCurPos(size_t pos);
|
||||
static UniMode determineEncoding(const unsigned char *buf, size_t bufLen);
|
||||
|
||||
protected:
|
||||
|
@ -147,7 +144,6 @@ public:
|
|||
|
||||
size_t convert(char* p, size_t _size);
|
||||
char* getNewBuf() { return reinterpret_cast<char*>(m_pNewBuf); }
|
||||
size_t calcCurPos(size_t pos);
|
||||
|
||||
protected:
|
||||
UniMode m_eEncoding;
|
||||
|
|
Loading…
Reference in New Issue