diff --git a/PowerEditor/src/Utf8_16.cpp b/PowerEditor/src/Utf8_16.cpp index bf952fb43..2be7ca3de 100644 --- a/PowerEditor/src/Utf8_16.cpp +++ b/PowerEditor/src/Utf8_16.cpp @@ -174,11 +174,15 @@ size_t Utf8_16_Read::convert(char* buf, size_t len) m_Iter16.set(m_pBuf + nSkip, len - nSkip, m_eEncoding); - for (; m_Iter16; ++m_Iter16) - { - *pCur++ = m_Iter16.get(); - } + while (m_Iter16) + { + ++m_Iter16; + utf8 c; + while (m_Iter16.get(&c)) + *pCur++ = c; + } m_nNewBufSize = pCur - m_pNewBuf; + break; } default: @@ -342,18 +346,18 @@ size_t Utf8_16_Write::fwrite(const void* p, size_t _size) Utf8_Iter iter8; iter8.set(static_cast(p), _size, m_eEncoding); - - int bufIndex = 0; - while (iter8) { - if (iter8.canGet()) { - buf[bufIndex++] = iter8.get(); - } + + int bufIndex = 0; + while (iter8) { ++iter8; + while ((bufIndex < bufSize) && iter8.canGet()) + iter8.get(&buf [bufIndex++]); + if (bufIndex == bufSize || !iter8) { if (!::fwrite(buf, bufIndex*sizeof(utf16), 1, m_pFile)) return 0; bufIndex = 0; } - } + } ret = 1; break; } @@ -412,7 +416,7 @@ size_t Utf8_16_Write::convert(char* p, size_t _size) for (; iter8; ++iter8) { if (iter8.canGet()) { - *pCur++ = iter8.get(); + iter8.get(pCur++); } } m_nBufSize = (const char*)pCur - (const char*)m_pNewBuf; @@ -461,7 +465,8 @@ void Utf8_Iter::reset() m_pRead = NULL; m_pEnd = NULL; m_eState = eStart; - m_nCur = 0; + m_out1st = 0; + m_outLst = 0; m_eEncoding = uni8Bit; } @@ -471,35 +476,50 @@ void Utf8_Iter::set(const ubyte* pBuf, size_t nLen, UniMode eEncoding) m_pRead = pBuf; m_pEnd = pBuf + nLen; m_eEncoding = eEncoding; - operator++(); // Note: m_eState, m_nCur not set } +bool Utf8_Iter::get(utf16* c) +{ +#ifdef _DEBUG + assert(m_out1st != m_outLst); +#endif + if (m_out1st == m_outLst) return false; + *c = m_out [m_out1st]; + m_out1st = (m_out1st + 1) % _countof (m_out); + return true; +} + // Go to the next byte. void Utf8_Iter::operator++() { + if (m_out1st != m_outLst) return; switch (m_eState) { case eStart: if (*m_pRead < 0x80) { - m_nCur = *m_pRead; - toStart(); + m_code = *m_pRead; + toStart(); } else if (*m_pRead < 0xE0) { - m_nCur = static_cast((0x1F & *m_pRead) << 6); - m_eState = e2Bytes_Byte2; + m_code = static_cast(0x1f & *m_pRead); + m_eState = eFollow; + m_count = 1; + } else if (*m_pRead < 0xF0) { + m_code = static_cast(0x0f & *m_pRead); + m_eState = eFollow; + m_count = 2; } else { - m_nCur = static_cast((0xF & *m_pRead) << 12); - m_eState = e3Bytes_Byte2; + m_code = static_cast(0x07 & *m_pRead); + m_eState = eFollow; + m_count = 3; } break; - case e2Bytes_Byte2: - case e3Bytes_Byte3: - m_nCur |= static_cast(0x3F & *m_pRead); - toStart(); - break; - case e3Bytes_Byte2: - m_nCur |= static_cast((0x3F & *m_pRead) << 6); - m_eState = e3Bytes_Byte3; + + case eFollow: + m_code = (m_code << 6) | static_cast(0x3F & *m_pRead); + m_count--; + if (m_count == 0) + toStart(); break; } ++m_pRead; @@ -507,19 +527,32 @@ void Utf8_Iter::operator++() void Utf8_Iter::toStart() { - m_eState = eStart; - if (m_eEncoding == uni16BE || m_eEncoding == uni16BE_NoBOM) - { - swap(); + bool swap = (m_eEncoding == uni16BE || m_eEncoding == uni16BE_NoBOM); + if (m_code < 0x10000) + { + utf16 c = swap ? _byteswap_ushort((utf16)m_code) : (utf16)m_code; + pushout (c); } + else + { + m_code -= 0x10000; + utf16 c1 = (utf16)(0xD800 | (m_code >> 10)); + utf16 c2 = (utf16)(0xDC00 | (m_code & 0x3ff)); + if (swap) + { + c1 = _byteswap_ushort(c1); + c2 = _byteswap_ushort(c2); + } + pushout(c1); + pushout(c2); + } + m_eState = eStart; } -void Utf8_Iter::swap() +void Utf8_Iter::pushout(utf16 c) { - utf8* p = reinterpret_cast(&m_nCur); - utf8 swapbyte = *p; - *p = *(p + 1); - *(p + 1) = swapbyte; + m_out [m_outLst] = c; + m_outLst = (m_outLst + 1) % _countof(m_out); } //================================================== @@ -534,20 +567,51 @@ void Utf16_Iter::reset() m_pRead = NULL; m_pEnd = NULL; m_eState = eStart; - m_nCur = 0; + m_out1st = 0; + m_outLst = 0; m_nCur16 = 0; m_eEncoding = uni8Bit; } +bool Utf16_Iter::get(utf8 *c) +{ + if (m_out1st != m_outLst) + { + *c = m_out [m_out1st]; + m_out1st = (m_out1st + 1) % _countof(m_out); + return true; + } + return false; +}; + +void Utf16_Iter::pushout(ubyte c) +{ + m_out [m_outLst] = c; + m_outLst = (m_outLst + 1) % _countof(m_out); +} + void Utf16_Iter::set(const ubyte* pBuf, size_t nLen, UniMode eEncoding) { m_pBuf = pBuf; m_pRead = pBuf; m_pEnd = pBuf + nLen; m_eEncoding = eEncoding; - m_eState = eStart; - operator++(); - // Note: m_eState, m_nCur, m_nCur16 not reinitalized. + // Note: m_eState, m_out*, m_nCur16 not reinitalized. +} + +void Utf16_Iter::read() +{ + if (m_eEncoding == uni16LE || m_eEncoding == uni16LE_NoBOM) + { + m_nCur16 = *m_pRead++; + m_nCur16 |= static_cast(*m_pRead << 8); + } + else //(m_eEncoding == uni16BE || m_eEncoding == uni16BE_NoBOM) + { + m_nCur16 = static_cast(*m_pRead++ << 8); + m_nCur16 |= *m_pRead; + } + ++m_pRead; } // Goes to the next byte. @@ -555,42 +619,46 @@ void Utf16_Iter::set(const ubyte* pBuf, size_t nLen, UniMode eEncoding) // This way we can continue from a partial buffer that doesn't align void Utf16_Iter::operator++() { + if (m_out1st != m_outLst) return; switch (m_eState) - { + { case eStart: - if (m_eEncoding == uni16LE || m_eEncoding == uni16LE_NoBOM) - { - m_nCur16 = *m_pRead++; - m_nCur16 |= static_cast(*m_pRead << 8); - } - else //(m_eEncoding == uni16BE || m_eEncoding == uni16BE_NoBOM) - { - m_nCur16 = static_cast(*m_pRead++ << 8); - m_nCur16 |= *m_pRead; - } - ++m_pRead; - - if (m_nCur16 < 0x80) { - m_nCur = static_cast(m_nCur16 & 0xFF); + read(); + if ((m_nCur16 >= 0xd800) && (m_nCur16 < 0xdc00)) { + m_eState = eSurrogate; + m_highSurrogate = m_nCur16; + } + else if (m_nCur16 < 0x80) { + pushout(static_cast(m_nCur16 & 0xFF)); m_eState = eStart; } else if (m_nCur16 < 0x800) { - m_nCur = static_cast(0xC0 | m_nCur16 >> 6); + pushout(static_cast(0xC0 | m_nCur16 >> 6)); m_eState = e2Bytes2; } else { - m_nCur = static_cast(0xE0 | m_nCur16 >> 12); + pushout(static_cast(0xE0 | m_nCur16 >> 12)); m_eState = e3Bytes2; } break; case e2Bytes2: case e3Bytes3: - m_nCur = static_cast(0x80 | m_nCur16 & 0x3F); + pushout(static_cast(0x80 | m_nCur16 & 0x3F)); m_eState = eStart; break; case e3Bytes2: - m_nCur = static_cast(0x80 | ((m_nCur16 >> 6) & 0x3F)); + pushout(static_cast(0x80 | ((m_nCur16 >> 6) & 0x3F))); m_eState = e3Bytes3; break; + case eSurrogate: + read(); + if ((m_nCur16 >= 0xDC00) && (m_nCur16 < 0xE000)) + { // valid surrogate pair + UINT code = 0x10000 + ((m_highSurrogate & 0x3ff) << 10) + (m_nCur16 & 0x3ff); + pushout(0xf0 | (code >> 18) & 0x07); + pushout(0x80 | (code >> 12) & 0x3f); + pushout(0x80 | (code >> 6) & 0x3f); + pushout(0x80 | code & 0x3f); + m_eState = eStart; + } + break; } } - - diff --git a/PowerEditor/src/Utf8_16.h b/PowerEditor/src/Utf8_16.h index eebe0585e..a703ca683 100644 --- a/PowerEditor/src/Utf8_16.h +++ b/PowerEditor/src/Utf8_16.h @@ -42,25 +42,30 @@ public: eStart, e2Bytes2, e3Bytes2, - e3Bytes3 + e3Bytes3, + eSurrogate }; Utf16_Iter(); void reset(); void set(const ubyte* pBuf, size_t nLen, UniMode eEncoding); - utf8 get() const { return m_nCur; }; + bool get(utf8 *c); void operator++(); eState getState() { return m_eState; }; - operator bool() { return m_pRead <= m_pEnd; }; + operator bool() { return m_pRead < m_pEnd; }; protected: - void toStart(); // Put to start state, swap bytes if necessary + void read(); + void pushout(ubyte c); protected: UniMode m_eEncoding; eState m_eState; - utf8 m_nCur; + utf8 m_out [16]; + int m_out1st; + int m_outLst; utf16 m_nCur16; + utf16 m_highSurrogate; const ubyte* m_pBuf; const ubyte* m_pRead; const ubyte* m_pEnd; @@ -72,29 +77,22 @@ public: Utf8_Iter(); void reset(); void set(const ubyte* pBuf, size_t nLen, UniMode eEncoding); - utf16 get() const { -#ifdef _DEBUG - assert(m_eState == eStart); -#endif - return m_nCur; - } - bool canGet() const { return m_eState == eStart; } + bool get(utf16* c); + bool canGet() const { return m_out1st != m_outLst; } + void toStart(); void operator++(); - operator bool() { return m_pRead <= m_pEnd; } + operator bool() { return m_pRead < m_pEnd; } protected: - void swap(); - void toStart(); // Put to start state, swap bytes if necessary - enum eState { - eStart, - e2Bytes_Byte2, - e3Bytes_Byte2, - e3Bytes_Byte3 - }; + enum eState {eStart, eFollow}; + void pushout(utf16 c); protected: UniMode m_eEncoding; eState m_eState; - utf16 m_nCur; + int m_code; + int m_count; + utf16 m_out [4]; + int m_out1st, m_outLst; const ubyte* m_pBuf; const ubyte* m_pRead; const ubyte* m_pEnd; @@ -112,7 +110,6 @@ public: size_t getNewSize() const { return m_nNewBufSize; } UniMode getEncoding() const { return m_eEncoding; } - size_t calcCurPos(size_t pos); static UniMode determineEncoding(const unsigned char *buf, size_t bufLen); protected: @@ -147,7 +144,6 @@ public: size_t convert(char* p, size_t _size); char* getNewBuf() { return reinterpret_cast(m_pNewBuf); } - size_t calcCurPos(size_t pos); protected: UniMode m_eEncoding;