notepad-plus-plus/PowerEditor/src/Utf8_16.cpp

// Utf8_16.cxx
// Copyright (C) 2002 Scott Kirkwood
//
// Permission to use, copy, modify, distribute and sell this code
// and its documentation for any purpose is hereby granted without fee,
// provided that the above copyright notice appear in all copies or
// any derived copies.  Scott Kirkwood makes no representations
// about the suitability of this software for any purpose.
// It is provided "as is" without express or implied warranty.
////////////////////////////////////////////////////////////////////////////////
// 
// Modificated 2006 Jens Lorenz
// 
// - Clean up the sources
// - Removing UCS-Bug in Utf8_Iter
// - Add convert function in Utf8_16_Write
////////////////////////////////////////////////////////////////////////////////

#include "precompiledHeaders.h"
#include "Utf8_16.h"

const Utf8_16::utf8 Utf8_16::k_Boms[][3] = {
	{0x00, 0x00, 0x00},  // Unknown
	{0xEF, 0xBB, 0xBF},  // UTF8
	{0xFE, 0xFF, 0x00},  // Big endian
	{0xFF, 0xFE, 0x00},  // Little endian
};


// ==================================================================

Utf8_16_Read::Utf8_16_Read() {
	m_eEncoding		= uni8Bit;
	m_nBufSize		= 0;
	m_pNewBuf		= NULL;
	m_bFirstRead	= true;
}

Utf8_16_Read::~Utf8_16_Read()
{
	if ((m_eEncoding == uni16BE) || (m_eEncoding == uni16LE) || (m_eEncoding == uni16BE_NoBOM) || (m_eEncoding == uni16LE_NoBOM))
    {
		delete [] m_pNewBuf;
		m_pNewBuf = NULL;
	}
}

// Returned value :
// 0 : utf8
// 1 : 7bits
// 2 : 8bits
u78 Utf8_16_Read::utf8_7bits_8bits()
{
	int rv = 1;
	int ASCII7only = 1;
	utf8 *sx	= (utf8 *)m_pBuf;
	utf8 *endx	= sx + m_nLen;

	while (sx<endx)
	{
		if (!*sx)
		{											// For detection, we'll say that NUL means not UTF8
			ASCII7only = 0;
			rv = 0;
			break;
		} 
		else if (*sx < 0x80)
		{			// 0nnnnnnn If the byte's first hex code begins with 0-7, it is an ASCII character.
			sx++;
		} 
		else if (*sx < (0x80 + 0x40)) 
		{											  // 10nnnnnn 8 through B cannot be first hex codes
			ASCII7only=0;
			rv=0;
			break;
		} 
		else if (*sx < (0x80 + 0x40 + 0x20))
		{					  // 110xxxvv 10nnnnnn  If it begins with C or D, it is an 11 bit character
			ASCII7only=0;
			if (sx>=endx-1) 
				break;
			if (!(*sx & 0x1F) || (sx[1]&(0x80+0x40)) != 0x80) {
				rv=0; break;
			}
			sx+=2;
		} 
		else if (*sx < (0x80 + 0x40 + 0x20 + 0x10))
		{								// 1110qqqq 10xxxxvv 10nnnnnn If it begins with E, it is 16 bit
			ASCII7only=0;
			if (sx>=endx-2) 
				break;
			if (!(*sx & 0xF) || (sx[1]&(0x80+0x40)) != 0x80 || (sx[2]&(0x80+0x40)) != 0x80) {
				rv=0; break;
			}
			sx+=3;
		} 
		else 
		{													  // more than 16 bits are not allowed here
			ASCII7only=0;
			rv=0;
			break;
		}
	}
	if (ASCII7only) 
		return ascii7bits;
	if (rv)
		return utf8NoBOM;
	return ascii8bits;
}

size_t Utf8_16_Read::convert(char* buf, size_t len)
{
	// bugfix by Jens Lorenz
	static	size_t nSkip = 0;

    size_t  ret = 0;
    
	m_pBuf = (ubyte*)buf;
	m_nLen = len;

	if (m_bFirstRead == true)
    {
		determineEncoding();
		nSkip = m_nSkip;
		m_bFirstRead = false;
	}

    switch (m_eEncoding)
    {
		case uni7Bit:
        case uni8Bit:
        case uniCookie: {
            // Do nothing, pass through
            m_nBufSize = 0;
            m_pNewBuf = m_pBuf;
            ret = len;
            break;
        }
        case uniUTF8: {
            // Pass through after BOM
            m_nBufSize = 0;
            m_pNewBuf = m_pBuf + nSkip;
            ret = len - nSkip;
            break;
        }    
        case uni16BE_NoBOM:
        case uni16LE_NoBOM:
        case uni16BE:
        case uni16LE: {
            size_t newSize = len + len / 2 + 1;
            
            if (m_nBufSize != newSize)
            {
				if (m_pNewBuf)
					delete [] m_pNewBuf;
                m_pNewBuf  = NULL;
                m_pNewBuf  = new ubyte[newSize];
                m_nBufSize = newSize;
            }
            
            ubyte* pCur = m_pNewBuf;
            
            m_Iter16.set(m_pBuf + nSkip, len - nSkip, m_eEncoding);

            for (; m_Iter16; ++m_Iter16)
            {
                *pCur++ = m_Iter16.get();
            }
            ret = pCur - m_pNewBuf;
            break;
        }
        default:
            break;
    }

	// necessary for second calls and more
	nSkip = 0;

	return ret;
}


void Utf8_16_Read::determineEncoding()
{
	m_eEncoding = uni8Bit;
	m_nSkip = 0;

    // detect UTF-16 big-endian with BOM
	if (m_nLen > 1 && m_pBuf[0] == k_Boms[uni16BE][0] && m_pBuf[1] == k_Boms[uni16BE][1])
	{
		m_eEncoding = uni16BE;
		m_nSkip = 2;
	}
    // detect UTF-16 little-endian with BOM
	else if (m_nLen > 1 && m_pBuf[0] == k_Boms[uni16LE][0] && m_pBuf[1] == k_Boms[uni16LE][1])
	{
		m_eEncoding = uni16LE;
		m_nSkip = 2;
	}
    // detect UTF-8 with BOM
	else if (m_nLen > 2 && m_pBuf[0] == k_Boms[uniUTF8][0] && 
		m_pBuf[1] == k_Boms[uniUTF8][1] && m_pBuf[2] == k_Boms[uniUTF8][2])
	{
		m_eEncoding = uniUTF8;
		m_nSkip = 3;
	}
	// try to detect UTF-16 little-endian without BOM
	else if (m_nLen > 1 && m_pBuf[0] != NULL && m_pBuf[1] == NULL && IsTextUnicode(m_pBuf, m_nLen, NULL))
	{
		m_eEncoding = uni16LE_NoBOM;
		m_nSkip = 0;
	}
	/* UTF-16 big-endian without BOM detection is taken away scince this detection is very week
    // try to detect UTF-16 big-endian without BOM
    else if (m_nLen > 1 && m_pBuf[0] == NULL && m_pBuf[1] != NULL)
	{
		m_eEncoding = uni16BE_NoBOM;
		m_nSkip = 0;
	}
	*/
	else
	{
		u78 detectedEncoding = utf8_7bits_8bits();
		if (detectedEncoding == utf8NoBOM)
			m_eEncoding = uniCookie;
		else if (detectedEncoding == ascii7bits)
			m_eEncoding = uni7Bit;
		else //(detectedEncoding == ascii8bits)
			m_eEncoding = uni8Bit;
		m_nSkip = 0;
	}
}

UniMode Utf8_16_Read::determineEncoding(const unsigned char *buf, int bufLen)
{
    // detect UTF-16 big-endian with BOM
	if (bufLen > 1 && buf[0] == k_Boms[uni16BE][0] && buf[1] == k_Boms[uni16BE][1])
	{
		return uni16BE;
	}
    
    // detect UTF-16 little-endian with BOM
    if (bufLen > 1 && buf[0] == k_Boms[uni16LE][0] && buf[1] == k_Boms[uni16LE][1])
	{
		return uni16LE;
	}
    
    // detect UTF-8 with BOM
	if (bufLen > 2 && buf[0] == k_Boms[uniUTF8][0] && 
		buf[1] == k_Boms[uniUTF8][1] && buf[2] == k_Boms[uniUTF8][2])
	{
		return uniUTF8;
	}

    return uni8Bit;
}


// ==================================================================

Utf8_16_Write::Utf8_16_Write()
{
	m_eEncoding = uni8Bit;
	m_pFile = NULL;
	m_pNewBuf = NULL;
	m_bFirstWrite = true;
	m_nBufSize = 0;
}

Utf8_16_Write::~Utf8_16_Write()
{
	fclose();
}

FILE * Utf8_16_Write::fopen(const TCHAR *_name, const TCHAR *_type)
{
	m_pFile = ::generic_fopen(_name, _type);

	m_bFirstWrite = true;

	return m_pFile;
}

size_t Utf8_16_Write::fwrite(const void* p, size_t _size)
{
    // no file open
	if (!m_pFile)
    {
		return 0;
	}

    size_t  ret = 0;
    
	if (m_bFirstWrite)
    {
        switch (m_eEncoding)
        {
            case uniUTF8: {
                ::fwrite(k_Boms[m_eEncoding], 3, 1, m_pFile);
                break;
            }    
            case uni16BE:
            case uni16LE:
                ::fwrite(k_Boms[m_eEncoding], 2, 1, m_pFile);
                break;
            default:
                // nothing to do
                break;
        }
		m_bFirstWrite = false;
    }
    
    switch (m_eEncoding)
    {
		case uni7Bit:
        case uni8Bit:
        case uniCookie:
        case uniUTF8: {
            // Normal write
            ret = ::fwrite(p, _size, 1, m_pFile);
            break;
        }
        case uni16BE_NoBOM:
        case uni16LE_NoBOM:
        case uni16BE:
        case uni16LE: {
			static const int bufSize = 64*1024;
			utf16 buf[bufSize];
            
            Utf8_Iter iter8;
            iter8.set(static_cast<const ubyte*>(p), _size, m_eEncoding);
            
            int bufIndex = 0;
            while (iter8) {
                if (iter8.canGet()) {
                    buf[bufIndex++] = iter8.get();
                }
				++iter8;
				if(bufIndex == bufSize || !iter8) {
					if(!::fwrite(buf, bufIndex*sizeof(utf16), 1, m_pFile)) return 0;
					bufIndex = 0;
				}
            }
            ret = 1;
            break;
        }    
        default:
            break;
    }
    
    return ret;
}


size_t Utf8_16_Write::convert(char* p, size_t _size)
{
	if (m_pNewBuf)
    {
		delete [] m_pNewBuf;
	}

    switch (m_eEncoding)
    {
		case uni7Bit:
        case uni8Bit:
        case uniCookie: {
            // Normal write
            m_nBufSize = _size;
            m_pNewBuf = (ubyte*)new ubyte[m_nBufSize];
            memcpy(m_pNewBuf, p, _size);
            break;
        }
        case uniUTF8: {
            m_nBufSize = _size + 3;
            m_pNewBuf = (ubyte*)new ubyte[m_nBufSize];
            memcpy(m_pNewBuf, k_Boms[m_eEncoding], 3);
            memcpy(&m_pNewBuf[3], p, _size);
            break;
        }
        case uni16BE_NoBOM:
        case uni16LE_NoBOM:
        case uni16BE:
        case uni16LE: {
            m_pNewBuf = (ubyte*)new ubyte[sizeof(utf16) * (_size + 1)];
            
            if (m_eEncoding == uni16BE || m_eEncoding == uni16LE) {
                // Write the BOM
                memcpy(m_pNewBuf, k_Boms[m_eEncoding], 2);
            }
            
            Utf8_Iter iter8;
            iter8.set(reinterpret_cast<const ubyte*>(p), _size, m_eEncoding);
            
            utf16* pCur = (utf16*)&m_pNewBuf[2];
            
            for (; iter8; ++iter8) {
                if (iter8.canGet()) {
                    *pCur++ = iter8.get();
                }
            }
            m_nBufSize = (const char*)pCur - (const char*)m_pNewBuf;
        }
        default:
            break;
    }
    
	return m_nBufSize;
}


void Utf8_16_Write::setEncoding(UniMode eType)
{
	m_eEncoding = eType;
}


void Utf8_16_Write::fclose()
{
	if (m_pNewBuf)
		delete [] m_pNewBuf;

	if (m_pFile)
		::fclose(m_pFile);
}


//=================================================================
Utf8_Iter::Utf8_Iter()
{
	reset();
}

void Utf8_Iter::reset()
{
	m_pBuf = NULL;
	m_pRead = NULL;
	m_pEnd = NULL;
	m_eState = eStart;
	m_nCur = 0;
	m_eEncoding = uni8Bit;
}

void Utf8_Iter::set(const ubyte* pBuf, size_t nLen, UniMode eEncoding)
{
	m_pBuf      = pBuf;
	m_pRead     = pBuf;
	m_pEnd      = pBuf + nLen;
	m_eEncoding = eEncoding;
	operator++();
	// Note: m_eState, m_nCur not set
}

// Go to the next byte.
void Utf8_Iter::operator++()
{
	switch (m_eState)
    {
        case eStart:
            if (*m_pRead < 0x80) {
                m_nCur = *m_pRead;
                toStart();
            } else if (*m_pRead < 0xE0) {
                m_nCur = static_cast<utf16>((0x1F & *m_pRead) << 6);
                m_eState = e2Bytes_Byte2;
            } else {
                m_nCur = static_cast<utf16>((0xF & *m_pRead) << 12);
                m_eState = e3Bytes_Byte2;
            }
            break;
        case e2Bytes_Byte2:
        case e3Bytes_Byte3:
            m_nCur |= static_cast<utf8>(0x3F & *m_pRead);
            toStart();
            break;
        case e3Bytes_Byte2:
            m_nCur |= static_cast<utf16>((0x3F & *m_pRead) << 6);
            m_eState = e3Bytes_Byte3;
            break;
	}
	++m_pRead;
}

void Utf8_Iter::toStart()
{
	m_eState = eStart;
	if (m_eEncoding == uni16BE || m_eEncoding == uni16BE_NoBOM)
    {
		swap();
	}
}

void Utf8_Iter::swap()
{
	utf8* p = reinterpret_cast<utf8*>(&m_nCur);
	utf8 swapbyte = *p;
	*p = *(p + 1);
	*(p + 1) = swapbyte;
}

//==================================================
Utf16_Iter::Utf16_Iter()
{
	reset();
}

void Utf16_Iter::reset()
{
	m_pBuf = NULL;
	m_pRead = NULL;
	m_pEnd = NULL;
	m_eState = eStart;
	m_nCur = 0;
	m_nCur16 = 0;
	m_eEncoding = uni8Bit;
}

void Utf16_Iter::set(const ubyte* pBuf, size_t nLen, UniMode eEncoding)
{
	m_pBuf = pBuf;
	m_pRead = pBuf;
	m_pEnd = pBuf + nLen;
	m_eEncoding = eEncoding;
	m_eState = eStart;
	operator++();
	// Note: m_eState, m_nCur, m_nCur16 not reinitalized.
}

// Goes to the next byte.
// Not the next symbol which you might expect.
// This way we can continue from a partial buffer that doesn't align
void Utf16_Iter::operator++()
{
	switch (m_eState)
    {
        case eStart:
            if (m_eEncoding == uni16LE || m_eEncoding == uni16LE_NoBOM) 
            {
                m_nCur16 = *m_pRead++;
                m_nCur16 |= static_cast<utf16>(*m_pRead << 8);
            }
            else //(m_eEncoding == uni16BE || m_eEncoding == uni16BE_NoBOM)
            {
                m_nCur16 = static_cast<utf16>(*m_pRead++ << 8);
                m_nCur16 |= *m_pRead;
            }
            ++m_pRead;
            
            if (m_nCur16 < 0x80) {
                m_nCur = static_cast<ubyte>(m_nCur16 & 0xFF);
                m_eState = eStart;
            } else if (m_nCur16 < 0x800) {
                m_nCur = static_cast<ubyte>(0xC0 | m_nCur16 >> 6);
                m_eState = e2Bytes2;
            } else {
                m_nCur = static_cast<ubyte>(0xE0 | m_nCur16 >> 12);
                m_eState = e3Bytes2;
            }
            break;
        case e2Bytes2:
        case e3Bytes3:
            m_nCur = static_cast<ubyte>(0x80 | m_nCur16 & 0x3F);
            m_eState = eStart;
            break;
        case e3Bytes2:
            m_nCur = static_cast<ubyte>(0x80 | ((m_nCur16 >> 6) & 0x3F));
            m_eState = e3Bytes3;
            break;
    }
}
Moved PowerEditor in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@460 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:34:47 +00:00			`// Utf8_16.cxx`
			`// Copyright (C) 2002 Scott Kirkwood`
			`//`
			`// Permission to use, copy, modify, distribute and sell this code`
			`// and its documentation for any purpose is hereby granted without fee,`
			`// provided that the above copyright notice appear in all copies or`
			`// any derived copies. Scott Kirkwood makes no representations`
			`// about the suitability of this software for any purpose.`
			`// It is provided "as is" without express or implied warranty.`
			`////////////////////////////////////////////////////////////////////////////////`
			`//`
			`// Modificated 2006 Jens Lorenz`
			`//`
			`// - Clean up the sources`
			`// - Removing UCS-Bug in Utf8_Iter`
			`// - Add convert function in Utf8_16_Write`
			`////////////////////////////////////////////////////////////////////////////////`

[ENHANCE] Thanks to Jocelyn Legault (precompiled headers), the build time is sped up. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@529 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-09-04 00:10:01 +00:00			`#include "precompiledHeaders.h"`
Moved PowerEditor in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@460 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:34:47 +00:00			`#include "Utf8_16.h"`

			`const Utf8_16::utf8 Utf8_16::k_Boms[][3] = {`
			`{0x00, 0x00, 0x00}, // Unknown`
			`{0xEF, 0xBB, 0xBF}, // UTF8`
			`{0xFE, 0xFF, 0x00}, // Big endian`
			`{0xFF, 0xFE, 0x00}, // Little endian`
			`};`


			`// ==================================================================`

			`Utf8_16_Read::Utf8_16_Read() {`
			`m_eEncoding = uni8Bit;`
			`m_nBufSize = 0;`
			`m_pNewBuf = NULL;`
			`m_bFirstRead = true;`
			`}`

			`Utf8_16_Read::~Utf8_16_Read()`
			`{`
[NEW_FEATURE] Detect UTF16 encoding (BE and LE) w/o BOM. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@473 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-29 18:07:30 +00:00			`if ((m_eEncoding == uni16BE) \|\| (m_eEncoding == uni16LE) \|\| (m_eEncoding == uni16BE_NoBOM) \|\| (m_eEncoding == uni16LE_NoBOM))`
Moved PowerEditor in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@460 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:34:47 +00:00			`{`
			`delete [] m_pNewBuf;`
			`m_pNewBuf = NULL;`
			`}`
			`}`

			`// Returned value :`
			`// 0 : utf8`
			`// 1 : 7bits`
			`// 2 : 8bits`
			`u78 Utf8_16_Read::utf8_7bits_8bits()`
			`{`
			`int rv = 1;`
			`int ASCII7only = 1;`
			`utf8 sx = (utf8 )m_pBuf;`
			`utf8 *endx = sx + m_nLen;`

			`while (sx<endx)`
			`{`
			`if (!*sx)`
			`{ // For detection, we'll say that NUL means not UTF8`
			`ASCII7only = 0;`
			`rv = 0;`
			`break;`
			`}`
			`else if (*sx < 0x80)`
			`{ // 0nnnnnnn If the byte's first hex code begins with 0-7, it is an ASCII character.`
			`sx++;`
			`}`
			`else if (*sx < (0x80 + 0x40))`
			`{ // 10nnnnnn 8 through B cannot be first hex codes`
			`ASCII7only=0;`
			`rv=0;`
			`break;`
			`}`
			`else if (*sx < (0x80 + 0x40 + 0x20))`
			`{ // 110xxxvv 10nnnnnn If it begins with C or D, it is an 11 bit character`
			`ASCII7only=0;`
			`if (sx>=endx-1)`
			`break;`
			`if (!(*sx & 0x1F) \|\| (sx[1]&(0x80+0x40)) != 0x80) {`
			`rv=0; break;`
			`}`
			`sx+=2;`
			`}`
			`else if (*sx < (0x80 + 0x40 + 0x20 + 0x10))`
			`{ // 1110qqqq 10xxxxvv 10nnnnnn If it begins with E, it is 16 bit`
			`ASCII7only=0;`
			`if (sx>=endx-2)`
			`break;`
			`if (!(*sx & 0xF) \|\| (sx[1]&(0x80+0x40)) != 0x80 \|\| (sx[2]&(0x80+0x40)) != 0x80) {`
			`rv=0; break;`
			`}`
			`sx+=3;`
			`}`
			`else`
			`{ // more than 16 bits are not allowed here`
			`ASCII7only=0;`
			`rv=0;`
			`break;`
			`}`
			`}`
			`if (ASCII7only)`
			`return ascii7bits;`
			`if (rv)`
			`return utf8NoBOM;`
			`return ascii8bits;`
			`}`

			`size_t Utf8_16_Read::convert(char* buf, size_t len)`
			`{`
			`// bugfix by Jens Lorenz`
			`static size_t nSkip = 0;`

			`size_t ret = 0;`

			`m_pBuf = (ubyte*)buf;`
			`m_nLen = len;`

			`if (m_bFirstRead == true)`
			`{`
			`determineEncoding();`
			`nSkip = m_nSkip;`
			`m_bFirstRead = false;`
			`}`

			`switch (m_eEncoding)`
			`{`
			`case uni7Bit:`
			`case uni8Bit:`
			`case uniCookie: {`
			`// Do nothing, pass through`
			`m_nBufSize = 0;`
			`m_pNewBuf = m_pBuf;`
			`ret = len;`
			`break;`
			`}`
			`case uniUTF8: {`
			`// Pass through after BOM`
			`m_nBufSize = 0;`
			`m_pNewBuf = m_pBuf + nSkip;`
			`ret = len - nSkip;`
			`break;`
			`}`
[NEW_FEATURE] Detect UTF16 encoding (BE and LE) w/o BOM. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@473 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-29 18:07:30 +00:00			`case uni16BE_NoBOM:`
			`case uni16LE_NoBOM:`
Moved PowerEditor in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@460 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:34:47 +00:00			`case uni16BE:`
			`case uni16LE: {`
			`size_t newSize = len + len / 2 + 1;`

			`if (m_nBufSize != newSize)`
			`{`
			`if (m_pNewBuf)`
			`delete [] m_pNewBuf;`
			`m_pNewBuf = NULL;`
			`m_pNewBuf = new ubyte[newSize];`
			`m_nBufSize = newSize;`
			`}`

			`ubyte* pCur = m_pNewBuf;`

			`m_Iter16.set(m_pBuf + nSkip, len - nSkip, m_eEncoding);`

			`for (; m_Iter16; ++m_Iter16)`
			`{`
			`*pCur++ = m_Iter16.get();`
			`}`
			`ret = pCur - m_pNewBuf;`
			`break;`
			`}`
			`default:`
			`break;`
			`}`

			`// necessary for second calls and more`
			`nSkip = 0;`

			`return ret;`
			`}`


			`void Utf8_16_Read::determineEncoding()`
			`{`
			`m_eEncoding = uni8Bit;`
			`m_nSkip = 0;`

[NEW_FEATURE] Detect UTF16 encoding (BE and LE) w/o BOM. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@473 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-29 18:07:30 +00:00			`// detect UTF-16 big-endian with BOM`
Moved PowerEditor in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@460 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:34:47 +00:00			`if (m_nLen > 1 && m_pBuf[0] == k_Boms[uni16BE][0] && m_pBuf[1] == k_Boms[uni16BE][1])`
			`{`
			`m_eEncoding = uni16BE;`
			`m_nSkip = 2;`
			`}`
[NEW_FEATURE] Detect UTF16 encoding (BE and LE) w/o BOM. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@473 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-29 18:07:30 +00:00			`// detect UTF-16 little-endian with BOM`
Moved PowerEditor in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@460 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:34:47 +00:00			`else if (m_nLen > 1 && m_pBuf[0] == k_Boms[uni16LE][0] && m_pBuf[1] == k_Boms[uni16LE][1])`
			`{`
			`m_eEncoding = uni16LE;`
			`m_nSkip = 2;`
			`}`
[NEW_FEATURE] Detect UTF16 encoding (BE and LE) w/o BOM. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@473 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-29 18:07:30 +00:00			`// detect UTF-8 with BOM`
Moved PowerEditor in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@460 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:34:47 +00:00			`else if (m_nLen > 2 && m_pBuf[0] == k_Boms[uniUTF8][0] &&`
			`m_pBuf[1] == k_Boms[uniUTF8][1] && m_pBuf[2] == k_Boms[uniUTF8][2])`
			`{`
			`m_eEncoding = uniUTF8;`
			`m_nSkip = 3;`
			`}`
[NEW_FEATURE] Detect UTF16 encoding (BE and LE) w/o BOM. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@473 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-29 18:07:30 +00:00			`// try to detect UTF-16 little-endian without BOM`
			`else if (m_nLen > 1 && m_pBuf[0] != NULL && m_pBuf[1] == NULL && IsTextUnicode(m_pBuf, m_nLen, NULL))`
			`{`
			`m_eEncoding = uni16LE_NoBOM;`
			`m_nSkip = 0;`
			`}`
[BUG_FIXED] Fix getting fonts problem under Chines/Korean/Japanese environment. [BUG_FIXED] Fix wrong File encoding auto-detection problem : binary file is detected as UTF-16 big-endian. [UPDATE] (Author : Christian Cuvier) Update NppHelp to v5.5. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@539 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-09-19 21:43:11 +00:00			`/* UTF-16 big-endian without BOM detection is taken away scince this detection is very week`
[NEW_FEATURE] Detect UTF16 encoding (BE and LE) w/o BOM. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@473 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-29 18:07:30 +00:00			`// try to detect UTF-16 big-endian without BOM`
			`else if (m_nLen > 1 && m_pBuf[0] == NULL && m_pBuf[1] != NULL)`
			`{`
			`m_eEncoding = uni16BE_NoBOM;`
			`m_nSkip = 0;`
			`}`
[BUG_FIXED] Fix getting fonts problem under Chines/Korean/Japanese environment. [BUG_FIXED] Fix wrong File encoding auto-detection problem : binary file is detected as UTF-16 big-endian. [UPDATE] (Author : Christian Cuvier) Update NppHelp to v5.5. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@539 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-09-19 21:43:11 +00:00			`*/`
Moved PowerEditor in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@460 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:34:47 +00:00			`else`
			`{`
			`u78 detectedEncoding = utf8_7bits_8bits();`
			`if (detectedEncoding == utf8NoBOM)`
			`m_eEncoding = uniCookie;`
			`else if (detectedEncoding == ascii7bits)`
			`m_eEncoding = uni7Bit;`
			`else //(detectedEncoding == ascii8bits)`
			`m_eEncoding = uni8Bit;`
			`m_nSkip = 0;`
			`}`
			`}`

[BUG_FIXED] Fix xml/html encoding overrides BOM if present. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@597 f5eea248-9336-0410-98b8-ebc06183d4e3 2010-01-17 23:13:06 +00:00			`UniMode Utf8_16_Read::determineEncoding(const unsigned char *buf, int bufLen)`
			`{`
			`// detect UTF-16 big-endian with BOM`
			`if (bufLen > 1 && buf[0] == k_Boms[uni16BE][0] && buf[1] == k_Boms[uni16BE][1])`
			`{`
			`return uni16BE;`
			`}`

			`// detect UTF-16 little-endian with BOM`
			`if (bufLen > 1 && buf[0] == k_Boms[uni16LE][0] && buf[1] == k_Boms[uni16LE][1])`
			`{`
			`return uni16LE;`
			`}`

			`// detect UTF-8 with BOM`
			`if (bufLen > 2 && buf[0] == k_Boms[uniUTF8][0] &&`
			`buf[1] == k_Boms[uniUTF8][1] && buf[2] == k_Boms[uniUTF8][2])`
			`{`
			`return uniUTF8;`
			`}`

			`return uni8Bit;`
			`}`

Moved PowerEditor in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@460 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:34:47 +00:00
			`// ==================================================================`

			`Utf8_16_Write::Utf8_16_Write()`
			`{`
			`m_eEncoding = uni8Bit;`
			`m_pFile = NULL;`
			`m_pNewBuf = NULL;`
			`m_bFirstWrite = true;`
			`m_nBufSize = 0;`
			`}`

			`Utf8_16_Write::~Utf8_16_Write()`
			`{`
			`fclose();`
			`}`

			`FILE * Utf8_16_Write::fopen(const TCHAR _name, const TCHAR _type)`
			`{`
			`m_pFile = ::generic_fopen(_name, _type);`

			`m_bFirstWrite = true;`

			`return m_pFile;`
			`}`

			`size_t Utf8_16_Write::fwrite(const void* p, size_t _size)`
			`{`
			`// no file open`
			`if (!m_pFile)`
			`{`
			`return 0;`
			`}`

			`size_t ret = 0;`

			`if (m_bFirstWrite)`
			`{`
			`switch (m_eEncoding)`
			`{`
			`case uniUTF8: {`
			`::fwrite(k_Boms[m_eEncoding], 3, 1, m_pFile);`
			`break;`
			`}`
			`case uni16BE:`
			`case uni16LE:`
			`::fwrite(k_Boms[m_eEncoding], 2, 1, m_pFile);`
			`break;`
			`default:`
			`// nothing to do`
			`break;`
			`}`
			`m_bFirstWrite = false;`
			`}`

			`switch (m_eEncoding)`
			`{`
			`case uni7Bit:`
			`case uni8Bit:`
			`case uniCookie:`
			`case uniUTF8: {`
			`// Normal write`
			`ret = ::fwrite(p, _size, 1, m_pFile);`
			`break;`
			`}`
[NEW_FEATURE] Detect UTF16 encoding (BE and LE) w/o BOM. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@473 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-29 18:07:30 +00:00			`case uni16BE_NoBOM:`
			`case uni16LE_NoBOM:`
Moved PowerEditor in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@460 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:34:47 +00:00			`case uni16BE:`
			`case uni16LE: {`
[BUG_FIXED] (Author: François-R Boyer) Fix saving UCS-2 text file corrupted bug. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@671 f5eea248-9336-0410-98b8-ebc06183d4e3 2010-09-25 18:14:22 +00:00			`static const int bufSize = 64*1024;`
			`utf16 buf[bufSize];`
Moved PowerEditor in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@460 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:34:47 +00:00
			`Utf8_Iter iter8;`
			`iter8.set(static_cast<const ubyte*>(p), _size, m_eEncoding);`

[BUG_FIXED] (Author: François-R Boyer) Fix saving UCS-2 text file corrupted bug. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@671 f5eea248-9336-0410-98b8-ebc06183d4e3 2010-09-25 18:14:22 +00:00			`int bufIndex = 0;`
			`while (iter8) {`
Moved PowerEditor in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@460 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:34:47 +00:00			`if (iter8.canGet()) {`
[BUG_FIXED] (Author: François-R Boyer) Fix saving UCS-2 text file corrupted bug. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@671 f5eea248-9336-0410-98b8-ebc06183d4e3 2010-09-25 18:14:22 +00:00			`buf[bufIndex++] = iter8.get();`
Moved PowerEditor in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@460 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:34:47 +00:00			`}`
[BUG_FIXED] (Author: François-R Boyer) Fix saving UCS-2 text file corrupted bug. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@671 f5eea248-9336-0410-98b8-ebc06183d4e3 2010-09-25 18:14:22 +00:00			`++iter8;`
			`if(bufIndex == bufSize \|\| !iter8) {`
			`if(!::fwrite(buf, bufIndex*sizeof(utf16), 1, m_pFile)) return 0;`
			`bufIndex = 0;`
			`}`
Moved PowerEditor in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@460 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:34:47 +00:00			`}`
[BUG_FIXED] (Author: François-R Boyer) Fix saving UCS-2 text file corrupted bug. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@671 f5eea248-9336-0410-98b8-ebc06183d4e3 2010-09-25 18:14:22 +00:00			`ret = 1;`
Moved PowerEditor in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@460 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:34:47 +00:00			`break;`
			`}`
			`default:`
			`break;`
			`}`

			`return ret;`
			`}`


			`size_t Utf8_16_Write::convert(char* p, size_t _size)`
			`{`
			`if (m_pNewBuf)`
			`{`
			`delete [] m_pNewBuf;`
			`}`

			`switch (m_eEncoding)`
			`{`
			`case uni7Bit:`
			`case uni8Bit:`
			`case uniCookie: {`
			`// Normal write`
			`m_nBufSize = _size;`
			`m_pNewBuf = (ubyte*)new ubyte[m_nBufSize];`
			`memcpy(m_pNewBuf, p, _size);`
			`break;`
			`}`
			`case uniUTF8: {`
			`m_nBufSize = _size + 3;`
			`m_pNewBuf = (ubyte*)new ubyte[m_nBufSize];`
			`memcpy(m_pNewBuf, k_Boms[m_eEncoding], 3);`
			`memcpy(&m_pNewBuf[3], p, _size);`
			`break;`
			`}`
[NEW_FEATURE] Detect UTF16 encoding (BE and LE) w/o BOM. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@473 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-29 18:07:30 +00:00			`case uni16BE_NoBOM:`
			`case uni16LE_NoBOM:`
Moved PowerEditor in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@460 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:34:47 +00:00			`case uni16BE:`
			`case uni16LE: {`
			`m_pNewBuf = (ubyte)new ubyte[sizeof(utf16) (_size + 1)];`

			`if (m_eEncoding == uni16BE \|\| m_eEncoding == uni16LE) {`
			`// Write the BOM`
			`memcpy(m_pNewBuf, k_Boms[m_eEncoding], 2);`
			`}`

			`Utf8_Iter iter8;`
			`iter8.set(reinterpret_cast<const ubyte*>(p), _size, m_eEncoding);`

			`utf16* pCur = (utf16*)&m_pNewBuf[2];`

			`for (; iter8; ++iter8) {`
			`if (iter8.canGet()) {`
			`*pCur++ = iter8.get();`
			`}`
			`}`
			`m_nBufSize = (const char)pCur - (const char)m_pNewBuf;`
			`}`
			`default:`
			`break;`
			`}`

			`return m_nBufSize;`
			`}`


			`void Utf8_16_Write::setEncoding(UniMode eType)`
			`{`
			`m_eEncoding = eType;`
			`}`


			`void Utf8_16_Write::fclose()`
			`{`
			`if (m_pNewBuf)`
			`delete [] m_pNewBuf;`

			`if (m_pFile)`
			`::fclose(m_pFile);`
			`}`


			`//=================================================================`
			`Utf8_Iter::Utf8_Iter()`
			`{`
			`reset();`
			`}`

			`void Utf8_Iter::reset()`
			`{`
			`m_pBuf = NULL;`
			`m_pRead = NULL;`
			`m_pEnd = NULL;`
			`m_eState = eStart;`
			`m_nCur = 0;`
			`m_eEncoding = uni8Bit;`
			`}`

			`void Utf8_Iter::set(const ubyte* pBuf, size_t nLen, UniMode eEncoding)`
			`{`
			`m_pBuf = pBuf;`
			`m_pRead = pBuf;`
			`m_pEnd = pBuf + nLen;`
			`m_eEncoding = eEncoding;`
			`operator++();`
			`// Note: m_eState, m_nCur not set`
			`}`

			`// Go to the next byte.`
			`void Utf8_Iter::operator++()`
			`{`
			`switch (m_eState)`
			`{`
			`case eStart:`
			`if (*m_pRead < 0x80) {`
			`m_nCur = *m_pRead;`
			`toStart();`
			`} else if (*m_pRead < 0xE0) {`
			`m_nCur = static_cast<utf16>((0x1F & *m_pRead) << 6);`
			`m_eState = e2Bytes_Byte2;`
			`} else {`
			`m_nCur = static_cast<utf16>((0xF & *m_pRead) << 12);`
			`m_eState = e3Bytes_Byte2;`
			`}`
			`break;`
			`case e2Bytes_Byte2:`
			`case e3Bytes_Byte3:`
			`m_nCur \|= static_cast<utf8>(0x3F & *m_pRead);`
			`toStart();`
			`break;`
			`case e3Bytes_Byte2:`
			`m_nCur \|= static_cast<utf16>((0x3F & *m_pRead) << 6);`
			`m_eState = e3Bytes_Byte3;`
			`break;`
			`}`
			`++m_pRead;`
			`}`

			`void Utf8_Iter::toStart()`
			`{`
			`m_eState = eStart;`
[NEW_FEATURE] Detect UTF16 encoding (BE and LE) w/o BOM. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@473 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-29 18:07:30 +00:00			`if (m_eEncoding == uni16BE \|\| m_eEncoding == uni16BE_NoBOM)`
Moved PowerEditor in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@460 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:34:47 +00:00			`{`
			`swap();`
			`}`
			`}`

			`void Utf8_Iter::swap()`
			`{`
			`utf8* p = reinterpret_cast<utf8*>(&m_nCur);`
			`utf8 swapbyte = *p;`
			`p = (p + 1);`
			`*(p + 1) = swapbyte;`
			`}`

			`//==================================================`
			`Utf16_Iter::Utf16_Iter()`
			`{`
			`reset();`
			`}`

			`void Utf16_Iter::reset()`
			`{`
			`m_pBuf = NULL;`
			`m_pRead = NULL;`
			`m_pEnd = NULL;`
			`m_eState = eStart;`
			`m_nCur = 0;`
			`m_nCur16 = 0;`
			`m_eEncoding = uni8Bit;`
			`}`

			`void Utf16_Iter::set(const ubyte* pBuf, size_t nLen, UniMode eEncoding)`
			`{`
			`m_pBuf = pBuf;`
			`m_pRead = pBuf;`
			`m_pEnd = pBuf + nLen;`
			`m_eEncoding = eEncoding;`
			`m_eState = eStart;`
			`operator++();`
			`// Note: m_eState, m_nCur, m_nCur16 not reinitalized.`
			`}`

			`// Goes to the next byte.`
			`// Not the next symbol which you might expect.`
			`// This way we can continue from a partial buffer that doesn't align`
			`void Utf16_Iter::operator++()`
			`{`
			`switch (m_eState)`
			`{`
			`case eStart:`
[NEW_FEATURE] Detect UTF16 encoding (BE and LE) w/o BOM. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@473 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-29 18:07:30 +00:00			`if (m_eEncoding == uni16LE \|\| m_eEncoding == uni16LE_NoBOM)`
			`{`
Moved PowerEditor in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@460 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:34:47 +00:00			`m_nCur16 = *m_pRead++;`
			`m_nCur16 \|= static_cast<utf16>(*m_pRead << 8);`
[NEW_FEATURE] Detect UTF16 encoding (BE and LE) w/o BOM. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@473 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-29 18:07:30 +00:00			`}`
			`else //(m_eEncoding == uni16BE \|\| m_eEncoding == uni16BE_NoBOM)`
			`{`
Moved PowerEditor in trunk remotely git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@460 f5eea248-9336-0410-98b8-ebc06183d4e3 2009-04-24 23:34:47 +00:00			`m_nCur16 = static_cast<utf16>(*m_pRead++ << 8);`
			`m_nCur16 \|= *m_pRead;`
			`}`
			`++m_pRead;`

			`if (m_nCur16 < 0x80) {`
			`m_nCur = static_cast<ubyte>(m_nCur16 & 0xFF);`
			`m_eState = eStart;`
			`} else if (m_nCur16 < 0x800) {`
			`m_nCur = static_cast<ubyte>(0xC0 \| m_nCur16 >> 6);`
			`m_eState = e2Bytes2;`
			`} else {`
			`m_nCur = static_cast<ubyte>(0xE0 \| m_nCur16 >> 12);`
			`m_eState = e3Bytes2;`
			`}`
			`break;`
			`case e2Bytes2:`
			`case e3Bytes3:`
			`m_nCur = static_cast<ubyte>(0x80 \| m_nCur16 & 0x3F);`
			`m_eState = eStart;`
			`break;`
			`case e3Bytes2:`
			`m_nCur = static_cast<ubyte>(0x80 \| ((m_nCur16 >> 6) & 0x3F));`
			`m_eState = e3Bytes3;`
			`break;`
			`}`
			`}`