[BUG_FIXED] (Author: François-R Boyer) Fix Unicode file corrupted on saving bug (the buffer alignment issue).

git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@648 f5eea248-9336-0410-98b8-ebc06183d4e3
2010-08-14 09:21:59 +00:00 · 2010-08-14 09:21:59 +00:00 · 29143b3d6c
parent 49e4230a4a
commit 29143b3d6c
1 changed files with 25 additions and 2 deletions
--- a/PowerEditor/src/ScitillaComponent/Buffer.cpp
+++ b/PowerEditor/src/ScitillaComponent/Buffer.cpp
@ -692,10 +692,26 @@ bool FileManager::loadFileData(Document doc, const TCHAR * filename, Utf8_16_Rea
 		size_t lenFile = 0;
 		size_t lenConvert = 0;	//just in case conversion results in 0, but file not empty
 		bool isFirstTime = true;
+		int incompleteMultibyteChar = 0; //we do not want to call SCI_APPENDTEXT with an incomplete character if the buffer ends in the middle of one
+		char incompleteMultibyteChar_first = 0;

 		do {
-			lenFile = fread(data, 1, blockSize, fp);
-            
+			lenFile = fread(data+incompleteMultibyteChar, 1, blockSize-incompleteMultibyteChar, fp) + incompleteMultibyteChar;
+
+			// we might not know yet the encoding; we ensure that valid UTF-8 characters will not be cut in the middle, without causing problems if it's not UTF-8
+			// TODO: all expressions for testing UTF chars should be put in inline functions, not directly in the code
+			if(lenFile == blockSize && (data[blockSize-1]&0x80) != 0) // possible multi-byte character that could be cut due to blockSize
+			{
+				incompleteMultibyteChar = 1;
+				while(incompleteMultibyteChar < 6 // longest "defined" UTF-8 code (including restricted codes not yet defined by Unicode)
+					&& (data[blockSize-incompleteMultibyteChar]&0xC0) == 0x80) // is possibly a continuation byte in a multi-byte character
+					++incompleteMultibyteChar;
+				// leave for the next buffer all bytes that could potentially be multi-byte UTF-8 at the end of current buffer
+				lenFile -= incompleteMultibyteChar;
+				incompleteMultibyteChar_first = data[lenFile]; // this byte can be erased by following code to put a null terminator
+			}
+			else incompleteMultibyteChar = 0;
+
            // check if file contain any BOM
            if (isFirstTime) 
            {
@ -722,6 +738,13 @@ bool FileManager::loadFileData(Document doc, const TCHAR * filename, Utf8_16_Rea
 				lenConvert = UnicodeConvertor->convert(data, lenFile);
 				_pscratchTilla->execute(SCI_APPENDTEXT, lenConvert, (LPARAM)(UnicodeConvertor->getNewBuf()));
 			}
+
+			if(incompleteMultibyteChar != 0)
+			{
+				// copy bytes to next buffer
+				memcpy(data, data+blockSize-incompleteMultibyteChar, incompleteMultibyteChar);
+				data[0] = incompleteMultibyteChar_first;
+			}
 			
 		} while (lenFile > 0);
 	} __except(filter(GetExceptionCode(), GetExceptionInformation())) {