From ac09857656caf15752ea170d9cd4a080a6a58147 Mon Sep 17 00:00:00 2001 From: Silent Date: Sun, 14 Oct 2018 13:11:11 +0200 Subject: [PATCH] Fix UTF-8 detection for 4 byte characters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR fixes UTF-8 detection for 4 byte characters (a 2002 code used by npp assumed characters longer than 3 bytes are invalid -.-). This means such files will not be erroreously displayed as ANSI anymore. Steps to reproduce: Create a new UTF-8 file (w/out BOM) Paste eg. this character 🍪 and save. Reopen the file again. Prior to this PR, file is detected as ANSI (even if Notepad++ is configured to default-assume UTF-8!!!). After this fix, file gets opened as UTF-8 correctly. Fixes #4730, Fixes #3986, Fixes #3441, Fixes #3405, Closes #4922 --- PowerEditor/src/Utf8_16.cpp | 41 ++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/PowerEditor/src/Utf8_16.cpp b/PowerEditor/src/Utf8_16.cpp index d04b00b86..75bc25cb1 100644 --- a/PowerEditor/src/Utf8_16.cpp +++ b/PowerEditor/src/Utf8_16.cpp @@ -58,44 +58,57 @@ u78 Utf8_16_Read::utf8_7bits_8bits() while (sx=endx-1) - break; - if ((*sx & 0xC0) != 0xC0 || (sx[1]&(0x80+0x40)) != 0x80) { + if (std::distance(sx, endx) < 2) { + rv=0; break; + } + if ( (sx[1]&(0x80+0x40)) != 0x80) { rv=0; break; } sx+=2; } - else if (*sx < (0x80 + 0x40 + 0x20 + 0x10)) - { // 1110qqqq 10xxxxvv 10nnnnnn If it begins with E, it is 16 bit + else if ((*sx & (0x80+0x40+0x20+0x10)) == (0x80+0x40+0x20)) + { // 1110qqqq 10xxxxvv 10nnnnnn, 16 bit character ASCII7only=0; - if (sx>=endx-2) - break; - if ((*sx & 0xE0) != 0xE0 || (sx[1]&(0x80+0x40)) != 0x80 || (sx[2]&(0x80+0x40)) != 0x80) { + if (std::distance(sx, endx) < 3) { + rv=0; break; + } + if ((sx[1]&(0x80+0x40)) != 0x80 || (sx[2]&(0x80+0x40)) != 0x80) { rv=0; break; } sx+=3; } + else if ((*sx & (0x80+0x40+0x20+0x10+0x8)) == (0x80+0x40+0x20+0x10)) + { // 11110qqq 10xxxxvv 10nnnnnn 10mmmmmm, 21 bit character + ASCII7only=0; + if (std::distance(sx, endx) < 4) { + rv=0; break; + } + if ((sx[1]&(0x80+0x40)) != 0x80 || (sx[2]&(0x80+0x40)) != 0x80 || (sx[3]&(0x80+0x40)) != 0x80) { + rv=0; break; + } + sx+=4; + } else - { // more than 16 bits are not allowed here + { ASCII7only=0; rv=0; break;