notepad-plus-plus/boostregex/UTF8DocumentIterator.cxx

// This file is part of Notepad++ project
// Copyright (C) 2021 Notepad++ authors.

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// at your option any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <https://www.gnu.org/licenses/>.

#include "UTF8DocumentIterator.h"
#include <string_view>
#include <stdexcept>
#include <optional>

#include "ILoader.h"
#include "ILexer.h"
#include "Scintilla.h"
#include "ScintillaTypes.h"
#include "ScintillaMessages.h"
#include "Debugging.h"
#include "Geometry.h"
#include "Platform.h"

#include "CharacterCategoryMap.h"
#include "Position.h"
#include "SplitVector.h"
#include "Partitioning.h"
#include "RunStyles.h"
#include "CellBuffer.h"
#include "CharClassify.h"
#include "Decoration.h"
#include "CaseFolder.h"
#include "Document.h"

using namespace Scintilla::Internal;

UTF8DocumentIterator::UTF8DocumentIterator(Document* doc, Sci::Position pos, Sci::Position end) :
				m_pos(pos),
				m_end(end),
				m_characterIndex(0),
				m_doc(doc)
{
		// Check for debug builds
		PLATFORM_ASSERT(m_pos <= m_end);

		// Ensure for release.
		if (m_pos > m_end)
		{
				m_pos = m_end;
		}
		readCharacter();
}

UTF8DocumentIterator::UTF8DocumentIterator(const UTF8DocumentIterator& copy) :
		m_pos(copy.m_pos),
		m_end(copy.m_end),
		m_characterIndex(copy.m_characterIndex),
		m_utf8Length(copy.m_utf8Length),
		m_utf16Length(copy.m_utf16Length),
		m_doc(copy.m_doc)
{
		// Check for debug builds
		PLATFORM_ASSERT(m_pos <= m_end);
		m_character[0] = copy.m_character[0];
		m_character[1] = copy.m_character[1];

		// Ensure for release.
		if (m_pos > m_end)
		{
				m_pos = m_end;
		}
}

UTF8DocumentIterator& UTF8DocumentIterator::operator ++ ()
{
	PLATFORM_ASSERT(m_pos < m_end);
	if (m_utf16Length == 2 && m_characterIndex == 0)
	{
		m_characterIndex = 1;
	}
	else
	{
		m_pos += m_utf8Length;

		if (m_pos > m_end)
		{
			m_pos = m_end;
		}
		m_characterIndex = 0;
		readCharacter();
	}
	return *this;
}

UTF8DocumentIterator& UTF8DocumentIterator::operator -- ()
{
	if (m_utf16Length == 2 && m_characterIndex == 1)
	{
		m_characterIndex = 0;
	}
	else
	{
		--m_pos;
		// Skip past the UTF-8 extension bytes
		while (0x80 == (m_doc->CharAt(m_pos) & 0xC0) && m_pos > 0)
			--m_pos;

		readCharacter();
		if (m_utf16Length == 2)
		{
			m_characterIndex = 1;
		}
	}
	return *this;
}

void UTF8DocumentIterator::readCharacter()
{
	unsigned char currentChar = m_doc->CharAt(m_pos);
	if (currentChar & 0x80)
	{
		int mask = 0x40;
		int nBytes = 1;

		do
		{
			mask >>= 1;
			++nBytes;
		} while (currentChar & mask);

		int result = currentChar & m_firstByteMask[nBytes];
		Sci::Position pos = m_pos;
		m_utf8Length = 1;
		// work out the unicode point, and count the actual bytes.
		// If a byte does not start with 10xxxxxx then it's not part of the
		// the code. Therefore invalid UTF-8 encodings are dealt with, simply by stopping when
		// the UTF8 extension bytes are no longer valid.
		while ((--nBytes) && (pos < m_end) && (0x80 == ((currentChar = m_doc->CharAt(++pos)) & 0xC0)))
		{
			result = (result << 6) | (currentChar & 0x3F);
			++m_utf8Length;
		}

		if (result >= 0x10000)
		{
			result -= 0x10000;
			m_utf16Length = 2;
			// UTF-16 Pair
			m_character[0] = static_cast<wchar_t>(0xD800 + (result >> 10));
			m_character[1] = static_cast<wchar_t>(0xDC00 + (result & 0x3FF));

		}
		else
		{
			m_utf16Length = 1;
			m_character[0] = static_cast<wchar_t>(result);
		}
	}
	else
	{
		m_utf8Length = 1;
		m_utf16Length = 1;
		m_characterIndex = 0;
		m_character[0] = static_cast<wchar_t>(currentChar);
	}
}


const unsigned char UTF8DocumentIterator::m_firstByteMask[7] = { 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01 };
-												Fix the warnings issue during Scintilla's compiling

Remove std::iterator from Boost Regex search code:
std::iterator was deprecated in C++17.
Remove it to fix the warnings and avoid other issues.

Fix #10035, close #10036

											
										
										
											2021-06-19 09:04:27 +00:00
+								// This file is part of Notepad++ project
 								// Copyright (C) 2021 Notepad++ authors.
-												[NEW_FEATURE] (Author: Dave Brotherstone) Add PCRE (Perl Compatible Regular Expressions) support.


git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@863 f5eea248-9336-0410-98b8-ebc06183d4e3

											
										
										
											2012-02-13 01:45:05 +00:00
-												Fix the warnings issue during Scintilla's compiling

Remove std::iterator from Boost Regex search code:
std::iterator was deprecated in C++17.
Remove it to fix the warnings and avoid other issues.

Fix #10035, close #10036

											
										
										
											2021-06-19 09:04:27 +00:00
+								// This program is free software: you can redistribute it and/or modify
 								// it under the terms of the GNU General Public License as published by
 								// the Free Software Foundation, either version 3 of the License, or
 								// at your option any later version.
 								//
 								// This program is distributed in the hope that it will be useful,
 								// but WITHOUT ANY WARRANTY; without even the implied warranty of
 								// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 								// GNU General Public License for more details.
 								//
 								// You should have received a copy of the GNU General Public License
 								// along with this program.  If not, see <https://www.gnu.org/licenses/>.
-												Upgrade Scintilla from v3.56 to v4.14

											
										
										
											2019-05-04 18:14:48 +00:00
-												Fix the warnings issue during Scintilla's compiling

Remove std::iterator from Boost Regex search code:
std::iterator was deprecated in C++17.
Remove it to fix the warnings and avoid other issues.

Fix #10035, close #10036

											
										
										
											2021-06-19 09:04:27 +00:00
+								#include "UTF8DocumentIterator.h"
-												Adapt for VS2019 builds with toolkit v142

Fix #9922, close #9942

											
										
										
											2020-09-06 07:24:24 +00:00
+								#include <string_view>
 								#include <stdexcept>
-												Update Scintilla from v4.4.6 to v5.2.1 and add Lexilla v5.1.5

Update with https://www.scintilla.org/scintilla521.zip
            https://www.scintilla.org/lexilla515.zip

- fix setting to bring Scintilla::PositionCR from ScintillaStructures.h inline with Sci_Position.h Sci_PositionCR
- add workaround to enable lexer for searchResult
commented out SCI_SETILEXER call on searchResult to get one result which is correctly handled by the lexer,
added comment about the current problem with property @MarkingsStruct which seems to disappear after call to SCI_SETILEXER or CreateLexer
- corrected usage of ObjC lexer
- removed unnecessary filter stuff
- use own sections for scintilla and lexilla build targets and allow parallel builds
- as libscilex is no longer existing, changed to libscintilla
- adapt makefiles and cmake
- use VS2019
- started simple changes for createlexer adaptations, nullpointercheck missing on return of lexer name from deprecated LexerNameFromID -> undefined behaviour
- movement from id -> lexer name, mostly done via LexerNameFromID + switching off corresponding compiler warning
- changed to SCI_SETILEXER from SCI_SETLEXER, SCI_SETLEXERLANGUAGE needs to be corrected, see Scintilla5Migration.html
- just commented out: SCI_LOADLEXERLIBRARY

Fix #10504, close #11419

											
										
										
											2022-01-04 23:07:50 +00:00
+								#include <optional>
-												Upgrade Scintilla from v3.56 to v4.14

											
										
										
											2019-05-04 18:14:48 +00:00
 								#include "ILoader.h"
 								#include "ILexer.h"
 								#include "Scintilla.h"
-												Update Scintilla from v4.4.6 to v5.2.1 and add Lexilla v5.1.5

Update with https://www.scintilla.org/scintilla521.zip
            https://www.scintilla.org/lexilla515.zip

- fix setting to bring Scintilla::PositionCR from ScintillaStructures.h inline with Sci_Position.h Sci_PositionCR
- add workaround to enable lexer for searchResult
commented out SCI_SETILEXER call on searchResult to get one result which is correctly handled by the lexer,
added comment about the current problem with property @MarkingsStruct which seems to disappear after call to SCI_SETILEXER or CreateLexer
- corrected usage of ObjC lexer
- removed unnecessary filter stuff
- use own sections for scintilla and lexilla build targets and allow parallel builds
- as libscilex is no longer existing, changed to libscintilla
- adapt makefiles and cmake
- use VS2019
- started simple changes for createlexer adaptations, nullpointercheck missing on return of lexer name from deprecated LexerNameFromID -> undefined behaviour
- movement from id -> lexer name, mostly done via LexerNameFromID + switching off corresponding compiler warning
- changed to SCI_SETILEXER from SCI_SETLEXER, SCI_SETLEXERLANGUAGE needs to be corrected, see Scintilla5Migration.html
- just commented out: SCI_LOADLEXERLIBRARY

Fix #10504, close #11419

											
										
										
											2022-01-04 23:07:50 +00:00
+								#include "ScintillaTypes.h"
 								#include "ScintillaMessages.h"
 								#include "Debugging.h"
 								#include "Geometry.h"
-												Adapt for VS2019 builds with toolkit v142

Fix #9922, close #9942

											
										
										
											2020-09-06 07:24:24 +00:00
+								#include "Platform.h"
-												Upgrade Scintilla from v3.56 to v4.14

											
										
										
											2019-05-04 18:14:48 +00:00
-												Update Scintilla from v4.4.6 to v5.2.1 and add Lexilla v5.1.5

Update with https://www.scintilla.org/scintilla521.zip
            https://www.scintilla.org/lexilla515.zip

- fix setting to bring Scintilla::PositionCR from ScintillaStructures.h inline with Sci_Position.h Sci_PositionCR
- add workaround to enable lexer for searchResult
commented out SCI_SETILEXER call on searchResult to get one result which is correctly handled by the lexer,
added comment about the current problem with property @MarkingsStruct which seems to disappear after call to SCI_SETILEXER or CreateLexer
- corrected usage of ObjC lexer
- removed unnecessary filter stuff
- use own sections for scintilla and lexilla build targets and allow parallel builds
- as libscilex is no longer existing, changed to libscintilla
- adapt makefiles and cmake
- use VS2019
- started simple changes for createlexer adaptations, nullpointercheck missing on return of lexer name from deprecated LexerNameFromID -> undefined behaviour
- movement from id -> lexer name, mostly done via LexerNameFromID + switching off corresponding compiler warning
- changed to SCI_SETILEXER from SCI_SETLEXER, SCI_SETLEXERLANGUAGE needs to be corrected, see Scintilla5Migration.html
- just commented out: SCI_LOADLEXERLIBRARY

Fix #10504, close #11419

											
										
										
											2022-01-04 23:07:50 +00:00
+								#include "CharacterCategoryMap.h"
-												Upgrade Scintilla from v3.56 to v4.14

											
										
										
											2019-05-04 18:14:48 +00:00
+								#include "Position.h"
 								#include "SplitVector.h"
 								#include "Partitioning.h"
 								#include "RunStyles.h"
 								#include "CellBuffer.h"
 								#include "CharClassify.h"
 								#include "Decoration.h"
 								#include "CaseFolder.h"
 								#include "Document.h"
-												Update Scintilla from v4.4.6 to v5.2.1 and add Lexilla v5.1.5

Update with https://www.scintilla.org/scintilla521.zip
            https://www.scintilla.org/lexilla515.zip

- fix setting to bring Scintilla::PositionCR from ScintillaStructures.h inline with Sci_Position.h Sci_PositionCR
- add workaround to enable lexer for searchResult
commented out SCI_SETILEXER call on searchResult to get one result which is correctly handled by the lexer,
added comment about the current problem with property @MarkingsStruct which seems to disappear after call to SCI_SETILEXER or CreateLexer
- corrected usage of ObjC lexer
- removed unnecessary filter stuff
- use own sections for scintilla and lexilla build targets and allow parallel builds
- as libscilex is no longer existing, changed to libscintilla
- adapt makefiles and cmake
- use VS2019
- started simple changes for createlexer adaptations, nullpointercheck missing on return of lexer name from deprecated LexerNameFromID -> undefined behaviour
- movement from id -> lexer name, mostly done via LexerNameFromID + switching off corresponding compiler warning
- changed to SCI_SETILEXER from SCI_SETLEXER, SCI_SETLEXERLANGUAGE needs to be corrected, see Scintilla5Migration.html
- just commented out: SCI_LOADLEXERLIBRARY

Fix #10504, close #11419

											
										
										
											2022-01-04 23:07:50 +00:00
+								using namespace Scintilla::Internal;
-												Upgrade Scintilla from v3.56 to v4.14

											
										
										
											2019-05-04 18:14:48 +00:00
-												One button to compile them all

1. Build Notepad++ with Scintilla static lib (libscintilla.lib) and Boost (v1.76) RegExpr.
2. ARM64 build is available.

Fix #5158, close #9594

											
										
										
											2021-02-28 00:45:09 +00:00
+								UTF8DocumentIterator::UTF8DocumentIterator(Document* doc, Sci::Position pos, Sci::Position end) :
 												m_pos(pos),
 												m_end(end),
 												m_characterIndex(0),
 												m_doc(doc)
-												Upgrade Scintilla from v3.56 to v4.14

											
										
										
											2019-05-04 18:14:48 +00:00
+								{
 										// Check for debug builds
 										PLATFORM_ASSERT(m_pos <= m_end);
 										// Ensure for release.
 										if (m_pos > m_end)
 										{
 												m_pos = m_end;
 										}
 										readCharacter();
 								}
 								UTF8DocumentIterator::UTF8DocumentIterator(const UTF8DocumentIterator& copy) :
 										m_pos(copy.m_pos),
 										m_end(copy.m_end),
 										m_characterIndex(copy.m_characterIndex),
 										m_utf8Length(copy.m_utf8Length),
-												One button to compile them all

1. Build Notepad++ with Scintilla static lib (libscintilla.lib) and Boost (v1.76) RegExpr.
2. ARM64 build is available.

Fix #5158, close #9594

											
										
										
											2021-02-28 00:45:09 +00:00
+										m_utf16Length(copy.m_utf16Length),
 										m_doc(copy.m_doc)
-												Upgrade Scintilla from v3.56 to v4.14

											
										
										
											2019-05-04 18:14:48 +00:00
+								{
 										// Check for debug builds
 										PLATFORM_ASSERT(m_pos <= m_end);
 										m_character[0] = copy.m_character[0];
 										m_character[1] = copy.m_character[1];
 										// Ensure for release.
 										if (m_pos > m_end)
 										{
 												m_pos = m_end;
 										}
 								}
-												Adapt for VS2019 builds with toolkit v142

Fix #9922, close #9942

											
										
										
											2020-09-06 07:24:24 +00:00
+								UTF8DocumentIterator& UTF8DocumentIterator::operator ++ ()
 								{
 									PLATFORM_ASSERT(m_pos < m_end);
 									if (m_utf16Length == 2 && m_characterIndex == 0)
 									{
 										m_characterIndex = 1;
 									}
 									else
 									{
 										m_pos += m_utf8Length;
 										if (m_pos > m_end)
 										{
 											m_pos = m_end;
 										}
 										m_characterIndex = 0;
 										readCharacter();
 									}
 									return *this;
 								}
-												Upgrade Scintilla from v3.56 to v4.14

											
										
										
											2019-05-04 18:14:48 +00:00
+								UTF8DocumentIterator& UTF8DocumentIterator::operator -- ()
 								{
 									if (m_utf16Length == 2 && m_characterIndex == 1)
 									{
 										m_characterIndex = 0;
 									}
 									else
 									{
 										--m_pos;
 										// Skip past the UTF-8 extension bytes
 										while (0x80 == (m_doc->CharAt(m_pos) & 0xC0) && m_pos > 0)
 											--m_pos;
 										readCharacter();
 										if (m_utf16Length == 2)
 										{
 											m_characterIndex = 1;
 										}
 									}
 									return *this;
 								}
-												[NEW_FEATURE] (Author: Dave Brotherstone) Add PCRE (Perl Compatible Regular Expressions) support.


git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@863 f5eea248-9336-0410-98b8-ebc06183d4e3

											
										
										
											2012-02-13 01:45:05 +00:00
 								void UTF8DocumentIterator::readCharacter()
 								{
 									unsigned char currentChar = m_doc->CharAt(m_pos);
 									if (currentChar & 0x80)
 									{
 										int mask = 0x40;
 										int nBytes = 1;
-												One button to compile them all

1. Build Notepad++ with Scintilla static lib (libscintilla.lib) and Boost (v1.76) RegExpr.
2. ARM64 build is available.

Fix #5158, close #9594

											
										
										
											2021-02-28 00:45:09 +00:00
 										do
-												[NEW_FEATURE] (Author: Dave Brotherstone) Add PCRE (Perl Compatible Regular Expressions) support.


git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@863 f5eea248-9336-0410-98b8-ebc06183d4e3

											
										
										
											2012-02-13 01:45:05 +00:00
+										{
 											mask >>= 1;
 											++nBytes;
 										} while (currentChar & mask);
 										int result = currentChar & m_firstByteMask[nBytes];
-												Upgrade Scintilla - integrate boost's PCRE in 64 build

And remove compiling warning.

											
										
										
											2019-05-08 08:43:30 +00:00
+										Sci::Position pos = m_pos;
-												[NEW_FEATURE] (Author: Dave Brotherstone) Add PCRE (Perl Compatible Regular Expressions) support.


git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@863 f5eea248-9336-0410-98b8-ebc06183d4e3

											
										
										
											2012-02-13 01:45:05 +00:00
+										m_utf8Length = 1;
 										// work out the unicode point, and count the actual bytes.
-												One button to compile them all

1. Build Notepad++ with Scintilla static lib (libscintilla.lib) and Boost (v1.76) RegExpr.
2. ARM64 build is available.

Fix #5158, close #9594

											
										
										
											2021-02-28 00:45:09 +00:00
+										// If a byte does not start with 10xxxxxx then it's not part of the
 										// the code. Therefore invalid UTF-8 encodings are dealt with, simply by stopping when
-												[NEW_FEATURE] (Author: Dave Brotherstone) Add PCRE (Perl Compatible Regular Expressions) support.


git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@863 f5eea248-9336-0410-98b8-ebc06183d4e3

											
										
										
											2012-02-13 01:45:05 +00:00
+										// the UTF8 extension bytes are no longer valid.
 										while ((--nBytes) && (pos < m_end) && (0x80 == ((currentChar = m_doc->CharAt(++pos)) & 0xC0)))
 										{
 											result = (result << 6) | (currentChar & 0x3F);
 											++m_utf8Length;
 										}
 										if (result >= 0x10000)
 										{
 											result -= 0x10000;
 											m_utf16Length = 2;
 											// UTF-16 Pair
-												Upgrade Scintilla - integrate boost's PCRE in 64 build

And remove compiling warning.

											
										
										
											2019-05-08 08:43:30 +00:00
+											m_character[0] = static_cast<wchar_t>(0xD800 + (result >> 10));
 											m_character[1] = static_cast<wchar_t>(0xDC00 + (result & 0x3FF));
-												One button to compile them all

1. Build Notepad++ with Scintilla static lib (libscintilla.lib) and Boost (v1.76) RegExpr.
2. ARM64 build is available.

Fix #5158, close #9594

											
										
										
											2021-02-28 00:45:09 +00:00
-												[NEW_FEATURE] (Author: Dave Brotherstone) Add PCRE (Perl Compatible Regular Expressions) support.


git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@863 f5eea248-9336-0410-98b8-ebc06183d4e3

											
										
										
											2012-02-13 01:45:05 +00:00
+										}
 										else
 										{
 											m_utf16Length = 1;
 											m_character[0] = static_cast<wchar_t>(result);
 										}
 									}
 									else
 									{
 										m_utf8Length = 1;
 										m_utf16Length = 1;
 										m_characterIndex = 0;
 										m_character[0] = static_cast<wchar_t>(currentChar);
 									}
 								}
-												One button to compile them all

1. Build Notepad++ with Scintilla static lib (libscintilla.lib) and Boost (v1.76) RegExpr.
2. ARM64 build is available.

Fix #5158, close #9594

											
										
										
											2021-02-28 00:45:09 +00:00
-												[NEW_FEATURE] (Author: Dave Brotherstone) Add PCRE (Perl Compatible Regular Expressions) support.


git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@863 f5eea248-9336-0410-98b8-ebc06183d4e3

											
										
										
											2012-02-13 01:45:05 +00:00
+								const unsigned char UTF8DocumentIterator::m_firstByteMask[7] = { 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01 };