notepad-plus-plus/scintilla/boostregex/BoostRegExSearch.cxx

323 lines
8.3 KiB
C++

/**
* Copyright (c) since 2009 Simon Steele - http://untidy.net/
* Based on the work of Simon Steele for Programmer's Notepad 2 (http://untidy.net)
* Converted from boost::xpressive to boost::regex and performance improvements
* (principally caching the compiled regex), and support for UTF8 encoded text
* (c) 2012 Dave Brotherstone - Changes for boost::regex
*
*
*/
#include <stdlib.h>
#include <iterator>
#include "scintilla.h"
#include "Platform.h"
#include "SplitVector.h"
#include "Partitioning.h"
#include "RunStyles.h"
#include "CellBuffer.h"
#include "CharClassify.h"
#include "Decoration.h"
#include "ILexer.h"
#include "Document.h"
#include "UniConversion.h"
#include "UTF8DocumentIterator.h"
#include "AnsiDocumentIterator.h"
#include "BoostRegexSearch.h"
#include <boost/regex.hpp>
#define CP_UTF8 65001
#define SC_CP_UTF8 65001
#ifdef SCI_NAMESPACE
using namespace Scintilla;
#endif
using namespace boost;
typedef basic_regex<char> charregex_t;
typedef boost::wregex wcharregex_t;
// , std::vector<boost::sub_match<DocumentIterator> >::allocator_type
typedef match_results<UTF8DocumentIterator> utf8match_t;
typedef match_results<AnsiDocumentIterator> ansimatch_t;
class BoostRegexSearch : public RegexSearchBase
{
public:
BoostRegexSearch() : substituted(NULL), lastCompileFlags(-1) {}
virtual ~BoostRegexSearch()
{
if (substituted)
{
delete [] substituted;
substituted = NULL;
}
}
virtual long FindText(Document* doc, int minPos, int maxPos, const char *s,
bool caseSensitive, bool word, bool wordStart, int flags, int *length);
virtual const char *SubstituteByPosition(Document* doc, const char *text, int *length);
private:
wchar_t *utf8ToWchar(const char *utf8);
char *wcharToUtf8(const wchar_t *w);
charregex_t m_charre;
wcharregex_t m_wcharre;
utf8match_t m_utf8match;
ansimatch_t m_ansimatch;
char *substituted;
std::string m_lastRegexString;
std::string m_lastRegexUtf8string;
int lastCompileFlags;
};
#ifdef SCI_NAMESPACE
namespace Scintilla
{
#endif
RegexSearchBase *CreateRegexSearch(CharClassify* /* charClassTable */)
{
return new BoostRegexSearch();
}
#ifdef SCI_NAMESPACE
}
#endif
/**
* Find text in document, supporting both forward and backward
* searches (just pass minPos > maxPos to do a backward search)
*/
long BoostRegexSearch::FindText(Document* doc, int minPos, int maxPos, const char *s,
bool caseSensitive, bool /*word*/, bool /*wordStart*/, int searchFlags, int *length)
{
int startPos, endPos, increment;
if (minPos > maxPos)
{
startPos = maxPos;
endPos = minPos;
increment = -1;
}
else
{
startPos = minPos;
endPos = maxPos;
increment = 1;
}
// Range endpoints should not be inside DBCS characters, but just in case, move them.
startPos = doc->MovePositionOutsideChar(startPos, 1, false);
endPos = doc->MovePositionOutsideChar(endPos, 1, false);
int compileFlags(regex_constants::ECMAScript);
if (!caseSensitive)
{
compileFlags |= regex_constants::icase;
}
bool isUtf8 = (doc->CodePage() == SC_CP_UTF8);
try
{
if (compileFlags != lastCompileFlags
|| (isUtf8 && m_lastRegexUtf8string != s)
|| (!isUtf8 && m_lastRegexString != s)) // Test to see if we're called with the same
// regex as last time, if we are, then we don't need to recompile it
{
if (isUtf8)
{
const wchar_t* wchars = utf8ToWchar(s);
m_wcharre = wcharregex_t(wchars, static_cast<regex_constants::syntax_option_type>(compileFlags));
delete [] wchars;
m_lastRegexUtf8string = s;
}
else
{ // Ansi
m_charre = charregex_t(s, static_cast<regex_constants::syntax_option_type>(compileFlags));
m_lastRegexString = s;
}
lastCompileFlags = compileFlags;
}
}
catch(regex_error& /*ex*/)
{
// -1 is normally used for not found, -2 is used here for invalid regex
return -2;
}
// Work out the range of lines we're searching across, moving beyond an empty end-of-line
int lineRangeStart = doc->LineFromPosition(startPos);
int lineRangeEnd = doc->LineFromPosition(endPos);
if ((increment == 1) &&
(startPos >= doc->LineEnd(lineRangeStart)) &&
(lineRangeStart < lineRangeEnd))
{
// the start position is at end of line or between line end characters.
lineRangeStart++;
startPos = doc->LineStart(lineRangeStart);
}
regex_constants::match_flag_type flags(regex_constants::match_default);
// Work out the flags:
if (startPos != doc->LineStart(lineRangeStart))
{
flags |= regex_constants::match_not_bol;
}
if (endPos != doc->LineEnd(lineRangeEnd))
{
flags |= regex_constants::match_not_eol;
}
if (0 == (searchFlags & SCFIND_REGEXP_DOTMATCHESNL))
{
flags |= regex_constants::match_not_dot_newline;
}
int pos(-1);
int lenRet(0);
if (doc->CodePage() == SC_CP_UTF8)
{
UTF8DocumentIterator end(doc, endPos, endPos);
bool success = boost::regex_search(UTF8DocumentIterator(doc, startPos, endPos), end, m_utf8match, m_wcharre, flags);
if (success)
{
pos = m_utf8match[0].first.pos();
lenRet = m_utf8match[0].second.pos() - pos;
if (increment == -1)
{
// Check for the last match on this line.
int repetitions = 100; // Break out of infinite loop
int previousPos = pos;
while (success && ((pos + lenRet) <= endPos))
{
if (previousPos >= pos && 0 >= (--repetitions))
break;
previousPos = pos;
success = regex_search(UTF8DocumentIterator(doc, pos + 1, endPos), end, m_utf8match, m_wcharre, flags);
// success = regex_search(DocumentIterator(doc, pos + 1, endPos), end, match, re, static_cast<regex_constants::match_flag_type>(flags));
if (success)
{
if ((pos + lenRet) <= minPos)
{
pos = m_utf8match[0].first.pos();
lenRet = m_utf8match[0].second.pos() - pos;
}
else
{
success = 0;
}
}
}
}
*length = lenRet;
}
}
else
{
AnsiDocumentIterator end(doc, endPos, endPos);
bool success = boost::regex_search(AnsiDocumentIterator(doc, startPos, endPos), end, m_ansimatch, m_charre, flags);
if (success)
{
pos = m_ansimatch[0].first.pos();
lenRet = m_ansimatch.length();
if (increment == -1)
{
// Check for the last match on this line.
int repetitions = 100; // Break out of infinite loop
int previousPos = pos;
while (success && ((pos + lenRet) <= endPos))
{
if (previousPos >= pos && 0 >= (--repetitions))
break;
previousPos = pos;
success = regex_search(AnsiDocumentIterator(doc, pos + 1, endPos), end, m_ansimatch, m_charre, flags);
// success = regex_search(DocumentIterator(doc, pos + 1, endPos), end, match, re, static_cast<regex_constants::match_flag_type>(flags));
if (success)
{
if ((pos + lenRet) <= minPos)
{
pos = m_ansimatch[0].first.pos();
lenRet = m_ansimatch[0].length();
}
else
{
success = 0;
}
}
}
}
*length = lenRet;
}
}
return pos;
}
const char *BoostRegexSearch::SubstituteByPosition(Document* doc, const char *text, int *length) {
delete []substituted;
substituted = NULL;
if (doc->CodePage() == SC_CP_UTF8)
{
const wchar_t* wtext = utf8ToWchar(text);
std::wstring replaced = m_utf8match.format(wtext, boost::format_all);
delete[] wtext;
substituted = wcharToUtf8(replaced.c_str());
*length = strlen(substituted);
}
else
{
std::string replaced = m_ansimatch.format(text, boost::format_all);
*length = replaced.size();
substituted = new char[*length + 1];
strcpy(substituted, replaced.c_str());
}
return substituted;
}
wchar_t *BoostRegexSearch::utf8ToWchar(const char *utf8)
{
int utf8Size = strlen(utf8);
int wcharSize = UTF16Length(utf8, utf8Size);
wchar_t *w = new wchar_t[wcharSize + 1];
UTF16FromUTF8(utf8, utf8Size, w, wcharSize + 1);
w[wcharSize] = 0;
return w;
}
char* BoostRegexSearch::wcharToUtf8(const wchar_t *w)
{
int wcharSize = wcslen(w);
int charSize = UTF8Length(w, wcharSize);
char *c = new char[charSize + 1];
UTF8FromUTF16(w, wcharSize, c, charSize);
c[charSize] = 0;
return c;
}