notepad-plus-plus/scintilla/scripts/GenerateCaseConvert.py

#!/usr/bin/env python3
# Script to generate CaseConvert.cxx from Python's Unicode data
# Should be run rarely when a Python with a new version of Unicode data is available.
# Requires Python 3.3 or later
# Should not be run with old versions of Python.

# Current best approach divides case conversions into two cases:
# simple symmetric and complex.
# Simple symmetric is where a lower and upper case pair convert to each
# other and the folded form is the same as the lower case.
# There are 1006 symmetric pairs.
# These are further divided into ranges (stored as lower, upper, range length,
# range pitch and singletons (stored as lower, upper).
# Complex is for cases that don't fit the above: where there are multiple
# characters in one of the forms or fold is different to lower or
# lower(upper(x)) or upper(lower(x)) are not x. These are represented as UTF-8
# strings with original, folded, upper, and lower separated by '|'.
# There are 126 complex cases.

import itertools, string, sys

from FileGenerator import Regenerate

def contiguousRanges(ll, diff):
    # ll is a list of lists
    # group into lists where first element of each element differs by diff
    out = [[ll[0]]]
    for s in ll[1:]:
        if s[0] != out[-1][-1][0] + diff:
            out.append([])
        out[-1].append(s)
    return out

def flatten(listOfLists):
    "Flatten one level of nesting"
    return itertools.chain.from_iterable(listOfLists)

def conversionSets():
    # For all Unicode characters, see whether they have case conversions
    # Return 2 sets: one of simple symmetric conversion cases and another
    # with complex cases.
    complexes = []
    symmetrics = []
    for ch in range(sys.maxunicode + 1):
        if ch >= 0xd800 and ch <= 0xDBFF:
            continue
        if ch >= 0xdc00 and ch <= 0xDFFF:
            continue
        uch = chr(ch)

        fold = uch.casefold()
        upper = uch.upper()
        lower = uch.lower()
        symmetric = False
        if uch != upper and len(upper) == 1 and uch == lower and uch == fold:
            lowerUpper = upper.lower()
            foldUpper = upper.casefold()
            if lowerUpper == foldUpper and lowerUpper == uch:
                symmetric = True
                symmetrics.append((ch, ord(upper), ch - ord(upper)))
        if uch != lower and len(lower) == 1 and uch == upper and lower == fold:
            upperLower = lower.upper()
            if upperLower == uch:
                symmetric = True

        if fold == uch:
            fold = ""
        if upper == uch:
            upper = ""
        if lower == uch:
            lower = ""

        if (fold or upper or lower) and not symmetric:
            complexes.append((uch, fold, upper, lower))

    return symmetrics, complexes

def groupRanges(symmetrics):
    # Group the symmetrics into groups where possible, returning a list
    # of ranges and a list of symmetrics that didn't fit into a range

    def distance(s):
        return s[2]

    groups = []
    uniquekeys = []
    for k, g in itertools.groupby(symmetrics, distance):
        groups.append(list(g))      # Store group iterator as a list
        uniquekeys.append(k)

    contiguousGroups = flatten([contiguousRanges(g, 1) for g in groups])
    longGroups = [(x[0][0], x[0][1], len(x), 1) for x in contiguousGroups if len(x) > 4]

    oneDiffs = [s for s in symmetrics if s[2] == 1]
    contiguousOnes = flatten([contiguousRanges(g, 2) for g in [oneDiffs]])
    longOneGroups = [(x[0][0], x[0][1], len(x), 2) for x in contiguousOnes if len(x) > 4]

    rangeGroups = sorted(longGroups+longOneGroups, key=lambda s: s[0])

    rangeCoverage = list(flatten([range(r[0], r[0]+r[2]*r[3], r[3]) for r in rangeGroups]))

    nonRanges = [(x, u) for x, u, _d in symmetrics if x not in rangeCoverage]

    return rangeGroups, nonRanges

def escape(s):
    return "".join((chr(c) if chr(c) in string.ascii_letters else "\\x%x" % c) for c in s.encode('utf-8'))

def updateCaseConvert():
    symmetrics, complexes = conversionSets()

    rangeGroups, nonRanges = groupRanges(symmetrics)

    print(len(rangeGroups), "ranges")
    rangeLines = ["%d,%d,%d,%d," % x for x in rangeGroups]

    print(len(nonRanges), "non ranges")
    nonRangeLines = ["%d,%d," % x for x in nonRanges]

    print(len(symmetrics), "symmetric")

    complexLines = ['"%s|%s|%s|%s|"' % tuple(escape(t) for t in x) for x in complexes]
    print(len(complexLines), "complex")

    Regenerate("../src/CaseConvert.cxx", "//", rangeLines, nonRangeLines, complexLines)

updateCaseConvert()
Upgrade Scintilla from v4.2.0 to v4.4.6 Close #8900, close #9550 4 years ago			`#!/usr/bin/env python3`
[UPDATE] Update Scintilla to 3.3.4 [BUG_FIXED] (Author: Dave Brotherstone) Fix scintilla crash bug while closing a document. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@1104 f5eea248-9336-0410-98b8-ebc06183d4e3 11 years ago			`# Script to generate CaseConvert.cxx from Python's Unicode data`
			`# Should be run rarely when a Python with a new version of Unicode data is available.`
[UPGRADE] Upgrade Scintilla from v3.34 to v3.56. 10 years ago			`# Requires Python 3.3 or later`
[UPDATE] Update Scintilla to 3.3.4 [BUG_FIXED] (Author: Dave Brotherstone) Fix scintilla crash bug while closing a document. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@1104 f5eea248-9336-0410-98b8-ebc06183d4e3 11 years ago			`# Should not be run with old versions of Python.`

Upgrade Scintilla from v3.56 to v4.14 6 years ago			`# Current best approach divides case conversions into two cases:`
[UPDATE] Update Scintilla to 3.3.4 [BUG_FIXED] (Author: Dave Brotherstone) Fix scintilla crash bug while closing a document. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@1104 f5eea248-9336-0410-98b8-ebc06183d4e3 11 years ago			`# simple symmetric and complex.`
			`# Simple symmetric is where a lower and upper case pair convert to each`
Upgrade Scintilla from v3.56 to v4.14 6 years ago			`# other and the folded form is the same as the lower case.`
[UPDATE] Update Scintilla to 3.3.4 [BUG_FIXED] (Author: Dave Brotherstone) Fix scintilla crash bug while closing a document. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@1104 f5eea248-9336-0410-98b8-ebc06183d4e3 11 years ago			`# There are 1006 symmetric pairs.`
			`# These are further divided into ranges (stored as lower, upper, range length,`
Upgrade Scintilla from v3.56 to v4.14 6 years ago			`# range pitch and singletons (stored as lower, upper).`
[UPDATE] Update Scintilla to 3.3.4 [BUG_FIXED] (Author: Dave Brotherstone) Fix scintilla crash bug while closing a document. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@1104 f5eea248-9336-0410-98b8-ebc06183d4e3 11 years ago			`# Complex is for cases that don't fit the above: where there are multiple`
Upgrade Scintilla from v3.56 to v4.14 6 years ago			`# characters in one of the forms or fold is different to lower or`
[UPDATE] Update Scintilla to 3.3.4 [BUG_FIXED] (Author: Dave Brotherstone) Fix scintilla crash bug while closing a document. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@1104 f5eea248-9336-0410-98b8-ebc06183d4e3 11 years ago			`# lower(upper(x)) or upper(lower(x)) are not x. These are represented as UTF-8`
			`# strings with original, folded, upper, and lower separated by '\|'.`
			`# There are 126 complex cases.`

Update Scintilla from v4.4.6 to v5.2.1 and add Lexilla v5.1.5 Update with https://www.scintilla.org/scintilla521.zip https://www.scintilla.org/lexilla515.zip - fix setting to bring Scintilla::PositionCR from ScintillaStructures.h inline with Sci_Position.h Sci_PositionCR - add workaround to enable lexer for searchResult commented out SCI_SETILEXER call on searchResult to get one result which is correctly handled by the lexer, added comment about the current problem with property @MarkingsStruct which seems to disappear after call to SCI_SETILEXER or CreateLexer - corrected usage of ObjC lexer - removed unnecessary filter stuff - use own sections for scintilla and lexilla build targets and allow parallel builds - as libscilex is no longer existing, changed to libscintilla - adapt makefiles and cmake - use VS2019 - started simple changes for createlexer adaptations, nullpointercheck missing on return of lexer name from deprecated LexerNameFromID -> undefined behaviour - movement from id -> lexer name, mostly done via LexerNameFromID + switching off corresponding compiler warning - changed to SCI_SETILEXER from SCI_SETLEXER, SCI_SETLEXERLANGUAGE needs to be corrected, see Scintilla5Migration.html - just commented out: SCI_LOADLEXERLIBRARY Fix #10504, close #11419 3 years ago			`import itertools, string, sys`
[UPDATE] Update Scintilla to 3.3.4 [BUG_FIXED] (Author: Dave Brotherstone) Fix scintilla crash bug while closing a document. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@1104 f5eea248-9336-0410-98b8-ebc06183d4e3 11 years ago
			`from FileGenerator import Regenerate`

Update: Scintilla 5.3.5 Lexilla 5.2.5 update to Scinitlla Release 5.3.5 (https://www.scintilla.org/scintilla535.zip) Released 31 May 2023. On Win32, implement IME context sensitivity with IMR_DOCUMENTFEED. Feature #1310. On Win32 remove dependence on MSIMG32.DLL by replacing AlphaBlend by GdiAlphaBlend. Bug #1923. On Qt, stop movement of IME candidate box. On Qt, report correct caret position within paragraph for IME retrieve surrounding text. On Qt for Cocoa, fix crash in entry of multi-character strings with IME. and Lexilla Release 5.2.5 (https://www.scintilla.org/lexilla525.zip) Released 31 May 2023. Add CharacterSetArray constructor without setBase initial argument for common case where this is setNone and the initialSet argument completely defines the characters. This shortens and clarifies use of CharacterSetArray. Bash: implement highlighting inside quoted elements and here-docs. Controlled with properties lexer.bash.styling.inside.string, lexer.bash.styling.inside.backticks, lexer.bash.styling.inside.parameter, and lexer.bash.styling.inside.heredoc. Issue #154, Issue #153, Feature #1033. Bash: add property lexer.bash.command.substitution to choose how to style command substitutions. 0 → SCE_SH_BACKTICKS; 1 → surrounding "$(" and ")" as operators and contents styled as bash code; 2 → use distinct styles (base style + 64) for contents. Choice (2) is a provisional feature and details may change before it is finalized. Issue #153. Bash: fix nesting of parameters (SCE_SH_PARAM) like ${var/$sub/"${rep}}"}. Issue #154. Bash: fix single character special parameters like $? by limiting style. Issue #154. Bash: treat "$$" as special parameter and end scalars before "$". Issue #154. Bash: treat "<<" in arithmetic contexts as left bitwise shift operator instead of here-doc. Issue #137. Batch: style SCE_BAT_AFTER_LABEL used for rest of line after label which is not executed. Issue #148. F#: Lex interpolated verbatim strings as verbatim. Issue #156. VB: allow multiline strings when lexer.vb.strings.multiline set. Issue #151. Close #13729 2 years ago			`def contiguousRanges(ll, diff):`
			`# ll is a list of lists`
[UPDATE] Update Scintilla to 3.3.4 [BUG_FIXED] (Author: Dave Brotherstone) Fix scintilla crash bug while closing a document. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@1104 f5eea248-9336-0410-98b8-ebc06183d4e3 11 years ago			`# group into lists where first element of each element differs by diff`
Update: Scintilla 5.3.5 Lexilla 5.2.5 update to Scinitlla Release 5.3.5 (https://www.scintilla.org/scintilla535.zip) Released 31 May 2023. On Win32, implement IME context sensitivity with IMR_DOCUMENTFEED. Feature #1310. On Win32 remove dependence on MSIMG32.DLL by replacing AlphaBlend by GdiAlphaBlend. Bug #1923. On Qt, stop movement of IME candidate box. On Qt, report correct caret position within paragraph for IME retrieve surrounding text. On Qt for Cocoa, fix crash in entry of multi-character strings with IME. and Lexilla Release 5.2.5 (https://www.scintilla.org/lexilla525.zip) Released 31 May 2023. Add CharacterSetArray constructor without setBase initial argument for common case where this is setNone and the initialSet argument completely defines the characters. This shortens and clarifies use of CharacterSetArray. Bash: implement highlighting inside quoted elements and here-docs. Controlled with properties lexer.bash.styling.inside.string, lexer.bash.styling.inside.backticks, lexer.bash.styling.inside.parameter, and lexer.bash.styling.inside.heredoc. Issue #154, Issue #153, Feature #1033. Bash: add property lexer.bash.command.substitution to choose how to style command substitutions. 0 → SCE_SH_BACKTICKS; 1 → surrounding "$(" and ")" as operators and contents styled as bash code; 2 → use distinct styles (base style + 64) for contents. Choice (2) is a provisional feature and details may change before it is finalized. Issue #153. Bash: fix nesting of parameters (SCE_SH_PARAM) like ${var/$sub/"${rep}}"}. Issue #154. Bash: fix single character special parameters like $? by limiting style. Issue #154. Bash: treat "$$" as special parameter and end scalars before "$". Issue #154. Bash: treat "<<" in arithmetic contexts as left bitwise shift operator instead of here-doc. Issue #137. Batch: style SCE_BAT_AFTER_LABEL used for rest of line after label which is not executed. Issue #148. F#: Lex interpolated verbatim strings as verbatim. Issue #156. VB: allow multiline strings when lexer.vb.strings.multiline set. Issue #151. Close #13729 2 years ago			`out = [[ll[0]]]`
			`for s in ll[1:]:`
[UPDATE] Update Scintilla to 3.3.4 [BUG_FIXED] (Author: Dave Brotherstone) Fix scintilla crash bug while closing a document. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@1104 f5eea248-9336-0410-98b8-ebc06183d4e3 11 years ago			`if s[0] != out[-1][-1][0] + diff:`
			`out.append([])`
			`out[-1].append(s)`
			`return out`

			`def flatten(listOfLists):`
			`"Flatten one level of nesting"`
			`return itertools.chain.from_iterable(listOfLists)`
Upgrade Scintilla from v3.56 to v4.14 6 years ago
[UPDATE] Update Scintilla to 3.3.4 [BUG_FIXED] (Author: Dave Brotherstone) Fix scintilla crash bug while closing a document. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@1104 f5eea248-9336-0410-98b8-ebc06183d4e3 11 years ago			`def conversionSets():`
			`# For all Unicode characters, see whether they have case conversions`
			`# Return 2 sets: one of simple symmetric conversion cases and another`
			`# with complex cases.`
			`complexes = []`
			`symmetrics = []`
Upgrade Scintilla from v4.2.0 to v4.4.6 Close #8900, close #9550 4 years ago			`for ch in range(sys.maxunicode + 1):`
[UPDATE] Update Scintilla to 3.3.4 [BUG_FIXED] (Author: Dave Brotherstone) Fix scintilla crash bug while closing a document. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@1104 f5eea248-9336-0410-98b8-ebc06183d4e3 11 years ago			`if ch >= 0xd800 and ch <= 0xDBFF:`
			`continue`
			`if ch >= 0xdc00 and ch <= 0xDFFF:`
			`continue`
			`uch = chr(ch)`

			`fold = uch.casefold()`
			`upper = uch.upper()`
			`lower = uch.lower()`
			`symmetric = False`
			`if uch != upper and len(upper) == 1 and uch == lower and uch == fold:`
			`lowerUpper = upper.lower()`
			`foldUpper = upper.casefold()`
			`if lowerUpper == foldUpper and lowerUpper == uch:`
			`symmetric = True`
			`symmetrics.append((ch, ord(upper), ch - ord(upper)))`
			`if uch != lower and len(lower) == 1 and uch == upper and lower == fold:`
			`upperLower = lower.upper()`
			`if upperLower == uch:`
			`symmetric = True`

			`if fold == uch:`
			`fold = ""`
			`if upper == uch:`
			`upper = ""`
			`if lower == uch:`
			`lower = ""`

			`if (fold or upper or lower) and not symmetric:`
			`complexes.append((uch, fold, upper, lower))`

			`return symmetrics, complexes`

			`def groupRanges(symmetrics):`
			`# Group the symmetrics into groups where possible, returning a list`
			`# of ranges and a list of symmetrics that didn't fit into a range`

			`def distance(s):`
			`return s[2]`

			`groups = []`
			`uniquekeys = []`
			`for k, g in itertools.groupby(symmetrics, distance):`
			`groups.append(list(g)) # Store group iterator as a list`
			`uniquekeys.append(k)`

			`contiguousGroups = flatten([contiguousRanges(g, 1) for g in groups])`
			`longGroups = [(x[0][0], x[0][1], len(x), 1) for x in contiguousGroups if len(x) > 4]`
Upgrade Scintilla from v3.56 to v4.14 6 years ago
[UPDATE] Update Scintilla to 3.3.4 [BUG_FIXED] (Author: Dave Brotherstone) Fix scintilla crash bug while closing a document. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@1104 f5eea248-9336-0410-98b8-ebc06183d4e3 11 years ago			`oneDiffs = [s for s in symmetrics if s[2] == 1]`
			`contiguousOnes = flatten([contiguousRanges(g, 2) for g in [oneDiffs]])`
			`longOneGroups = [(x[0][0], x[0][1], len(x), 2) for x in contiguousOnes if len(x) > 4]`

			`rangeGroups = sorted(longGroups+longOneGroups, key=lambda s: s[0])`

			`rangeCoverage = list(flatten([range(r[0], r[0]+r[2]*r[3], r[3]) for r in rangeGroups]))`
Upgrade Scintilla from v3.56 to v4.14 6 years ago
Update: Scintilla 5.3.5 Lexilla 5.2.5 update to Scinitlla Release 5.3.5 (https://www.scintilla.org/scintilla535.zip) Released 31 May 2023. On Win32, implement IME context sensitivity with IMR_DOCUMENTFEED. Feature #1310. On Win32 remove dependence on MSIMG32.DLL by replacing AlphaBlend by GdiAlphaBlend. Bug #1923. On Qt, stop movement of IME candidate box. On Qt, report correct caret position within paragraph for IME retrieve surrounding text. On Qt for Cocoa, fix crash in entry of multi-character strings with IME. and Lexilla Release 5.2.5 (https://www.scintilla.org/lexilla525.zip) Released 31 May 2023. Add CharacterSetArray constructor without setBase initial argument for common case where this is setNone and the initialSet argument completely defines the characters. This shortens and clarifies use of CharacterSetArray. Bash: implement highlighting inside quoted elements and here-docs. Controlled with properties lexer.bash.styling.inside.string, lexer.bash.styling.inside.backticks, lexer.bash.styling.inside.parameter, and lexer.bash.styling.inside.heredoc. Issue #154, Issue #153, Feature #1033. Bash: add property lexer.bash.command.substitution to choose how to style command substitutions. 0 → SCE_SH_BACKTICKS; 1 → surrounding "$(" and ")" as operators and contents styled as bash code; 2 → use distinct styles (base style + 64) for contents. Choice (2) is a provisional feature and details may change before it is finalized. Issue #153. Bash: fix nesting of parameters (SCE_SH_PARAM) like ${var/$sub/"${rep}}"}. Issue #154. Bash: fix single character special parameters like $? by limiting style. Issue #154. Bash: treat "$$" as special parameter and end scalars before "$". Issue #154. Bash: treat "<<" in arithmetic contexts as left bitwise shift operator instead of here-doc. Issue #137. Batch: style SCE_BAT_AFTER_LABEL used for rest of line after label which is not executed. Issue #148. F#: Lex interpolated verbatim strings as verbatim. Issue #156. VB: allow multiline strings when lexer.vb.strings.multiline set. Issue #151. Close #13729 2 years ago			`nonRanges = [(x, u) for x, u, _d in symmetrics if x not in rangeCoverage]`
[UPDATE] Update Scintilla to 3.3.4 [BUG_FIXED] (Author: Dave Brotherstone) Fix scintilla crash bug while closing a document. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@1104 f5eea248-9336-0410-98b8-ebc06183d4e3 11 years ago
			`return rangeGroups, nonRanges`

[UPGRADE] Upgrade Scintilla from v3.34 to v3.56. 10 years ago			`def escape(s):`
Upgrade Scintilla from v3.56 to v4.14 6 years ago			`return "".join((chr(c) if chr(c) in string.ascii_letters else "\\x%x" % c) for c in s.encode('utf-8'))`
[UPGRADE] Upgrade Scintilla from v3.34 to v3.56. 10 years ago
[UPDATE] Update Scintilla to 3.3.4 [BUG_FIXED] (Author: Dave Brotherstone) Fix scintilla crash bug while closing a document. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@1104 f5eea248-9336-0410-98b8-ebc06183d4e3 11 years ago			`def updateCaseConvert():`
			`symmetrics, complexes = conversionSets()`
Upgrade Scintilla from v3.56 to v4.14 6 years ago
[UPDATE] Update Scintilla to 3.3.4 [BUG_FIXED] (Author: Dave Brotherstone) Fix scintilla crash bug while closing a document. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@1104 f5eea248-9336-0410-98b8-ebc06183d4e3 11 years ago			`rangeGroups, nonRanges = groupRanges(symmetrics)`

			`print(len(rangeGroups), "ranges")`
Upgrade Scintilla from v3.56 to v4.14 6 years ago			`rangeLines = ["%d,%d,%d,%d," % x for x in rangeGroups]`
[UPDATE] Update Scintilla to 3.3.4 [BUG_FIXED] (Author: Dave Brotherstone) Fix scintilla crash bug while closing a document. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@1104 f5eea248-9336-0410-98b8-ebc06183d4e3 11 years ago
			`print(len(nonRanges), "non ranges")`
Upgrade Scintilla from v3.56 to v4.14 6 years ago			`nonRangeLines = ["%d,%d," % x for x in nonRanges]`

[UPDATE] Update Scintilla to 3.3.4 [BUG_FIXED] (Author: Dave Brotherstone) Fix scintilla crash bug while closing a document. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@1104 f5eea248-9336-0410-98b8-ebc06183d4e3 11 years ago			`print(len(symmetrics), "symmetric")`
Upgrade Scintilla from v3.56 to v4.14 6 years ago
[UPGRADE] Upgrade Scintilla from v3.34 to v3.56. 10 years ago			`complexLines = ['"%s\|%s\|%s\|%s\|"' % tuple(escape(t) for t in x) for x in complexes]`
[UPDATE] Update Scintilla to 3.3.4 [BUG_FIXED] (Author: Dave Brotherstone) Fix scintilla crash bug while closing a document. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@1104 f5eea248-9336-0410-98b8-ebc06183d4e3 11 years ago			`print(len(complexLines), "complex")`

			`Regenerate("../src/CaseConvert.cxx", "//", rangeLines, nonRangeLines, complexLines)`

			`updateCaseConvert()`