notepad-plus-plus/lexilla/lexers/LexMarkdown.cxx

487 lines
18 KiB
C++
Raw Normal View History

/******************************************************************
* LexMarkdown.cxx
*
* A simple Markdown lexer for scintilla.
*
* Includes highlighting for some extra features from the
* Pandoc implementation; strikeout, using '#.' as a default
* ordered list item marker, and delimited code blocks.
*
* Limitations:
*
* Standard indented code blocks are not highlighted at all,
* as it would conflict with other indentation schemes. Use
* delimited code blocks for blanket highlighting of an
* entire code block. Embedded HTML is not highlighted either.
* Blanket HTML highlighting has issues, because some Markdown
* implementations allow Markdown markup inside of the HTML. Also,
* there is a following blank line issue that can't be ignored,
* explained in the next paragraph. Embedded HTML and code
* blocks would be better supported with language specific
* highlighting.
*
* The highlighting aims to accurately reflect correct syntax,
* but a few restrictions are relaxed. Delimited code blocks are
* highlighted, even if the line following the code block is not blank.
* Requiring a blank line after a block, breaks the highlighting
* in certain cases, because of the way Scintilla ends up calling
* the lexer.
*
* Written by Jon Strait - jstrait@moonloop.net
*
* The License.txt file describes the conditions under which this
* software may be distributed.
*
*****************************************************************/
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <stdarg.h>
#include <assert.h>
#include <string>
#include <string_view>
#include "ILexer.h"
#include "Scintilla.h"
#include "SciLexer.h"
#include "WordList.h"
#include "LexAccessor.h"
#include "Accessor.h"
#include "StyleContext.h"
#include "CharacterSet.h"
#include "LexerModule.h"
using namespace Lexilla;
namespace {
constexpr bool IsNewline(const int ch) {
// sc.GetRelative(i) returns '\0' if out of range
return (ch == '\n' || ch == '\r' || ch == '\0');
}
}
// True if can follow ch down to the end with possibly trailing whitespace
// Does not set the state SCE_MARKDOWN_LINE_BEGIN as to allow further processing
static bool FollowToLineEnd(const int ch, const int state, const Sci_PositionU endPos, StyleContext &sc) {
Sci_Position i = 0;
while (sc.GetRelative(++i) == ch)
;
// Skip over whitespace
while (IsASpaceOrTab(sc.GetRelative(i)) && sc.currentPos + i < endPos)
++i;
if (IsNewline(sc.GetRelative(i)) || sc.currentPos + i == endPos) {
sc.SetState(state);
sc.Forward(i);
return true;
}
else return false;
}
// Set the state on text section from current to length characters,
// then set the rest until the newline to default, except for any characters matching token
static void SetStateAndZoom(const int state, const Sci_Position length, const int token, StyleContext &sc) {
sc.SetState(state);
sc.Forward(length);
sc.SetState(SCE_MARKDOWN_DEFAULT);
sc.Forward();
bool started = false;
while (sc.More() && !IsNewline(sc.ch)) {
if (sc.ch == token && !started) {
sc.SetState(state);
started = true;
}
else if (sc.ch != token) {
sc.SetState(SCE_MARKDOWN_DEFAULT);
started = false;
}
sc.Forward();
}
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
}
// Does the previous line have more than spaces and tabs?
static bool HasPrevLineContent(StyleContext &sc) {
Sci_Position i = 0;
// Go back to the previous newline
while ((--i + (Sci_Position)sc.currentPos) >= 0 && !IsNewline(sc.GetRelative(i)))
;
while ((--i + (Sci_Position)sc.currentPos) >= 0) {
const int ch = sc.GetRelative(i);
if (ch == '\n')
break;
if (!((ch == '\r' || IsASpaceOrTab(ch))))
return true;
}
return false;
}
static bool AtTermStart(StyleContext &sc) {
return sc.currentPos == 0 || sc.chPrev == 0 || isspacechar(sc.chPrev);
}
static bool IsCompleteStyleRegion(StyleContext &sc, const char *token) {
bool found = false;
const size_t start = strlen(token);
Sci_Position i = static_cast<Sci_Position>(start);
while (!IsNewline(sc.GetRelative(i))) {
// make sure an empty pair of single-char tokens doesn't match
// with a longer token: {*}{*} != {**}
if (sc.GetRelative(i) == *token && sc.GetRelative(i - 1) != *token) {
found = start > 1U ? sc.GetRelative(i + 1) == token[1] : true;
break;
}
i++;
}
return AtTermStart(sc) && found;
}
static bool IsValidHrule(const Sci_PositionU endPos, StyleContext &sc) {
int count = 1;
Sci_Position i = 0;
for (;;) {
++i;
int c = sc.GetRelative(i);
if (c == sc.ch)
++count;
// hit a terminating character
else if (!IsASpaceOrTab(c) || sc.currentPos + i == endPos) {
// Are we a valid HRULE
if ((IsNewline(c) || sc.currentPos + i == endPos) &&
count >= 3 && !HasPrevLineContent(sc)) {
sc.SetState(SCE_MARKDOWN_HRULE);
sc.Forward(i);
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
return true;
}
else {
sc.SetState(SCE_MARKDOWN_DEFAULT);
return false;
}
}
}
}
static void ColorizeMarkdownDoc(Sci_PositionU startPos, Sci_Position length, int initStyle,
WordList **, Accessor &styler) {
Sci_PositionU endPos = startPos + length;
int precharCount = 0;
bool isLinkNameDetecting = false;
// Don't advance on a new loop iteration and retry at the same position.
// Useful in the corner case of having to start at the beginning file position
// in the default state.
bool freezeCursor = false;
// property lexer.markdown.header.eolfill
// Set to 1 to highlight all ATX header text.
bool headerEOLFill = styler.GetPropertyInt("lexer.markdown.header.eolfill", 0) == 1;
StyleContext sc(startPos, static_cast<Sci_PositionU>(length), initStyle, styler);
while (sc.More()) {
// Skip past escaped characters
if (sc.ch == '\\') {
sc.Forward();
continue;
}
// A blockquotes resets the line semantics
if (sc.state == SCE_MARKDOWN_BLOCKQUOTE)
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
// Conditional state-based actions
if (sc.state == SCE_MARKDOWN_CODE2) {
if (sc.Match("``")) {
const int closingSpan = (sc.GetRelative(2) == '`') ? 3 : 2;
sc.Forward(closingSpan);
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
}
else if (sc.state == SCE_MARKDOWN_CODE) {
if (sc.ch == '`' && sc.chPrev != ' ')
sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
}
/* De-activated because it gets in the way of other valid indentation
* schemes, for example multiple paragraphs inside a list item.
// Code block
else if (sc.state == SCE_MARKDOWN_CODEBK) {
bool d = true;
if (IsNewline(sc.ch)) {
if (sc.chNext != '\t') {
for (int c = 1; c < 5; ++c) {
if (sc.GetRelative(c) != ' ')
d = false;
}
}
}
else if (sc.atLineStart) {
if (sc.ch != '\t' ) {
for (int i = 0; i < 4; ++i) {
if (sc.GetRelative(i) != ' ')
d = false;
}
}
}
if (!d)
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
}
*/
// Strong
else if (sc.state == SCE_MARKDOWN_STRONG1) {
if ((sc.Match("**") && sc.chPrev != ' ') || IsNewline(sc.GetRelative(2))) {
sc.Forward(2);
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
}
else if (sc.state == SCE_MARKDOWN_STRONG2) {
if ((sc.Match("__") && sc.chPrev != ' ') || IsNewline(sc.GetRelative(2))) {
sc.Forward(2);
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
}
// Emphasis
else if (sc.state == SCE_MARKDOWN_EM1) {
if ((sc.ch == '*' && sc.chPrev != ' ') || IsNewline(sc.chNext))
sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
}
else if (sc.state == SCE_MARKDOWN_EM2) {
if ((sc.ch == '_' && sc.chPrev != ' ') || IsNewline(sc.chNext))
sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
}
else if (sc.state == SCE_MARKDOWN_CODEBK) {
if (sc.atLineStart && sc.Match("~~~")) {
Sci_Position i = 1;
while (!IsNewline(sc.GetRelative(i)) && sc.currentPos + i < endPos)
i++;
sc.Forward(i);
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
}
else if (sc.state == SCE_MARKDOWN_STRIKEOUT) {
if ((sc.Match("~~") && sc.chPrev != ' ') || IsNewline(sc.GetRelative(2))) {
sc.Forward(2);
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
}
else if (sc.state == SCE_MARKDOWN_LINE_BEGIN) {
// Header
if (sc.Match("######")) {
if (headerEOLFill)
sc.SetState(SCE_MARKDOWN_HEADER6);
else
SetStateAndZoom(SCE_MARKDOWN_HEADER6, 6, '#', sc);
}
else if (sc.Match("#####")) {
if (headerEOLFill)
sc.SetState(SCE_MARKDOWN_HEADER5);
else
SetStateAndZoom(SCE_MARKDOWN_HEADER5, 5, '#', sc);
}
else if (sc.Match("####")) {
if (headerEOLFill)
sc.SetState(SCE_MARKDOWN_HEADER4);
else
SetStateAndZoom(SCE_MARKDOWN_HEADER4, 4, '#', sc);
}
else if (sc.Match("###")) {
if (headerEOLFill)
sc.SetState(SCE_MARKDOWN_HEADER3);
else
SetStateAndZoom(SCE_MARKDOWN_HEADER3, 3, '#', sc);
}
else if (sc.Match("##")) {
if (headerEOLFill)
sc.SetState(SCE_MARKDOWN_HEADER2);
else
SetStateAndZoom(SCE_MARKDOWN_HEADER2, 2, '#', sc);
}
else if (sc.Match("#")) {
// Catch the special case of an unordered list
if (sc.chNext == '.' && IsASpaceOrTab(sc.GetRelative(2))) {
precharCount = 0;
sc.SetState(SCE_MARKDOWN_PRECHAR);
}
else if (headerEOLFill) {
sc.SetState(SCE_MARKDOWN_HEADER1);
}
else
SetStateAndZoom(SCE_MARKDOWN_HEADER1, 1, '#', sc);
}
// Code block
else if (sc.Match("~~~")) {
if (!HasPrevLineContent(sc))
sc.SetState(SCE_MARKDOWN_CODEBK);
else
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
else if (sc.ch == '=') {
if (HasPrevLineContent(sc) && FollowToLineEnd('=', SCE_MARKDOWN_HEADER1, endPos, sc)) {
if (!headerEOLFill)
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
}
else
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
else if (sc.ch == '-') {
if (HasPrevLineContent(sc) && FollowToLineEnd('-', SCE_MARKDOWN_HEADER2, endPos, sc)) {
if (!headerEOLFill)
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
}
else {
precharCount = 0;
sc.SetState(SCE_MARKDOWN_PRECHAR);
}
}
else if (IsNewline(sc.ch))
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
else {
precharCount = 0;
sc.SetState(SCE_MARKDOWN_PRECHAR);
}
}
// The header lasts until the newline
else if (sc.state == SCE_MARKDOWN_HEADER1 || sc.state == SCE_MARKDOWN_HEADER2 ||
sc.state == SCE_MARKDOWN_HEADER3 || sc.state == SCE_MARKDOWN_HEADER4 ||
sc.state == SCE_MARKDOWN_HEADER5 || sc.state == SCE_MARKDOWN_HEADER6) {
if (headerEOLFill) {
if (sc.atLineStart) {
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
freezeCursor = true;
}
}
else if (IsNewline(sc.ch))
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
}
// New state only within the initial whitespace
if (sc.state == SCE_MARKDOWN_PRECHAR) {
// Blockquote
if (sc.ch == '>' && precharCount < 5)
sc.SetState(SCE_MARKDOWN_BLOCKQUOTE);
/*
// Begin of code block
else if (!HasPrevLineContent(sc) && (sc.chPrev == '\t' || precharCount >= 4))
sc.SetState(SCE_MARKDOWN_CODEBK);
*/
// HRule - Total of three or more hyphens, asterisks, or underscores
// on a line by themselves
else if ((sc.ch == '-' || sc.ch == '*' || sc.ch == '_') && IsValidHrule(endPos, sc))
;
// Unordered list
else if ((sc.ch == '-' || sc.ch == '*' || sc.ch == '+') && IsASpaceOrTab(sc.chNext)) {
sc.SetState(SCE_MARKDOWN_ULIST_ITEM);
sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
}
// Ordered list
else if (IsADigit(sc.ch)) {
int digitCount = 0;
while (IsADigit(sc.GetRelative(++digitCount)))
;
if (sc.GetRelative(digitCount) == '.' &&
IsASpaceOrTab(sc.GetRelative(digitCount + 1))) {
sc.SetState(SCE_MARKDOWN_OLIST_ITEM);
sc.Forward(digitCount + 1);
sc.SetState(SCE_MARKDOWN_DEFAULT);
Update to Scintilla 5.3.2 and Lexilla 5.2.1 update to https://www.scintilla.org/scintilla532.zip with: Released 6 December 2022. Add SCI_REPLACETARGETMINIMAL to change text without causing unchanged prefix and suffix to be marked as modified in change history. Draw background colour for EOL annotations with standard and boxed visuals. Add SCI_GETSTYLEDTEXTFULL to support 64-bit document positions on Win32 replacing SCI_GETSTYLEDTEXT which is not safe for huge documents. Feature #1455. Send SCN_AUTOCCOMPLETED for SCI_AUTOCSHOW triggering insertion because of SCI_AUTOCSETCHOOSESINGLE mode. Feature #1459. Change 'paragraph up' commands SCI_PARAUP and SCI_PARAUPEXTEND to go to the start position of the paragraph containing the caret. Only if the caret is already at the start of the paragraph will it go to the start of the previous paragraph. Bug #2363. Change release compilation optimization option to favour speed over space. -O2 for MSVC and -O3 for gcc and clang. On Win32, avoid blurry display with DirectWrite in GDI scaling mode. Bug #2344. On Win32, use the top-level window to find the monitor for DirectWrite rendering parameters. Temporarily switch DPI awareness to find correct monitor in GDI scaling mode. Bug #2344. On Qt, implement SCI_SETRECTANGULARSELECTIONMODIFIER for all platforms. On Qt, allow string form XPM images for SCI_REGISTERIMAGE. and https://www.scintilla.org/lexilla521.zip with Released 6 December 2022. Update to Unicode 14. Feature #1461. Change default compilation optimization option to favour speed over space. -O2 for MSVC and -O3 for gcc and clang. Batch: Fix comments starting inside strings. Issue #115. F#: Lex signed numeric literals more accurately. Issue #110, Issue #111. F#: Add specifiers for 64-bit integer and floating point literals. Issue #112. Markdown: Stop styling numbers at line start in PRECHAR style. Issue #117. PowerShell: Recognise numeric literals more accurately. Issue #118. Close #12624
2022-12-10 12:35:16 +00:00
} else {
// a textual number at the margin should be plain text
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
}
// Alternate Ordered list
else if (sc.ch == '#' && sc.chNext == '.' && IsASpaceOrTab(sc.GetRelative(2))) {
sc.SetState(SCE_MARKDOWN_OLIST_ITEM);
sc.Forward(2);
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
else if (sc.ch != ' ' || precharCount > 2)
sc.SetState(SCE_MARKDOWN_DEFAULT);
else
++precharCount;
}
// Any link
if (sc.state == SCE_MARKDOWN_LINK) {
if (sc.Match("](") && sc.GetRelative(-1) != '\\') {
sc.Forward(2);
isLinkNameDetecting = true;
}
else if (sc.Match("]:") && sc.GetRelative(-1) != '\\') {
sc.Forward(2);
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
else if (!isLinkNameDetecting && sc.ch == ']' && sc.GetRelative(-1) != '\\') {
sc.Forward();
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
else if (isLinkNameDetecting && sc.ch == ')' && sc.GetRelative(-1) != '\\') {
sc.Forward();
sc.SetState(SCE_MARKDOWN_DEFAULT);
isLinkNameDetecting = false;
}
}
// New state anywhere in doc
if (sc.state == SCE_MARKDOWN_DEFAULT) {
if (sc.atLineStart && sc.ch == '#') {
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
freezeCursor = true;
}
// Links and Images
if (sc.Match("![")) {
sc.SetState(SCE_MARKDOWN_LINK);
sc.Forward(1);
}
else if (sc.ch == '[' && sc.GetRelative(-1) != '\\') {
sc.SetState(SCE_MARKDOWN_LINK);
}
// Code - also a special case for alternate inside spacing
else if (sc.Match("``") && sc.GetRelative(3) != ' ' && AtTermStart(sc)) {
const int openingSpan = (sc.GetRelative(2) == '`') ? 2 : 1;
sc.SetState(SCE_MARKDOWN_CODE2);
sc.Forward(openingSpan);
}
else if (sc.ch == '`' && sc.chNext != ' ' && IsCompleteStyleRegion(sc, "`")) {
sc.SetState(SCE_MARKDOWN_CODE);
}
// Strong
else if (sc.Match("**") && sc.GetRelative(2) != ' ' && IsCompleteStyleRegion(sc, "**")) {
sc.SetState(SCE_MARKDOWN_STRONG1);
sc.Forward();
}
else if (sc.Match("__") && sc.GetRelative(2) != ' ' && IsCompleteStyleRegion(sc, "__")) {
sc.SetState(SCE_MARKDOWN_STRONG2);
sc.Forward();
}
// Emphasis
else if (sc.ch == '*' && sc.chNext != ' ' && IsCompleteStyleRegion(sc, "*")) {
sc.SetState(SCE_MARKDOWN_EM1);
}
else if (sc.ch == '_' && sc.chNext != ' ' && IsCompleteStyleRegion(sc, "_")) {
sc.SetState(SCE_MARKDOWN_EM2);
}
// Strikeout
else if (sc.Match("~~") && !(sc.GetRelative(2) == '~' || sc.GetRelative(2) == ' ') &&
IsCompleteStyleRegion(sc, "~~")) {
sc.SetState(SCE_MARKDOWN_STRIKEOUT);
sc.Forward();
}
// Beginning of line
else if (IsNewline(sc.ch)) {
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
}
}
// Advance if not holding back the cursor for this iteration.
if (!freezeCursor)
sc.Forward();
freezeCursor = false;
}
sc.Complete();
}
LexerModule lmMarkdown(SCLEX_MARKDOWN, ColorizeMarkdownDoc, "markdown");