From 909194dd7b9562094f1231d17d72c44b274a3711 Mon Sep 17 00:00:00 2001 From: dail8859 Date: Mon, 16 Mar 2026 20:00:49 -0400 Subject: [PATCH] PCRE2 Regex Engine --- CMakeLists.txt | 8 +- src/CMakeLists.txt | 6 +- src/PCRE2Search.cpp | 326 ++++++++++++++++++++++++++++++ src/PCRE2Search.h | 21 ++ src/QRegexSearch.cpp | 127 ------------ src/QRegexSearch.h | 101 --------- src/dialogs/FindReplaceDialog.cpp | 4 + thirdparty/CMakeLists.txt | 12 ++ 8 files changed, 371 insertions(+), 234 deletions(-) create mode 100644 src/PCRE2Search.cpp create mode 100644 src/PCRE2Search.h delete mode 100644 src/QRegexSearch.cpp delete mode 100644 src/QRegexSearch.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 265c68b4b..c7a67e607 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,14 +6,14 @@ project(NotepadNext LANGUAGES CXX ) -set(APP_DISTRIBUTION "" CACHE STRING "Distribution type") -string(TIMESTAMP CURRENT_YEAR "%Y") -set(APP_COPYRIGHT "Copyright 2019-${CURRENT_YEAR} Justin Dailey") - set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) +set(APP_DISTRIBUTION "" CACHE STRING "Distribution type") +string(TIMESTAMP CURRENT_YEAR "%Y") +set(APP_COPYRIGHT "Copyright 2019-${CURRENT_YEAR} Justin Dailey") + add_compile_definitions(SCI_OWNREGEX) if(MSVC) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 712a3336f..31cb7b88f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -32,7 +32,7 @@ qt_add_executable(NotepadNext MacroStepTableModel.h NotepadNextApplication.h NppImporter.h - QRegexSearch.h + RangeAllocator.h RecentFilesListManager.h RecentFilesListMenuBuilder.h @@ -79,7 +79,7 @@ qt_add_executable(NotepadNext main.cpp NotepadNextApplication.cpp NppImporter.cpp - QRegexSearch.cpp + RangeAllocator.cpp RecentFilesListManager.cpp RecentFilesListMenuBuilder.cpp @@ -179,6 +179,7 @@ qt_add_executable(NotepadNext widgets/QuickFindWidget.ui widgets/StatusLabel.cpp widgets/StatusLabel.h + PCRE2Search.h PCRE2Search.cpp ) set_target_properties(NotepadNext PROPERTIES @@ -234,6 +235,7 @@ target_link_libraries(NotepadNext lexilla scintilla uchardet + pcre2-8 ) target_compile_definitions(NotepadNext diff --git a/src/PCRE2Search.cpp b/src/PCRE2Search.cpp new file mode 100644 index 000000000..04b8807ea --- /dev/null +++ b/src/PCRE2Search.cpp @@ -0,0 +1,326 @@ +/* + * This file is part of Notepad Next. + * Copyright 2026 Justin Dailey + * + * Notepad Next is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Notepad Next is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Notepad Next. If not, see . + */ + + +// TODO: Fix this mess. Scintilla makes you include everything...in the correct order... +// this was copied from Editor.cxx just to get it to compile + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ScintillaTypes.h" +#include "ScintillaMessages.h" +#include "ScintillaStructures.h" +#include "ILoader.h" +#include "ILexer.h" + +#include "Debugging.h" +#include "Geometry.h" +#include "Platform.h" + +#include "CharacterType.h" +#include "CharacterCategoryMap.h" +#include "Position.h" +#include "UniqueString.h" +#include "SplitVector.h" +#include "Partitioning.h" +#include "RunStyles.h" +#include "ContractionState.h" +#include "CellBuffer.h" +#include "PerLine.h" +#include "KeyMap.h" +#include "Indicator.h" +#include "LineMarker.h" +#include "Style.h" +#include "ViewStyle.h" +#include "CharClassify.h" +#include "Decoration.h" +#include "CaseFolder.h" +#include "Document.h" +#include "Scintilla.h" + +#include "PCRE2Search.h" + +#define PCRE2_CODE_UNIT_WIDTH 8 +#include + +#include +#include + + +using namespace Scintilla; +using namespace Scintilla::Internal; + + +class PCRE2Search : public RegexSearchBase +{ +public: + PCRE2Search() { qInfo() << Q_FUNC_INFO; } + + Sci::Position FindText(Document *doc, Sci::Position minPos, Sci::Position maxPos, const char *pattern, bool caseSensitive, bool word, bool wordStart, Scintilla::FindOption flags, Sci::Position *length) override; + const char *SubstituteByPosition(Document *doc, const char *text, Sci::Position *length) override; + +private: + struct RegexContext { + pcre2_code* re = nullptr; + pcre2_match_data* match_data = nullptr; + uint32_t compile_options = 0; + uint32_t match_options = 0; + Sci::Position search_start; + Sci::Position search_end; + std::string replacement_buffer; + + ~RegexContext() { + if (match_data) pcre2_match_data_free(match_data); + if (re) pcre2_code_free(re); + } + }; + + RegexContext ctx; +}; + +#ifdef SCI_OWNREGEX +RegexSearchBase *Scintilla::Internal::CreateRegexSearch(CharClassify *charClassTable) +{ + Q_UNUSED(charClassTable); + + qInfo(Q_FUNC_INFO); + + return new PCRE2Search(); +} +#endif + +Sci::Position PCRE2Search::FindText(Document *doc, Sci::Position minPos, Sci::Position maxPos, const char *s, bool caseSensitive, bool word, bool wordStart, Scintilla::FindOption flags, Sci::Position *length) +{ + // Clear out any context from the previous search + if (ctx.re) { + pcre2_match_data_free(ctx.match_data); + ctx.match_data = nullptr; + pcre2_code_free(ctx.re); + ctx.re = nullptr; + } + + const bool searchForward = (minPos <= maxPos); + Q_ASSERT(searchForward); + + const bool dotMatchesNewLine = static_cast(flags) & SCFIND_REGEXP_DOTMATCHESNL; + std::string pattern = s; + ctx.search_start = std::min(minPos, maxPos); + ctx.search_end = std::max(minPos, maxPos); + + // Range endpoints should not be inside DBCS characters or between a CR and LF, but just in case, move them. + ctx.search_start= doc->MovePositionOutsideChar(ctx.search_start, 1, true); + ctx.search_end= doc->MovePositionOutsideChar(ctx.search_end, 1, true); + + // No need to search an empty range or empty pattern + if (ctx.search_start == ctx.search_end || pattern.empty()) { + return -1; + } + + // Wrap in word boundaries. Also include additional grouping to fix potential precedence issue + if (word) pattern = "\\b(?:" + pattern + ")\\b"; + if (wordStart) pattern = "\\b(?:" + pattern + ")"; + // For some reason PCRE2_NEWLINE_ANYCRLF | PCRE2_BSR_ANYCRLF don't seem to have an effect + // Handles mixed line endings, could use e.g. (*LF) based on doc settings + pattern = "(*ANYCRLF)(*BSR_ANYCRLF)" + pattern; + + //qDebug() << "--- FindText ---"; + //qDebug() << "Min Pos:" << minPos; + //qDebug() << "Max Pos:" << maxPos; + //qDebug() << "Min Pos (actual):" << ctx.search_start; + //qDebug() << "Max Pos (actual):" << ctx.search_end; + //qDebug() << "Pattern:" << s; + //qDebug() << "Computed Pattern:" << pattern; + //qDebug() << "Case Sensitive:" << caseSensitive; + //qDebug() << "Word:" << word; + //qDebug() << "Word Start:" << wordStart; + //qDebug() << "Direction:" << (searchForward ? "forwards" : "backwards"); + //qDebug() << "BOL:" << doc->IsLineStartPosition(ctx.search_start); + //qDebug() << "EOL:" << doc->IsLineEndPosition(ctx.search_end); + + ctx.compile_options = PCRE2_MULTILINE; + if (!caseSensitive) + ctx.compile_options |= PCRE2_CASELESS; + if (dotMatchesNewLine) + ctx.compile_options |= PCRE2_DOTALL; + if (doc->CodePage() == SC_CP_UTF8) + ctx.compile_options |= PCRE2_UTF | PCRE2_NO_UTF_CHECK; + + int errornumber; + PCRE2_SIZE erroroffset; + + ctx.re = pcre2_compile( + reinterpret_cast(pattern.c_str()), + PCRE2_ZERO_TERMINATED, + ctx.compile_options, + &errornumber, + &erroroffset, + nullptr + ); + + if (!ctx.re) { + PCRE2_UCHAR buffer[256]; + pcre2_get_error_message(errornumber, buffer, sizeof(buffer)); + + qWarning().noquote() << pattern; + qWarning().noquote() << QByteArray(erroroffset - 1, ' ') + '^';; + qWarning().noquote() << QString::fromUtf8(reinterpret_cast(buffer)); + + doc->SetErrorStatus(SC_STATUS_WARN_REGEX); + + *length = 0; + return -1; + } + + ctx.match_data = pcre2_match_data_create_from_pattern(ctx.re, nullptr); + ctx.match_options = PCRE2_NOTEMPTY | PCRE2_NOTEMPTY_ATSTART; // TODO: use 0 and handle empty matches + + // Since this string is in the middle of the document there is additional information to pass + // to pcre2 about the context of the string. + if (!doc->IsLineStartPosition(ctx.search_start)) + ctx.match_options |= PCRE2_NOTBOL; + if (!doc->IsLineEndPosition(ctx.search_end)) + ctx.match_options |= PCRE2_NOTEOL; + + const int subjectLength = ctx.search_end - ctx.search_start; + const char *subject = doc->RangePointer(ctx.search_start, subjectLength); + + int rc = pcre2_match( + ctx.re, + reinterpret_cast(subject), + subjectLength, + 0, // start offset always needs to be 0 since we only give the target to pcre2 + ctx.match_options, + ctx.match_data, + nullptr + ); + + int ret = -1; + if (rc >= 0) { + PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(ctx.match_data); + PCRE2_SIZE matchStart = ovector[0]; + PCRE2_SIZE matchEnd = ovector[1]; + + *length = matchEnd - matchStart; + ret = ctx.search_start + matchStart; // Offset it by the start of the search + + //QByteArray a = QByteArray::fromRawData(&subject[matchStart], matchEnd - matchStart); + //qDebug() << "First match:" << a << "at" << matchStart << "to" << matchEnd; + } else { + *length = 0; + ret = -1; + //qDebug() << "No Match"; + } + + return ret; +} + +const char *PCRE2Search::SubstituteByPosition(Document *doc, const char *text, Sci::Position *length) +{ + Q_ASSERT(ctx.re); + Q_ASSERT(ctx.match_data); + Q_ASSERT(ctx.search_end > ctx.search_start); + + //qDebug() << "--- SubstituteByPosition ----"; + //qDebug() << "Replacement Text:" << text; + + // A common pattern is to just remove the match, so short circuit this case + if (!text[0]) { + *length = 0; + return ""; + } + + const int subjectLength = ctx.search_end - ctx.search_start; + const char *subject = doc->RangePointer(ctx.search_start, subjectLength); + const uint32_t options = ctx.match_options| + PCRE2_SUBSTITUTE_EXTENDED | + PCRE2_SUBSTITUTE_REPLACEMENT_ONLY | + PCRE2_SUBSTITUTE_OVERFLOW_LENGTH | + PCRE2_SUBSTITUTE_MATCHED; + + PCRE2_SIZE outlen = 1024; // Arbitrary size which should handle most cases + ctx.replacement_buffer.resize(outlen); + + int rc = pcre2_substitute( + ctx.re, + reinterpret_cast(subject), + subjectLength, + 0, + options, + ctx.match_data, + nullptr, + reinterpret_cast(text), + PCRE2_ZERO_TERMINATED, + reinterpret_cast(ctx.replacement_buffer.data()), + &outlen + ); + + // Check to see if a second pass is necessary if there was not enough memory + if (rc == PCRE2_ERROR_NOMEMORY) { + ctx.replacement_buffer.resize(outlen); + + rc = pcre2_substitute( + ctx.re, + reinterpret_cast(subject), + subjectLength, + 0, + options, + ctx.match_data, + nullptr, + reinterpret_cast(text), + PCRE2_ZERO_TERMINATED, + reinterpret_cast(ctx.replacement_buffer.data()), + &outlen + ); + } + + //if (rc == PCRE2_ERROR_NOMATCH) { + // // Is this bad? There should have been a match + // *length = 0; + // return nullptr; + //} + + if (rc < 0) { + PCRE2_UCHAR buffer[256]; + pcre2_get_error_message(rc, buffer, sizeof(buffer)); + qWarning().noquote() << QString::fromUtf8(reinterpret_cast(buffer)); + + *length = 0; + return nullptr; + } + + *length = static_cast(outlen); + return ctx.replacement_buffer.c_str(); +} diff --git a/src/PCRE2Search.h b/src/PCRE2Search.h new file mode 100644 index 000000000..89e70e559 --- /dev/null +++ b/src/PCRE2Search.h @@ -0,0 +1,21 @@ +/* + * This file is part of Notepad Next. + * Copyright 2026 Justin Dailey + * + * Notepad Next is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * Notepad Next is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Notepad Next. If not, see . + */ + +#pragma once + +#define SCFIND_REGEXP_DOTMATCHESNL 0x10000000 diff --git a/src/QRegexSearch.cpp b/src/QRegexSearch.cpp deleted file mode 100644 index 96a98a8bc..000000000 --- a/src/QRegexSearch.cpp +++ /dev/null @@ -1,127 +0,0 @@ -/* - * This file is part of Notepad Next. - * Copyright 2019 Justin Dailey - * - * Notepad Next is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Notepad Next is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Notepad Next. If not, see . - */ - - -#include "QRegexSearch.h" - -#include -#include - -using namespace Scintilla; - -#ifdef SCI_OWNREGEX -RegexSearchBase *Scintilla::Internal::CreateRegexSearch(CharClassify *charClassTable) -{ - Q_UNUSED(charClassTable); - - qInfo(Q_FUNC_INFO); - - return new QRegexSearch(); -} -#endif - -QRegexSearch::QRegexSearch() -{ - -} - -Sci::Position QRegexSearch::FindText(Document *doc, Sci::Position minPos, Sci::Position maxPos, const char *s, bool caseSensitive, bool word, bool wordStart, Scintilla::FindOption flags, Sci::Position *length) -{ - Q_UNUSED(caseSensitive); - Q_UNUSED(word) - Q_UNUSED(wordStart) - // ----------------------------------------------------------------------------------------------------------------------- - // NOTE: This section of code has to be very careful about what units of measure is being used. Scintilla wants to operate - // in units of bytes (e.g. position 3 is 3 bytes into the text). Qt wants to operate in units of UTF16 chars. The trouble is - // when you start using characters that are >1 byte a piece. Meaning position 3 (3 bytes into a file) could be 1 character. - // ----------------------------------------------------------------------------------------------------------------------- - - // Make sure the positiosn are outside of characters - minPos = doc->MovePositionOutsideChar(minPos, 1, false); - maxPos = doc->MovePositionOutsideChar(maxPos, -1, false); - - //qInfo(Q_FUNC_INFO); - //qInfo("\tminPos %d", minPos); - //qInfo("\tmaxPos %d", maxPos); - //qInfo("\ts %s", s); - //qInfo("\tcaseSensitive %s", caseSensitive ? "true" : "false"); - //qInfo("\tword %s", word ? "true" : "false"); - //qInfo("\twordStart %s", wordStart ? "true" : "false"); - //qInfo("\tflags %d", flags); - - // No need to search an empty range - if (minPos == maxPos) - return -1; - - auto options = QRegularExpression::MultilineOption | QRegularExpression::UseUnicodePropertiesOption; - - if (!FlagSet(flags, FindOption::MatchCase)) - options |= QRegularExpression::CaseInsensitiveOption; - - // TODO: does (*ANYCRLF) need prepended to the search string? - QRegularExpression re(s, options); - if (!re.isValid()) - return -1; // Invalid regular expression - - // Get the bytes from the document. No need to go past maxPos bytes - // Not actually sure if this copies the data or not - const Sci::Position rangeLength = maxPos - minPos; - const QString utf8 = QString::fromUtf8(doc->RangePointer(minPos, rangeLength), rangeLength); - - // NOTE: QString uses UTF16 counts since QChars are 16 bits - QRegularExpressionMatch m = re.match(utf8, 0, QRegularExpression::NormalMatch, QRegularExpression::NoMatchOption); - - if (!m.hasMatch()) - return -1; // No match - - match = m; - - // NOTE: Returned started is the index into the QString which uses UTF16 - const int positionStart = doc->GetRelativePositionUTF16(minPos, match.capturedStart(0)); - - // Now move ahead however many characters we matched. Again, based on UTF16 count - const int positionEnd = doc->GetRelativePositionUTF16(positionStart, match.capturedLength(0)); - - // The length is the number of bytes that was matched - *length = positionEnd - positionStart; - - return positionStart; -} - -const char *QRegexSearch::SubstituteByPosition(Document *doc, const char *text, Sci::Position *length) -{ - Q_UNUSED(doc); - - qInfo(Q_FUNC_INFO); - - Q_ASSERT(match.isValid()); - Q_ASSERT(match.hasMatch()); - - // Get the captured text and replace the match - QString newString = match.captured(); - newString.replace(match.regularExpression(), QByteArray(text, *length)); - - // TODO: figure out why this has to be new'd and can't be an instantiated class member - if (substituted) { - delete substituted; - } - - substituted = new QByteArray(newString.toUtf8()); - *length = substituted->length(); - return substituted->data(); -} diff --git a/src/QRegexSearch.h b/src/QRegexSearch.h deleted file mode 100644 index 9a44af559..000000000 --- a/src/QRegexSearch.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * This file is part of Notepad Next. - * Copyright 2019 Justin Dailey - * - * Notepad Next is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * Notepad Next is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Notepad Next. If not, see . - */ - - -#ifndef QREGEXSEARCH_H -#define QREGEXSEARCH_H - -#include - -#include -#include -#include -#include - - -// TODO: Fix this mess. Scintilla makes you include everything...in the correct order... -// this was copied from Editor.cxx just to get it to compile - - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ScintillaTypes.h" -#include "ScintillaMessages.h" -#include "ScintillaStructures.h" -#include "ILoader.h" -#include "ILexer.h" - -#include "Debugging.h" -#include "Geometry.h" -#include "Platform.h" - -#include "CharacterType.h" -#include "CharacterCategoryMap.h" -#include "Position.h" -#include "UniqueString.h" -#include "SplitVector.h" -#include "Partitioning.h" -#include "RunStyles.h" -#include "ContractionState.h" -#include "CellBuffer.h" -#include "PerLine.h" -#include "KeyMap.h" -#include "Indicator.h" -#include "LineMarker.h" -#include "Style.h" -#include "ViewStyle.h" -#include "CharClassify.h" -#include "Decoration.h" -#include "CaseFolder.h" -#include "Document.h" -#include "Scintilla.h" - -using namespace Scintilla::Internal; - -class QRegexSearch : public RegexSearchBase -{ -public: - QRegexSearch(); - - Sci::Position FindText(Document *doc, Sci::Position minPos, Sci::Position maxPos, const char *s, bool caseSensitive, bool word, bool wordStart, Scintilla::FindOption flags, Sci::Position *length) override; - const char *SubstituteByPosition(Document *doc, const char *text, Sci::Position *length) override; - -private: - QRegularExpressionMatch match; - QByteArray *substituted = Q_NULLPTR; -}; - -#endif // QREGEXSEARCH_H diff --git a/src/dialogs/FindReplaceDialog.cpp b/src/dialogs/FindReplaceDialog.cpp index a8d8d5ad6..c2db00f67 100644 --- a/src/dialogs/FindReplaceDialog.cpp +++ b/src/dialogs/FindReplaceDialog.cpp @@ -26,6 +26,8 @@ #include #include "ScintillaNext.h" +#include "PCRE2Search.h" + #include "MainWindow.h" @@ -618,6 +620,8 @@ int FindReplaceDialog::computeSearchFlags() flags |= SCFIND_MATCHCASE; if (ui->radioRegexSearch->isChecked()) flags |= SCFIND_REGEXP; + if (ui->checkBoxRegexMatchesNewline->isChecked()) + flags |= SCFIND_REGEXP_DOTMATCHESNL; return flags; } diff --git a/thirdparty/CMakeLists.txt b/thirdparty/CMakeLists.txt index 43a01e337..d1f72f469 100644 --- a/thirdparty/CMakeLists.txt +++ b/thirdparty/CMakeLists.txt @@ -30,6 +30,18 @@ CPMAddPackage( GIT_TAG ab62f0554abf2bbe4d45427b36a8b2f81ca7b4ab ) +CPMAddPackage( + NAME PCRE2 + GITHUB_REPOSITORY PCRE2Project/pcre2 + GIT_TAG pcre2-10.47 + OPTIONS + "PCRE2_BUILD_PCRE2_8:BOOL=ON" + "PCRE2_BUILD_PCRE2_16:BOOL=OFF" + "PCRE2_BUILD_PCRE2_32:BOOL=OFF" + "BUILD_SHARED_LIBS:BOOL=OFF" + "PCRE2_SUPPORT_JIT:BOOL=ON" +) + add_subdirectory(lua) add_subdirectory(scintilla) add_subdirectory(lexilla)