diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index bd0c1dea9..cf5e92312 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -29,7 +29,6 @@ jobs: BUILD_CMD: swift build -c release BENCH_CMD: .build/release/RegexBenchmark BASELINE_FILE: benchmark-baseline - MAIN_FILE: benchmark-main COMPARE_FILE: benchmark-pr COMPARE_OUT_FILE: benchmark-results.txt steps: @@ -50,26 +49,6 @@ jobs: set -euo pipefail eval "$BENCH_CMD --save $RUNNER_TEMP/$BASELINE_FILE" test -s "$RUNNER_TEMP/$BASELINE_FILE" || { echo "Baseline not created at $BASELINE_FILE"; exit 1; } - - name: Check out main branch - if: ${{ github.event.pull_request.base.ref != 'main' }} - uses: actions/checkout@v4 - with: - ref: main - path: main-branch - fetch-depth: 0 - - name: Build main branch - if: ${{ github.event.pull_request.base.ref != 'main' }} - working-directory: main-branch - run: | - set -euo pipefail - eval "$BUILD_CMD" - - name: Run main benchmark - if: ${{ github.event.pull_request.base.ref != 'main' }} - working-directory: main-branch - run: | - set -euo pipefail - eval "$BENCH_CMD --save $RUNNER_TEMP/$MAIN_FILE" - test -s "$RUNNER_TEMP/$MAIN_FILE" || { echo "Baseline (main) not created at $MAIN_FILE"; exit 1; } - name: Check out PR branch uses: actions/checkout@v4 with: @@ -88,17 +67,11 @@ jobs: eval "$BENCH_CMD --save $RUNNER_TEMP/$COMPARE_FILE" test -s "$RUNNER_TEMP/$COMPARE_FILE" || { echo "Comparison not created at $COMPARE_FILE"; exit 1; } eval "$BENCH_CMD --compare $RUNNER_TEMP/$BASELINE_FILE" | tee "$RUNNER_TEMP/$COMPARE_OUT_FILE" - - name: 📊 Compare benchmarks with base + - name: 📊 Compare benchmarks working-directory: pr run: | set -euo pipefail eval "$BENCH_CMD --load $RUNNER_TEMP/$COMPARE_FILE --compare $RUNNER_TEMP/$BASELINE_FILE --compare-compile-time $RUNNER_TEMP/$BASELINE_FILE" | tee "$RUNNER_TEMP/$COMPARE_OUT_FILE" - - name: 📊 Compare benchmarks with `main` - if: ${{ github.event.pull_request.base.ref != 'main' }} - working-directory: pr - run: | - set -euo pipefail - eval "$BENCH_CMD --load $RUNNER_TEMP/$COMPARE_FILE --compare $RUNNER_TEMP/$MAIN_FILE --compare-compile-time $RUNNER_TEMP/$MAIN_FILE" - name: Upload benchmark artifacts uses: actions/upload-artifact@v4 with: diff --git a/Sources/_StringProcessing/ByteCodeGen+DSLList.swift b/Sources/_StringProcessing/ByteCodeGen+DSLList.swift deleted file mode 100644 index 3394b319f..000000000 --- a/Sources/_StringProcessing/ByteCodeGen+DSLList.swift +++ /dev/null @@ -1,904 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// This source file is part of the Swift.org open source project -// -// Copyright (c) 2025 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception -// -// See https://swift.org/LICENSE.txt for license information -// -//===----------------------------------------------------------------------===// - -internal import _RegexParser - -extension Compiler.ByteCodeGen { - mutating func emitRoot(_ root: inout DSLList) throws -> MEProgram { - // If the whole regex is a matcher, then the whole-match value - // is the constructed value. Denote that the current value - // register is the processor's value output. - switch root.nodes.first { - case .matcher: - builder.denoteCurrentValueIsWholeMatchValue() - default: - break - } - - if optimizationsEnabled { - root.autoPossessify() - } - - var list = root.nodes[...] - try emitNode(&list) - - builder.canOnlyMatchAtStart = canOnlyMatchAtStart(in: root) - builder.buildAccept() - return try builder.assemble() - } -} - -fileprivate extension Compiler.ByteCodeGen { - /// Implementation for `canOnlyMatchAtStart`, which maintains the option - /// state. - /// - /// For a given specific node, this method can return one of three values: - /// - /// - `true`: This node is guaranteed to match only at the start of a subject. - /// - `false`: This node can match anywhere in the subject. - /// - `nil`: This node is inconclusive about where it can match. - /// - /// In particular, non-required groups and option-setting groups are - /// inconclusive about where they can match. - private mutating func _canOnlyMatchAtStartImpl( - _ list: inout ArraySlice - ) -> Bool? { - guard let node = list.popFirst() else { return false } - switch node { - // Defining cases - case .atom(.assertion(.startOfSubject)): - return true - case .atom(.assertion(.caretAnchor)): - return !options.anchorsMatchNewlines - - // Changing options doesn't determine `true`/`false`. - case .atom(.changeMatchingOptions(let sequence)): - options.apply(sequence.ast) - return nil - - // Any other atom or consuming node returns `false`. - case .atom, .customCharacterClass, .quotedLiteral: - return false - - // Trivia/empty have no effect. - case .trivia, .empty: - return nil - - // In an alternation, all of its children must match only at start. - case .orderedChoice(let children): - for _ in 0.. Bool { - let currentOptions = options - options = MatchingOptions() - defer { options = currentOptions } - - var list = list.nodes[...] - return _canOnlyMatchAtStartImpl(&list) ?? false - } - - mutating func emitAlternationGen( - _ elements: inout ArraySlice, - alternationCount: Int, - withBacktracking: Bool, - _ body: (inout Compiler.ByteCodeGen, inout ArraySlice) throws -> Void - ) rethrows { - // Alternation: p0 | p1 | ... | pn - // save next_p1 - // - // branch done - // next_p1: - // save next_p2 - // - // branch done - // next_p2: - // save next_p... - // - // branch done - // ... - // next_pn: - // - // done: - let done = builder.makeAddress() - for _ in 1.., - alternationCount count: Int - ) throws { - try emitAlternationGen(&list, alternationCount: count, withBacktracking: true) { - try $0.emitNode(&$1) - } - } - - mutating func emitPositiveLookahead(_ list: inout ArraySlice) throws { - /* - save(restoringAt: success) - save(restoringAt: intercept) - // failure restores at intercept - clearThrough(intercept) // remove intercept and any leftovers from - fail(preservingCaptures: true) // ->success - intercept: - clearSavePoint // remove success - fail // propagate failure - success: - ... - */ - let intercept = builder.makeAddress() - let success = builder.makeAddress() - - builder.buildSave(success) - builder.buildSave(intercept) - try emitNode(&list) - builder.buildClearThrough(intercept) - builder.buildFail(preservingCaptures: true) // Lookahead succeeds here - - builder.label(intercept) - builder.buildClear() - builder.buildFail() - - builder.label(success) - } - - mutating func emitNegativeLookahead(_ list: inout ArraySlice) throws { - /* - save(restoringAt: success) - save(restoringAt: intercept) - // failure restores at intercept - clearThrough(intercept) // remove intercept and any leftovers from - clearSavePoint // remove success - fail // propagate failure - intercept: - fail // ->success - success: - ... - */ - let intercept = builder.makeAddress() - let success = builder.makeAddress() - - builder.buildSave(success) - builder.buildSave(intercept) - try emitNode(&list) - builder.buildClearThrough(intercept) - builder.buildClear() - builder.buildFail() - - builder.label(intercept) - builder.buildFail() - - builder.label(success) - } - - mutating func emitLookaround( - _ kind: (forwards: Bool, positive: Bool), - _ list: inout ArraySlice - ) throws { - guard kind.forwards else { - throw Unsupported("backwards assertions") - } - if kind.positive { - try emitPositiveLookahead(&list) - } else { - try emitNegativeLookahead(&list) - } - } - - mutating func emitAtomicNoncapturingGroup( - _ list: inout ArraySlice - ) throws { - /* - save(continuingAt: success) - save(restoringAt: intercept) - // failure restores at intercept - clearThrough(intercept) // remove intercept and any leftovers from - fail(preservingCaptures: true) // ->success - intercept: - clearSavePoint // remove success - fail // propagate failure - success: - ... - */ - - let intercept = builder.makeAddress() - let success = builder.makeAddress() - - builder.buildSaveAddress(success) - builder.buildSave(intercept) - try emitNode(&list) - builder.buildClearThrough(intercept) - builder.buildFail(preservingCaptures: true) // Atomic group succeeds here - - builder.label(intercept) - builder.buildClear() - builder.buildFail() - - builder.label(success) - } - - mutating func emitNoncapturingGroup( - _ kind: AST.Group.Kind, - _ list: inout ArraySlice - ) throws { - assert(!kind.isCapturing) - - options.beginScope() - defer { options.endScope() } - - if let lookaround = kind.lookaroundKind { - try emitLookaround(lookaround, &list) - return - } - - switch kind { - case .lookahead, .negativeLookahead, - .lookbehind, .negativeLookbehind: - throw Unreachable("TODO: reason") - - case .capture, .namedCapture, .balancedCapture: - throw Unreachable("These should produce a capture node") - - case .changeMatchingOptions(let optionSequence): - if !hasEmittedFirstMatchableAtom { - builder.initialOptions.apply(optionSequence) - } - options.apply(optionSequence) - try emitNode(&list) - - case .atomicNonCapturing: - try emitAtomicNoncapturingGroup(&list) - - default: - // FIXME: Other kinds... - try emitNode(&list) - } - } - - func _guaranteesForwardProgressImpl(_ list: ArraySlice, position: inout Int) -> Bool { - guard position < list.endIndex else { return false } - let node = list[position] - position += 1 - switch node { - case .orderedChoice(let children): - return (0.. 0 else { return false } - return _guaranteesForwardProgressImpl(list, position: &position) - case .limitCaptureNesting, .ignoreCapturesInTypedOutput: - return _guaranteesForwardProgressImpl(list, position: &position) - default: return false - } - } - - func guaranteesForwardProgress(_ list: ArraySlice) -> Bool { - var pos = list.startIndex - return _guaranteesForwardProgressImpl(list, position: &pos) - } - - mutating func emitQuantification( - _ amount: AST.Quantification.Amount, - _ kind: DSLTree.QuantificationKind, - _ list: inout ArraySlice - ) throws { - let updatedKind = kind.applying(options: options) - - let (low, high) = amount.bounds - guard let low = low else { - throw Unreachable("Must have a lower bound") - } - switch (low, high) { - case (_, 0): - try skipNode(&list) - return - case let (n, m?) where n > m: - // TODO: Should error out earlier, maybe DSL and parser - // has validation logic? - return - - case let (n, m) where m == nil || n <= m!: - // Ok - break - default: - throw Unreachable("TODO: reason") - } - - // Compiler and/or parser should enforce these invariants - // before we are called - assert(high != 0) - assert((0...(high ?? Int.max)).contains(low)) - - let maxExtraTrips: Int? - if let h = high { - maxExtraTrips = h - low - } else { - maxExtraTrips = nil - } - let minTrips = low - assert((maxExtraTrips ?? 1) >= 0) - - var tmp = list - if tryEmitFastQuant(&tmp, updatedKind, minTrips, maxExtraTrips) { - list = tmp - return - } - - // The below is a general algorithm for bounded and unbounded - // quantification. It can be specialized when the min - // is 0 or 1, or when extra trips is 1 or unbounded. - // - // Stuff inside `<` and `>` are decided at compile time, - // while run-time values stored in registers start with a `%` - _ = """ - min-trip-count control block: - if %minTrips is zero: - goto exit-policy control block - else: - decrement %minTrips and fallthrough - - loop-body: - : - mov currentPosition %pos - evaluate the subexpression - : - if %pos is currentPosition: - goto exit - goto min-trip-count control block - - exit-policy control block: - if %maxExtraTrips is zero: - goto exit - else: - decrement %maxExtraTrips and fallthrough - - : - save exit and goto loop-body - : - ratchet and goto loop - : - save loop-body and fallthrough (i.e. goto exit) - - exit - ... the rest of the program ... - """ - - // Specialization based on `minTrips` for 0 or 1: - _ = """ - min-trip-count control block: - : - goto exit-policy - : - /* fallthrough */ - - loop-body: - evaluate the subexpression - - /* fallthrough */ - """ - - // Specialization based on `maxExtraTrips` for 0 or unbounded - _ = """ - exit-policy control block: - : - goto exit - : - /* fallthrough */ - """ - - /* - NOTE: These specializations don't emit the optimal - code layout (e.g. fallthrough vs goto), but that's better - done later (not prematurely) and certainly better - done by an optimizing compiler. - - NOTE: We're intentionally emitting essentially the same - algorithm for all quantifications for now, for better - testing and surfacing difficult bugs. We can specialize - for other things, like `.*`, later. - - When it comes time for optimizing, we can also look into - quantification instructions (e.g. reduce save-point traffic) - */ - - let minTripsControl = builder.makeAddress() - let loopBody = builder.makeAddress() - let exitPolicy = builder.makeAddress() - let exit = builder.makeAddress() - - // We'll need registers if we're (non-trivially) bounded - let minTripsReg: IntRegister? - if minTrips > 1 { - minTripsReg = builder.makeIntRegister( - initialValue: minTrips) - } else { - minTripsReg = nil - } - - let maxExtraTripsReg: IntRegister? - if (maxExtraTrips ?? 0) > 0 { - maxExtraTripsReg = builder.makeIntRegister( - initialValue: maxExtraTrips!) - } else { - maxExtraTripsReg = nil - } - - // Set up a dummy save point for possessive to update - if updatedKind == .possessive { - builder.pushEmptySavePoint() - } - - // min-trip-count: - // condBranch(to: exitPolicy, ifZeroElseDecrement: %min) - builder.label(minTripsControl) - switch minTrips { - case 0: builder.buildBranch(to: exitPolicy) - case 1: break - default: - assert(minTripsReg != nil, "logic inconsistency") - builder.buildCondBranch( - to: exitPolicy, ifZeroElseDecrement: minTripsReg!) - } - - // FIXME: Possessive needs a "dummy" save point to ratchet - - // loop: - // - // branch min-trip-count - builder.label(loopBody) - - // if we aren't sure if the child node will have forward progress and - // we have an unbounded quantification - let startPosition: PositionRegister? - // FIXME: forward progress check?! - let emitPositionChecking = - (!optimizationsEnabled || !guaranteesForwardProgress(list)) - && maxExtraTrips == nil - - if emitPositionChecking { - startPosition = builder.makePositionRegister() - builder.buildMoveCurrentPosition(into: startPosition!) - } else { - startPosition = nil - } - try emitNode(&list) - if emitPositionChecking { - // in all quantifier cases, no matter what minTrips or maxExtraTrips is, - // if we have a successful non-advancing match, branch to exit because it - // can match an arbitrary number of times - builder.buildCondBranch(to: exit, ifSamePositionAs: startPosition!) - } - - if minTrips <= 1 { - // fallthrough - } else { - builder.buildBranch(to: minTripsControl) - } - - // exit-policy: - // condBranch(to: exit, ifZeroElseDecrement: %maxExtraTrips) - // - // - // , - _ kind: AST.Quantification.Kind, - _ minTrips: Int, - _ maxExtraTrips: Int? - ) -> Bool { - let isScalarSemantics = options.semanticLevel == .unicodeScalar - guard optimizationsEnabled - && minTrips <= QuantifyPayload.maxStorableTrips - && maxExtraTrips ?? 0 <= QuantifyPayload.maxStorableTrips - && kind != .reluctant else { - return false - } - guard let child = list.popFirst() else { return false } - - switch child { - case .customCharacterClass(let ccc): - // ascii only custom character class - guard let bitset = ccc.asAsciiBitset(options) else { - return false - } - builder.buildQuantify(bitset: bitset, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) - - case .atom(let atom): - switch atom { - case .char(let c): - if options.isCaseInsensitive && c.isCased { - // Cased character with case-insensitive matching; match only as an ASCII bitset - guard let bitset = DSLTree.CustomCharacterClass(members: [.atom(atom)]).asAsciiBitset(options) else { - return false - } - builder.buildQuantify(bitset: bitset, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) - } else { - // Uncased character OR case-sensitive matching; match as a single scalar ascii value character - guard let val = c._singleScalarAsciiValue else { - return false - } - builder.buildQuantify(asciiChar: val, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) - } - - case .any: - builder.buildQuantifyAny( - matchesNewlines: true, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) - case .anyNonNewline: - builder.buildQuantifyAny( - matchesNewlines: false, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) - case .dot: - builder.buildQuantifyAny( - matchesNewlines: options.dotMatchesNewline, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) - - case .characterClass(let cc): - // Custom character class that consumes a single grapheme - let model = cc.asRuntimeModel(options) - builder.buildQuantify( - model: model, - kind, - minTrips, - maxExtraTrips, - isScalarSemantics: isScalarSemantics) - default: - return false - } - case .limitCaptureNesting(let node): - if tryEmitFastQuant(&list, kind, minTrips, maxExtraTrips) { - return true - } else { - return false - } - case .nonCapturingGroup(let groupKind, let node): - // .nonCapture nonCapturingGroups are ignored during compilation - guard groupKind.ast == .nonCapture else { - return false - } - if tryEmitFastQuant(&list, kind, minTrips, maxExtraTrips) { - return true - } else { - return false - } - default: - return false - } - return true - } - - mutating func emitConcatenation( - _ list: inout ArraySlice, - componentCount: Int - ) throws { - // Unlike the tree-based bytecode generator, in a DSLList concatenations - // have already been flattened. - for _ in 0..) throws -> ValueRegister? { - guard let node = list.popFirst() else { return nil } - switch node { - - case let .orderedChoice(children): - let n = children.count - try emitAlternation(&list, alternationCount: n) - - case let .concatenation(children): - let n = children.count - try emitConcatenation(&list, componentCount: n) - - case let .capture(name, refId, _, transform): - options.beginScope() - defer { options.endScope() } - - let cap = builder.makeCapture(id: refId, name: name) - builder.buildBeginCapture(cap) - let value = try emitNode(&list) - builder.buildEndCapture(cap) - // If the child node produced a custom capture value, e.g. the result of - // a matcher, this should override the captured substring. - if let value { - builder.buildMove(value, into: cap) - } - // If there's a capture transform, apply it now. - if let transform = transform { - let fn = builder.makeTransformFunction { input, cap in - // If it's a substring capture with no custom value, apply the - // transform directly to the substring to avoid existential traffic. - // - // FIXME: separate out this code path. This is fragile, - // slow, and these are clearly different constructs - if let range = cap.range, cap.value == nil { - return try transform(input[range]) - } - - let value = constructExistentialOutputComponent( - from: input, - component: cap.deconstructed, - optionalCount: 0) - return try transform(value) - } - builder.buildTransformCapture(cap, fn) - } - - case let .nonCapturingGroup(kind, _): - try emitNoncapturingGroup(kind.ast, &list) - - case let .ignoreCapturesInTypedOutput(_): - try emitNode(&list) - - case let .limitCaptureNesting(_): - return try emitNode(&list) - - case .conditional: - throw Unsupported("Conditionals") - - case let .quantification(amt, kind, _): - try emitQuantification(amt.ast, kind, &list) - - case let .customCharacterClass(ccc): - if ccc.containsDot { - if !ccc.isInverted { - try emitDot() - } else { - throw Unsupported("Inverted any") - } - } else { - try emitCustomCharacterClass(ccc) - } - - case let .atom(a): - try emitAtom(a) - - case let .quotedLiteral(s): - emitQuotedLiteral(s) - - case .absentFunction: - throw Unsupported("absent function") - case .consumer: - throw Unsupported("consumer") - - case let .matcher(_, f): - return emitMatcher(f) - - case .characterPredicate: - throw Unsupported("character predicates") - - case .trivia, .empty: - return nil - } - return nil - } -} - -// MARK: Skip node - -extension Compiler.ByteCodeGen { - mutating func skipNode( - _ list: inout ArraySlice, - preservingCaptures: Bool = true - ) throws { - guard let node = list.popFirst() else { return } - switch node { - case let .orderedChoice(children): - let n = children.count - for _ in 0.. 0 && child.guaranteesForwardProgress - case .limitCaptureNesting(let node), .ignoreCapturesInTypedOutput(let node): - return node.guaranteesForwardProgress default: return false } } diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index b34e0e5f7..33cffaf20 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -12,59 +12,33 @@ internal import _RegexParser class Compiler { - let tree: DSLList + let tree: DSLTree // TODO: Or are these stored on the tree? var options = MatchingOptions() private var compileOptions: _CompileOptions = .default init(ast: AST) { - self.tree = DSLList(tree: ast.dslTree) + self.tree = ast.dslTree } init(tree: DSLTree) { - self.tree = DSLList(tree: tree) - } - - init(list: DSLList) { - self.tree = list + self.tree = tree } init(tree: DSLTree, compileOptions: _CompileOptions) { - self.tree = DSLList(tree: tree) - self.compileOptions = compileOptions - } - - init(tree: DSLList, compileOptions: _CompileOptions) { self.tree = tree self.compileOptions = compileOptions } __consuming func emit() throws -> MEProgram { - try emitViaList() - } - - __consuming func emitViaTree() throws -> MEProgram { - // TODO: Handle global options - var codegen = ByteCodeGen( - options: options, - compileOptions: - compileOptions, - captureList: tree.captureList) - fatalError() -// return try codegen.emitRoot(tree.root) - } - - __consuming func emitViaList() throws -> MEProgram { // TODO: Handle global options -// var dslList = DSLList(tree: tree) var codegen = ByteCodeGen( options: options, compileOptions: compileOptions, captureList: tree.captureList) - var tree = tree - return try codegen.emitRoot(&tree) + return try codegen.emitRoot(tree.root) } } @@ -116,22 +90,20 @@ func _compileRegex( _ syntax: SyntaxOptions = .traditional, _ semanticLevel: RegexSemanticLevel? = nil ) throws -> MEProgram { - var ast = try parse(regex, syntax) - let dsl: DSLList + let ast = try parse(regex, syntax) + let dsl: DSLTree switch semanticLevel?.base { case .graphemeCluster: let sequence = AST.MatchingOptionSequence(adding: [.init(.graphemeClusterSemantics, location: .fake)]) - ast.root = AST.Node.group(AST.Group(.init(faking: .changeMatchingOptions(sequence)), ast.root, .fake)) - dsl = DSLList(ast: ast) + dsl = DSLTree(.nonCapturingGroup(.init(ast: .changeMatchingOptions(sequence)), ast.dslTree.root)) case .unicodeScalar: let sequence = AST.MatchingOptionSequence(adding: [.init(.unicodeScalarSemantics, location: .fake)]) - ast.root = AST.Node.group(AST.Group(.init(faking: .changeMatchingOptions(sequence)), ast.root, .fake)) - dsl = DSLList(ast: ast) + dsl = DSLTree(.nonCapturingGroup(.init(ast: .changeMatchingOptions(sequence)), ast.dslTree.root)) case .none: - dsl = DSLList(ast: ast) + dsl = ast.dslTree } - let program = try Compiler(list: dsl).emit() + let program = try Compiler(tree: dsl).emit() return program } diff --git a/Sources/_StringProcessing/LiteralPrinter.swift b/Sources/_StringProcessing/LiteralPrinter.swift index d9cdbb04e..5c136827c 100644 --- a/Sources/_StringProcessing/LiteralPrinter.swift +++ b/Sources/_StringProcessing/LiteralPrinter.swift @@ -36,8 +36,7 @@ extension Regex { @available(SwiftStdlib 6.0, *) public var _literalPattern: String? { var gen = LiteralPrinter(options: MatchingOptions()) - var list = self.program.list.nodes[...] - try? gen.outputList(&list) + gen.outputNode(self.program.tree.root) return gen.canonicalLiteralString } } @@ -84,159 +83,6 @@ fileprivate struct LiteralPrinter { mutating func saveInconvertible(_ node: DSLTree.Node) { segments.append(.inconvertible(node)) } - - mutating func inconvertible(_ node: DSLTree.Node) throws { - segments.append(.inconvertible(node)) - throw Incovertible.error - } -} - -extension LiteralPrinter { - enum Incovertible: Error { - case error - } - - mutating func outputList(_ list: inout ArraySlice) throws { - guard let node = list.popFirst() else { - return - } - - switch node { - case let .orderedChoice(children): - try outputAlternation(&list, count: children.count) - case let .concatenation(children): - try outputConcatenation(&list, count: children.count) - - case let .capture(name, nil, _, nil): - options.beginScope() - defer { options.endScope() } - try outputCapture(&list, name: name) - case .capture: - // Captures that use a reference or a transform are unsupported - try inconvertible(node) - return - - case let .nonCapturingGroup(kind, _): - guard let kindPattern = kind._patternString else { - try inconvertible(node) - return - } - options.beginScope() - defer { options.endScope() } - - output(kindPattern) - if case .changeMatchingOptions(let optionSequence) = kind.ast { - options.apply(optionSequence) - } - try outputList(&list) - output(")") - - case .ignoreCapturesInTypedOutput(_), - .limitCaptureNesting(_): - try outputList(&list) - case let .quantification(amount, kind, _): - try outputQuantification(&list, amount: amount, kind: kind) - case let .customCharacterClass(charClass): - outputCustomCharacterClass(charClass) - case let .atom(atom): - outputAtom(atom) - case let .quotedLiteral(literal): - output(prepareQuotedLiteral(literal)) - - case .trivia(_): - // TODO: Include trivia? - return - case .empty: - return - - case .conditional, .absentFunction, .consumer, .matcher, .characterPredicate: - saveInconvertible(node) - } - } - - mutating func outputAlternation(_ list: inout ArraySlice, count: Int) throws { - for i in 0.., count: Int) throws { - for _ in 0.., name: String?) throws { - if let name { - output("(?<\(name)>") - } else { - output("(") - } - try outputList(&list) - output(")") - } - - func requiresGrouping(_ list: ArraySlice) -> Bool { - guard let node = list.first else { return false } // malformed? - switch node { - case .concatenation(let children): - switch children.count { - case 0: - return false - case 1: - return requiresGrouping(list.dropFirst()) - default: - return true - } - - case .quotedLiteral(let literal): - return prepareQuotedLiteral(literal).count > 1 - - default: - return false - } - } - - mutating func outputQuantification( - _ list: inout ArraySlice, - amount: DSLTree._AST.QuantificationAmount, - kind: DSLTree.QuantificationKind - ) throws { - // RegexBuilder regexes can have children that need - if requiresGrouping(list) { - output("(?:") - try outputList(&list) - output(")") - } else { - try outputList(&list) - } - - switch amount.ast { - case .zeroOrMore: - output("*") - case .oneOrMore: - output("+") - case .zeroOrOne: - output("?") - case let .exactly(n): - output("{\(n.value!)}") - case let .nOrMore(n): - output("{\(n.value!),}") - case let .upToN(n): - output("{,\(n.value!)}") - case let .range(low, high): - output("{\(low.value!),\(high.value!)}") - #if RESILIENT_LIBRARIES - @unknown default: - fatalError() - #endif - } - - outputQuantificationKind(kind) - } } extension LiteralPrinter { @@ -270,9 +116,11 @@ extension LiteralPrinter { outputNode(child) output(")") - case let .ignoreCapturesInTypedOutput(child), - let .limitCaptureNesting(child): + case let .ignoreCapturesInTypedOutput(child): outputNode(child) + case .convertedRegexLiteral(let node, _): + outputNode(node) + case let .quantification(amount, kind, node): outputQuantification(amount, kind, node) case let .customCharacterClass(charClass): @@ -378,16 +226,13 @@ extension LiteralPrinter { } mutating func outputQuantificationKind(_ kind: DSLTree.QuantificationKind) { - guard let astKind = kind.quantificationKind?.ast else { + switch kind { + case .`default`: // We can treat this as if the current default had been given explicity. outputQuantificationKind( .explicit(.init(ast: options.defaultQuantificationKind))) - return - } - - if kind.isExplicit { - // Explicitly provided modifiers need to match the current option state. - switch astKind { + case let .explicit(kind): + switch kind.ast { case .eager: output(options.isReluctantByDefault ? "?" : "") case .reluctant: @@ -399,9 +244,9 @@ extension LiteralPrinter { fatalError() #endif } - } else { + case let .syntax(kind): // Syntactically-specified quantification modifiers can stay as-is. - switch astKind { + switch kind.ast { case .eager: output("") case .reluctant: @@ -609,15 +454,7 @@ extension String { } func escapingConfusableCharacters() -> String { - reduce(into: "") { result, ch in - for scalar in ch.unicodeScalars { - if scalar.isPrintableASCII { - result.append(Character(scalar)) - } else { - result.append(scalar.escapedString) - } - } - } + lazy.map(\.escapingConfusable).joined() } } diff --git a/Sources/_StringProcessing/Optimizations/AutoPossessification.swift b/Sources/_StringProcessing/Optimizations/AutoPossessification.swift deleted file mode 100644 index 7a728365c..000000000 --- a/Sources/_StringProcessing/Optimizations/AutoPossessification.swift +++ /dev/null @@ -1,398 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// This source file is part of the Swift.org open source project -// -// Copyright (c) 2025 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception -// -// See https://swift.org/LICENSE.txt for license information -// -//===----------------------------------------------------------------------===// - -extension DSLList { - private func _requiredAtomImpl( - _ position: inout Int, - options: inout MatchingOptions, - allowOptionsChanges: Bool - ) -> DSLTree.Atom?? { - guard position < nodes.count else { - return nil - } - - switch nodes[position] { - case .atom(let atom): - switch atom { - case .changeMatchingOptions(let seq): - // Exit early if an atom changes the matching options. - if allowOptionsChanges { - options.apply(seq.ast) - return nil - } else { - return .some(nil) - } - default: - return atom - } - - // In a concatenation, the first definitive child provides the answer, - // and then we need to skip past (in some cases at least) the remaining - // concatenation elements. - case .concatenation(let children): - var result: DSLTree.Atom?? = nil - var i = 0 - while i < children.count { - i += 1 - position += 1 - if let r = _requiredAtomImpl(&position, options: &options, allowOptionsChanges: allowOptionsChanges) { - result = r - break - } - } - - for _ in i.. DSLTree.Atom? { - var position = 0 - var options = MatchingOptions() - return _requiredAtomImpl(&position, options: &options, allowOptionsChanges: allowOptionsChanges) ?? nil - } - - internal mutating func autoPossessifyNextQuantification( - _ position: inout Int, - options: inout MatchingOptions - ) -> (Int, DSLTree.Atom)? { - guard position < nodes.count else { - return nil - } - - switch nodes[position] { - case .quantification(_, _, _): - let quantPosition = position - position += 1 - - // Limit auto-possessification to a single quantified atom, to avoid - // issues of overlapped matches. - guard position < nodes.count else { - return nil - } - switch nodes[position] { - case .atom(let atom) where atom.isMatchable: - return (quantPosition, atom) - default: - var innerPosition = position - _ = autoPossessifyNextQuantification(&innerPosition, options: &options) - return nil - } - - case .concatenation(let children): - // If we find a valid quantification among this concatenation's components, - // we must look for a required atom in the sibling. If a definitive result - // is not found, pop up the recursion stack to find a sibling at a higher - // level. - var foundQuantification: (Int, DSLTree.Atom)? = nil - var foundNextAtom: DSLTree.Atom? = nil - var i = 0 - position += 1 - while i < children.count { - i += 1 - if let result = autoPossessifyNextQuantification(&position, options: &options) { - foundQuantification = result - break - } - } - - while i < children.count { - i += 1 - position += 1 - if let result = _requiredAtomImpl(&position, options: &options, allowOptionsChanges: false) { - foundNextAtom = result - break - } - } - - for _ in i.. Bool { - switch (self, other) { - case (.char(let a), .char(let b)): - // Two characters are mutually exclusive if one does not match against - // the other. - // - // Relevant options: - // - semantic level - // - case insensitivity - - if options.semanticLevel == .graphemeCluster { - // Just call String.match(Character, ...) - let s = String(a) - return nil == s.match( - b, at: s.startIndex, - limitedBy: s.endIndex, - isCaseInsensitive: options.isCaseInsensitive) - } else { - // Call String.matchScalar(Scalar, ...) for each in scalar sequence - let s = String(a) - var i = s.startIndex - var j = b.unicodeScalars.startIndex - while i < s.endIndex { - guard j < b.unicodeScalars.endIndex else { return true } - guard let nextIndex = s.matchScalar(b.unicodeScalars[j], at: i, limitedBy: s.endIndex, boundaryCheck: false, isCaseInsensitive: options.isCaseInsensitive) else { - return true - } - i = nextIndex - b.unicodeScalars.formIndex(after: &j) - } - return false - } - - case (.scalar(let a), .scalar(let b)): - // Two scalars are mutually exclusive if one does not match against - // the other. - // - // Relevant options: - // - case insensitivity - let s = String(a) - return nil == s.matchScalar( - b, at: s.startIndex, - limitedBy: s.endIndex, - boundaryCheck: false, - isCaseInsensitive: options.isCaseInsensitive) - - case (.characterClass(let a), .characterClass(let b)): - // Certain character classes are mutually exclusive of each other. - return a.excludes(b, options: options) - - // For character class and char/scalar, we can test against the class's model. - case (.characterClass(let a), .char(let b)), (.char(let b), .characterClass(let a)): - let s = "\(b)" - return nil == a.asRuntimeModel(options).matches(in: s, at: s.startIndex, limitedBy: s.endIndex) - case (.characterClass(let a), .scalar(let b)), (.scalar(let b), .characterClass(let a)): - let s = "\(b)" - return nil == a.asRuntimeModel(options).matches(in: s, at: s.startIndex, limitedBy: s.endIndex) - - default: - return false - } - } -} - -extension DSLTree.Atom.CharacterClass { - func excludes(_ other: Self, options: MatchingOptions) -> Bool { - if other == .anyGrapheme || other == .anyUnicodeScalar { - return false - } - - return switch self { - case .anyGrapheme, .anyUnicodeScalar: - false - - case .digit: - switch other { - case .whitespace, .horizontalWhitespace, .verticalWhitespace, .newlineSequence, - .notWord, .notDigit: true - default: false - } - case .notDigit: - other == .digit - - case .horizontalWhitespace: - switch other { - case .word, .digit, .verticalWhitespace, .newlineSequence, - .notWhitespace, .notHorizontalWhitespace: true - default: false - } - case .notHorizontalWhitespace: - other == .horizontalWhitespace - - case .newlineSequence: - switch other { - case .word, .digit, .horizontalWhitespace, .notNewline: true - default: false - } - case .notNewline: - other == .newlineSequence - - case .whitespace: - switch other { - case .word, .digit, .notWhitespace: true - default: false - } - case .notWhitespace: - other == .whitespace - - case .verticalWhitespace: - switch other { - case .word, .digit, .notWhitespace, .notVerticalWhitespace: true - default: false - } - case .notVerticalWhitespace: - other == .verticalWhitespace - - case .word: - switch other { - case .whitespace, .horizontalWhitespace, .verticalWhitespace, .newlineSequence, - .notWord: true - default: false - } - case .notWord: - other == .word - } - } -} diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 2f6ebab64..34ca44f0d 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -179,9 +179,6 @@ extension PrettyPrinter { case let .ignoreCapturesInTypedOutput(child): printAsPattern(convertedFromAST: child, isTopLevel: isTopLevel) - case let .limitCaptureNesting(child): - printAsPattern(convertedFromAST: child, isTopLevel: isTopLevel) - case .conditional: print("/* TODO: conditional */") @@ -261,6 +258,20 @@ extension PrettyPrinter { break + case let .convertedRegexLiteral(.atom(a), _): + if let pattern = a._patternBase(&self), pattern.canBeWrapped { + printAtom(pattern.0) + return + } + + break + case let .convertedRegexLiteral(.customCharacterClass(ccc), _): + if ccc.isSimplePrint { + printSimpleCCC(ccc) + return + } + + break default: break } @@ -294,6 +305,13 @@ extension PrettyPrinter { case let .quotedLiteral(v): print(v._quoted) + case let .convertedRegexLiteral(n, _): + // FIXME: This recursion coordinates with back-off + // check above, so it should work out. Need a + // cleaner way to do this. This means the argument + // label is a lie. + printAsPattern(convertedFromAST: n, isTopLevel: isTopLevel) + case let .customCharacterClass(ccc): printAsPattern(ccc) @@ -1413,6 +1431,9 @@ extension DSLTree.Node { result += node.getNamedCaptures() } + case .convertedRegexLiteral(let node, _): + result += node.getNamedCaptures() + case .quantification(_, _, let node): result += node.getNamedCaptures() diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 2c376fd6d..49094d4f1 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -13,146 +13,28 @@ internal import _RegexParser extension AST { var dslTree: DSLTree { - return DSLTree(.limitCaptureNesting(root.dslTreeNode)) + return DSLTree(root.dslTreeNode) } } extension AST.Node { - func convert(into list: inout [DSLTree.Node]) throws { - switch self { - case .alternation(let alternation): - list.append(.orderedChoice(Array(repeating: TEMP_FAKE_NODE, count: alternation.children.count))) - for child in alternation.children { - try child.convert(into: &list) - } - case .concatenation(let concatenation): - let coalesced = self.coalescedChildren - list.append(.concatenation(Array(repeating: TEMP_FAKE_NODE, count: coalesced.count))) - for child in coalesced { - try child.convert(into: &list) - } - case .group(let group): - let child = group.child - switch group.kind.value { - case .capture: - list.append(.capture(TEMP_FAKE_NODE)) - try child.convert(into: &list) - case .namedCapture(let name): - list.append(.capture(name: name.value, TEMP_FAKE_NODE)) - try child.convert(into: &list) - case .balancedCapture: - throw Unsupported("TODO: balanced captures") - default: - list.append(.nonCapturingGroup(.init(ast: group.kind.value), TEMP_FAKE_NODE)) - try child.convert(into: &list) - } - case .conditional(let conditional): - list.append(.conditional(.init(ast: conditional.condition.kind), TEMP_FAKE_NODE, TEMP_FAKE_NODE)) - try conditional.trueBranch.convert(into: &list) - try conditional.falseBranch.convert(into: &list) - case .quantification(let quant): - list.append( - .quantification(.init(ast: quant.amount.value), .syntax(.init(ast: quant.kind.value)), TEMP_FAKE_NODE)) - try quant.child.convert(into: &list) - case .quote(let node): - list.append(.quotedLiteral(node.literal)) - case .trivia(let node): - list.append(.trivia(node.contents)) - case .interpolation(_): - throw Unsupported("TODO: interpolation") - case .atom(let atom): - switch atom.kind { - case .scalarSequence(let seq): - // The DSL doesn't have an equivalent node for scalar sequences. Splat - // them into a concatenation of scalars. - // list.append(.concatenation(Array(repeating: TEMP_FAKE_NODE, count: seq.scalarValues.count))) - list.append(.quotedLiteral(String(seq.scalarValues))) - default: - list.append(.atom(atom.dslTreeAtom)) - } - case .customCharacterClass(let ccc): - list.append(.customCharacterClass(ccc.dslTreeClass)) - case .absentFunction(let abs): - // TODO: What should this map to? - list.append(.absentFunction(.init(ast: abs))) - case .empty(_): - list.append(.empty) - } - } - - var coalescedChildren: [AST.Node] { - // Before converting a concatenation in a tree to list form, we need to - // flatten out any nested concatenations, and coalesce any adjacent - // characters and scalars, forming quoted literals of their contents, - // over which we can perform grapheme breaking. - - func flatten(_ node: AST.Node) -> [AST.Node] { + /// Converts an AST node to a `convertedRegexLiteral` node. + var dslTreeNode: DSLTree.Node { + func wrap(_ node: DSLTree.Node) -> DSLTree.Node { switch node { - case .concatenation(let concat): - return concat.children.flatMap(flatten) - default: - return [node] - } - } - - func appendAtom(_ atom: AST.Atom, to str: inout String) -> Bool { - switch atom.kind { - case .char(let c): - str.append(c) - return true - case .scalar(let s): - str.append(Character(s.value)) - return true - case .escaped(let c): - guard let value = c.scalarValue else { return false } - str.append(Character(value)) - return true - case .scalarSequence(let seq): - str.append(contentsOf: seq.scalarValues.lazy.map(Character.init)) - return true - + case .convertedRegexLiteral: + // FIXME: DSL can have one item concats +// assertionFailure("Double wrapping?") + return node default: - return false + break } + // TODO: Should we do this for the + // single-concatenation child too, or should? + // we wrap _that_? + return .convertedRegexLiteral(node, .init(ast: self)) } - - switch self { - case .alternation(let v): return v.children - case .concatenation(let v): - let children = v.children - .flatMap(flatten) - .coalescing(with: "", into: { AST.Node.quote(.init($0, .fake)) }) { str, node in - switch node { - case .atom(let a): - return appendAtom(a, to: &str) - case .quote(let q): - str += q.literal - return true - case .trivia: - // Trivia can be completely ignored if we've already coalesced - // something. - return !str.isEmpty - default: - return false - } - } - return children - case .group(let group): - return [group.child] - case .conditional(let conditional): - return [conditional.trueBranch, conditional.falseBranch] - case .quantification(let quant): - return [quant.child] - case .quote, .trivia, .interpolation, .atom, .customCharacterClass, .absentFunction, .empty: - return [] - } - } -} - -extension AST.Node { - /// Converts an AST node to a `convertedRegexLiteral` node. - var dslTreeNode: DSLTree.Node { // Convert the top-level node without wrapping func convert() throws -> DSLTree.Node { switch self { @@ -223,8 +105,9 @@ extension AST.Node { } } + // FIXME: make total function again let converted = try! convert() - return converted + return wrap(converted) } } diff --git a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift index a4e405f8c..ae8193804 100644 --- a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift +++ b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift @@ -265,7 +265,7 @@ extension Regex { /// - Parameter name: The name to look for among the regular expression's /// capture groups. Capture group names are case sensitive. public func contains(captureNamed name: String) -> Bool { - program.list.captureList.captures.contains(where: { + program.tree.captureList.captures.contains(where: { $0.name == name }) } @@ -284,7 +284,7 @@ extension Regex where Output == AnyRegexOutput { /// - Parameter regex: A regular expression to convert to use a dynamic /// capture list. public init(_ regex: Regex) { - self.init(list: regex.list) + self.init(node: regex.root) } } @@ -331,7 +331,7 @@ extension Regex { _ regex: Regex, as outputType: Output.Type = Output.self ) { - self.init(list: regex.list) + self.init(node: regex.root) guard _verifyType().0 else { return nil diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index 425f64549..11445531c 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -11,8 +11,6 @@ internal import _RegexParser -let TEMP_FAKE_NODE = DSLTree.Node.empty - /// A type that represents a regular expression. /// /// You can use types that conform to `RegexComponent` as parameters to string @@ -93,10 +91,7 @@ public struct Regex: RegexComponent { let program: Program var hasCapture: Bool { - program.list.hasCapture - } - var hasChildren: Bool { - program.list.hasChildren + program.tree.hasCapture } init(ast: AST) { @@ -153,7 +148,7 @@ extension Regex { /// FIXME: If Regex is the unit of composition, then it should be a Node instead, /// and we should have a separate type that handled both global options and, /// likely, compilation/caching. - var list: DSLList + let tree: DSLTree /// OptionSet of compiler options for testing purposes fileprivate var compileOptions: _CompileOptions = .default @@ -183,7 +178,7 @@ extension Regex { } // Compile the DSLTree into a lowered program and store it atomically. - let compiledProgram = try! Compiler(tree: list, compileOptions: compileOptions).emit() + let compiledProgram = try! Compiler(tree: tree, compileOptions: compileOptions).emit() let storedNewProgram = _stdlib_atomicInitializeARCRef( object: _loweredProgramStoragePtr, desired: ProgramBox(compiledProgram)) @@ -196,15 +191,11 @@ extension Regex { } init(ast: AST) { - self.list = DSLList(ast: ast) + self.tree = ast.dslTree } init(tree: DSLTree) { - self.list = DSLList(tree: tree) - } - - init(list: DSLList) { - self.list = list + self.tree = tree } } @@ -223,77 +214,12 @@ extension Regex { @available(SwiftStdlib 5.7, *) extension Regex { - var list: DSLList { - program.list - } - - init(node: DSLTree.Node) { - self.program = Program(list: .init(node)) + var root: DSLTree.Node { + program.tree.root } - init(list: DSLList) { - self.program = Program(list: list) - } - - func appending(_ node: DSLTree.Node) -> Regex { - var list = program.list - list.append(node) - return Regex(list: list) - } - - func appending(contentsOf node: [DSLTree.Node]) -> Regex { - var list = program.list - list.append(contentsOf: node) - return Regex(list: list) - } - - func concatenating(_ other: DSLList) -> Regex { - // TODO: Quick check to see if these copies are necessary? - var list = program.list - var other = other - list.coalesce(withFirstAtomIn: &other) - - // Sometimes coalescing consumes all of `other` - guard !other.nodes.isEmpty else { - return Regex(list: list) - } - - // Use an existing concatenation if it's already the root; - // otherwise, embed self and other in a new concatenation root. - switch list.nodes[0] { - case .concatenation(let children): - list.nodes[0] = .concatenation(Array(repeating: TEMP_FAKE_NODE, count: children.count + 1)) - list.nodes.append(contentsOf: other.nodes) - default: - list.nodes.insert(.concatenation(Array(repeating: TEMP_FAKE_NODE, count: 2)), at: 0) - list.nodes.append(contentsOf: other.nodes) - } - return Regex(list: list) - } - - func alternating(with other: some Collection) -> Regex { - var nodes = program.list.nodes - switch nodes[0] { - case .orderedChoice(let children): - nodes[0] = .orderedChoice(Array(repeating: TEMP_FAKE_NODE, count: children.count + 1)) - nodes.append(contentsOf: other) - default: - nodes.insert(.orderedChoice(Array(repeating: TEMP_FAKE_NODE, count: 2)), at: 0) - nodes.append(contentsOf: other) - } - return Regex(list: DSLList(nodes)) - } - - func prepending(_ node: DSLTree.Node) -> Regex { - var list = program.list - list.prepend(node) - return Regex(list: list) - } - - func prepending(contentsOf node: some Collection) -> Regex { - var list = program.list - list.prepend(contentsOf: node) - return Regex(list: list) + init(node: DSLTree.Node) { + self.program = Program(tree: .init(node)) } } @@ -316,7 +242,7 @@ extension Regex { return true case .recompile: let _ = try Compiler( - tree: program.list, + tree: program.tree, compileOptions: program.compileOptions).emit() return true } diff --git a/Sources/_StringProcessing/Regex/DSLList.swift b/Sources/_StringProcessing/Regex/DSLList.swift deleted file mode 100644 index 8e53c87d1..000000000 --- a/Sources/_StringProcessing/Regex/DSLList.swift +++ /dev/null @@ -1,221 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// This source file is part of the Swift.org open source project -// -// Copyright (c) 2025 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception -// -// See https://swift.org/LICENSE.txt for license information -// -//===----------------------------------------------------------------------===// - -internal import _RegexParser - -struct DSLList { - var nodes: [DSLTree.Node] - - // experimental - var hasCapture: Bool = false - var hasChildren: Bool { - (nodes.first?.directChildren ?? 0) > 0 - } - - var captureList: CaptureList { - .Builder.build(self) - } - - init(_ initial: DSLTree.Node) { - self.nodes = [initial] - } - - init(_ nodes: [DSLTree.Node]) { - self.nodes = nodes - } - - init(tree: DSLTree) { - self.nodes = Array(tree.depthFirst) - } - - init(ast: AST) { - self.nodes = [.limitCaptureNesting(TEMP_FAKE_NODE)] - try! ast.root.convert(into: &nodes) - } - - var first: DSLTree.Node { - nodes.first ?? .empty - } -} - -extension DSLList { - mutating func append(_ node: DSLTree.Node) { - nodes.append(node) - } - - mutating func append(contentsOf other: some Sequence) { - nodes.append(contentsOf: other) - } - - mutating func prepend(_ node: DSLTree.Node) { - nodes.insert(node, at: 0) - } - - mutating func prepend(contentsOf other: some Collection) { - nodes.insert(contentsOf: other, at: 0) - } -} - -extension DSLTree.Node { - var directChildren: Int { - switch self { - case .trivia, .empty, .quotedLiteral, - .consumer, .matcher, .characterPredicate, - .customCharacterClass, .atom: - return 0 - - case .orderedChoice(let c), .concatenation(let c): - return c.count - - case .capture, .nonCapturingGroup, - .quantification, .ignoreCapturesInTypedOutput, - .limitCaptureNesting, .conditional: - return 1 - - case .absentFunction: - return 0 - } - } -} - -extension DSLTree { - struct DepthFirst: Sequence, IteratorProtocol { - typealias Element = DSLTree.Node - private var stack: [Frame] - private let getChildren: (Element) -> [Element] - - private struct Frame { - let node: Element - let children: [Element] - var nextIndex: Int = 0 - } - - fileprivate init( - root: Element, - getChildren: @escaping (Element) -> [Element] - ) { - self.getChildren = getChildren - self.stack = [Frame(node: root, children: getChildren(root))] - } - - mutating func next() -> Element? { - guard let top = stack.popLast() else { return nil } - // Push children in reverse so leftmost comes out first. - for child in top.children.reversed() { - stack.append(Frame(node: child, children: getChildren(child))) - } - - // Since we coalesce the children before adding them to the stack, - // we need an exact matching number of children in the list's - // concatenation node, so that it can provide the correct component - // count. This will go away/change when .concatenation only stores - // a count. - return switch top.node { - case .concatenation: - .concatenation(top.node.coalescedChildren) - default: - top.node - } - } - } - - var depthFirst: DepthFirst { - DepthFirst(root: root, getChildren: { - $0.coalescedChildren - }) - } -} - -extension DSLList { - internal func skipNode(_ position: inout Int) { - guard position < nodes.count else { - return - } - switch nodes[position] { - case let .orderedChoice(children): - let n = children.count - for _ in 0.. Int? { - switch nodes[position] { - case .concatenation(let children): - var position = position + 1 - if findLast { - for _ in 0..<(children.count - 1) { - skipNode(&position) - position += 1 - } - } - return indexOfCoalescableAtom(startingAt: position, findLast: findLast) - case .ignoreCapturesInTypedOutput, .limitCaptureNesting: - return indexOfCoalescableAtom(startingAt: position + 1, findLast: findLast) - case .atom(let atom): - if atom.literalCharacterValue != nil { - return position - } - case .quotedLiteral: - return position - default: - break - } - return nil - } - - mutating func coalesce(withFirstAtomIn other: inout DSLList) { - // Find the last coalescable node in the LHS and the first in the RHS - guard let prefixIndex = indexOfCoalescableAtom(startingAt: 0, findLast: true), - let postfixIndex = other.indexOfCoalescableAtom(startingAt: 0), - let prefixValue = nodes[prefixIndex].literalStringValue, - let postfixValue = other.nodes[postfixIndex].literalStringValue - else { return } - - // Replace the prefix node with a coalesced version of the two - nodes[prefixIndex] = .quotedLiteral(prefixValue + postfixValue) - - // Remove the postfix node and fix up any parent concatenations - other.nodes.remove(at: postfixIndex) - var i = postfixIndex - 1 - Loop: - while i >= 0 { - switch other.nodes[i] { - case .concatenation(let children): - other.nodes[i] = .concatenation(.init(repeating: .empty, count: children.count - 1)) - break Loop - case .limitCaptureNesting, .ignoreCapturesInTypedOutput: - other.nodes.remove(at: i) - i -= 1 - default: - break Loop - } - } - } -} diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 12f559729..5971cd93a 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -44,8 +44,7 @@ extension DSLTree { /// Marks all captures in a subpattern as ignored in strongly-typed output. case ignoreCapturesInTypedOutput(Node) - case limitCaptureNesting(Node) - + // TODO: Consider splitting off grouped conditions, or have // our own kind @@ -80,6 +79,13 @@ extension DSLTree { /// TODO: Consider splitting off expression functions, or have our own kind case absentFunction(_AST.AbsentFunction) + // MARK: - Tree conversions + + /// The target of AST conversion. + /// + /// Keeps original AST around for rich syntactic and source information + case convertedRegexLiteral(Node, _AST.ASTNode) + // MARK: - Extensibility points case consumer(_ConsumerInterface) @@ -92,38 +98,19 @@ extension DSLTree { } extension DSLTree { - struct QuantificationKind { - var quantificationKind: _AST.QuantificationKind? - var isExplicit: Bool - var canAutoPossessify: Bool? - + enum QuantificationKind { /// The default quantification kind, as set by options. - static var `default`: Self { - .init(quantificationKind: nil, isExplicit: false, canAutoPossessify: nil) - } - + case `default` /// An explicitly chosen kind, overriding any options. - static func explicit(_ kind: _AST.QuantificationKind) -> Self { - .init(quantificationKind: kind, isExplicit: true, canAutoPossessify: nil) - } - + case explicit(_AST.QuantificationKind) /// A kind set via syntax, which can be affected by options. - static func syntax(_ kind: _AST.QuantificationKind) -> Self { - .init(quantificationKind: kind, isExplicit: false, canAutoPossessify: nil) - } + case syntax(_AST.QuantificationKind) var ast: AST.Quantification.Kind? { - quantificationKind?.ast - } - - func applying(options: MatchingOptions) -> AST.Quantification.Kind { - guard let kind = quantificationKind?.ast else { - return options.defaultQuantificationKind - } - return if isExplicit { - kind - } else { - kind.applying(options) + switch self { + case .default: return nil + case .explicit(let kind), .syntax(let kind): + return kind.ast } } } @@ -397,9 +384,8 @@ extension DSLTree.Node { case .orderedChoice(let c), .concatenation(let c): return !c.isEmpty - case .capture, .nonCapturingGroup, - .quantification, .ignoreCapturesInTypedOutput, .limitCaptureNesting, - .conditional: + case .convertedRegexLiteral, .capture, .nonCapturingGroup, + .quantification, .ignoreCapturesInTypedOutput, .conditional: return true case .absentFunction(let abs): @@ -412,72 +398,16 @@ extension DSLTree.Node { switch self { case let .orderedChoice(v): return v - case let .concatenation(v): return v - - case let .capture(_, _, n, _): return [n] - case let .nonCapturingGroup(_, n): return [n] - case let .quantification(_, _, n): return [n] - case let .ignoreCapturesInTypedOutput(n): return [n] - case let .limitCaptureNesting(n): return [n] - - case let .conditional(_, t, f): return [t,f] - - case .trivia, .empty, .quotedLiteral, - .consumer, .matcher, .characterPredicate, - .customCharacterClass, .atom: - return [] - - case let .absentFunction(abs): - return abs.ast.children.map(\.dslTreeNode) - } - } - - public var coalescedChildren: [DSLTree.Node] { - // Before converting a concatenation in a tree to list form, we need to - // flatten out any nested concatenations, and coalesce any adjacent - // characters and scalars, forming quoted literals of their contents, - // over which we can perform grapheme breaking. + case let .concatenation(v): return v - func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] { - switch node { - case .concatenation(let ch): - return ch.flatMap(flatten) - case .ignoreCapturesInTypedOutput(let n), .limitCaptureNesting(let n): - return flatten(n) - default: - return [node] - } - } - - switch self { - case let .orderedChoice(v): return v - case let .concatenation(v): - let children = v - .flatMap(flatten) - .coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in - switch node { - case .atom(let a): - guard let c = a.literalCharacterValue else { return false } - str.append(c) - return true - case .quotedLiteral(let q): - str += q - return true - case .trivia: - // Trivia can be completely ignored if we've already coalesced - // something. - return !str.isEmpty - default: - return false - } - } - return children + case let .convertedRegexLiteral(n, _): + // Treat this transparently + return n.children case let .capture(_, _, n, _): return [n] case let .nonCapturingGroup(_, n): return [n] case let .quantification(_, _, n): return [n] - case let .ignoreCapturesInTypedOutput(n): return [n] - case let .limitCaptureNesting(n): return [n] + case let .ignoreCapturesInTypedOutput(n): return [n] case let .conditional(_, t, f): return [t,f] @@ -494,12 +424,18 @@ extension DSLTree.Node { extension DSLTree.Node { var astNode: AST.Node? { - nil + switch self { + case let .convertedRegexLiteral(_, literal): return literal.ast + default: return nil + } } /// If this node is for a converted literal, look through it. var lookingThroughConvertedLiteral: Self { - self + switch self { + case let .convertedRegexLiteral(n, _): return n + default: return self + } } } @@ -514,16 +450,6 @@ extension DSLTree.Atom { } } -extension DSLTree.Node { - var literalStringValue: String? { - switch self { - case .atom(let a): return a.literalCharacterValue.map(String.init) - case .quotedLiteral(let s): return s - default: return nil - } - } -} - extension DSLTree { struct Options { // TBD @@ -542,6 +468,10 @@ extension DSLTree.Node { switch self { case .capture: return true + case let .convertedRegexLiteral(n, re): + assert(n.hasCapture == re.ast.hasCapture) + return n.hasCapture + default: return self.children.any(\.hasCapture) } @@ -725,9 +655,6 @@ extension CaptureList.Builder { case let .ignoreCapturesInTypedOutput(child): addCaptures(of: child, optionalNesting: nesting, visibleInTypedOutput: false) - case let .limitCaptureNesting(child): - addCaptures(of: child, optionalNesting: nesting.disablingNesting, visibleInTypedOutput: visibleInTypedOutput) - case let .conditional(cond, trueBranch, falseBranch): switch cond.ast { case .group(let g): @@ -758,11 +685,11 @@ extension CaptureList.Builder { #endif } -// case let .convertedRegexLiteral(n, _): -// // We disable nesting for converted AST trees, as literals do not nest -// // captures. This includes literals nested in a DSL. -// return addCaptures(of: n, optionalNesting: nesting.disablingNesting, visibleInTypedOutput: visibleInTypedOutput) -// + case let .convertedRegexLiteral(n, _): + // We disable nesting for converted AST trees, as literals do not nest + // captures. This includes literals nested in a DSL. + return addCaptures(of: n, optionalNesting: nesting.disablingNesting, visibleInTypedOutput: visibleInTypedOutput) + case .matcher: break @@ -779,91 +706,6 @@ extension CaptureList.Builder { builder.addCaptures(of: dsl.root, optionalNesting: .init(canNest: true), visibleInTypedOutput: true) return builder.captures } - - mutating func addCaptures( - in list: inout ArraySlice, optionalNesting nesting: OptionalNesting, visibleInTypedOutput: Bool - ) { - guard let node = list.popFirst() else { return } - switch node { - case let .orderedChoice(children): - for _ in 0.. CaptureList { - var builder = Self() - builder.captures.append( - .init(type: dsl.first.wholeMatchType, optionalDepth: 0, visibleInTypedOutput: true, .fake)) - var nodes = dsl.nodes[...] - builder.addCaptures(in: &nodes, optionalNesting: .init(canNest: true), visibleInTypedOutput: true) - return builder.captures - } } extension DSLTree.Node { @@ -875,8 +717,8 @@ extension DSLTree.Node { return true case .orderedChoice, .concatenation, .capture, .conditional, .quantification, .customCharacterClass, .atom, - .trivia, .empty, .quotedLiteral, .limitCaptureNesting, - .consumer, .absentFunction, + .trivia, .empty, .quotedLiteral, .absentFunction, + .convertedRegexLiteral, .consumer, .characterPredicate, .matcher: return false } @@ -901,23 +743,6 @@ extension DSLTree.Node { } } -extension DSLList { - - /// Returns the output-defining node, peering through any output-forwarding - /// nodes. - var outputDefiningNode: DSLTree.Node? { - nodes.first(where: { !$0.isOutputForwarding }) - } - - /// Returns the type of the whole match, i.e. `.0` element type of the output. - var wholeMatchType: Any.Type { - if case .matcher(let type, _) = outputDefiningNode { - return type - } - return Substring.self - } -} - extension DSLTree.Node { /// Implementation for `canOnlyMatchAtStart`, which maintains the option /// state. @@ -980,7 +805,8 @@ extension DSLTree.Node { options.beginScope() defer { options.endScope() } return child._canOnlyMatchAtStartImpl(&options) - case .ignoreCapturesInTypedOutput(let child), .limitCaptureNesting(let child): + case .ignoreCapturesInTypedOutput(let child), + .convertedRegexLiteral(let child, _): return child._canOnlyMatchAtStartImpl(&options) // A quantification that doesn't require its child to exist can still @@ -1020,146 +846,6 @@ extension DSLTree.Node { } } -// MARK: Required first and last atoms - -extension DSLTree.Node { - private func _requiredAtomImpl(forward: Bool) -> DSLTree.Atom?? { - switch self { - case .atom(let atom): - return switch atom { - case .changeMatchingOptions: - nil - default: - atom - } - - // In a concatenation, the first definitive child provides the answer. - case .concatenation(let children): - if forward { - for child in children { - if let result = child._requiredAtomImpl(forward: forward) { - return result - } - } - } else { - for child in children.reversed() { - if let result = child._requiredAtomImpl(forward: forward) { - return result - } - } - } - return nil - - // For a quoted literal, we can look at the first char - // TODO: matching semantics??? - case .quotedLiteral(let str): - return str.first.map(DSLTree.Atom.char) - - // TODO: custom character classes could/should participate here somehow - case .customCharacterClass: - return .some(nil) - - // Trivia/empty have no effect. - case .trivia, .empty: - return nil - - // For alternation and conditional, no required first (this could change - // if we identify the _same_ required first atom across all possibilities). - case .orderedChoice, .conditional: - return .some(nil) - - // Groups (and other parent nodes) defer to the child. - case .nonCapturingGroup(_, let child), .capture(_, _, let child, _), - .ignoreCapturesInTypedOutput(let child), - .limitCaptureNesting(let child): - return child._requiredAtomImpl(forward: forward) - - // A quantification that doesn't require its child to exist can still - // allow a start-only match. (e.g. `/(foo)?^bar/`) - case .quantification(let amount, _, let child): - return amount.requiresAtLeastOne - ? child._requiredAtomImpl(forward: forward) - : .some(nil) - - // Extended behavior isn't known, so we return `false` for safety. - case .consumer, .matcher, .characterPredicate, .absentFunction: - return .some(nil) - } - } - - internal func requiredFirstAtom() -> DSLTree.Atom? { - self._requiredAtomImpl(forward: true) ?? nil - } - - internal func requiredLastAtom() -> DSLTree.Atom? { - self._requiredAtomImpl(forward: false) ?? nil - } -} - - -private func _requiredAtomImpl(_ list: inout ArraySlice) -> DSLTree.Atom?? { - guard let node = list.popFirst() else { - return nil - } - switch node { - case .atom(let atom): - return switch atom { - case .changeMatchingOptions: - nil - default: - atom - } - - // In a concatenation, the first definitive child provides the answer. - case .concatenation(let children): - for _ in 0..) -> DSLTree.Atom? { - _requiredAtomImpl(&list) ?? nil -} - // MARK: AST wrapper types // // These wrapper types are required because even @_spi-marked public APIs can't @@ -1183,13 +869,14 @@ extension DSLTree { case let .orderedChoice(v): return v.map(_Tree.init) case let .concatenation(v): return v.map(_Tree.init) + case let .convertedRegexLiteral(n, _): + // Treat this transparently + return _Tree(n).children + case let .capture(_, _, n, _): return [_Tree(n)] case let .nonCapturingGroup(_, n): return [_Tree(n)] case let .quantification(_, _, n): return [_Tree(n)] - case let .ignoreCapturesInTypedOutput(n): return [_Tree(n)] - case let .limitCaptureNesting(n): - // This is a transparent wrapper - return _Tree(n).children + case let .ignoreCapturesInTypedOutput(n): return [_Tree(n)] case let .conditional(_, t, f): return [_Tree(t), _Tree(f)] @@ -1223,14 +910,6 @@ extension DSLTree { internal var isNegativeLookahead: Bool { self.ast == .negativeLookahead } - - internal var isChangeMatchingOptions: Bool { - if case let .changeMatchingOptions = ast { - return true - } else { - return false - } - } } @_spi(RegexBuilder) diff --git a/Sources/_StringProcessing/Regex/Options.swift b/Sources/_StringProcessing/Regex/Options.swift index 5b3121831..6911af911 100644 --- a/Sources/_StringProcessing/Regex/Options.swift +++ b/Sources/_StringProcessing/Regex/Options.swift @@ -294,9 +294,7 @@ extension RegexComponent { let sequence = shouldAdd ? AST.MatchingOptionSequence(adding: [.init(option, location: .fake)]) : AST.MatchingOptionSequence(removing: [.init(option, location: .fake)]) - - var list = regex.program.list - list.nodes.insert(.nonCapturingGroup(.init(ast: .changeMatchingOptions(sequence)), TEMP_FAKE_NODE), at: 0) - return Regex(list: list) + return Regex(node: .nonCapturingGroup( + .init(ast: .changeMatchingOptions(sequence)), regex.root)) } } diff --git a/Sources/_StringProcessing/Utility/RegexFactory.swift b/Sources/_StringProcessing/Utility/RegexFactory.swift index 5f8dc83a2..0c224e159 100644 --- a/Sources/_StringProcessing/Utility/RegexFactory.swift +++ b/Sources/_StringProcessing/Utility/RegexFactory.swift @@ -26,9 +26,9 @@ public struct _RegexFactory { _ child: some RegexComponent ) -> Regex { // Don't wrap `child` again if it's a leaf node. - child.regex.list.hasChildren - ? child.regex.prepending(.ignoreCapturesInTypedOutput(TEMP_FAKE_NODE)) as Regex - : .init(list: child.regex.program.list) + child.regex.root.hasChildNodes + ? .init(node: .ignoreCapturesInTypedOutput(child.regex.root)) + : .init(node: child.regex.root) } @available(SwiftStdlib 5.7, *) @@ -36,7 +36,7 @@ public struct _RegexFactory { _ left: some RegexComponent, _ right: some RegexComponent ) -> Regex { - left.regex.concatenating(right.regex.program.list) + .init(node: left.regex.root.appending(right.regex.root)) } @available(SwiftStdlib 5.7, *) @@ -44,7 +44,7 @@ public struct _RegexFactory { _ left: some RegexComponent, _ right: some RegexComponent ) -> Regex { - left.regex.alternating(with: right.regex.program.list.nodes) + .init(node: left.regex.root.appendingAlternationCase(right.regex.root)) } @_spi(RegexBuilder) @@ -107,7 +107,7 @@ public struct _RegexFactory { _ behavior: RegexRepetitionBehavior? = nil ) -> Regex { let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default - return component.regex.prepending(.quantification(.zeroOrOne, kind, TEMP_FAKE_NODE)) + return .init(node: .quantification(.zeroOrOne, kind, component.regex.root)) } @available(SwiftStdlib 5.7, *) @@ -116,7 +116,7 @@ public struct _RegexFactory { _ behavior: RegexRepetitionBehavior? = nil ) -> Regex { let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default - return component.regex.prepending(.quantification(.zeroOrMore, kind, TEMP_FAKE_NODE)) + return .init(node: .quantification(.zeroOrMore, kind, component.regex.root)) } @available(SwiftStdlib 5.7, *) @@ -125,7 +125,7 @@ public struct _RegexFactory { _ behavior: RegexRepetitionBehavior? = nil ) -> Regex { let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default - return component.regex.prepending(.quantification(.oneOrMore, kind, TEMP_FAKE_NODE)) + return .init(node: .quantification(.oneOrMore, kind, component.regex.root)) } @available(SwiftStdlib 5.7, *) @@ -133,7 +133,7 @@ public struct _RegexFactory { _ count: Int, _ component: some RegexComponent ) -> Regex { - component.regex.prepending(.quantification(.exactly(count), .default, TEMP_FAKE_NODE)) + .init(node: .quantification(.exactly(count), .default, component.regex.root)) } @available(SwiftStdlib 5.7, *) @@ -142,14 +142,14 @@ public struct _RegexFactory { _ behavior: RegexRepetitionBehavior?, _ component: some RegexComponent ) -> Regex { - component.regex.prepending(.repeating(range, behavior, TEMP_FAKE_NODE)) + .init(node: .repeating(range, behavior, component.regex.root)) } @available(SwiftStdlib 5.7, *) public func atomicNonCapturing( _ component: some RegexComponent ) -> Regex { - component.regex.prepending(.nonCapturingGroup(.atomicNonCapturing, TEMP_FAKE_NODE)) + .init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root)) } @_spi(RegexBuilder) @@ -157,7 +157,7 @@ public struct _RegexFactory { public func lookaheadNonCapturing( _ component: some RegexComponent ) -> Regex { - component.regex.prepending(.nonCapturingGroup(.lookahead, TEMP_FAKE_NODE)) + .init(node: .nonCapturingGroup(.lookahead, component.regex.root)) } @_spi(RegexBuilder) @@ -165,21 +165,21 @@ public struct _RegexFactory { public func negativeLookaheadNonCapturing( _ component: some RegexComponent ) -> Regex { - component.regex.prepending(.nonCapturingGroup(.negativeLookahead, TEMP_FAKE_NODE)) + .init(node: .nonCapturingGroup(.negativeLookahead, component.regex.root)) } @available(SwiftStdlib 5.7, *) public func orderedChoice( _ component: some RegexComponent ) -> Regex { - component.regex.prepending(.orderedChoice([TEMP_FAKE_NODE])) + .init(node: .orderedChoice([component.regex.root])) } @available(SwiftStdlib 5.7, *) public func capture( - _ component: some RegexComponent + _ r: some RegexComponent ) -> Regex { - component.regex.prepending(.capture(TEMP_FAKE_NODE)) + .init(node: .capture(r.regex.root)) } @available(SwiftStdlib 5.7, *) @@ -187,7 +187,10 @@ public struct _RegexFactory { _ component: some RegexComponent, _ reference: Int ) -> Regex { - component.regex.prepending(.capture(reference: ReferenceID(reference), TEMP_FAKE_NODE)) + .init(node: .capture( + reference: ReferenceID(reference), + component.regex.root + )) } @available(SwiftStdlib 5.7, *) @@ -196,12 +199,11 @@ public struct _RegexFactory { _ reference: Int? = nil, _ transform: @escaping (W) throws -> NewCapture ) -> Regex { - component.regex.prepending( - .capture( - reference: reference.map { ReferenceID($0) }, - TEMP_FAKE_NODE, - CaptureTransform(transform) - )) + .init(node: .capture( + reference: reference.map { ReferenceID($0) }, + component.regex.root, + CaptureTransform(transform) + )) } @available(SwiftStdlib 5.7, *) @@ -210,11 +212,10 @@ public struct _RegexFactory { _ reference: Int? = nil, _ transform: @escaping (W) throws -> NewCapture? ) -> Regex { - component.regex.prepending( - .capture( - reference: reference.map { ReferenceID($0) }, - TEMP_FAKE_NODE, - CaptureTransform(transform) - )) + .init(node: .capture( + reference: reference.map { ReferenceID($0) }, + component.regex.root, + CaptureTransform(transform) + )) } } diff --git a/Sources/_StringProcessing/Utility/TypeVerification.swift b/Sources/_StringProcessing/Utility/TypeVerification.swift index 566127220..11796d1e3 100644 --- a/Sources/_StringProcessing/Utility/TypeVerification.swift +++ b/Sources/_StringProcessing/Utility/TypeVerification.swift @@ -21,7 +21,7 @@ extension Regex { var tupleElements: [Any.Type] = [] var labels = "" - for capture in program.list.captureList.captures { + for capture in program.tree.captureList.captures { var captureType = capture.type var i = capture.optionalDepth @@ -41,7 +41,7 @@ extension Regex { // If we have no captures, then our Regex must be Regex. if tupleElements.count == 1 { - let wholeMatchType = program.list.wholeMatchType + let wholeMatchType = program.tree.root.wholeMatchType return (Output.self == wholeMatchType, wholeMatchType) } diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index 34cc20ad7..63ee266ec 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -157,7 +157,7 @@ func captureTest( } // Ensure DSLTree preserves literal captures - var dslCapList = DSLList(ast: ast).captureList + var dslCapList = ast.dslTree.captureList // Peel off the whole match element. dslCapList.captures.removeFirst() guard dslCapList == capList else { diff --git a/Tests/RegexTests/DSLListTests.swift b/Tests/RegexTests/DSLListTests.swift deleted file mode 100644 index 3b99b40f3..000000000 --- a/Tests/RegexTests/DSLListTests.swift +++ /dev/null @@ -1,35 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// This source file is part of the Swift.org open source project -// -// Copyright (c) 2025 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception -// -// See https://swift.org/LICENSE.txt for license information -// -//===----------------------------------------------------------------------===// - -import Testing -@testable import _StringProcessing - -@Suite -struct DSLListTests { - @available(macOS 9999, *) - @Test(arguments: [ - (#/a/#, 2), // literal, a - (#/abcd+/#, 5), // literal, concat, abc, quant, d - (#/a(?:b+)c*/#, 8), // literal, concat, a, noncap grp, quant, b, quant, c - ]) - func convertedNodeCount(regex: Regex, nodeCount: Int) { - let dslList = regex.program.list - #expect(dslList.nodes.count == nodeCount) - } - - @Test(arguments: [#/a|b/#, #/a+b?c/#, #/abc/#, #/a(?:b+)c*/#, #/;[\r\n]/#, #/(?=(?:[1-9]|(?:a|b)))/#]) - func compilationComparison(regex: Regex) throws { - let listCompiler = Compiler(list: regex.program.list) - let listProgram = try listCompiler.emitViaList() - -// #expect(treeProgram.instructions == listProgram.instructions) - } -} diff --git a/Tests/RegexTests/LiteralPrinterTests.swift b/Tests/RegexTests/LiteralPrinterTests.swift index 69f273fd5..dd15d8cd1 100644 --- a/Tests/RegexTests/LiteralPrinterTests.swift +++ b/Tests/RegexTests/LiteralPrinterTests.swift @@ -41,9 +41,6 @@ extension RegexTests { } func testUnicodeEscapes() throws { - let regex0 = #/[a]\u0301/# - _literalTest(regex0, expected: #"[a]\u0301"#) - let regex = #/\r\n\t cafe\u{301} \u{1D11E}/# _literalTest(regex, expected: #"\r\n\t cafe\u0301 \U0001D11E"#) } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 387a71d62..494acb3a2 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -26,16 +26,14 @@ struct MatchError: Error { @available(SwiftStdlib 6.0, *) func _roundTripLiteral( _ regexStr: String, - syntax: SyntaxOptions, - file: StaticString = #file, - line: UInt = #line + syntax: SyntaxOptions ) throws -> Regex? { guard let pattern = try Regex(regexStr, syntax: syntax)._literalPattern else { return nil } let remadeRegex = try Regex(pattern) - XCTAssertEqual(pattern, remadeRegex._literalPattern, file: file, line: line) + XCTAssertEqual(pattern, remadeRegex._literalPattern) return remadeRegex } @@ -44,13 +42,11 @@ func _firstMatch( input: String, validateOptimizations: Bool, semanticLevel: RegexSemanticLevel = .graphemeCluster, - syntax: SyntaxOptions = .traditional, - file: StaticString = #file, - line: UInt = #line + syntax: SyntaxOptions = .traditional ) throws -> (String, [String?])? { var regex = try Regex(regexStr, syntax: syntax).matchingSemantics(semanticLevel) let result = try regex.firstMatch(in: input) - + func validateSubstring(_ substringInput: Substring) throws { // Sometimes the characters we add to a substring merge with existing // string members. This messes up cross-validation, so skip the test. @@ -93,14 +89,14 @@ func _firstMatch( } if #available(SwiftStdlib 6.0, *) { - let roundTripRegex = try? _roundTripLiteral(regexStr, syntax: syntax, file: file, line: line) + let roundTripRegex = try? _roundTripLiteral(regexStr, syntax: syntax) let roundTripResult = try? roundTripRegex? .matchingSemantics(semanticLevel) .firstMatch(in: input)?[0] .substring switch (result?[0].substring, roundTripResult) { case let (match?, rtMatch?): - XCTAssertEqual(match, rtMatch, file: file, line: line) + XCTAssertEqual(match, rtMatch) case (nil, nil): break // okay case let (match?, _): @@ -109,18 +105,14 @@ func _firstMatch( For input '\(input)' Original: '\(regexStr)' _literalPattern: '\(roundTripRegex?._literalPattern ?? "")' - """, - file: file, - line: line) + """) case let (_, rtMatch?): XCTFail(""" Incorrectly matched as '\(rtMatch)' For input '\(input)' Original: '\(regexStr)' _literalPattern: '\(roundTripRegex!._literalPattern!)' - """, - file: file, - line: line) + """) } } @@ -192,8 +184,7 @@ func flatCaptureTest( input: test, validateOptimizations: validateOptimizations, semanticLevel: semanticLevel, - syntax: syntax, - file: file, line: line + syntax: syntax ) else { if expect == nil { continue @@ -312,8 +303,7 @@ func firstMatchTest( input: input, validateOptimizations: validateOptimizations, semanticLevel: semanticLevel, - syntax: syntax, - file: file, line: line)?.0 + syntax: syntax)?.0 if xfail { XCTAssertNotEqual(found, match, file: file, line: line) @@ -720,31 +710,6 @@ extension RegexTests { ("baaaaabc", nil), ("baaaaaaaabc", nil)) - // Auto-possessification tests: - // - case sensitive - firstMatchTests( - "a+A", - ("aaaaA", "aaaaA"), - ("aaaaa", nil), - ("aaAaa", "aaA")) - // - case insensitive - firstMatchTests( - "(?i:a+A)", - ("aaaaA", "aaaaA"), - ("aaaaa", "aaaaa")) - firstMatchTests( - "(?i)a+A", - ("aaaaA", "aaaaA"), - ("aaaaa", "aaaaa")) - firstMatchTests( - "a+(?i:A)", - ("aaaaA", "aaaaA"), - ("aaaaa", "aaaaa")) - firstMatchTests( - "a+(?:(?i)A)", - ("aaaaA", "aaaaA"), - ("aaaaa", "aaaaa")) - // XFAIL'd possessive tests firstMatchTests( "a?+a", diff --git a/Tests/RegexTests/OptimizationTests.swift b/Tests/RegexTests/OptimizationTests.swift deleted file mode 100644 index a60d9bf5f..000000000 --- a/Tests/RegexTests/OptimizationTests.swift +++ /dev/null @@ -1,68 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// This source file is part of the Swift.org open source project -// -// Copyright (c) 2025 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception -// -// See https://swift.org/LICENSE.txt for license information -// -//===----------------------------------------------------------------------===// - -import Testing -@testable @_spi(RegexBuilder) import _StringProcessing -@testable import _RegexParser - -@Suite struct OptimizationTests { - @available(macOS 9999, *) - @Test(arguments: [#/a/#, #/a+/#, #/(?:a+)/#, #/(?:a)+/#, #/(?m)a+/#, #/ab?c/#, #/(?:a+)+$/#, #/(?:(?:a+b)+b)/#]) - func requiredFirstAtom(pattern: Regex) throws { - let list = pattern.program.list - let atom = list.requiredFirstAtom(allowOptionsChanges: true) - #expect(atom?.literalCharacterValue == "a", "Missing first character atom in '\(pattern._literalPattern!)'") - } - - @available(macOS 9999, *) - @Test(arguments: [#/a?/#, #/(?:a|b)/#, #/[a]/#, #/a?bc/#]) - func noRequiredFirstAtom(pattern: Regex) throws { - let list = pattern.program.list - let atom = list.requiredFirstAtom(allowOptionsChanges: true) - #expect(atom == nil, "Unexpected required first atom in '\(pattern._literalPattern!)'") - } - - @available(macOS 9999, *) - @Test(arguments: [#/a+b/#, #/a*b/#, #/\w+\s/#, #/(?:a+b|b+a)/#, #/\d+a/#, #/a+A/#]) - func autoPossessify(pattern: Regex) throws { - var list = pattern.program.list - list.autoPossessify() - for node in list.nodes { - switch node { - case .quantification(_, let kind, _): - #expect( - kind.isExplicit && kind.quantificationKind?.ast == .possessive, - "Expected possessification in '\(pattern._literalPattern!)'") - default: break - } - } - } - - @available(macOS 9999, *) - @Test(arguments: [ - #/a?/#, #/a+a/#, #/a+(?:b|c)/#, #/(?:a+|b+)/#, #/[a]/#, #/a?a/#, - #/(?i)a+A/#, #/(?i:a+A)/#, // case insensitivity when checking exclusion - #/(?:(?:ab)+b)/#, // single atom quantifications only - ]) - func noAutoPossessify(pattern: Regex) throws { - var list = pattern.program.list - list.autoPossessify() - for node in list.nodes { - switch node { - case .quantification(_, let kind, _): - #expect( - kind.quantificationKind?.ast != .possessive, - "Unexpected possessification in '\(pattern._literalPattern!)'") - default: break - } - } - } -}