diff --git a/__tests__/extraction-resolution-accuracy.test.ts b/__tests__/extraction-resolution-accuracy.test.ts new file mode 100644 index 00000000..f78f3d76 --- /dev/null +++ b/__tests__/extraction-resolution-accuracy.test.ts @@ -0,0 +1,266 @@ +/** + * Extraction & Resolution Accuracy Tests + * + * Regression tests for three accuracy bugs fixed in one PR: + * 1. Parse-retry comment strip was hardcoded to `//`, no-op on Python/Ruby/etc. + * 2. Framework route extractors ran regex over raw file content, matching + * examples in docstrings/comments as real routes. + * 3. UTF-8 BOM caused spurious "modified" hash mismatches between editors. + */ + +import { describe, it, expect } from 'vitest'; +import { stripBom, stripCommentLinesForRetry, stripCommentsForRegex } from '../src/utils'; +import { hashContent } from '../src/extraction'; +import { flaskResolver, fastapiResolver, djangoResolver } from '../src/resolution/frameworks/python'; +import { expressResolver } from '../src/resolution/frameworks/express'; +import { aspnetResolver } from '../src/resolution/frameworks/csharp'; +import { rustResolver } from '../src/resolution/frameworks/rust'; +import { laravelResolver } from '../src/resolution/frameworks/laravel'; + +describe('UTF-8 BOM normalization (bug #5)', () => { + it('stripBom removes leading U+FEFF', () => { + expect(stripBom('hello')).toBe('hello'); + expect(stripBom('hello')).toBe('hello'); + expect(stripBom('')).toBe(''); + }); + + it('stripBom only removes leading BOM, not embedded ones', () => { + expect(stripBom('ab')).toBe('ab'); + }); + + it('hashContent treats BOM and no-BOM as identical', () => { + const withBom = 'export function hello() { return 42; }'; + const withoutBom = 'export function hello() { return 42; }'; + expect(hashContent(withBom)).toBe(hashContent(withoutBom)); + }); +}); + +describe('Per-language comment-line stripping (bug #1)', () => { + it('strips `#` lines for Python', () => { + const input = ['# CHECK: foo', 'def x():', ' pass'].join('\n'); + const out = stripCommentLinesForRetry(input, 'python'); + expect(out.split('\n')).toEqual(['', 'def x():', ' pass']); + }); + + it('strips `#` lines for Ruby', () => { + const input = ['# top comment', 'def x; end'].join('\n'); + const out = stripCommentLinesForRetry(input, 'ruby'); + expect(out.split('\n')).toEqual(['', 'def x; end']); + }); + + it('strips `//` lines for TypeScript', () => { + const input = ['// header', 'function x() {}'].join('\n'); + const out = stripCommentLinesForRetry(input, 'typescript'); + expect(out.split('\n')).toEqual(['', 'function x() {}']); + }); + + it('strips both `//` and `#` lines for PHP', () => { + const input = ['// js-style', '# perl-style', ' { + const input = '// looks like a comment\ncode'; + expect(stripCommentLinesForRetry(input, 'unknown-lang')).toBe(input); + }); + + it('preserves line count so node positions stay correct', () => { + const input = ['# c1', 'a', '# c2', 'b'].join('\n'); + const out = stripCommentLinesForRetry(input, 'python'); + expect(out.split('\n').length).toBe(input.split('\n').length); + }); + + it('does NOT strip indented `#` inside Python (still recognized as line comment)', () => { + // The marker matches optional leading whitespace + `#`, so an indented + // pure comment line is correctly stripped. Non-comment code on the same + // line as `#` (mid-line comment) is intentionally not stripped here. + const input = [' # indented comment', ' pass # trailing'].join('\n'); + const out = stripCommentLinesForRetry(input, 'python'); + expect(out.split('\n')).toEqual(['', ' pass # trailing']); + }); +}); + +describe('Framework regex no longer matches docstrings/comments (bug #4)', () => { + describe('Flask', () => { + it('skips routes inside `#` comments', () => { + const content = [ + 'from flask import Flask', + 'app = Flask(__name__)', + '# Example: @app.route("/fake")', + '@app.route("/real")', + 'def real(): pass', + ].join('\n'); + const nodes = flaskResolver.extractNodes!('app.py', content); + const paths = nodes.map((n) => n.name); + expect(paths).toContain('/real'); + expect(paths).not.toContain('/fake'); + }); + + it('skips routes inside triple-quoted docstrings', () => { + const content = [ + 'def example():', + ' """', + ' Usage: @app.route("/fake")', + ' """', + ' pass', + '@app.route("/real")', + 'def real(): pass', + ].join('\n'); + const nodes = flaskResolver.extractNodes!('app.py', content); + const paths = nodes.map((n) => n.name); + expect(paths).toContain('/real'); + expect(paths).not.toContain('/fake'); + }); + }); + + describe('FastAPI', () => { + it('skips routes inside `#` comments and triple-quoted docstrings', () => { + const content = [ + '"""', + 'Module docs — example: @app.get("/docfake")', + '"""', + '# @app.post("/commentfake")', + '@app.get("/real")', + 'def real(): pass', + ].join('\n'); + const nodes = fastapiResolver.extractNodes!('app.py', content); + const names = nodes.map((n) => n.name); + expect(names.some((n) => n.includes('/real'))).toBe(true); + expect(names.some((n) => n.includes('/docfake'))).toBe(false); + expect(names.some((n) => n.includes('/commentfake'))).toBe(false); + }); + + it('preserves correct line numbers for real routes after stripping', () => { + const content = [ + '"""', // line 1 + '@app.get("/fake")', // line 2 — inside docstring + '"""', // line 3 + '', // line 4 + '@app.get("/real")', // line 5 — real + ].join('\n'); + const nodes = fastapiResolver.extractNodes!('app.py', content); + const real = nodes.find((n) => n.name.includes('/real')); + expect(real).toBeDefined(); + expect(real!.startLine).toBe(5); + }); + }); + + describe('Django URL patterns', () => { + it('skips path() inside `#` comments', () => { + const content = [ + 'from django.urls import path', + '# example: path("fake/", fake_view)', + 'urlpatterns = [path("real/", real_view)]', + ].join('\n'); + const nodes = djangoResolver.extractNodes!('urls.py', content); + const names = nodes.map((n) => n.name); + expect(names).toContain('real/'); + expect(names).not.toContain('fake/'); + }); + }); + + describe('Express', () => { + it('skips routes inside `//` comments', () => { + const content = [ + 'const app = express();', + '// app.get("/fake", fakeHandler);', + 'app.get("/real", realHandler);', + ].join('\n'); + const nodes = expressResolver.extractNodes!('server.js', content); + const names = nodes.map((n) => n.name); + expect(names.some((n) => n.includes('/real'))).toBe(true); + expect(names.some((n) => n.includes('/fake'))).toBe(false); + }); + + it('skips routes inside `/* ... */` block comments', () => { + const content = [ + '/*', + ' * app.post("/blockfake", h);', + ' */', + 'app.get("/real", h);', + ].join('\n'); + const nodes = expressResolver.extractNodes!('server.js', content); + const names = nodes.map((n) => n.name); + expect(names.some((n) => n.includes('/real'))).toBe(true); + expect(names.some((n) => n.includes('/blockfake'))).toBe(false); + }); + }); + + describe('Laravel', () => { + it('skips routes inside PHP `//` and `#` comments', () => { + const content = [ + ' n.name); + expect(names.some((n) => n.includes('/real'))).toBe(true); + expect(names.some((n) => n.includes('/jsfake'))).toBe(false); + expect(names.some((n) => n.includes('/perlfake'))).toBe(false); + }); + }); + + describe('Rust', () => { + it('skips actix/rocket routes inside `///` doc comments', () => { + const content = [ + '/// Example route: #[get("/docfake")]', + '#[get("/real")]', + 'fn real() {}', + ].join('\n'); + const nodes = rustResolver.extractNodes!('main.rs', content); + const names = nodes.map((n) => n.name); + expect(names.some((n) => n.includes('/real'))).toBe(true); + expect(names.some((n) => n.includes('/docfake'))).toBe(false); + }); + }); + + describe('ASP.NET (C#)', () => { + it('skips route attributes inside `///` XML doc comments', () => { + const content = [ + '/// ', + '/// Example: [HttpGet("/docfake")]', + '/// ', + '[HttpGet("/real")]', + 'public class C {}', + ].join('\n'); + const nodes = aspnetResolver.extractNodes!('Controller.cs', content); + const names = nodes.map((n) => n.name); + expect(names.some((n) => n.includes('/real'))).toBe(true); + expect(names.some((n) => n.includes('/docfake'))).toBe(false); + }); + + it('skips minimal-API MapGet/MapPost calls inside comments', () => { + // Regression: the minimalApiPattern loop below the routePatterns + // loop was initially missed when applying the strip helper, leaving + // commented-out `app.MapGet("/x")` calls extracted as real routes. + const content = [ + '// app.MapGet("/linefake", h);', + '/*', + ' * app.MapPost("/blockfake", h);', + ' */', + 'app.MapGet("/real", h);', + ].join('\n'); + const nodes = aspnetResolver.extractNodes!('Program.cs', content); + const names = nodes.map((n) => n.name); + expect(names.some((n) => n.includes('/real'))).toBe(true); + expect(names.some((n) => n.includes('/linefake'))).toBe(false); + expect(names.some((n) => n.includes('/blockfake'))).toBe(false); + }); + }); +}); + +describe('stripCommentsForRegex preserves line offsets', () => { + it('keeps newlines so match.index → original line number', () => { + const input = '"""\n@app.get("/x")\n"""\n@app.get("/y")'; + const out = stripCommentsForRegex(input, 'python'); + // Newlines preserved + expect(out.split('\n').length).toBe(input.split('\n').length); + // The /y route survives + expect(out).toContain('/y'); + // The docstring contents are blanked + expect(out).not.toContain('/x'); + }); +}); diff --git a/src/extraction/index.ts b/src/extraction/index.ts index 4ad056fb..f4acda24 100644 --- a/src/extraction/index.ts +++ b/src/extraction/index.ts @@ -20,7 +20,7 @@ import { QueryBuilder } from '../db/queries'; import { extractFromSource } from './tree-sitter'; import { detectLanguage, isLanguageSupported, initGrammars, loadGrammarsForLanguages } from './grammars'; import { logDebug, logWarn } from '../errors'; -import { validatePathWithinRoot, normalizePath } from '../utils'; +import { validatePathWithinRoot, normalizePath, stripBom, stripCommentLinesForRetry } from '../utils'; import picomatch from 'picomatch'; /** @@ -85,10 +85,15 @@ export interface SyncResult { } /** - * Calculate SHA256 hash of file contents + * Calculate SHA256 hash of file contents. + * + * A leading UTF-8 BOM is stripped before hashing so files round-tripped + * through editors that disagree about BOM handling (VSCode strips by + * default; some Windows editors preserve it) hash identically and don't + * appear "modified" on every sync. */ export function hashContent(content: string): string { - return crypto.createHash('sha256').update(content).digest('hex'); + return crypto.createHash('sha256').update(stripBom(content)).digest('hex'); } /** @@ -820,11 +825,12 @@ export class ExtractionOrchestrator { } // Strip lines that are entirely comments (preserving line numbers - // by replacing with empty lines so node positions stay correct) - const stripped = fullContent - .split('\n') - .map(line => /^\s*\/\//.test(line) ? '' : line) - .join('\n'); + // by replacing with empty lines so node positions stay correct). + // The marker is language-specific — the previous hardcoded `//` + // was a no-op for Python (`#`), Ruby (`#`), etc., so those files + // would silently keep failing on the retry. + const language = detectLanguage(filePath, fullContent); + const stripped = stripCommentLinesForRetry(fullContent, language); let result: ExtractionResult; try { @@ -834,7 +840,6 @@ export class ExtractionOrchestrator { } if (result.nodes.length > 0 || result.errors.length === 0) { - const language = detectLanguage(filePath, fullContent); const stats = await fsp.stat(path.join(this.rootDir, filePath)); this.storeExtractionResult(filePath, fullContent, language, stats, result); diff --git a/src/resolution/frameworks/csharp.ts b/src/resolution/frameworks/csharp.ts index 1e170be4..9effb53f 100644 --- a/src/resolution/frameworks/csharp.ts +++ b/src/resolution/frameworks/csharp.ts @@ -6,6 +6,7 @@ import { Node } from '../../types'; import { FrameworkResolver, UnresolvedRef, ResolvedRef, ResolutionContext } from '../types'; +import { stripCommentsForRegex } from '../../utils'; export const aspnetResolver: FrameworkResolver = { name: 'aspnet', @@ -117,6 +118,9 @@ export const aspnetResolver: FrameworkResolver = { extractNodes(filePath: string, content: string): Node[] { const nodes: Node[] = []; const now = Date.now(); + // Strip `//` and `/* */` comments so XML-doc examples like + // `/// [HttpGet("/x")]` aren't treated as real route attributes. + const safe = stripCommentsForRegex(content, 'csharp'); // Extract route attributes // [HttpGet("path")], [HttpPost("path")], [Route("path")] @@ -128,8 +132,8 @@ export const aspnetResolver: FrameworkResolver = { for (const pattern of routePatterns) { let match; - while ((match = pattern.exec(content)) !== null) { - const line = content.slice(0, match.index).split('\n').length; + while ((match = pattern.exec(safe)) !== null) { + const line = safe.slice(0, match.index).split('\n').length; if (pattern.source.includes('Http')) { if (match[3]) { @@ -190,9 +194,9 @@ export const aspnetResolver: FrameworkResolver = { const minimalApiPattern = /\.Map(Get|Post|Put|Patch|Delete)\s*\(\s*["']([^"']+)["']/g; let match; - while ((match = minimalApiPattern.exec(content)) !== null) { + while ((match = minimalApiPattern.exec(safe)) !== null) { const [, method, path] = match; - const line = content.slice(0, match.index).split('\n').length; + const line = safe.slice(0, match.index).split('\n').length; nodes.push({ id: `route:${filePath}:${method!.toUpperCase()}:${path}:${line}`, diff --git a/src/resolution/frameworks/express.ts b/src/resolution/frameworks/express.ts index 0afa7e03..07851769 100644 --- a/src/resolution/frameworks/express.ts +++ b/src/resolution/frameworks/express.ts @@ -6,6 +6,7 @@ import { Node } from '../../types'; import { FrameworkResolver, UnresolvedRef, ResolvedRef, ResolutionContext } from '../types'; +import { stripCommentsForRegex } from '../../utils'; export const expressResolver: FrameworkResolver = { name: 'express', @@ -93,6 +94,9 @@ export const expressResolver: FrameworkResolver = { extractNodes(filePath: string, content: string): Node[] { const nodes: Node[] = []; const now = Date.now(); + // Neutralize comments and JSDoc blocks so a `app.get('/x')` example in + // a comment isn't extracted as a real route. + const safe = stripCommentsForRegex(content, 'javascript'); // Extract route definitions // app.get('/path', handler) or router.get('/path', handler) @@ -102,9 +106,9 @@ export const expressResolver: FrameworkResolver = { for (const pattern of routePatterns) { let match; - while ((match = pattern.exec(content)) !== null) { + while ((match = pattern.exec(safe)) !== null) { const [, _obj, method, path] = match; - const line = content.slice(0, match.index).split('\n').length; + const line = safe.slice(0, match.index).split('\n').length; // Skip middleware use() without paths if (method === 'use' && !path?.startsWith('/')) { diff --git a/src/resolution/frameworks/laravel.ts b/src/resolution/frameworks/laravel.ts index d6a79885..4b3b5e00 100644 --- a/src/resolution/frameworks/laravel.ts +++ b/src/resolution/frameworks/laravel.ts @@ -6,6 +6,7 @@ import { Node } from '../../types'; import { FrameworkResolver, UnresolvedRef, ResolvedRef, ResolutionContext } from '../types'; +import { stripCommentsForRegex } from '../../utils'; /** * Laravel facade mappings to underlying classes @@ -93,6 +94,7 @@ export const laravelResolver: FrameworkResolver = { extractNodes(filePath: string, content: string): Node[] { const nodes: Node[] = []; const now = Date.now(); + const safe = stripCommentsForRegex(content, 'php'); // Extract route definitions const routePatterns = [ @@ -106,10 +108,10 @@ export const laravelResolver: FrameworkResolver = { for (const pattern of routePatterns) { let match; - while ((match = pattern.exec(content)) !== null) { + while ((match = pattern.exec(safe)) !== null) { if (pattern.source.includes('resource')) { const [, resourceName] = match; - const line = content.slice(0, match.index).split('\n').length; + const line = safe.slice(0, match.index).split('\n').length; nodes.push({ id: `route:${filePath}:resource:${resourceName}:${line}`, kind: 'route', @@ -125,7 +127,7 @@ export const laravelResolver: FrameworkResolver = { }); } else { const [, method, path] = match; - const line = content.slice(0, match.index).split('\n').length; + const line = safe.slice(0, match.index).split('\n').length; nodes.push({ id: `route:${filePath}:${method!.toUpperCase()}:${path}:${line}`, kind: 'route', diff --git a/src/resolution/frameworks/python.ts b/src/resolution/frameworks/python.ts index 88f5034a..021fbd1d 100644 --- a/src/resolution/frameworks/python.ts +++ b/src/resolution/frameworks/python.ts @@ -6,6 +6,7 @@ import { Node } from '../../types'; import { FrameworkResolver, UnresolvedRef, ResolvedRef, ResolutionContext } from '../types'; +import { stripCommentsForRegex } from '../../utils'; export const djangoResolver: FrameworkResolver = { name: 'django', @@ -77,6 +78,10 @@ export const djangoResolver: FrameworkResolver = { extractNodes(filePath: string, content: string): Node[] { const nodes: Node[] = []; const now = Date.now(); + // Neutralize comments and docstrings so a `path('/x', view)` example in + // a docstring isn't extracted as a real route. Newlines preserved so + // line numbers stay correct. + const safe = stripCommentsForRegex(content, 'python'); // Extract URL patterns // path('route/', view, name='name') @@ -87,9 +92,9 @@ export const djangoResolver: FrameworkResolver = { for (const pattern of urlPatterns) { let match; - while ((match = pattern.exec(content)) !== null) { + while ((match = pattern.exec(safe)) !== null) { const [, urlPath] = match; - const line = content.slice(0, match.index).split('\n').length; + const line = safe.slice(0, match.index).split('\n').length; nodes.push({ id: `route:${filePath}:${urlPath}:${line}`, @@ -157,15 +162,16 @@ export const flaskResolver: FrameworkResolver = { extractNodes(filePath: string, content: string): Node[] { const nodes: Node[] = []; const now = Date.now(); + const safe = stripCommentsForRegex(content, 'python'); // Extract Flask route decorators // @app.route('/path') or @blueprint.route('/path') const routePattern = /@(\w+)\.route\s*\(\s*['"]([^'"]+)['"]/g; let match; - while ((match = routePattern.exec(content)) !== null) { + while ((match = routePattern.exec(safe)) !== null) { const [, _appOrBp, routePath] = match; - const line = content.slice(0, match.index).split('\n').length; + const line = safe.slice(0, match.index).split('\n').length; nodes.push({ id: `route:${filePath}:${routePath}:${line}`, @@ -245,15 +251,16 @@ export const fastapiResolver: FrameworkResolver = { extractNodes(filePath: string, content: string): Node[] { const nodes: Node[] = []; const now = Date.now(); + const safe = stripCommentsForRegex(content, 'python'); // Extract FastAPI route decorators // @app.get('/path') or @router.post('/path') const routePattern = /@(\w+)\.(get|post|put|patch|delete|options|head)\s*\(\s*['"]([^'"]+)['"]/g; let match; - while ((match = routePattern.exec(content)) !== null) { + while ((match = routePattern.exec(safe)) !== null) { const [, _appOrRouter, method, routePath] = match; - const line = content.slice(0, match.index).split('\n').length; + const line = safe.slice(0, match.index).split('\n').length; nodes.push({ id: `route:${filePath}:${method!.toUpperCase()}:${routePath}:${line}`, diff --git a/src/resolution/frameworks/rust.ts b/src/resolution/frameworks/rust.ts index 5ab10bc3..92d92060 100644 --- a/src/resolution/frameworks/rust.ts +++ b/src/resolution/frameworks/rust.ts @@ -6,6 +6,7 @@ import { Node } from '../../types'; import { FrameworkResolver, UnresolvedRef, ResolvedRef, ResolutionContext } from '../types'; +import { stripCommentsForRegex } from '../../utils'; export const rustResolver: FrameworkResolver = { name: 'rust', @@ -74,15 +75,18 @@ export const rustResolver: FrameworkResolver = { extractNodes(filePath: string, content: string): Node[] { const nodes: Node[] = []; const now = Date.now(); + // Strip `//` and `/* */` comments so doc-comment examples like + // `/// #[get("/x")]` aren't treated as real route attributes. + const safe = stripCommentsForRegex(content, 'rust'); // Extract Actix-web routes // #[get("/path")], #[post("/path")], etc. const actixRoutePattern = /#\[(get|post|put|patch|delete)\s*\(\s*["']([^"']+)["']/g; let match; - while ((match = actixRoutePattern.exec(content)) !== null) { + while ((match = actixRoutePattern.exec(safe)) !== null) { const [, method, path] = match; - const line = content.slice(0, match.index).split('\n').length; + const line = safe.slice(0, match.index).split('\n').length; nodes.push({ id: `route:${filePath}:${method!.toUpperCase()}:${path}:${line}`, @@ -103,9 +107,9 @@ export const rustResolver: FrameworkResolver = { // #[get("/path")], #[post("/path", ...)] const rocketRoutePattern = /#\[(get|post|put|patch|delete|head|options)\s*\(\s*["']([^"']+)["']/g; - while ((match = rocketRoutePattern.exec(content)) !== null) { + while ((match = rocketRoutePattern.exec(safe)) !== null) { const [, method, path] = match; - const line = content.slice(0, match.index).split('\n').length; + const line = safe.slice(0, match.index).split('\n').length; // Avoid duplicates from actix pattern const routeId = `route:${filePath}:${method!.toUpperCase()}:${path}:${line}`; @@ -130,9 +134,9 @@ export const rustResolver: FrameworkResolver = { // .route("/path", get(handler)) const axumRoutePattern = /\.route\s*\(\s*["']([^"']+)["']\s*,\s*(get|post|put|patch|delete)/g; - while ((match = axumRoutePattern.exec(content)) !== null) { + while ((match = axumRoutePattern.exec(safe)) !== null) { const [, path, method] = match; - const line = content.slice(0, match.index).split('\n').length; + const line = safe.slice(0, match.index).split('\n').length; nodes.push({ id: `route:${filePath}:${method!.toUpperCase()}:${path}:${line}`, diff --git a/src/utils.ts b/src/utils.ts index e75e58e0..64741ab6 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -174,6 +174,135 @@ export function normalizePath(filePath: string): string { return filePath.replace(/\\/g, '/'); } +/** + * Strip a leading UTF-8 BOM (U+FEFF) if present. + * + * Editors disagree about whether to write the BOM. Without normalization + * the same logical content hashes to two different values depending on + * which editor last touched the file, producing spurious "modified" + * detections on every sync. + */ +export function stripBom(content: string): string { + return content.charCodeAt(0) === 0xfeff ? content.slice(1) : content; +} + +/** + * Replace every non-newline character in `text` with a space. Preserves + * line count and column offsets so subsequent regex matches against the + * processed content map back to the same line numbers in the original. + */ +function blankPreservingNewlines(text: string): string { + return text.replace(/[^\n]/g, ' '); +} + +/** + * Comment / docstring patterns to neutralize before applying coarse-grained + * regex extraction (e.g., framework route decorators). The goal is to + * prevent commented-out examples and docstring snippets from being + * extracted as real code constructs, without rebuilding a full lexer. + * + * For each language we strip: + * - Block comments (preserve newlines so line numbers stay correct). + * - Whole-line single-line comments (only when the line contains nothing + * but optional whitespace before the marker — this avoids corrupting + * string literals on the same line). + * - Python triple-quoted strings (the common docstring carrier). + * + * We deliberately do NOT strip arbitrary string literals — that risks + * removing legitimate route paths the regex needs to see. + */ +const BLOCK_COMMENT_LANGUAGES = new Set([ + 'javascript', 'typescript', 'tsx', 'jsx', + 'java', 'csharp', 'cpp', 'c', + 'go', 'rust', 'swift', 'kotlin', 'dart', 'scala', + 'php', +]); + +/** + * Per-language line-comment marker as a *line-anchored* prefix regex. + * Stateless (no `/g`, no `/m`) so it can be reused across many `.test` + * calls without regex-state pitfalls. + */ +const LINE_COMMENT_MARKER: Record = { + javascript: /^[ \t]*\/\//, + typescript: /^[ \t]*\/\//, + tsx: /^[ \t]*\/\//, + jsx: /^[ \t]*\/\//, + java: /^[ \t]*\/\//, + csharp: /^[ \t]*\/\//, + cpp: /^[ \t]*\/\//, + c: /^[ \t]*\/\//, + go: /^[ \t]*\/\//, + rust: /^[ \t]*\/\//, + swift: /^[ \t]*\/\//, + kotlin: /^[ \t]*\/\//, + dart: /^[ \t]*\/\//, + scala: /^[ \t]*\/\//, + pascal: /^[ \t]*\/\//, + python: /^[ \t]*#/, + ruby: /^[ \t]*#/, + php: /^[ \t]*(?:\/\/|#)/, +}; + +/** + * Best-effort comment stripper for use before coarse-grained regex + * extraction. Returns content with comments and (for Python) triple-quoted + * strings replaced by spaces — newlines preserved so line/column offsets + * derived from the result still map onto the original file. + * + * Languages without an entry are returned unchanged. + */ +export function stripCommentsForRegex(content: string, language: string): string { + let out = content; + + if (BLOCK_COMMENT_LANGUAGES.has(language)) { + out = out.replace(/\/\*[\s\S]*?\*\//g, blankPreservingNewlines); + } + if (language === 'python') { + out = out.replace(/"""[\s\S]*?"""/g, blankPreservingNewlines); + out = out.replace(/'''[\s\S]*?'''/g, blankPreservingNewlines); + } + if (language === 'ruby') { + out = out.replace(/^=begin\b[\s\S]*?^=end\b[^\n]*/gm, blankPreservingNewlines); + } + + const lineMarker = LINE_COMMENT_MARKER[language]; + if (lineMarker) { + // Walk lines; replace any line that starts with optional whitespace + // then the marker. Done line-at-a-time so we never touch content + // inside string literals on other lines. + out = out + .split('\n') + .map((line) => (lineMarker.test(line) ? blankPreservingNewlines(line) : line)) + .join('\n'); + } + + return out; +} + +/** + * Strip lines that are entirely a single-line comment for the given + * language, replacing them with empty lines. Preserves line numbers so + * tree-sitter node positions stay correct. + * + * Used by the parser-retry "shrink the file" fallback. Unlike + * {@link stripCommentsForRegex} this does NOT strip block comments or + * docstrings — the goal is to remove the easiest dead weight (e.g. + * compiler test files dominated by `# CHECK:` / `// CHECK:` lines) + * without risking semantic changes. + * + * Returns content unchanged for languages without a known line-comment + * marker. + */ +export function stripCommentLinesForRetry(content: string, language: string): string { + const marker = LINE_COMMENT_MARKER[language]; + if (!marker) return content; + return content + .split('\n') + .map((line) => (marker.test(line) ? '' : line)) + .join('\n'); +} + /** * Cross-process file lock using a lock file with PID tracking. *