diff --git a/__tests__/extraction-resolution-accuracy.test.ts b/__tests__/extraction-resolution-accuracy.test.ts
new file mode 100644
index 00000000..f78f3d76
--- /dev/null
+++ b/__tests__/extraction-resolution-accuracy.test.ts
@@ -0,0 +1,266 @@
+/**
+ * Extraction & Resolution Accuracy Tests
+ *
+ * Regression tests for three accuracy bugs fixed in one PR:
+ * 1. Parse-retry comment strip was hardcoded to `//`, no-op on Python/Ruby/etc.
+ * 2. Framework route extractors ran regex over raw file content, matching
+ * examples in docstrings/comments as real routes.
+ * 3. UTF-8 BOM caused spurious "modified" hash mismatches between editors.
+ */
+
+import { describe, it, expect } from 'vitest';
+import { stripBom, stripCommentLinesForRetry, stripCommentsForRegex } from '../src/utils';
+import { hashContent } from '../src/extraction';
+import { flaskResolver, fastapiResolver, djangoResolver } from '../src/resolution/frameworks/python';
+import { expressResolver } from '../src/resolution/frameworks/express';
+import { aspnetResolver } from '../src/resolution/frameworks/csharp';
+import { rustResolver } from '../src/resolution/frameworks/rust';
+import { laravelResolver } from '../src/resolution/frameworks/laravel';
+
+describe('UTF-8 BOM normalization (bug #5)', () => {
+ it('stripBom removes leading U+FEFF', () => {
+ expect(stripBom('hello')).toBe('hello');
+ expect(stripBom('hello')).toBe('hello');
+ expect(stripBom('')).toBe('');
+ });
+
+ it('stripBom only removes leading BOM, not embedded ones', () => {
+ expect(stripBom('ab')).toBe('ab');
+ });
+
+ it('hashContent treats BOM and no-BOM as identical', () => {
+ const withBom = 'export function hello() { return 42; }';
+ const withoutBom = 'export function hello() { return 42; }';
+ expect(hashContent(withBom)).toBe(hashContent(withoutBom));
+ });
+});
+
+describe('Per-language comment-line stripping (bug #1)', () => {
+ it('strips `#` lines for Python', () => {
+ const input = ['# CHECK: foo', 'def x():', ' pass'].join('\n');
+ const out = stripCommentLinesForRetry(input, 'python');
+ expect(out.split('\n')).toEqual(['', 'def x():', ' pass']);
+ });
+
+ it('strips `#` lines for Ruby', () => {
+ const input = ['# top comment', 'def x; end'].join('\n');
+ const out = stripCommentLinesForRetry(input, 'ruby');
+ expect(out.split('\n')).toEqual(['', 'def x; end']);
+ });
+
+ it('strips `//` lines for TypeScript', () => {
+ const input = ['// header', 'function x() {}'].join('\n');
+ const out = stripCommentLinesForRetry(input, 'typescript');
+ expect(out.split('\n')).toEqual(['', 'function x() {}']);
+ });
+
+ it('strips both `//` and `#` lines for PHP', () => {
+ const input = ['// js-style', '# perl-style', ' {
+ const input = '// looks like a comment\ncode';
+ expect(stripCommentLinesForRetry(input, 'unknown-lang')).toBe(input);
+ });
+
+ it('preserves line count so node positions stay correct', () => {
+ const input = ['# c1', 'a', '# c2', 'b'].join('\n');
+ const out = stripCommentLinesForRetry(input, 'python');
+ expect(out.split('\n').length).toBe(input.split('\n').length);
+ });
+
+ it('does NOT strip indented `#` inside Python (still recognized as line comment)', () => {
+ // The marker matches optional leading whitespace + `#`, so an indented
+ // pure comment line is correctly stripped. Non-comment code on the same
+ // line as `#` (mid-line comment) is intentionally not stripped here.
+ const input = [' # indented comment', ' pass # trailing'].join('\n');
+ const out = stripCommentLinesForRetry(input, 'python');
+ expect(out.split('\n')).toEqual(['', ' pass # trailing']);
+ });
+});
+
+describe('Framework regex no longer matches docstrings/comments (bug #4)', () => {
+ describe('Flask', () => {
+ it('skips routes inside `#` comments', () => {
+ const content = [
+ 'from flask import Flask',
+ 'app = Flask(__name__)',
+ '# Example: @app.route("/fake")',
+ '@app.route("/real")',
+ 'def real(): pass',
+ ].join('\n');
+ const nodes = flaskResolver.extractNodes!('app.py', content);
+ const paths = nodes.map((n) => n.name);
+ expect(paths).toContain('/real');
+ expect(paths).not.toContain('/fake');
+ });
+
+ it('skips routes inside triple-quoted docstrings', () => {
+ const content = [
+ 'def example():',
+ ' """',
+ ' Usage: @app.route("/fake")',
+ ' """',
+ ' pass',
+ '@app.route("/real")',
+ 'def real(): pass',
+ ].join('\n');
+ const nodes = flaskResolver.extractNodes!('app.py', content);
+ const paths = nodes.map((n) => n.name);
+ expect(paths).toContain('/real');
+ expect(paths).not.toContain('/fake');
+ });
+ });
+
+ describe('FastAPI', () => {
+ it('skips routes inside `#` comments and triple-quoted docstrings', () => {
+ const content = [
+ '"""',
+ 'Module docs — example: @app.get("/docfake")',
+ '"""',
+ '# @app.post("/commentfake")',
+ '@app.get("/real")',
+ 'def real(): pass',
+ ].join('\n');
+ const nodes = fastapiResolver.extractNodes!('app.py', content);
+ const names = nodes.map((n) => n.name);
+ expect(names.some((n) => n.includes('/real'))).toBe(true);
+ expect(names.some((n) => n.includes('/docfake'))).toBe(false);
+ expect(names.some((n) => n.includes('/commentfake'))).toBe(false);
+ });
+
+ it('preserves correct line numbers for real routes after stripping', () => {
+ const content = [
+ '"""', // line 1
+ '@app.get("/fake")', // line 2 — inside docstring
+ '"""', // line 3
+ '', // line 4
+ '@app.get("/real")', // line 5 — real
+ ].join('\n');
+ const nodes = fastapiResolver.extractNodes!('app.py', content);
+ const real = nodes.find((n) => n.name.includes('/real'));
+ expect(real).toBeDefined();
+ expect(real!.startLine).toBe(5);
+ });
+ });
+
+ describe('Django URL patterns', () => {
+ it('skips path() inside `#` comments', () => {
+ const content = [
+ 'from django.urls import path',
+ '# example: path("fake/", fake_view)',
+ 'urlpatterns = [path("real/", real_view)]',
+ ].join('\n');
+ const nodes = djangoResolver.extractNodes!('urls.py', content);
+ const names = nodes.map((n) => n.name);
+ expect(names).toContain('real/');
+ expect(names).not.toContain('fake/');
+ });
+ });
+
+ describe('Express', () => {
+ it('skips routes inside `//` comments', () => {
+ const content = [
+ 'const app = express();',
+ '// app.get("/fake", fakeHandler);',
+ 'app.get("/real", realHandler);',
+ ].join('\n');
+ const nodes = expressResolver.extractNodes!('server.js', content);
+ const names = nodes.map((n) => n.name);
+ expect(names.some((n) => n.includes('/real'))).toBe(true);
+ expect(names.some((n) => n.includes('/fake'))).toBe(false);
+ });
+
+ it('skips routes inside `/* ... */` block comments', () => {
+ const content = [
+ '/*',
+ ' * app.post("/blockfake", h);',
+ ' */',
+ 'app.get("/real", h);',
+ ].join('\n');
+ const nodes = expressResolver.extractNodes!('server.js', content);
+ const names = nodes.map((n) => n.name);
+ expect(names.some((n) => n.includes('/real'))).toBe(true);
+ expect(names.some((n) => n.includes('/blockfake'))).toBe(false);
+ });
+ });
+
+ describe('Laravel', () => {
+ it('skips routes inside PHP `//` and `#` comments', () => {
+ const content = [
+ ' n.name);
+ expect(names.some((n) => n.includes('/real'))).toBe(true);
+ expect(names.some((n) => n.includes('/jsfake'))).toBe(false);
+ expect(names.some((n) => n.includes('/perlfake'))).toBe(false);
+ });
+ });
+
+ describe('Rust', () => {
+ it('skips actix/rocket routes inside `///` doc comments', () => {
+ const content = [
+ '/// Example route: #[get("/docfake")]',
+ '#[get("/real")]',
+ 'fn real() {}',
+ ].join('\n');
+ const nodes = rustResolver.extractNodes!('main.rs', content);
+ const names = nodes.map((n) => n.name);
+ expect(names.some((n) => n.includes('/real'))).toBe(true);
+ expect(names.some((n) => n.includes('/docfake'))).toBe(false);
+ });
+ });
+
+ describe('ASP.NET (C#)', () => {
+ it('skips route attributes inside `///` XML doc comments', () => {
+ const content = [
+ '/// ',
+ '/// Example: [HttpGet("/docfake")]',
+ '/// ',
+ '[HttpGet("/real")]',
+ 'public class C {}',
+ ].join('\n');
+ const nodes = aspnetResolver.extractNodes!('Controller.cs', content);
+ const names = nodes.map((n) => n.name);
+ expect(names.some((n) => n.includes('/real'))).toBe(true);
+ expect(names.some((n) => n.includes('/docfake'))).toBe(false);
+ });
+
+ it('skips minimal-API MapGet/MapPost calls inside comments', () => {
+ // Regression: the minimalApiPattern loop below the routePatterns
+ // loop was initially missed when applying the strip helper, leaving
+ // commented-out `app.MapGet("/x")` calls extracted as real routes.
+ const content = [
+ '// app.MapGet("/linefake", h);',
+ '/*',
+ ' * app.MapPost("/blockfake", h);',
+ ' */',
+ 'app.MapGet("/real", h);',
+ ].join('\n');
+ const nodes = aspnetResolver.extractNodes!('Program.cs', content);
+ const names = nodes.map((n) => n.name);
+ expect(names.some((n) => n.includes('/real'))).toBe(true);
+ expect(names.some((n) => n.includes('/linefake'))).toBe(false);
+ expect(names.some((n) => n.includes('/blockfake'))).toBe(false);
+ });
+ });
+});
+
+describe('stripCommentsForRegex preserves line offsets', () => {
+ it('keeps newlines so match.index → original line number', () => {
+ const input = '"""\n@app.get("/x")\n"""\n@app.get("/y")';
+ const out = stripCommentsForRegex(input, 'python');
+ // Newlines preserved
+ expect(out.split('\n').length).toBe(input.split('\n').length);
+ // The /y route survives
+ expect(out).toContain('/y');
+ // The docstring contents are blanked
+ expect(out).not.toContain('/x');
+ });
+});
diff --git a/src/extraction/index.ts b/src/extraction/index.ts
index 4ad056fb..f4acda24 100644
--- a/src/extraction/index.ts
+++ b/src/extraction/index.ts
@@ -20,7 +20,7 @@ import { QueryBuilder } from '../db/queries';
import { extractFromSource } from './tree-sitter';
import { detectLanguage, isLanguageSupported, initGrammars, loadGrammarsForLanguages } from './grammars';
import { logDebug, logWarn } from '../errors';
-import { validatePathWithinRoot, normalizePath } from '../utils';
+import { validatePathWithinRoot, normalizePath, stripBom, stripCommentLinesForRetry } from '../utils';
import picomatch from 'picomatch';
/**
@@ -85,10 +85,15 @@ export interface SyncResult {
}
/**
- * Calculate SHA256 hash of file contents
+ * Calculate SHA256 hash of file contents.
+ *
+ * A leading UTF-8 BOM is stripped before hashing so files round-tripped
+ * through editors that disagree about BOM handling (VSCode strips by
+ * default; some Windows editors preserve it) hash identically and don't
+ * appear "modified" on every sync.
*/
export function hashContent(content: string): string {
- return crypto.createHash('sha256').update(content).digest('hex');
+ return crypto.createHash('sha256').update(stripBom(content)).digest('hex');
}
/**
@@ -820,11 +825,12 @@ export class ExtractionOrchestrator {
}
// Strip lines that are entirely comments (preserving line numbers
- // by replacing with empty lines so node positions stay correct)
- const stripped = fullContent
- .split('\n')
- .map(line => /^\s*\/\//.test(line) ? '' : line)
- .join('\n');
+ // by replacing with empty lines so node positions stay correct).
+ // The marker is language-specific — the previous hardcoded `//`
+ // was a no-op for Python (`#`), Ruby (`#`), etc., so those files
+ // would silently keep failing on the retry.
+ const language = detectLanguage(filePath, fullContent);
+ const stripped = stripCommentLinesForRetry(fullContent, language);
let result: ExtractionResult;
try {
@@ -834,7 +840,6 @@ export class ExtractionOrchestrator {
}
if (result.nodes.length > 0 || result.errors.length === 0) {
- const language = detectLanguage(filePath, fullContent);
const stats = await fsp.stat(path.join(this.rootDir, filePath));
this.storeExtractionResult(filePath, fullContent, language, stats, result);
diff --git a/src/resolution/frameworks/csharp.ts b/src/resolution/frameworks/csharp.ts
index 1e170be4..9effb53f 100644
--- a/src/resolution/frameworks/csharp.ts
+++ b/src/resolution/frameworks/csharp.ts
@@ -6,6 +6,7 @@
import { Node } from '../../types';
import { FrameworkResolver, UnresolvedRef, ResolvedRef, ResolutionContext } from '../types';
+import { stripCommentsForRegex } from '../../utils';
export const aspnetResolver: FrameworkResolver = {
name: 'aspnet',
@@ -117,6 +118,9 @@ export const aspnetResolver: FrameworkResolver = {
extractNodes(filePath: string, content: string): Node[] {
const nodes: Node[] = [];
const now = Date.now();
+ // Strip `//` and `/* */` comments so XML-doc examples like
+ // `/// [HttpGet("/x")]` aren't treated as real route attributes.
+ const safe = stripCommentsForRegex(content, 'csharp');
// Extract route attributes
// [HttpGet("path")], [HttpPost("path")], [Route("path")]
@@ -128,8 +132,8 @@ export const aspnetResolver: FrameworkResolver = {
for (const pattern of routePatterns) {
let match;
- while ((match = pattern.exec(content)) !== null) {
- const line = content.slice(0, match.index).split('\n').length;
+ while ((match = pattern.exec(safe)) !== null) {
+ const line = safe.slice(0, match.index).split('\n').length;
if (pattern.source.includes('Http')) {
if (match[3]) {
@@ -190,9 +194,9 @@ export const aspnetResolver: FrameworkResolver = {
const minimalApiPattern = /\.Map(Get|Post|Put|Patch|Delete)\s*\(\s*["']([^"']+)["']/g;
let match;
- while ((match = minimalApiPattern.exec(content)) !== null) {
+ while ((match = minimalApiPattern.exec(safe)) !== null) {
const [, method, path] = match;
- const line = content.slice(0, match.index).split('\n').length;
+ const line = safe.slice(0, match.index).split('\n').length;
nodes.push({
id: `route:${filePath}:${method!.toUpperCase()}:${path}:${line}`,
diff --git a/src/resolution/frameworks/express.ts b/src/resolution/frameworks/express.ts
index 0afa7e03..07851769 100644
--- a/src/resolution/frameworks/express.ts
+++ b/src/resolution/frameworks/express.ts
@@ -6,6 +6,7 @@
import { Node } from '../../types';
import { FrameworkResolver, UnresolvedRef, ResolvedRef, ResolutionContext } from '../types';
+import { stripCommentsForRegex } from '../../utils';
export const expressResolver: FrameworkResolver = {
name: 'express',
@@ -93,6 +94,9 @@ export const expressResolver: FrameworkResolver = {
extractNodes(filePath: string, content: string): Node[] {
const nodes: Node[] = [];
const now = Date.now();
+ // Neutralize comments and JSDoc blocks so a `app.get('/x')` example in
+ // a comment isn't extracted as a real route.
+ const safe = stripCommentsForRegex(content, 'javascript');
// Extract route definitions
// app.get('/path', handler) or router.get('/path', handler)
@@ -102,9 +106,9 @@ export const expressResolver: FrameworkResolver = {
for (const pattern of routePatterns) {
let match;
- while ((match = pattern.exec(content)) !== null) {
+ while ((match = pattern.exec(safe)) !== null) {
const [, _obj, method, path] = match;
- const line = content.slice(0, match.index).split('\n').length;
+ const line = safe.slice(0, match.index).split('\n').length;
// Skip middleware use() without paths
if (method === 'use' && !path?.startsWith('/')) {
diff --git a/src/resolution/frameworks/laravel.ts b/src/resolution/frameworks/laravel.ts
index d6a79885..4b3b5e00 100644
--- a/src/resolution/frameworks/laravel.ts
+++ b/src/resolution/frameworks/laravel.ts
@@ -6,6 +6,7 @@
import { Node } from '../../types';
import { FrameworkResolver, UnresolvedRef, ResolvedRef, ResolutionContext } from '../types';
+import { stripCommentsForRegex } from '../../utils';
/**
* Laravel facade mappings to underlying classes
@@ -93,6 +94,7 @@ export const laravelResolver: FrameworkResolver = {
extractNodes(filePath: string, content: string): Node[] {
const nodes: Node[] = [];
const now = Date.now();
+ const safe = stripCommentsForRegex(content, 'php');
// Extract route definitions
const routePatterns = [
@@ -106,10 +108,10 @@ export const laravelResolver: FrameworkResolver = {
for (const pattern of routePatterns) {
let match;
- while ((match = pattern.exec(content)) !== null) {
+ while ((match = pattern.exec(safe)) !== null) {
if (pattern.source.includes('resource')) {
const [, resourceName] = match;
- const line = content.slice(0, match.index).split('\n').length;
+ const line = safe.slice(0, match.index).split('\n').length;
nodes.push({
id: `route:${filePath}:resource:${resourceName}:${line}`,
kind: 'route',
@@ -125,7 +127,7 @@ export const laravelResolver: FrameworkResolver = {
});
} else {
const [, method, path] = match;
- const line = content.slice(0, match.index).split('\n').length;
+ const line = safe.slice(0, match.index).split('\n').length;
nodes.push({
id: `route:${filePath}:${method!.toUpperCase()}:${path}:${line}`,
kind: 'route',
diff --git a/src/resolution/frameworks/python.ts b/src/resolution/frameworks/python.ts
index 88f5034a..021fbd1d 100644
--- a/src/resolution/frameworks/python.ts
+++ b/src/resolution/frameworks/python.ts
@@ -6,6 +6,7 @@
import { Node } from '../../types';
import { FrameworkResolver, UnresolvedRef, ResolvedRef, ResolutionContext } from '../types';
+import { stripCommentsForRegex } from '../../utils';
export const djangoResolver: FrameworkResolver = {
name: 'django',
@@ -77,6 +78,10 @@ export const djangoResolver: FrameworkResolver = {
extractNodes(filePath: string, content: string): Node[] {
const nodes: Node[] = [];
const now = Date.now();
+ // Neutralize comments and docstrings so a `path('/x', view)` example in
+ // a docstring isn't extracted as a real route. Newlines preserved so
+ // line numbers stay correct.
+ const safe = stripCommentsForRegex(content, 'python');
// Extract URL patterns
// path('route/', view, name='name')
@@ -87,9 +92,9 @@ export const djangoResolver: FrameworkResolver = {
for (const pattern of urlPatterns) {
let match;
- while ((match = pattern.exec(content)) !== null) {
+ while ((match = pattern.exec(safe)) !== null) {
const [, urlPath] = match;
- const line = content.slice(0, match.index).split('\n').length;
+ const line = safe.slice(0, match.index).split('\n').length;
nodes.push({
id: `route:${filePath}:${urlPath}:${line}`,
@@ -157,15 +162,16 @@ export const flaskResolver: FrameworkResolver = {
extractNodes(filePath: string, content: string): Node[] {
const nodes: Node[] = [];
const now = Date.now();
+ const safe = stripCommentsForRegex(content, 'python');
// Extract Flask route decorators
// @app.route('/path') or @blueprint.route('/path')
const routePattern = /@(\w+)\.route\s*\(\s*['"]([^'"]+)['"]/g;
let match;
- while ((match = routePattern.exec(content)) !== null) {
+ while ((match = routePattern.exec(safe)) !== null) {
const [, _appOrBp, routePath] = match;
- const line = content.slice(0, match.index).split('\n').length;
+ const line = safe.slice(0, match.index).split('\n').length;
nodes.push({
id: `route:${filePath}:${routePath}:${line}`,
@@ -245,15 +251,16 @@ export const fastapiResolver: FrameworkResolver = {
extractNodes(filePath: string, content: string): Node[] {
const nodes: Node[] = [];
const now = Date.now();
+ const safe = stripCommentsForRegex(content, 'python');
// Extract FastAPI route decorators
// @app.get('/path') or @router.post('/path')
const routePattern = /@(\w+)\.(get|post|put|patch|delete|options|head)\s*\(\s*['"]([^'"]+)['"]/g;
let match;
- while ((match = routePattern.exec(content)) !== null) {
+ while ((match = routePattern.exec(safe)) !== null) {
const [, _appOrRouter, method, routePath] = match;
- const line = content.slice(0, match.index).split('\n').length;
+ const line = safe.slice(0, match.index).split('\n').length;
nodes.push({
id: `route:${filePath}:${method!.toUpperCase()}:${routePath}:${line}`,
diff --git a/src/resolution/frameworks/rust.ts b/src/resolution/frameworks/rust.ts
index 5ab10bc3..92d92060 100644
--- a/src/resolution/frameworks/rust.ts
+++ b/src/resolution/frameworks/rust.ts
@@ -6,6 +6,7 @@
import { Node } from '../../types';
import { FrameworkResolver, UnresolvedRef, ResolvedRef, ResolutionContext } from '../types';
+import { stripCommentsForRegex } from '../../utils';
export const rustResolver: FrameworkResolver = {
name: 'rust',
@@ -74,15 +75,18 @@ export const rustResolver: FrameworkResolver = {
extractNodes(filePath: string, content: string): Node[] {
const nodes: Node[] = [];
const now = Date.now();
+ // Strip `//` and `/* */` comments so doc-comment examples like
+ // `/// #[get("/x")]` aren't treated as real route attributes.
+ const safe = stripCommentsForRegex(content, 'rust');
// Extract Actix-web routes
// #[get("/path")], #[post("/path")], etc.
const actixRoutePattern = /#\[(get|post|put|patch|delete)\s*\(\s*["']([^"']+)["']/g;
let match;
- while ((match = actixRoutePattern.exec(content)) !== null) {
+ while ((match = actixRoutePattern.exec(safe)) !== null) {
const [, method, path] = match;
- const line = content.slice(0, match.index).split('\n').length;
+ const line = safe.slice(0, match.index).split('\n').length;
nodes.push({
id: `route:${filePath}:${method!.toUpperCase()}:${path}:${line}`,
@@ -103,9 +107,9 @@ export const rustResolver: FrameworkResolver = {
// #[get("/path")], #[post("/path", ...)]
const rocketRoutePattern = /#\[(get|post|put|patch|delete|head|options)\s*\(\s*["']([^"']+)["']/g;
- while ((match = rocketRoutePattern.exec(content)) !== null) {
+ while ((match = rocketRoutePattern.exec(safe)) !== null) {
const [, method, path] = match;
- const line = content.slice(0, match.index).split('\n').length;
+ const line = safe.slice(0, match.index).split('\n').length;
// Avoid duplicates from actix pattern
const routeId = `route:${filePath}:${method!.toUpperCase()}:${path}:${line}`;
@@ -130,9 +134,9 @@ export const rustResolver: FrameworkResolver = {
// .route("/path", get(handler))
const axumRoutePattern = /\.route\s*\(\s*["']([^"']+)["']\s*,\s*(get|post|put|patch|delete)/g;
- while ((match = axumRoutePattern.exec(content)) !== null) {
+ while ((match = axumRoutePattern.exec(safe)) !== null) {
const [, path, method] = match;
- const line = content.slice(0, match.index).split('\n').length;
+ const line = safe.slice(0, match.index).split('\n').length;
nodes.push({
id: `route:${filePath}:${method!.toUpperCase()}:${path}:${line}`,
diff --git a/src/utils.ts b/src/utils.ts
index e75e58e0..64741ab6 100644
--- a/src/utils.ts
+++ b/src/utils.ts
@@ -174,6 +174,135 @@ export function normalizePath(filePath: string): string {
return filePath.replace(/\\/g, '/');
}
+/**
+ * Strip a leading UTF-8 BOM (U+FEFF) if present.
+ *
+ * Editors disagree about whether to write the BOM. Without normalization
+ * the same logical content hashes to two different values depending on
+ * which editor last touched the file, producing spurious "modified"
+ * detections on every sync.
+ */
+export function stripBom(content: string): string {
+ return content.charCodeAt(0) === 0xfeff ? content.slice(1) : content;
+}
+
+/**
+ * Replace every non-newline character in `text` with a space. Preserves
+ * line count and column offsets so subsequent regex matches against the
+ * processed content map back to the same line numbers in the original.
+ */
+function blankPreservingNewlines(text: string): string {
+ return text.replace(/[^\n]/g, ' ');
+}
+
+/**
+ * Comment / docstring patterns to neutralize before applying coarse-grained
+ * regex extraction (e.g., framework route decorators). The goal is to
+ * prevent commented-out examples and docstring snippets from being
+ * extracted as real code constructs, without rebuilding a full lexer.
+ *
+ * For each language we strip:
+ * - Block comments (preserve newlines so line numbers stay correct).
+ * - Whole-line single-line comments (only when the line contains nothing
+ * but optional whitespace before the marker — this avoids corrupting
+ * string literals on the same line).
+ * - Python triple-quoted strings (the common docstring carrier).
+ *
+ * We deliberately do NOT strip arbitrary string literals — that risks
+ * removing legitimate route paths the regex needs to see.
+ */
+const BLOCK_COMMENT_LANGUAGES = new Set([
+ 'javascript', 'typescript', 'tsx', 'jsx',
+ 'java', 'csharp', 'cpp', 'c',
+ 'go', 'rust', 'swift', 'kotlin', 'dart', 'scala',
+ 'php',
+]);
+
+/**
+ * Per-language line-comment marker as a *line-anchored* prefix regex.
+ * Stateless (no `/g`, no `/m`) so it can be reused across many `.test`
+ * calls without regex-state pitfalls.
+ */
+const LINE_COMMENT_MARKER: Record = {
+ javascript: /^[ \t]*\/\//,
+ typescript: /^[ \t]*\/\//,
+ tsx: /^[ \t]*\/\//,
+ jsx: /^[ \t]*\/\//,
+ java: /^[ \t]*\/\//,
+ csharp: /^[ \t]*\/\//,
+ cpp: /^[ \t]*\/\//,
+ c: /^[ \t]*\/\//,
+ go: /^[ \t]*\/\//,
+ rust: /^[ \t]*\/\//,
+ swift: /^[ \t]*\/\//,
+ kotlin: /^[ \t]*\/\//,
+ dart: /^[ \t]*\/\//,
+ scala: /^[ \t]*\/\//,
+ pascal: /^[ \t]*\/\//,
+ python: /^[ \t]*#/,
+ ruby: /^[ \t]*#/,
+ php: /^[ \t]*(?:\/\/|#)/,
+};
+
+/**
+ * Best-effort comment stripper for use before coarse-grained regex
+ * extraction. Returns content with comments and (for Python) triple-quoted
+ * strings replaced by spaces — newlines preserved so line/column offsets
+ * derived from the result still map onto the original file.
+ *
+ * Languages without an entry are returned unchanged.
+ */
+export function stripCommentsForRegex(content: string, language: string): string {
+ let out = content;
+
+ if (BLOCK_COMMENT_LANGUAGES.has(language)) {
+ out = out.replace(/\/\*[\s\S]*?\*\//g, blankPreservingNewlines);
+ }
+ if (language === 'python') {
+ out = out.replace(/"""[\s\S]*?"""/g, blankPreservingNewlines);
+ out = out.replace(/'''[\s\S]*?'''/g, blankPreservingNewlines);
+ }
+ if (language === 'ruby') {
+ out = out.replace(/^=begin\b[\s\S]*?^=end\b[^\n]*/gm, blankPreservingNewlines);
+ }
+
+ const lineMarker = LINE_COMMENT_MARKER[language];
+ if (lineMarker) {
+ // Walk lines; replace any line that starts with optional whitespace
+ // then the marker. Done line-at-a-time so we never touch content
+ // inside string literals on other lines.
+ out = out
+ .split('\n')
+ .map((line) => (lineMarker.test(line) ? blankPreservingNewlines(line) : line))
+ .join('\n');
+ }
+
+ return out;
+}
+
+/**
+ * Strip lines that are entirely a single-line comment for the given
+ * language, replacing them with empty lines. Preserves line numbers so
+ * tree-sitter node positions stay correct.
+ *
+ * Used by the parser-retry "shrink the file" fallback. Unlike
+ * {@link stripCommentsForRegex} this does NOT strip block comments or
+ * docstrings — the goal is to remove the easiest dead weight (e.g.
+ * compiler test files dominated by `# CHECK:` / `// CHECK:` lines)
+ * without risking semantic changes.
+ *
+ * Returns content unchanged for languages without a known line-comment
+ * marker.
+ */
+export function stripCommentLinesForRetry(content: string, language: string): string {
+ const marker = LINE_COMMENT_MARKER[language];
+ if (!marker) return content;
+ return content
+ .split('\n')
+ .map((line) => (marker.test(line) ? '' : line))
+ .join('\n');
+}
+
/**
* Cross-process file lock using a lock file with PID tracking.
*