diff --git a/.husky/commit-msg b/.husky/commit-msg new file mode 100644 index 0000000..a78cc75 --- /dev/null +++ b/.husky/commit-msg @@ -0,0 +1 @@ +npx commitlint --edit $1 diff --git a/README.md b/README.md index ad479b7..8254045 100644 --- a/README.md +++ b/README.md @@ -122,8 +122,30 @@ const markdown = await summarizeGitDiff({ | `provider` | `LlmProviderId` — wins over `LLM_PROVIDER` env and auto-detection. | | `model` | Chat model id; overrides `LLM_MODEL` and the provider default. | | `maxDiffChars` | Caps unified diff size for the request. | +| `contextLines` | Number of context lines around each change (`git diff -U`). Lower values (1 or 0) are the single biggest token saver on modification-heavy diffs. | +| `ignoreWhitespace` | Passes `-w` / `--ignore-all-space` to `git diff` so pure-whitespace hunks don't consume tokens. Also applies to `--numstat` / `--name-status` so counts stay consistent. | +| `stripDiffPreamble` | Removes low-value lines from the unified diff (`diff --git`, `index`, mode changes, `similarity/rename/copy` metadata). `--- a/…`, `+++ b/…`, and `@@` hunk headers are kept. | +| `maxHunkLines` | Caps the body of each hunk; anything past the limit is replaced with a single elision marker. The `@@` header and `DiffSummary` totals are preserved. | +| `excludeDefaultNoise` | Merges the built-in `DEFAULT_NOISE_EXCLUDES` list (lockfiles, `dist`, `build`, `out`, `coverage`, `node_modules`, `__snapshots__`) into `excludeFolders`. | | `llmModelProvider` | `() => Promise` — bypass env-based resolution entirely; hand-wire a Vercel AI SDK `LanguageModel` (required in tests or custom setups). | +#### Reducing tokens + +For most repos, the cheapest wins are: + +```ts +await summarizeGitDiff({ + from: 'origin/main', + contextLines: 1, // -U1 cuts 30-60% of tokens on typical diffs + ignoreWhitespace: true, // drop pure-whitespace hunks entirely + stripDiffPreamble: true, // kill `index`/`mode`/`similarity` lines + maxHunkLines: 400, // truncate monster hunks but keep the @@ header + excludeDefaultNoise: true // skip lockfiles, dist/, coverage/, node_modules/ +}); +``` + +These options only reshape the *unified diff text* — the structured `DiffSummary` still reports true file counts and line totals, so the model always sees the full change inventory. + ### Injecting your own `LanguageModel` If you want full control — for example, to configure retries, middlewares, or hit an in-process mock — pass `llmModelProvider`: @@ -150,7 +172,7 @@ const md = await summarizeGitDiff({ The package also exports helpers for building a custom pipeline on top of the same git and LLM behavior: -- **Git**: `createGitClient`, `getRepoRoot`, `getCommits`, `getDiff`, `getDiffSummary`, `getChangedFiles`, `filterCommitsByMessageRegexes`, `buildDiffPathspecs` +- **Git**: `createGitClient`, `getRepoRoot`, `getCommits`, `getDiff`, `getDiffSummary`, `getChangedFiles`, `filterCommitsByMessageRegexes`, `buildDiffPathspecs`, `buildDiffShapingGitArgs`, `shapeUnifiedDiff`, `DEFAULT_NOISE_EXCLUDES` - **AI**: `generateSummary`, `resolveLlmMaxDiffChars`, `truncateUnifiedDiffForLlm` - **Provider resolution**: `resolveLanguageModel`, `detectLlmProvider`, `isLlmProviderConfigured`, `defaultModelForProvider`, `resolveLlmBaseUrl`, `parseLlmDefaultHeadersFromEnv` - **Constants / types**: `DEFAULT_GIT_DIFF_SYSTEM_PROMPT`, `LLM_GATEWAY_REQUIRED_MESSAGE`, `LlmProviderId`, `LlmModelProvider`, `ResolveLanguageModelOptions`, `GenerateSummaryInput`, `SummarizeFlags` diff --git a/commitlint.config.cjs b/commitlint.config.cjs new file mode 100644 index 0000000..3d88028 --- /dev/null +++ b/commitlint.config.cjs @@ -0,0 +1 @@ +module.exports = { extends: ['@commitlint/config-conventional'] }; \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index dd26f01..2ab945d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -17,6 +17,7 @@ "tslib": "^2.6.2" }, "devDependencies": { + "@commitlint/config-conventional": "^20.5.0", "@rollup/plugin-terser": "^1.0.0", "@rollup/plugin-typescript": "^12.1.4", "@types/fs-extra": "^11.0.4", @@ -789,6 +790,32 @@ "integrity": "sha512-0hYQ8SB4Db5zvZB4axdMHGwEaQjkZzFjQiN9LVYvIFB2nSUHW9tYpxWriPrWDASIxiaXax83REcLxuSdnGPZtw==", "dev": true }, + "node_modules/@commitlint/config-conventional": { + "version": "20.5.0", + "resolved": "https://registry.npmjs.org/@commitlint/config-conventional/-/config-conventional-20.5.0.tgz", + "integrity": "sha512-t3Ni88rFw1XMa4nZHgOKJ8fIAT9M2j5TnKyTqJzsxea7FUetlNdYFus9dz+MhIRZmc16P0PPyEfh6X2d/qw8SA==", + "dev": true, + "dependencies": { + "@commitlint/types": "^20.5.0", + "conventional-changelog-conventionalcommits": "^9.2.0" + }, + "engines": { + "node": ">=v18" + } + }, + "node_modules/@commitlint/types": { + "version": "20.5.0", + "resolved": "https://registry.npmjs.org/@commitlint/types/-/types-20.5.0.tgz", + "integrity": "sha512-ZJoS8oSq2CAZEpc/YI9SulLrdiIyXeHb/OGqGrkUP6Q7YV+0ouNAa7GjqRdXeQPncHQIDz/jbCTlHScvYvO/gA==", + "dev": true, + "dependencies": { + "conventional-commits-parser": "^6.3.0", + "picocolors": "^1.1.1" + }, + "engines": { + "node": ">=v18" + } + }, "node_modules/@emnapi/core": { "version": "1.9.2", "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.9.2.tgz", @@ -1901,6 +1928,18 @@ "@simple-git/args-pathspec": "^1.0.3" } }, + "node_modules/@simple-libs/stream-utils": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/@simple-libs/stream-utils/-/stream-utils-1.2.0.tgz", + "integrity": "sha512-KxXvfapcixpz6rVEB6HPjOUZT22yN6v0vI0urQSk1L8MlEWPDFCZkhw2xmkyoTGYeFw7tWTZd7e3lVzRZRN/EA==", + "dev": true, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://ko-fi.com/dangreen" + } + }, "node_modules/@sinclair/typebox": { "version": "0.34.49", "resolved": "https://registry.npmjs.org/@sinclair/typebox/-/typebox-0.34.49.tgz", @@ -2553,6 +2592,12 @@ "sprintf-js": "~1.0.2" } }, + "node_modules/array-ify": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/array-ify/-/array-ify-1.0.0.tgz", + "integrity": "sha512-c5AMf34bKdvPhQ7tBGhqkgKNUzMr4WUs+WDtC2ZUGOUncbxKMTvqxYctiseW3+L4bA8ec+GcZ6/A/FW4m8ukng==", + "dev": true + }, "node_modules/aws4fetch": { "version": "1.0.20", "resolved": "https://registry.npmjs.org/aws4fetch/-/aws4fetch-1.0.20.tgz", @@ -3004,12 +3049,50 @@ "node": ">=20" } }, + "node_modules/compare-func": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/compare-func/-/compare-func-2.0.0.tgz", + "integrity": "sha512-zHig5N+tPWARooBnb0Zx1MFcdfpyJrfTJ3Y5L+IFvUm8rM74hHz66z0gw0x4tijh5CorKkKUCnW82R2vmpeCRA==", + "dev": true, + "dependencies": { + "array-ify": "^1.0.0", + "dot-prop": "^5.1.0" + } + }, "node_modules/concat-map": { "version": "0.0.1", "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==", "dev": true }, + "node_modules/conventional-changelog-conventionalcommits": { + "version": "9.3.1", + "resolved": "https://registry.npmjs.org/conventional-changelog-conventionalcommits/-/conventional-changelog-conventionalcommits-9.3.1.tgz", + "integrity": "sha512-dTYtpIacRpcZgrvBYvBfArMmK2xvIpv2TaxM0/ZI5CBtNUzvF2x0t15HsbRABWprS6UPmvj+PzHVjSx4qAVKyw==", + "dev": true, + "dependencies": { + "compare-func": "^2.0.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/conventional-commits-parser": { + "version": "6.4.0", + "resolved": "https://registry.npmjs.org/conventional-commits-parser/-/conventional-commits-parser-6.4.0.tgz", + "integrity": "sha512-tvRg7FIBNlyPzjdG8wWRlPHQJJHI7DylhtRGeU9Lq+JuoPh5BKpPRX83ZdLrvXuOSu5Eo/e7SzOQhU4Hd2Miuw==", + "dev": true, + "dependencies": { + "@simple-libs/stream-utils": "^1.2.0", + "meow": "^13.0.0" + }, + "bin": { + "conventional-commits-parser": "dist/cli/index.js" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/convert-source-map": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-2.0.0.tgz", @@ -3084,6 +3167,18 @@ "node": ">=8" } }, + "node_modules/dot-prop": { + "version": "5.3.0", + "resolved": "https://registry.npmjs.org/dot-prop/-/dot-prop-5.3.0.tgz", + "integrity": "sha512-QM8q3zDe58hqUqjraQOmzZ1LIH9SWQJTlEKCH4kJ2oQvLZk7RbQXvtDM2XEq3fwkV9CCvvH4LA0AV+ogFsBM2Q==", + "dev": true, + "dependencies": { + "is-obj": "^2.0.0" + }, + "engines": { + "node": ">=8" + } + }, "node_modules/eastasianwidth": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz", @@ -3855,6 +3950,15 @@ "node": ">=0.10.0" } }, + "node_modules/is-obj": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/is-obj/-/is-obj-2.0.0.tgz", + "integrity": "sha512-drqDG3cbczxxEJRoOXcOjtdp1J/lyp1mNn0xaznRs8+muBhgQcrnbspox5X5fOw0HnMnbfDzvnEMEtqDEJEo8w==", + "dev": true, + "engines": { + "node": ">=8" + } + }, "node_modules/is-stream": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-2.0.1.tgz", @@ -4916,6 +5020,18 @@ "tmpl": "1.0.5" } }, + "node_modules/meow": { + "version": "13.2.0", + "resolved": "https://registry.npmjs.org/meow/-/meow-13.2.0.tgz", + "integrity": "sha512-pxQJQzB6djGPXh08dacEloMFopsOqGVRKFPYvPOt9XDZ1HasbgDZA74CJGreSU4G3Ak7EFJGoiH2auq+yXISgA==", + "dev": true, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/merge-stream": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz", diff --git a/package.json b/package.json index ba8391f..76074f3 100644 --- a/package.json +++ b/package.json @@ -29,6 +29,7 @@ "**/*": "prettier --write \"**/*.{js,ts}\" --ignore-unknown" }, "devDependencies": { + "@commitlint/config-conventional": "^20.5.0", "@rollup/plugin-terser": "^1.0.0", "@rollup/plugin-typescript": "^12.1.4", "@types/fs-extra": "^11.0.4", diff --git a/src/git/diffShaping.ts b/src/git/diffShaping.ts new file mode 100644 index 0000000..562681c --- /dev/null +++ b/src/git/diffShaping.ts @@ -0,0 +1,184 @@ +/** + * Options that reshape a unified diff to reduce token cost before sending it to an LLM. + * Every option is opt-in; none of them alter the logical meaning of the diff. + */ +export type DiffShapingOptions = { + /** + * Number of context lines around each change. Passed as `-U` to `git diff`. + * Git defaults to 3; lowering to 0 or 1 commonly saves 30–60% of tokens on + * modification-heavy diffs with little loss of fidelity. + */ + contextLines?: number; + /** + * Pass `-w` / `--ignore-all-space` to `git diff`. Pure-whitespace hunks vanish, + * which is usually what you want when the model is reasoning about behavior. + */ + ignoreWhitespace?: boolean; + /** + * Strip low-value preamble lines from the unified diff: `diff --git`, `index`, + * `new/deleted file mode`, `old/new mode`, `similarity index`, `rename/copy from/to`. + * `--- a/...`, `+++ b/...`, and `@@` lines are kept so the model still sees file + * identity and hunk positions. + */ + stripDiffPreamble?: boolean; + /** + * Replace any hunk body longer than this many lines with a single elision marker + * after the truncation point, preserving the `@@` header. Totals are still + * reflected by the structured `DiffSummary`. + */ + maxHunkLines?: number; +}; + +/** + * Common high-token-cost files/folders that rarely help an LLM summarize a change. + * Entries are repo-root relative paths suitable for `excludeFolders` / git + * `:(exclude)` pathspecs. Opt in via `excludeDefaultNoise: true` on + * `summarizeGitDiff`, or merge this list into your own `excludeFolders`. + */ +export const DEFAULT_NOISE_EXCLUDES: readonly string[] = [ + "package-lock.json", + "yarn.lock", + "pnpm-lock.yaml", + "npm-shrinkwrap.json", + "bun.lockb", + "go.sum", + "Cargo.lock", + "Gemfile.lock", + "composer.lock", + "Pipfile.lock", + "poetry.lock", + "uv.lock", + "Podfile.lock", + "node_modules", + "dist", + "build", + "out", + "coverage", + "__snapshots__", +]; + +function normalizeContextLines(raw: number): number { + if (!Number.isFinite(raw) || raw < 0) return 0; + return Math.trunc(raw); +} + +/** Build the leading args for `git diff` implied by shaping options. */ +export function buildDiffShapingGitArgs( + shaping?: DiffShapingOptions, +): string[] { + const args: string[] = []; + if (shaping?.contextLines !== undefined) { + args.push(`-U${normalizeContextLines(shaping.contextLines)}`); + } + if (shaping?.ignoreWhitespace) { + args.push("-w"); + } + return args; +} + +const PREAMBLE_NOISE_PREFIXES = [ + "diff --git ", + "index ", + "new file mode ", + "deleted file mode ", + "old mode ", + "new mode ", + "similarity index ", + "dissimilarity index ", + "rename from ", + "rename to ", + "copy from ", + "copy to ", +]; + +function isPreambleNoiseLine(line: string): boolean { + for (const prefix of PREAMBLE_NOISE_PREFIXES) { + if (line.startsWith(prefix)) return true; + } + return false; +} + +function stripPreambleLines(text: string): string { + return text + .split(/\r?\n/) + .filter((line) => !isPreambleNoiseLine(line)) + .join("\n"); +} + +/** + * True when a line is a real unified-diff file header (`--- a/...`, `+++ b/...`, + * or `/dev/null`), not a deleted/added body line that happens to begin with `---`/`+++`. + */ +function isFileHeaderLine(line: string): boolean { + return ( + /^--- (a\/|b\/|"a\/|"b\/|\/dev\/null)/.test(line) || + /^\+\+\+ (a\/|b\/|"a\/|"b\/|\/dev\/null)/.test(line) + ); +} + +function elideLargeHunks(text: string, maxHunkLines: number): string { + const limit = normalizeContextLines(maxHunkLines); + const lines = text.split(/\r?\n/); + const out: string[] = []; + let inHunk = false; + let hunkBuf: string[] = []; + + const flushHunk = (): void => { + if (hunkBuf.length > limit) { + const elided = hunkBuf.length - limit; + out.push(...hunkBuf.slice(0, limit)); + out.push( + `[... ${elided} diff line${elided === 1 ? "" : "s"} elided ...]`, + ); + } else { + out.push(...hunkBuf); + } + hunkBuf = []; + inHunk = false; + }; + + for (const line of lines) { + if (line.startsWith("@@")) { + if (inHunk) flushHunk(); + out.push(line); + inHunk = true; + continue; + } + if (line.startsWith("diff --git ") || isFileHeaderLine(line)) { + if (inHunk) flushHunk(); + out.push(line); + continue; + } + if (inHunk) { + hunkBuf.push(line); + } else { + out.push(line); + } + } + if (inHunk) flushHunk(); + return out.join("\n"); +} + +/** + * Apply post-processing shaping (preamble stripping and hunk elision) to a + * unified diff. `-U` / `-w` are handled separately via + * {@link buildDiffShapingGitArgs} since they need to reach `git diff`. + * + * Order: strip preamble, then elide hunks. + */ +export function shapeUnifiedDiff( + text: string, + shaping?: DiffShapingOptions, +): string { + if (!shaping?.stripDiffPreamble && shaping?.maxHunkLines === undefined) { + return text; + } + let out = text; + if (shaping.stripDiffPreamble) { + out = stripPreambleLines(out); + } + if (shaping.maxHunkLines !== undefined) { + out = elideLargeHunks(out, shaping.maxHunkLines); + } + return out; +} diff --git a/src/git/diffTypes.ts b/src/git/diffTypes.ts index 898266f..0c5a20a 100644 --- a/src/git/diffTypes.ts +++ b/src/git/diffTypes.ts @@ -1,3 +1,5 @@ +import type { DiffShapingOptions } from "./diffShaping.js"; + export type CommitInfo = { hash: string; message: string; @@ -45,4 +47,6 @@ export type GitDiffRangeQuery = { pathFilter?: DiffPathFilter; /** When set, skips `git rev-parse` and uses this as the repo root for pathspecs. */ repoRootOverride?: string; + /** Token-reduction controls applied to the unified diff produced by `getDiff`. */ + shaping?: DiffShapingOptions; }; diff --git a/src/git/gitDiff.ts b/src/git/gitDiff.ts index 755485f..5fd3d9f 100644 --- a/src/git/gitDiff.ts +++ b/src/git/gitDiff.ts @@ -18,3 +18,9 @@ export { getRepoRoot, } from "./gitDiffOps.js"; export { parseDiffSummary } from "./diffSummaryParse.js"; +export type { DiffShapingOptions } from "./diffShaping.js"; +export { + DEFAULT_NOISE_EXCLUDES, + buildDiffShapingGitArgs, + shapeUnifiedDiff, +} from "./diffShaping.js"; diff --git a/src/git/gitDiffOps.ts b/src/git/gitDiffOps.ts index 350c66f..d2fa85c 100644 --- a/src/git/gitDiffOps.ts +++ b/src/git/gitDiffOps.ts @@ -8,6 +8,10 @@ import type { GitDiffRangeQuery, } from "./diffTypes.js"; import { buildDiffPathspecs } from "./diffPathspecs.js"; +import { + buildDiffShapingGitArgs, + shapeUnifiedDiff, +} from "./diffShaping.js"; import { buildDiffSummaryFromGitOutputs } from "./diffSummaryBuild.js"; export function createGitClient(cwd = process.cwd()): SimpleGit { @@ -47,33 +51,66 @@ export async function getDiff( git: SimpleGit, query: GitDiffRangeQuery, ): Promise { - const { from, to, commits, filterByCommits, pathFilter, repoRootOverride } = - query; + const { + from, + to, + commits, + filterByCommits, + pathFilter, + repoRootOverride, + shaping, + } = query; const { specs } = await getDiffPathContext(git, pathFilter, repoRootOverride); + const shapingArgs = buildDiffShapingGitArgs(shaping); if (!filterByCommits) { - return git.diff([`${from}..${to}`, "--", ...specs]); + const raw = await git.diff([ + ...shapingArgs, + `${from}..${to}`, + "--", + ...specs, + ]); + return shapeUnifiedDiff(raw, shaping); } const patches = await Promise.all( - commits.map((c) => git.diff([`${c.hash}^!`, "--", ...specs])), + commits.map((c) => + git.diff([...shapingArgs, `${c.hash}^!`, "--", ...specs]), + ), ); - return patches.filter(Boolean).join("\n"); + return patches + .map((p) => shapeUnifiedDiff(p, shaping)) + .filter(Boolean) + .join("\n"); } export async function getDiffSummary( git: SimpleGit, query: GitDiffRangeQuery, ): Promise { - const { from, to, commits, filterByCommits, pathFilter, repoRootOverride } = - query; + const { + from, + to, + commits, + filterByCommits, + pathFilter, + repoRootOverride, + shaping, + } = query; const { specs } = await getDiffPathContext(git, pathFilter, repoRootOverride); + const whitespaceArgs = shaping?.ignoreWhitespace ? ["-w"] : []; if (!filterByCommits) { const [numOutput, nameOutput] = await Promise.all([ - git.diff(["--numstat", `${from}..${to}`, "--", ...specs]), - git.diff(["--name-status", `${from}..${to}`, "--", ...specs]), + git.diff([...whitespaceArgs, "--numstat", `${from}..${to}`, "--", ...specs]), + git.diff([ + ...whitespaceArgs, + "--name-status", + `${from}..${to}`, + "--", + ...specs, + ]), ]); return buildDiffSummaryFromGitOutputs(nameOutput, numOutput); } @@ -82,8 +119,8 @@ export async function getDiffSummary( commits.map(async (c) => { const range = `${c.hash}^!`; const [numOutput, nameOutput] = await Promise.all([ - git.diff(["--numstat", range, "--", ...specs]), - git.diff(["--name-status", range, "--", ...specs]), + git.diff([...whitespaceArgs, "--numstat", range, "--", ...specs]), + git.diff([...whitespaceArgs, "--name-status", range, "--", ...specs]), ]); return { numOutput, nameOutput }; }), diff --git a/src/index.ts b/src/index.ts index a0aa423..658fd11 100644 --- a/src/index.ts +++ b/src/index.ts @@ -5,6 +5,7 @@ import type { LlmModelProvider, SummarizeFlags } from "./ai/aiTypes.js"; import type { LlmProviderId } from "./ai/llmProviders.js"; import { createGitClient, + DEFAULT_NOISE_EXCLUDES, filterCommitsByMessageRegexes, getChangedFiles, getCommits, @@ -12,6 +13,7 @@ import { getDiffSummary, type CommitInfo, type DiffPathFilter, + type DiffShapingOptions, } from "./git/gitDiff.js"; export type GitDiffAiSummaryOptions = { @@ -51,6 +53,29 @@ export type GitDiffAiSummaryOptions = { */ provider?: LlmProviderId; maxDiffChars?: number; + /** + * Number of context lines around each change (git `-U`). Default git behavior is 3; + * dropping to 0 or 1 is the single biggest token saver on modification-heavy diffs. + */ + contextLines?: number; + /** Pass `-w` / `--ignore-all-space` so pure-whitespace hunks don't consume tokens. */ + ignoreWhitespace?: boolean; + /** + * Strip low-value preamble lines (`diff --git`, `index`, mode changes, rename/copy metadata) + * from the unified diff. `--- a/...`, `+++ b/...`, and `@@` hunk headers are kept. + */ + stripDiffPreamble?: boolean; + /** + * Replace any hunk body longer than this many lines with an elision marker after + * the truncation point. The `@@` header is preserved and the structured diff + * summary still reflects the true counts. + */ + maxHunkLines?: number; + /** + * Merge the built-in high-noise path list ({@link DEFAULT_NOISE_EXCLUDES}) into + * `excludeFolders` — lockfiles, `dist`, `build`, `node_modules`, `coverage`, etc. + */ + excludeDefaultNoise?: boolean; /** * Optional factory returning a Vercel AI SDK `LanguageModel` — bypass env-based * provider resolution (useful in tests and bespoke setups). @@ -58,6 +83,39 @@ export type GitDiffAiSummaryOptions = { llmModelProvider?: LlmModelProvider; }; +function buildShapingFromOptions( + options: GitDiffAiSummaryOptions, +): DiffShapingOptions | undefined { + const shaping: DiffShapingOptions = {}; + if (options.contextLines !== undefined) { + shaping.contextLines = options.contextLines; + } + if (options.ignoreWhitespace) shaping.ignoreWhitespace = true; + if (options.stripDiffPreamble) shaping.stripDiffPreamble = true; + if (options.maxHunkLines !== undefined) { + shaping.maxHunkLines = options.maxHunkLines; + } + return Object.keys(shaping).length > 0 ? shaping : undefined; +} + +function buildEffectiveExcludeFolders( + options: GitDiffAiSummaryOptions, +): string[] | undefined { + const userExcludes = options.excludeFolders ?? []; + if (!options.excludeDefaultNoise) { + return userExcludes.length > 0 ? userExcludes : undefined; + } + const seen = new Set(); + const merged: string[] = []; + for (const p of [...DEFAULT_NOISE_EXCLUDES, ...userExcludes]) { + const key = p.trim(); + if (!key || seen.has(key)) continue; + seen.add(key); + merged.push(p); + } + return merged; +} + function hasNonEmptyTrimmed(arr?: string[]): boolean { return (arr ?? []).some((s) => s.trim().length > 0); } @@ -90,12 +148,13 @@ export async function summarizeGitDiff( const from = options.from; const to = options.to ?? "HEAD"; + const effectiveExcludeFolders = buildEffectiveExcludeFolders(options); const pathFilter: DiffPathFilter | undefined = hasNonEmptyTrimmed(options.includeFolders) || - hasNonEmptyTrimmed(options.excludeFolders) + hasNonEmptyTrimmed(effectiveExcludeFolders) ? { includeFolders: options.includeFolders, - excludeFolders: options.excludeFolders, + excludeFolders: effectiveExcludeFolders, } : undefined; @@ -111,12 +170,14 @@ export async function summarizeGitDiff( options, ); + const shaping = buildShapingFromOptions(options); const rangeQuery = { from, to, commits: filteredCommits, filterByCommits, pathFilter, + shaping, }; const [diffText, fileNames, diffSummary] = await Promise.all([ @@ -151,11 +212,14 @@ export type { CommitInfo, DiffFileSummary, DiffPathFilter, + DiffShapingOptions, DiffSummary, GitDiffRangeQuery, } from "./git/gitDiff.js"; export { + DEFAULT_NOISE_EXCLUDES, buildDiffPathspecs, + buildDiffShapingGitArgs, createGitClient, filterCommitsByMessageRegexes, getChangedFiles, @@ -163,6 +227,7 @@ export { getDiff, getDiffSummary, getRepoRoot, + shapeUnifiedDiff, } from "./git/gitDiff.js"; export type { diff --git a/test/diffShaping.spec.ts b/test/diffShaping.spec.ts new file mode 100644 index 0000000..10ad056 --- /dev/null +++ b/test/diffShaping.spec.ts @@ -0,0 +1,282 @@ +import { + DEFAULT_NOISE_EXCLUDES, + buildDiffShapingGitArgs, + shapeUnifiedDiff, +} from "../src/git/gitDiff"; + +describe("buildDiffShapingGitArgs", () => { + it("returns an empty array when shaping is undefined", () => { + expect(buildDiffShapingGitArgs()).toEqual([]); + }); + + it("returns an empty array when no fields are set", () => { + expect(buildDiffShapingGitArgs({})).toEqual([]); + }); + + it("emits -U for valid contextLines", () => { + expect(buildDiffShapingGitArgs({ contextLines: 1 })).toEqual(["-U1"]); + }); + + it("truncates fractional contextLines", () => { + expect(buildDiffShapingGitArgs({ contextLines: 2.9 })).toEqual(["-U2"]); + }); + + it("clamps negative contextLines to 0", () => { + expect(buildDiffShapingGitArgs({ contextLines: -5 })).toEqual(["-U0"]); + }); + + it("clamps non-finite contextLines to 0", () => { + expect(buildDiffShapingGitArgs({ contextLines: Number.NaN })).toEqual([ + "-U0", + ]); + }); + + it("emits -w when ignoreWhitespace is true", () => { + expect(buildDiffShapingGitArgs({ ignoreWhitespace: true })).toEqual(["-w"]); + }); + + it("omits -w when ignoreWhitespace is false", () => { + expect(buildDiffShapingGitArgs({ ignoreWhitespace: false })).toEqual([]); + }); + + it("combines contextLines and ignoreWhitespace", () => { + expect( + buildDiffShapingGitArgs({ contextLines: 0, ignoreWhitespace: true }), + ).toEqual(["-U0", "-w"]); + }); +}); + +describe("DEFAULT_NOISE_EXCLUDES", () => { + it("includes common lockfiles and build outputs", () => { + expect(DEFAULT_NOISE_EXCLUDES).toEqual( + expect.arrayContaining([ + "package-lock.json", + "yarn.lock", + "pnpm-lock.yaml", + "node_modules", + "dist", + "coverage", + ]), + ); + }); +}); + +const SAMPLE_DIFF = [ + "diff --git a/app.ts b/app.ts", + "index abc123..def456 100644", + "--- a/app.ts", + "+++ b/app.ts", + "@@ -1,4 +1,5 @@", + " function add(a: number, b: number): number {", + "- return a + b;", + "+ const result = a + b;", + "+ return result;", + " }", + "diff --git a/lib.ts b/lib.ts", + "similarity index 80%", + "rename from src/old.ts", + "rename to src/new.ts", + "index 111..222 100644", + "--- a/src/old.ts", + "+++ b/src/new.ts", + "@@ -10,2 +10,2 @@", + "-const x = 1;", + "+const x = 2;", +].join("\n"); + +describe("shapeUnifiedDiff", () => { + it("returns input untouched when shaping is undefined", () => { + expect(shapeUnifiedDiff(SAMPLE_DIFF)).toBe(SAMPLE_DIFF); + }); + + it("returns input untouched when shaping has no post-processing options", () => { + expect( + shapeUnifiedDiff(SAMPLE_DIFF, { + contextLines: 1, + ignoreWhitespace: true, + }), + ).toBe(SAMPLE_DIFF); + }); + + it("strips preamble noise while keeping ---/+++/@@ lines", () => { + const out = shapeUnifiedDiff(SAMPLE_DIFF, { stripDiffPreamble: true }); + expect(out).not.toMatch(/^diff --git /m); + expect(out).not.toMatch(/^index /m); + expect(out).not.toMatch(/^similarity index /m); + expect(out).not.toMatch(/^rename from /m); + expect(out).not.toMatch(/^rename to /m); + expect(out).toContain("--- a/app.ts"); + expect(out).toContain("+++ b/app.ts"); + expect(out).toContain("@@ -1,4 +1,5 @@"); + expect(out).toContain("- return a + b;"); + }); + + it("strips new/deleted file mode and old/new mode lines", () => { + const raw = [ + "diff --git a/new.ts b/new.ts", + "new file mode 100644", + "index 0000000..abcdef", + "--- /dev/null", + "+++ b/new.ts", + "@@ -0,0 +1,1 @@", + "+added", + "diff --git a/gone.ts b/gone.ts", + "deleted file mode 100644", + "index abcdef..0000000", + "--- a/gone.ts", + "+++ /dev/null", + "@@ -1,1 +0,0 @@", + "-removed", + "diff --git a/exec.sh b/exec.sh", + "old mode 100644", + "new mode 100755", + ].join("\n"); + const out = shapeUnifiedDiff(raw, { stripDiffPreamble: true }); + expect(out).not.toMatch(/^new file mode /m); + expect(out).not.toMatch(/^deleted file mode /m); + expect(out).not.toMatch(/^old mode /m); + expect(out).not.toMatch(/^new mode /m); + expect(out).toContain("--- /dev/null"); + expect(out).toContain("+++ /dev/null"); + }); + + it("elides hunk bodies longer than maxHunkLines", () => { + const bigHunk = [ + "--- a/big.ts", + "+++ b/big.ts", + "@@ -1,5 +1,5 @@", + " line1", + "-line2", + "+line2b", + " line3", + " line4", + " line5", + ].join("\n"); + const out = shapeUnifiedDiff(bigHunk, { maxHunkLines: 3 }); + const lines = out.split("\n"); + expect(lines).toContain("@@ -1,5 +1,5 @@"); + expect(lines).toContain(" line1"); + expect(lines).toContain("-line2"); + expect(lines).toContain("+line2b"); + expect(lines.some((l) => /3 diff lines elided/.test(l))).toBe(true); + expect(lines).not.toContain(" line5"); + }); + + it("does not elide hunks at or below maxHunkLines", () => { + const smallHunk = [ + "--- a/small.ts", + "+++ b/small.ts", + "@@ -1,2 +1,2 @@", + "-old", + "+new", + ].join("\n"); + const out = shapeUnifiedDiff(smallHunk, { maxHunkLines: 10 }); + expect(out).toBe(smallHunk); + }); + + it("uses singular 'line' when exactly one line is elided", () => { + const hunk = [ + "@@ -1,3 +1,3 @@", + " a", + "-b", + "+c", + " d", + ].join("\n"); + const out = shapeUnifiedDiff(hunk, { maxHunkLines: 3 }); + expect(out).toMatch(/1 diff line elided/); + expect(out).not.toMatch(/1 diff lines elided/); + }); + + it("flushes an open hunk when a new @@ header begins", () => { + const raw = [ + "@@ -1,3 +1,3 @@", + " a", + "-b", + "+c", + " d", + "@@ -20,2 +20,2 @@", + "-x", + "+y", + ].join("\n"); + const out = shapeUnifiedDiff(raw, { maxHunkLines: 2 }); + const lines = out.split("\n"); + expect(lines.filter((l) => /diff line.* elided/.test(l)).length).toBe(1); + expect(lines).toContain("@@ -20,2 +20,2 @@"); + expect(lines).toContain("-x"); + }); + + it("flushes an open hunk when a new diff --git boundary appears", () => { + const raw = [ + "@@ -1,3 +1,3 @@", + " a", + "-b", + "+c", + " d", + "diff --git a/other.ts b/other.ts", + "--- a/other.ts", + "+++ b/other.ts", + "@@ -1,1 +1,1 @@", + "-x", + "+y", + ].join("\n"); + const out = shapeUnifiedDiff(raw, { maxHunkLines: 2 }); + const lines = out.split("\n"); + expect(lines).toContain("diff --git a/other.ts b/other.ts"); + expect(lines.filter((l) => /diff line.* elided/.test(l)).length).toBe(1); + }); + + it("composes preamble strip with hunk elision", () => { + const out = shapeUnifiedDiff(SAMPLE_DIFF, { + stripDiffPreamble: true, + maxHunkLines: 1, + }); + expect(out).not.toMatch(/^diff --git /m); + expect(out).toMatch(/diff line.* elided/); + }); + + it("keeps hunk body lines that look like '---' deletions, not file headers", () => { + const raw = [ + "--- a/foo.md", + "+++ b/foo.md", + "@@ -1,3 +1,3 @@", + "-separator", + "+replacement", + "---", + ].join("\n"); + const out = shapeUnifiedDiff(raw, { maxHunkLines: 100 }); + expect(out).toContain("---"); + expect(out).toContain("-separator"); + }); + + it("treats maxHunkLines of 0 as eliding the entire body", () => { + const hunk = [ + "@@ -1,2 +1,2 @@", + "-a", + "+b", + ].join("\n"); + const out = shapeUnifiedDiff(hunk, { maxHunkLines: 0 }); + expect(out).toMatch(/^@@ -1,2 \+1,2 @@/); + expect(out).toContain("2 diff lines elided"); + expect(out).not.toContain("\n-a"); + }); + + it("normalizes negative and non-finite maxHunkLines to 0", () => { + const hunk = ["@@ -1,1 +1,1 @@", "-a", "+b"].join("\n"); + expect(shapeUnifiedDiff(hunk, { maxHunkLines: -3 })).toContain( + "diff lines elided", + ); + expect(shapeUnifiedDiff(hunk, { maxHunkLines: Number.NaN })).toContain( + "diff lines elided", + ); + }); + + it("does not collapse lines outside of any hunk when only maxHunkLines is set", () => { + const raw = [ + "diff --git a/a.ts b/a.ts", + "index abc..def 100644", + "--- a/a.ts", + "+++ b/a.ts", + ].join("\n"); + expect(shapeUnifiedDiff(raw, { maxHunkLines: 1 })).toBe(raw); + }); +}); diff --git a/test/gitDiff.async.spec.ts b/test/gitDiff.async.spec.ts index 7faa61b..8c9f6a8 100644 --- a/test/gitDiff.async.spec.ts +++ b/test/gitDiff.async.spec.ts @@ -81,6 +81,80 @@ describe("getDiff", () => { ).not.toHaveBeenCalled(); }); + it("forwards shaping args and post-processes the range diff", async () => { + const { git, diff } = makeGitWithDiff(); + diff.mockResolvedValue( + [ + "diff --git a/a.ts b/a.ts", + "index 111..222 100644", + "--- a/a.ts", + "+++ b/a.ts", + "@@ -1,1 +1,1 @@", + "-old", + "+new", + ].join("\n"), + ); + + const out = await getDiff(git, { + from: "a", + to: "b", + commits: [{ hash: "x", message: "m" }], + filterByCommits: false, + repoRootOverride: join(__dirname, "fixture-repo"), + shaping: { + contextLines: 1, + ignoreWhitespace: true, + stripDiffPreamble: true, + }, + }); + + expect(diff).toHaveBeenCalledWith([ + "-U1", + "-w", + "a..b", + "--", + ".", + ]); + expect(out).not.toContain("diff --git"); + expect(out).not.toContain("index 111..222"); + expect(out).toContain("--- a/a.ts"); + expect(out).toContain("@@ -1,1 +1,1 @@"); + }); + + it("shapes each per-commit patch independently", async () => { + const { git, diff } = makeGitWithDiff(); + diff + .mockResolvedValueOnce( + [ + "diff --git a/a.ts b/a.ts", + "index 111..222 100644", + "--- a/a.ts", + "+++ b/a.ts", + "@@ -1,1 +1,1 @@", + "-a", + "+b", + ].join("\n"), + ) + .mockResolvedValueOnce(""); + + const out = await getDiff(git, { + from: "f", + to: "t", + commits: [ + { hash: "aaa", message: "1" }, + { hash: "bbb", message: "2" }, + ], + filterByCommits: true, + repoRootOverride: join(__dirname, "fixture-repo"), + shaping: { stripDiffPreamble: true, contextLines: 0 }, + }); + + expect(diff).toHaveBeenCalledWith(["-U0", "aaa^!", "--", "."]); + expect(diff).toHaveBeenCalledWith(["-U0", "bbb^!", "--", "."]); + expect(out).not.toContain("diff --git"); + expect(out).toContain("--- a/a.ts"); + }); + it("joins per-commit patches and drops empty", async () => { const { git, diff } = makeGitWithDiff(); diff.mockResolvedValueOnce("").mockResolvedValueOnce("patch-b"); @@ -242,6 +316,57 @@ describe("getDiffSummary", () => { expect(shared?.oldPath).toBe("old/name.ts"); }); + it("passes -w to numstat and name-status when ignoreWhitespace is set", async () => { + const { git, diff } = makeGitWithDiff(); + diff.mockResolvedValue(""); + + await getDiffSummary(git, { + from: "a", + to: "b", + commits: [{ hash: "h", message: "m" }], + filterByCommits: false, + repoRootOverride: join(__dirname, "fixture-repo"), + shaping: { ignoreWhitespace: true }, + }); + + expect(diff).toHaveBeenCalledWith([ + "-w", + "--numstat", + "a..b", + "--", + ".", + ]); + expect(diff).toHaveBeenCalledWith([ + "-w", + "--name-status", + "a..b", + "--", + ".", + ]); + }); + + it("passes -w per-commit when ignoreWhitespace is set with filterByCommits", async () => { + const { git, diff } = makeGitWithDiff(); + diff.mockResolvedValue(""); + + await getDiffSummary(git, { + from: "a", + to: "b", + commits: [{ hash: "c1", message: "m" }], + filterByCommits: true, + repoRootOverride: join(__dirname, "fixture-repo"), + shaping: { ignoreWhitespace: true }, + }); + + expect(diff).toHaveBeenCalledWith([ + "-w", + "--numstat", + "c1^!", + "--", + ".", + ]); + }); + it("aggregates per-commit summaries", async () => { const { git, diff } = makeGitWithDiff(); let call = 0; diff --git a/test/summarizeGitDiff.spec.ts b/test/summarizeGitDiff.spec.ts index 5fba2ed..3c0a895 100644 --- a/test/summarizeGitDiff.spec.ts +++ b/test/summarizeGitDiff.spec.ts @@ -70,4 +70,130 @@ describe("summarizeGitDiff", () => { ); expect(hasPerCommitPatch).toBe(true); }); + + const SUMMARY_MODE_FLAGS = ["--numstat", "--name-status", "--name-only"]; + + function findPatchCall(diffCalls: string[][]): string[] | undefined { + return diffCalls.find( + (args) => + args.includes("a..b") && + !SUMMARY_MODE_FLAGS.some((f) => args.includes(f)), + ); + } + + it("forwards flat shaping options to git diff", async () => { + const git = createMockGit(repoRoot); + + await summarizeGitDiff({ + from: "a", + to: "b", + git, + contextLines: 0, + ignoreWhitespace: true, + stripDiffPreamble: true, + maxHunkLines: 200, + llmModelProvider: mockLlmProvider("ok"), + }); + + const diffCalls = (git.diff as jest.Mock).mock.calls.map( + (c) => c[0] as string[], + ); + const patchCall = findPatchCall(diffCalls); + expect(patchCall).toEqual(["-U0", "-w", "a..b", "--", "."]); + const numstatCall = diffCalls.find( + (args) => args.includes("--numstat") && args.includes("a..b"), + ); + expect(numstatCall?.[0]).toBe("-w"); + }); + + it("merges excludeDefaultNoise into excludeFolders and deduplicates user entries", async () => { + const git = createMockGit(repoRoot); + + await summarizeGitDiff({ + from: "a", + to: "b", + git, + excludeFolders: ["node_modules", "custom-out"], + excludeDefaultNoise: true, + llmModelProvider: mockLlmProvider("ok"), + }); + + const diffCalls = (git.diff as jest.Mock).mock.calls.map( + (c) => c[0] as string[], + ); + const patchCall = findPatchCall(diffCalls); + expect(patchCall).toBeDefined(); + const excludes = (patchCall ?? []).filter((a) => + a.startsWith(":(exclude)"), + ); + expect(excludes).toContain(":(exclude)package-lock.json"); + expect(excludes).toContain(":(exclude)custom-out"); + const nodeModulesCount = excludes.filter( + (e) => e === ":(exclude)node_modules", + ).length; + expect(nodeModulesCount).toBe(1); + }); + + it("ignores blank/duplicate entries when merging default noise excludes", async () => { + const git = createMockGit(repoRoot); + + await summarizeGitDiff({ + from: "a", + to: "b", + git, + excludeFolders: [" ", "custom-out", "custom-out"], + excludeDefaultNoise: true, + llmModelProvider: mockLlmProvider("ok"), + }); + + const diffCalls = (git.diff as jest.Mock).mock.calls.map( + (c) => c[0] as string[], + ); + const patchCall = findPatchCall(diffCalls); + const customCount = (patchCall ?? []).filter( + (e) => e === ":(exclude)custom-out", + ).length; + expect(customCount).toBe(1); + }); + + it("merges default noise excludes even when no user excludes are supplied", async () => { + const git = createMockGit(repoRoot); + + await summarizeGitDiff({ + from: "a", + to: "b", + git, + excludeDefaultNoise: true, + llmModelProvider: mockLlmProvider("ok"), + }); + + const diffCalls = (git.diff as jest.Mock).mock.calls.map( + (c) => c[0] as string[], + ); + const patchCall = findPatchCall(diffCalls); + const excludes = (patchCall ?? []).filter((a) => + a.startsWith(":(exclude)"), + ); + expect(excludes).toContain(":(exclude)package-lock.json"); + }); + + it("leaves path filter untouched when excludeDefaultNoise is not set", async () => { + const git = createMockGit(repoRoot); + + await summarizeGitDiff({ + from: "a", + to: "b", + git, + llmModelProvider: mockLlmProvider("ok"), + }); + + const diffCalls = (git.diff as jest.Mock).mock.calls.map( + (c) => c[0] as string[], + ); + const patchCall = findPatchCall(diffCalls); + expect(patchCall).toBeDefined(); + expect( + (patchCall ?? []).some((a) => a.startsWith(":(exclude)")), + ).toBe(false); + }); });