@@ -25,8 +25,10 @@ const WIKI_ETAG_CONCURRENCY = 5
2525const FILE_BATCH_SIZE = 100
2626/**
2727 * Max repository file size to index. The Items list API does not return file
28- * size, so this cap is enforced at content-fetch time in getDocument via the
29- * decoded byte length. Larger files are skipped.
28+ * size, so this cap is enforced at content-fetch time in getDocument: the raw
29+ * octet-stream body is read through `readBodyWithLimit`, which streams the bytes
30+ * and aborts (returning null) the moment the cap is exceeded. Larger files are
31+ * skipped without being fully buffered.
3032 */
3133const MAX_FILE_SIZE = 10 * 1024 * 1024
3234/** Bytes sniffed for a NUL byte when detecting binary files (matches git's heuristic). */
@@ -197,10 +199,21 @@ function parseFileExternalId(externalId: string): { repoId: string; path: string
197199
198200/**
199201 * Builds the change-detection hash for a repository file. The git blob objectId
200- * is content-addressable, so it changes exactly when the file content changes —
201- * and it is available both on the tree listing (`objectId`) and the file fetch
202- * (`objectId`), so the stub and hydrated document hash identically without a
203- * content fetch during listing.
202+ * is content-addressable, so it changes exactly when the file content changes,
203+ * and it is reported both by the tree listing (`objectId`) and the per-file
204+ * metadata fetch (`objectId`) — so the listing stub and the hydrated document
205+ * normally hash identically without a content fetch during listing.
206+ *
207+ * Hydration in getFileDocument is a two-step fetch against the same branch ref:
208+ * a JSON metadata call yields the objectId used for this hash, then a raw
209+ * octet-stream call yields the content. Both pin to the branch *name*, not a
210+ * commit SHA, so if the branch advances between the two calls the stored hash
211+ * (metadata call's objectId) and the stored content (content call's bytes) can
212+ * be one commit apart. This window is bounded and self-heals: the next listing
213+ * reports the branch's current objectId, which differs from the stored
214+ * one-commit-old hash, queuing an update that re-fetches and re-converges
215+ * content and hash. (A revert to identical bytes yields the identical objectId
216+ * by content-addressing, so the stored content is already correct in that case.)
204217 */
205218function buildFileContentHash ( repoId : string , objectId : string ) : string {
206219 return `ado:file:${ repoId } :${ objectId } `
0 commit comments