Skip to content

Commit 1f1f1af

Browse files
committed
fix(knowledge): engine truncation flag is an absolute deletion block (fullSync cannot override); s3 byte-exact size fallback; ado tsdoc accuracy
1 parent 5f4516b commit 1f1f1af

3 files changed

Lines changed: 34 additions & 10 deletions

File tree

apps/sim/connectors/azure-devops/azure-devops.ts

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,10 @@ const WIKI_ETAG_CONCURRENCY = 5
2525
const FILE_BATCH_SIZE = 100
2626
/**
2727
* Max repository file size to index. The Items list API does not return file
28-
* size, so this cap is enforced at content-fetch time in getDocument via the
29-
* decoded byte length. Larger files are skipped.
28+
* size, so this cap is enforced at content-fetch time in getDocument: the raw
29+
* octet-stream body is read through `readBodyWithLimit`, which streams the bytes
30+
* and aborts (returning null) the moment the cap is exceeded. Larger files are
31+
* skipped without being fully buffered.
3032
*/
3133
const MAX_FILE_SIZE = 10 * 1024 * 1024
3234
/** Bytes sniffed for a NUL byte when detecting binary files (matches git's heuristic). */
@@ -197,10 +199,21 @@ function parseFileExternalId(externalId: string): { repoId: string; path: string
197199

198200
/**
199201
* Builds the change-detection hash for a repository file. The git blob objectId
200-
* is content-addressable, so it changes exactly when the file content changes —
201-
* and it is available both on the tree listing (`objectId`) and the file fetch
202-
* (`objectId`), so the stub and hydrated document hash identically without a
203-
* content fetch during listing.
202+
* is content-addressable, so it changes exactly when the file content changes,
203+
* and it is reported both by the tree listing (`objectId`) and the per-file
204+
* metadata fetch (`objectId`) — so the listing stub and the hydrated document
205+
* normally hash identically without a content fetch during listing.
206+
*
207+
* Hydration in getFileDocument is a two-step fetch against the same branch ref:
208+
* a JSON metadata call yields the objectId used for this hash, then a raw
209+
* octet-stream call yields the content. Both pin to the branch *name*, not a
210+
* commit SHA, so if the branch advances between the two calls the stored hash
211+
* (metadata call's objectId) and the stored content (content call's bytes) can
212+
* be one commit apart. This window is bounded and self-heals: the next listing
213+
* reports the branch's current objectId, which differs from the stored
214+
* one-commit-old hash, queuing an update that re-fetches and re-converges
215+
* content and hash. (A revert to identical bytes yields the identical objectId
216+
* by content-addressing, so the stored content is already correct in that case.)
204217
*/
205218
function buildFileContentHash(repoId: string, objectId: string): string {
206219
return `ado:file:${repoId}:${objectId}`

apps/sim/connectors/s3/s3.ts

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -646,9 +646,7 @@ export const s3Connector: ConnectorConfig = {
646646
etag,
647647
lastModified,
648648
size:
649-
Number.isNaN(declaredLength) || declaredLength <= 0
650-
? Buffer.byteLength(content)
651-
: declaredLength,
649+
Number.isNaN(declaredLength) || declaredLength <= 0 ? body.byteLength : declaredLength,
652650
}
653651
const stub = objectToStub(ctx, entry)
654652
return { ...stub, content, contentDeferred: false }

apps/sim/lib/knowledge/connectors/sync-engine.ts

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -422,8 +422,15 @@ export async function executeSync(
422422
* cursor. The listing is incomplete, so flag it to suppress deletion
423423
* reconciliation; otherwise documents beyond the truncation point would
424424
* be removed even though they still exist in the source.
425+
*
426+
* `listingTruncated` is distinct from connector-set `listingCapped`:
427+
* a forced fullSync may legitimately override a connector's soft cap
428+
* (the user opted to reconcile the capped scope), but engine-level
429+
* truncation can never be resolved by forcing a fullSync — the next
430+
* fullSync truncates identically — so it is an absolute deletion block.
425431
*/
426432
syncContext.listingCapped = true
433+
syncContext.listingTruncated = true
427434
logger.warn('Pagination ended before source exhaustion; skipping deletion reconciliation', {
428435
connectorId,
429436
docsSoFar: externalDocs.length,
@@ -652,7 +659,13 @@ export async function executeSync(
652659

653660
// Reconcile deletions for non-incremental syncs that returned ALL docs.
654661
// Skip when listing was capped (maxFiles/maxThreads) — unseen docs may still exist in the source.
655-
if (!isIncremental && (!syncContext?.listingCapped || options?.fullSync)) {
662+
// A forced fullSync overrides connector-set caps, but never engine-level truncation
663+
// (listingTruncated): a truncated listing is incomplete no matter how the sync was triggered.
664+
if (
665+
!isIncremental &&
666+
!syncContext?.listingTruncated &&
667+
(!syncContext?.listingCapped || options?.fullSync)
668+
) {
656669
const removedIds = existingDocs
657670
.filter((d) => d.externalId && !seenExternalIds.has(d.externalId))
658671
.map((d) => d.id)

0 commit comments

Comments
 (0)