fix(connectors): audit fixes across new connectors

waleedlatif1 · waleedlatif1 · commit c16e94a3e8f6 · 2026-06-04T14:47:13.000-07:00
- registry: register x connector (was dead code, never wired in)
- google-docs/google-drive/google-forms: gate deletion reconciliation on
  Drive incompleteSearch; google-docs also now sets listingCapped on its
  maxDocs cap path
- jsm: add read:jira-user scope so reporter resolves on requests
- gong: only set listingCapped on genuine truncation, not exact-cap
  source exhaustion
- gitlab: issues phase switched to keyset pagination (removes ~50k
  offset ceiling), matching the repo-tree phase
- grain: parallelize recording + transcript fetch in getDocument
- ashby: document updatedAt-based content-hash limitation for
  notes/feedback change detection
- tests: mapTags coverage for x, granola, greenhouse, fathom, rootly
diff --git a/apps/sim/connectors/ashby/ashby.ts b/apps/sim/connectors/ashby/ashby.ts
@@ -298,7 +298,32 @@ function renderFeedbackValue(value: unknown): string {
 
 /**
  * Stable, metadata-based content hash for a candidate document. Identical between the
- * listing stub and the fully-fetched document so unchanged candidates are skipped.
+ * listing stub and the fully-fetched document so unchanged candidates are skipped,
+ * which keeps the `getDocument` re-hydration (notes + feedback fetches) cheap: the
+ * sync engine only re-hydrates a deferred stub when this hash differs from the stored
+ * document's hash (see `lib/knowledge/connectors/sync-engine.ts`).
+ *
+ * Known limitation — notes/feedback freshness depends on `candidate.updatedAt`.
+ * Candidate notes (`candidate.listNotes`) and interview feedback
+ * (`applicationFeedback.list`) are separate Ashby objects, not candidate fields. This
+ * hash is derived solely from the candidate's own `updatedAt`, so a new note or newly
+ * submitted feedback is only re-synced if Ashby advances `candidate.updatedAt` as a
+ * side effect of that write.
+ *
+ * As of this writing Ashby's public API docs do not specify what counts as a
+ * "modification" for `candidate.updatedAt` or for `candidate.list` syncToken
+ * incremental sync, and no third-party ATS-integration vendor (Merge, Nango, Knit)
+ * documents it either — so this behavior is unverified. If Ashby does NOT touch
+ * `candidate.updatedAt` on note/feedback writes, those additions will not be picked up
+ * until some other candidate field changes; a forced full sync re-hydrates everything
+ * regardless. No cheaper listing-time signal exists to fold into this hash: the
+ * `candidate.list` object exposes no note/feedback count, and syncToken carries the
+ * same unspecified change semantics as `updatedAt`.
+ *
+ * Refs:
+ * - https://developers.ashbyhq.com/reference/candidatelist
+ * - https://developers.ashbyhq.com/reference/candidatecreatenote
+ * - https://developers.ashbyhq.com/docs/pagination-and-incremental-sync
  */
 function buildContentHash(id: string, updatedAt: string | null): string {
   return `ashby:${id}:${updatedAt ?? ''}`
diff --git a/apps/sim/connectors/gitlab/gitlab.ts b/apps/sim/connectors/gitlab/gitlab.ts
@@ -470,15 +470,18 @@ async function fetchProject(
 }
 
 /**
- * Encodes the listing cursor. The cursor packs the resource phase (wiki ➜ issues)
- * and the issues page number so a single sync walks wikis first, then paginates
- * issues via the X-Next-Page header.
+ * Encodes the listing cursor. The cursor packs the resource phase (repo ➜ wiki ➜
+ * issues) and a per-phase continuation token so a single sync walks the phases in
+ * order. The repository-tree and issues phases both use GitLab keyset pagination
+ * and store the full `rel="next"` URL from the Link header to fetch verbatim.
  */
 interface CursorState {
   phase: SyncPhase
   issuePage: number
   /** Full `rel="next"` URL for the repository-tree keyset page to fetch next. */
   fileNextUrl?: string
+  /** Full `rel="next"` URL for the issues keyset page to fetch next. */
+  issueNextUrl?: string
 }
 
 function encodeCursor(state: CursorState): string {
@@ -492,6 +495,7 @@ function decodeCursor(cursor: string | undefined, initialPhase: SyncPhase): Curs
       phase: SyncPhase
       issuePage: number
       fileNextUrl: string
+      issueNextUrl: string
     }>
     const phase: SyncPhase =
       parsed.phase === 'repo' || parsed.phase === 'issues' || parsed.phase === 'wiki'
@@ -501,6 +505,7 @@ function decodeCursor(cursor: string | undefined, initialPhase: SyncPhase): Curs
       phase,
       issuePage: Number(parsed.issuePage) > 0 ? Number(parsed.issuePage) : 1,
       fileNextUrl: typeof parsed.fileNextUrl === 'string' ? parsed.fileNextUrl : undefined,
+      issueNextUrl: typeof parsed.issueNextUrl === 'string' ? parsed.issueNextUrl : undefined,
     }
   } catch {
     return { phase: initialPhase, issuePage: 1 }
@@ -859,9 +864,9 @@ export const gitlabConnector: ConnectorConfig = {
     if (state.phase === 'issues') {
       const params = new URLSearchParams({
         per_page: String(PAGE_SIZE),
-        page: String(state.issuePage),
         order_by: 'updated_at',
         sort: 'desc',
+        pagination: 'keyset',
       })
       if (lastSyncAt) params.set('updated_after', lastSyncAt.toISOString())
       const issueState =
@@ -874,11 +879,15 @@ export const gitlabConnector: ConnectorConfig = {
         typeof sourceConfig.issueMilestone === 'string' ? sourceConfig.issueMilestone.trim() : ''
       if (issueMilestone) params.set('milestone', issueMilestone)
 
-      const url = `${apiBase}/projects/${encodedProject}/issues?${params.toString()}`
+      if (state.issueNextUrl && !isSameOrigin(state.issueNextUrl, apiBase)) {
+        throw new Error('GitLab pagination cursor points to an unexpected host')
+      }
+      const url =
+        state.issueNextUrl ?? `${apiBase}/projects/${encodedProject}/issues?${params.toString()}`
       logger.info('Listing GitLab issues', {
         host,
         project: encodedProject,
-        page: state.issuePage,
+        continued: Boolean(state.issueNextUrl),
         incremental: Boolean(lastSyncAt),
       })
 
@@ -909,18 +918,18 @@ export const gitlabConnector: ConnectorConfig = {
         maxItems,
         syncContext
       )
+      if (hitLimit) return { documents: capped, hasMore: false }
 
-      const nextPageHeader = response.headers.get('x-next-page')?.trim()
-      const nextPage = nextPageHeader ? Number(nextPageHeader) : 0
-      const hasMorePages = !hitLimit && Number.isFinite(nextPage) && nextPage > 0
-
-      return {
-        documents: capped,
-        nextCursor: hasMorePages
-          ? encodeCursor({ phase: 'issues', issuePage: nextPage })
-          : undefined,
-        hasMore: hasMorePages,
+      const nextLink = parseNextLink(response.headers.get('link'))
+      if (nextLink) {
+        return {
+          documents: capped,
+          nextCursor: encodeCursor({ phase: 'issues', issuePage: 1, issueNextUrl: nextLink }),
+          hasMore: true,
+        }
       }
+
+      return { documents: capped, hasMore: false }
     }
 
     return { documents: [], hasMore: false }
diff --git a/apps/sim/connectors/gong/gong.ts b/apps/sim/connectors/gong/gong.ts
@@ -417,20 +417,32 @@ export const gongConnector: ConnectorConfig = {
 
     const prevFetched = (syncContext?.totalDocsFetched as number) ?? 0
     let documents = allDocuments
+    let capDroppedDocs = false
     if (maxCalls > 0) {
       const remaining = Math.max(0, maxCalls - prevFetched)
       if (allDocuments.length > remaining) {
         documents = allDocuments.slice(0, remaining)
+        capDroppedDocs = true
       }
     }
 
     const totalFetched = prevFetched + documents.length
     if (syncContext) syncContext.totalDocsFetched = totalFetched
     const hitLimit = maxCalls > 0 && totalFetched >= maxCalls
-    if (hitLimit && syncContext) syncContext.listingCapped = true
-
     const hasMore = !hitLimit && Boolean(nextPageCursor)
 
+    /**
+     * Only flag the listing as capped when the `maxCalls` limit actually
+     * truncated calls that still exist in the source — either by dropping calls
+     * from the current page or by stopping while another page remains. Reaching
+     * the limit exactly at source exhaustion (no dropped calls, no further
+     * cursor) yields a complete listing, so deletion reconciliation must still
+     * run for calls removed in Gong.
+     */
+    if (syncContext && (capDroppedDocs || (hitLimit && Boolean(nextPageCursor)))) {
+      syncContext.listingCapped = true
+    }
+
     return {
       documents,
       nextCursor: hasMore ? nextPageCursor : undefined,
diff --git a/apps/sim/connectors/google-docs/google-docs.ts b/apps/sim/connectors/google-docs/google-docs.ts
@@ -235,13 +235,23 @@ export const googleDocsConnector: ConnectorConfig = {
     const data = await response.json()
     const files = (data.files || []) as DriveFile[]
 
+    /**
+     * Drive sets `incompleteSearch` when it could not search every corpus (it
+     * arises with the `allDrives` scope enabled by `includeItemsFromAllDrives`).
+     * A partial listing drops still-existing docs, so reconciliation must be
+     * suppressed to avoid hard-deleting valid documents.
+     */
+    const incompleteSearch = data.incompleteSearch === true
+
     const maxDocs = sourceConfig.maxDocs ? Number(sourceConfig.maxDocs) : 0
     const previouslyFetched = (syncContext?.totalDocsFetched as number) ?? 0
 
     let documents = files.map(fileToStub)
+    let slicedSome = false
     if (maxDocs > 0) {
       const remaining = maxDocs - previouslyFetched
       if (documents.length > remaining) {
+        slicedSome = true
         documents = documents.slice(0, remaining)
       }
     }
@@ -252,6 +262,19 @@ export const googleDocsConnector: ConnectorConfig = {
 
     const nextPageToken = data.nextPageToken as string | undefined
 
+    /**
+     * Mark the listing as incomplete so the sync engine skips deletion
+     * reconciliation when this page does not represent the full source set:
+     * - `slicedSome`: the page held more docs than the `maxDocs` cap allowed.
+     * - `hitLimit` with a next page: the cap was reached while more pages remain.
+     * - `incompleteSearch`: Drive could not search every corpus, so the page is
+     *   partial and may omit still-existing docs.
+     * Reconciliation against any of these would hard-delete valid documents.
+     */
+    if (syncContext && (slicedSome || (hitLimit && Boolean(nextPageToken)) || incompleteSearch)) {
+      syncContext.listingCapped = true
+    }
+
     return {
       documents,
       nextCursor: hitLimit ? undefined : nextPageToken,
diff --git a/apps/sim/connectors/google-drive/google-drive.ts b/apps/sim/connectors/google-drive/google-drive.ts
@@ -268,14 +268,22 @@ export const googleDriveConnector: ConnectorConfig = {
     const data = await response.json()
     const files = (data.files || []) as DriveFile[]
 
+    /**
+     * Drive sets `incompleteSearch` when it could not search every corpus (it
+     * arises with the `allDrives` scope enabled by `includeItemsFromAllDrives`).
+     * A partial listing drops still-existing files, so reconciliation must be
+     * suppressed to avoid hard-deleting valid documents.
+     */
+    const incompleteSearch = data.incompleteSearch === true
+
     const documents = files
       .filter((f) => isGoogleWorkspaceFile(f.mimeType) || isSupportedTextFile(f.mimeType))
       .map(fileToStub)
 
     const totalFetched = previouslyFetched + documents.length
     if (syncContext) syncContext.totalDocsFetched = totalFetched
     const hitLimit = maxFiles > 0 && totalFetched >= maxFiles
-    if (hitLimit && syncContext) syncContext.listingCapped = true
+    if (syncContext && (hitLimit || incompleteSearch)) syncContext.listingCapped = true
 
     const nextPageToken = data.nextPageToken as string | undefined
 
diff --git a/apps/sim/connectors/google-forms/google-forms.ts b/apps/sim/connectors/google-forms/google-forms.ts
@@ -573,6 +573,14 @@ export const googleFormsConnector: ConnectorConfig = {
     const data = await response.json()
     let files = (data.files || []) as DriveFormFile[]
 
+    /**
+     * Drive sets `incompleteSearch` when it could not search every corpus (it
+     * arises with the `allDrives` scope enabled by `includeItemsFromAllDrives`).
+     * A partial listing drops still-existing forms, so reconciliation must be
+     * suppressed to avoid hard-deleting valid documents.
+     */
+    const incompleteSearch = data.incompleteSearch === true
+
     let slicedSome = false
     if (maxForms > 0) {
       const remaining = maxForms - previouslyFetched
@@ -633,11 +641,16 @@ export const googleFormsConnector: ConnectorConfig = {
      * - `hitLimit` with a next page: the cap was reached while more pages of
      *   forms remain in the source.
      * - `skippedOnError`: a transient error dropped a still-present form.
+     * - `incompleteSearch`: Drive could not search every corpus, so the page
+     *   itself is partial and may omit still-existing forms.
      * Deleting any of those would wipe valid documents from the knowledge base.
      * When the cap merely coincides with source exhaustion (no slice, no next
      * page), reconciliation stays enabled so deleted forms are cleaned up.
      */
-    if (syncContext && (slicedSome || (hitLimit && Boolean(nextPageToken)) || skippedOnError)) {
+    if (
+      syncContext &&
+      (slicedSome || (hitLimit && Boolean(nextPageToken)) || skippedOnError || incompleteSearch)
+    ) {
       syncContext.listingCapped = true
     }
 
diff --git a/apps/sim/connectors/grain/grain.ts b/apps/sim/connectors/grain/grain.ts
@@ -471,10 +471,11 @@ export const grainConnector: ConnectorConfig = {
     try {
       if (!externalId) return null
 
-      const recording = await fetchRecording(accessToken, externalId)
+      const [recording, segments] = await Promise.all([
+        fetchRecording(accessToken, externalId),
+        fetchTranscript(accessToken, externalId),
+      ])
       if (!recording) return null
-
-      const segments = await fetchTranscript(accessToken, externalId)
       if (!segments) return null
 
       const hasTranscript = segments.some((segment) => segment.text?.trim())
diff --git a/apps/sim/connectors/jsm/jsm.ts b/apps/sim/connectors/jsm/jsm.ts
@@ -338,6 +338,13 @@ export const jsmConnector: ConnectorConfig = {
       'read:request:jira-service-management',
       'read:request.comment:jira-service-management',
       'read:request.status:jira-service-management',
+      /**
+       * Requests embed a `reporter` user object whose `displayName` is surfaced
+       * in document content and the Reporter tag. Atlassian only populates
+       * embedded user data when the user-read scope is granted, so request it
+       * here. Present in the `jira` OAuth provider config as `read:jira-user`.
+       */
+      'read:jira-user',
       'offline_access',
     ],
   },
diff --git a/apps/sim/connectors/mapTags.test.ts b/apps/sim/connectors/mapTags.test.ts
diff --git a/apps/sim/connectors/registry.ts b/apps/sim/connectors/registry.ts