From b8f0dd598d579d645f3f945bb60cf46609096bb4 Mon Sep 17 00:00:00 2001 From: waleed Date: Thu, 4 Jun 2026 11:33:40 -0700 Subject: [PATCH 01/16] feat(connectors): add 7 knowledge base connectors (Google Forms, Typeform, Azure DevOps, YouTube, JSM, S3, Sentry) --- .claude/commands/add-connector.md | 19 + .claude/commands/validate-connector.md | 7 + .../docs/en/knowledgebase/connectors.mdx | 29 +- .../content/docs/en/mothership/knowledge.mdx | 2 +- .../connectors/azure-devops/azure-devops.ts | 1674 +++++++++++++++++ apps/sim/connectors/azure-devops/index.ts | 1 + .../connectors/google-forms/google-forms.ts | 771 ++++++++ apps/sim/connectors/google-forms/index.ts | 1 + apps/sim/connectors/jsm/index.ts | 1 + apps/sim/connectors/jsm/jsm.ts | 674 +++++++ apps/sim/connectors/registry.ts | 14 + apps/sim/connectors/s3/index.ts | 1 + apps/sim/connectors/s3/s3.ts | 721 +++++++ apps/sim/connectors/sentry/index.ts | 1 + apps/sim/connectors/sentry/sentry.ts | 732 +++++++ apps/sim/connectors/typeform/index.ts | 1 + apps/sim/connectors/typeform/typeform.ts | 596 ++++++ apps/sim/connectors/x/index.ts | 1 + apps/sim/connectors/x/x.ts | 628 +++++++ apps/sim/connectors/youtube/index.ts | 1 + apps/sim/connectors/youtube/youtube.ts | 650 +++++++ 21 files changed, 6515 insertions(+), 10 deletions(-) create mode 100644 apps/sim/connectors/azure-devops/azure-devops.ts create mode 100644 apps/sim/connectors/azure-devops/index.ts create mode 100644 apps/sim/connectors/google-forms/google-forms.ts create mode 100644 apps/sim/connectors/google-forms/index.ts create mode 100644 apps/sim/connectors/jsm/index.ts create mode 100644 apps/sim/connectors/jsm/jsm.ts create mode 100644 apps/sim/connectors/s3/index.ts create mode 100644 apps/sim/connectors/s3/s3.ts create mode 100644 apps/sim/connectors/sentry/index.ts create mode 100644 apps/sim/connectors/sentry/sentry.ts create mode 100644 apps/sim/connectors/typeform/index.ts create mode 100644 apps/sim/connectors/typeform/typeform.ts create mode 100644 apps/sim/connectors/x/index.ts create mode 100644 apps/sim/connectors/x/x.ts create mode 100644 apps/sim/connectors/youtube/index.ts create mode 100644 apps/sim/connectors/youtube/youtube.ts diff --git a/.claude/commands/add-connector.md b/.claude/commands/add-connector.md index 22c8c52e1c8..81823675a72 100644 --- a/.claude/commands/add-connector.md +++ b/.claude/commands/add-connector.md @@ -463,6 +463,24 @@ const response = await fetchWithRetry(url, { ... }, VALIDATE_RETRY_OPTIONS) If `ExternalDocument.sourceUrl` is set, the sync engine stores it on the document record. Always construct the full URL (not a relative path). +## Capped or Incomplete Listings — `syncContext.listingCapped` (REQUIRED) + +If `listDocuments` can ever return **less than the full source set** on a non-incremental sync — a `maxItems`/`maxDocuments`-style cap, or a transient per-item error that drops a still-existing document from the listing — it MUST set `syncContext.listingCapped = true` when that happens. + +The sync engine reconciles deletions by comparing the full listing against stored documents: anything not seen is **hard-deleted** (sync-engine.ts, gated on `!syncContext?.listingCapped`). A truncated listing without this flag deletes every real document beyond the cap. This was the single most common bug found when auditing connectors — do not omit it. + +```typescript +if (hitLimit && syncContext) { + syncContext.listingCapped = true +} +``` + +Rules: +- Set it when a user-configured cap truncates the listing while more documents exist +- Set it when a thrown error caused a still-present document to be skipped during listing +- Do NOT set it when the source is genuinely exhausted (deleted documents must still reconcile) +- Do NOT set it for intentional scope filters (e.g. a date cutoff) — out-of-scope documents should be reconciled normally + ## Sync Engine Behavior (Do Not Modify) The sync engine (`lib/knowledge/connectors/sync-engine.ts`) is connector-agnostic. It: @@ -515,6 +533,7 @@ export const CONNECTOR_REGISTRY: ConnectorRegistry = { - `dependsOn` references selector field IDs (not `canonicalParamId`) - Dependency `canonicalParamId` values exist in `SELECTOR_CONTEXT_FIELDS` - [ ] `listDocuments` handles pagination with metadata-based content hashes +- [ ] `syncContext.listingCapped = true` set whenever the listing is truncated (max-items cap or transient per-item error) — required to prevent the engine's deletion reconciliation from removing unseen documents - [ ] `contentDeferred: true` used if content requires per-doc API calls (file download, export, blocks fetch) - [ ] `contentHash` is metadata-based (not content-based) and identical between stub and `getDocument` - [ ] `sourceUrl` set on each ExternalDocument (full URL, not relative) diff --git a/.claude/commands/validate-connector.md b/.claude/commands/validate-connector.md index adcbf61b12b..3aa5da34f93 100644 --- a/.claude/commands/validate-connector.md +++ b/.claude/commands/validate-connector.md @@ -135,6 +135,13 @@ For each API endpoint the connector calls: - [ ] No off-by-one errors in pagination tracking - [ ] The connector does NOT hit known API pagination limits silently (e.g., HubSpot search 10k cap) +### Deletion-Reconciliation Safety (`listingCapped`) — CRITICAL +The sync engine hard-deletes any stored document absent from a full listing. Audit every path where `listDocuments` can return less than the full source set: +- [ ] `syncContext.listingCapped = true` is set when a `maxItems`-style cap truncates the listing while more documents exist +- [ ] `listingCapped` is set when a transient per-item error drops a still-existing document from the listing +- [ ] `listingCapped` is NOT set when the source is genuinely exhausted (deleted documents must reconcile) or for intentional scope filters (date cutoffs) +This is the most common connector bug class — verify it explicitly against `sync-engine.ts`'s reconciliation gate. + ### Pagination State Across Pages - [ ] `syncContext` is used to cache state across pages (user names, field maps, instance URLs, portal IDs, etc.) - [ ] Cached state in `syncContext` is correctly initialized on first page and reused on subsequent pages diff --git a/apps/docs/content/docs/en/knowledgebase/connectors.mdx b/apps/docs/content/docs/en/knowledgebase/connectors.mdx index 88a62383027..2f9de16cfa2 100644 --- a/apps/docs/content/docs/en/knowledgebase/connectors.mdx +++ b/apps/docs/content/docs/en/knowledgebase/connectors.mdx @@ -14,21 +14,23 @@ Connectors continuously sync documents from external services into your knowledg Connect Source picker showing a searchable list of available connectors including Airtable, Asana, Confluence, Discord, Dropbox, Evernote, Fireflies, GitHub, and Gmail -Sim ships with 30 built-in connectors: +Sim ships with 49 built-in connectors: | Category | Connectors | |----------|-----------| -| **Productivity** | Notion, Confluence, Asana, Linear, Jira, Google Calendar, Google Sheets | -| **Cloud Storage** | Google Drive, Dropbox, OneDrive, SharePoint | -| **Documents** | Google Docs, WordPress, Webflow | -| **Development** | GitHub | -| **Communication** | Slack, Discord, Microsoft Teams, Reddit | +| **Productivity** | Notion, Confluence, Asana, Linear, Jira, Jira Service Management, Monday, Google Calendar, Google Sheets, Google Forms, Typeform | +| **Cloud Storage** | Google Drive, Dropbox, OneDrive, SharePoint, Amazon S3 | +| **Documents** | Google Docs, WordPress, Webflow, DocuSign | +| **Development** | GitHub, GitLab, Azure DevOps, Sentry | +| **Communication** | Slack, Discord, Microsoft Teams, Reddit, YouTube | | **Email** | Gmail, Outlook | | **CRM** | HubSpot, Salesforce | | **Support** | Intercom, ServiceNow, Zendesk | +| **Incident Management** | incident.io, Rootly | | **Data** | Airtable | | **Note-taking** | Evernote, Obsidian | -| **Meetings** | Fireflies | +| **Meetings** | Zoom, Gong, Grain, Granola, Fathom, Fireflies | +| **Recruiting** | Greenhouse, Ashby | ## Adding a Connector @@ -41,13 +43,18 @@ From inside a knowledge base, click **+ New connector** in the top right to open Most connectors use **OAuth** — select an existing credential from the dropdown or click **Connect new account** to authorize through the service. Tokens are refreshed automatically. -A few connectors use **API keys** instead: +Other connectors use **API keys** or **personal access tokens** instead. The setup modal tells you which credential each connector expects — for example: | Connector | Where to get the key | |-----------|---------------------| | **Evernote** | Developer Token (starts with `S=`) from your Evernote account settings | | **Obsidian** | Install the [Local REST API](https://github.com/coddingtonbear/obsidian-local-rest-api) plugin, then copy the key from its settings | | **Fireflies** | Generate from the Integrations page in your Fireflies account | +| **Typeform** | Personal access token from your Typeform account settings | +| **Azure DevOps** | Personal access token with Wiki (Read), Work Items (Read), and Code (Read) scopes | +| **YouTube** | YouTube Data API key from the Google Cloud Console | +| **Amazon S3** | Secret Access Key (the Access Key ID, region, and bucket are entered as config fields) | +| **Sentry** | Auth token with `project:read` and `event:read` scopes | If you rotate an API key in the external service, update it in Sim as well — OAuth tokens refresh automatically, but API keys do not. @@ -63,6 +70,10 @@ Each connector has source-specific fields that control what gets synced. Example - **Notion** — sync an entire workspace, a specific database, or a single page tree - **GitHub** — specify a repository, branch, and optional file extension filter - **Confluence** — enter your Atlassian domain and optionally filter by space key or content type +- **Azure DevOps** — choose what to sync (wiki pages, work items, repository files, or all), with optional work item type/state filters, a custom WIQL query, and repository/branch/path filters +- **Amazon S3** — point at a bucket with an optional key prefix and a customizable file extension allowlist; S3-compatible stores (Cloudflare R2, MinIO) are supported via a custom endpoint +- **YouTube** — sync a channel (by `@handle` or ID) or playlist, with an optional published-after date filter and the option to exclude Shorts +- **Sentry** — filter issues by search query (e.g. `is:unresolved`), environment, and time window; self-hosted Sentry is supported via a custom host - **Obsidian** — provide your vault URL (`https://127.0.0.1:27124` by default) and optionally restrict to a folder path - **Fireflies** — optionally filter by host email or cap the number of transcripts synced @@ -188,5 +199,5 @@ You can add as many connectors as you need to a single knowledge base. Each mana { question: "What happens when I delete a connector?", answer: "The connector is removed and future syncs stop. You're given the option to also delete all documents that were synced by that connector. If you don't check that option, they stay in the knowledge base as-is." }, { question: "What does the Disabled status mean?", answer: "After 10 consecutive full-sync failures, the connector is automatically disabled to stop retrying. Reconnect the OAuth account or click Resume to re-enable it." }, { question: "Do metadata tags count against a limit?", answer: "Yes. Tag slots are shared across all documents in a knowledge base — 17 slots total. Multiple connectors draw from the same pool, so plan accordingly if several connectors each auto-populate tags." }, - { question: "Do I need to re-authenticate connectors?", answer: "OAuth connectors refresh tokens automatically. API key connectors (Evernote, Obsidian, Fireflies) need manual updates if you rotate the key in the external service." }, + { question: "Do I need to re-authenticate connectors?", answer: "OAuth connectors refresh tokens automatically. API key and personal access token connectors need manual updates if you rotate the credential in the external service." }, ]} /> diff --git a/apps/docs/content/docs/en/mothership/knowledge.mdx b/apps/docs/content/docs/en/mothership/knowledge.mdx index ab17e6e6a78..008c050b5c2 100644 --- a/apps/docs/content/docs/en/mothership/knowledge.mdx +++ b/apps/docs/content/docs/en/mothership/knowledge.mdx @@ -49,7 +49,7 @@ For knowledge bases that should stay current automatically, connectors sync cont Connectors are configured through the knowledge base settings, not through Mothership chat. Once connected, all synced content is immediately searchable by Mothership and by any Agent block with the knowledge base attached. -Sim ships with 30 built-in connectors, including Notion, Google Drive, Slack, GitHub, Confluence, HubSpot, Salesforce, Gmail, and more. +Sim ships with 49 built-in connectors, including Notion, Google Drive, Slack, GitHub, Confluence, HubSpot, Salesforce, Gmail, and more. Examples of what you can sync: diff --git a/apps/sim/connectors/azure-devops/azure-devops.ts b/apps/sim/connectors/azure-devops/azure-devops.ts new file mode 100644 index 00000000000..2f99f9922f5 --- /dev/null +++ b/apps/sim/connectors/azure-devops/azure-devops.ts @@ -0,0 +1,1674 @@ +import { createLogger } from '@sim/logger' +import { getErrorMessage, toError } from '@sim/utils/errors' +import { AzureDevOpsIcon } from '@/components/icons' +import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils' +import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types' +import { htmlToPlainText, joinTagArray, parseTagDate } from '@/connectors/utils' + +const logger = createLogger('AzureDevOpsConnector') + +const ADO_BASE_URL = 'https://dev.azure.com' +const WIKI_API_VERSION = '7.1' +const WIKIS_LIST_API_VERSION = '7.1' +const WIQL_API_VERSION = '7.1' +const WORKITEMS_API_VERSION = '7.1' +const PROJECT_API_VERSION = '7.1' +const GIT_API_VERSION = '7.1' + +/** Page size for the wiki `pagesbatch` endpoint. */ +const WIKI_PAGE_BATCH_SIZE = 100 +/** Page size for the WIQL → workitemsbatch listing pipeline. ADO caps a batch at 200 ids. */ +const WORK_ITEM_BATCH_SIZE = 200 +/** Concurrency for per-page wiki ETag lookups during listing. */ +const WIKI_ETAG_CONCURRENCY = 5 +/** Page size for paginating repository-file stubs out of the in-memory tree. */ +const FILE_BATCH_SIZE = 100 +/** + * Max repository file size to index. The Items list API does not return file + * size, so this cap is enforced at content-fetch time in getDocument via the + * decoded byte length. Larger files are skipped. + */ +const MAX_FILE_SIZE = 10 * 1024 * 1024 +/** Bytes sniffed for a NUL byte when detecting binary files (matches git's heuristic). */ +const BINARY_SNIFF_BYTES = 8000 +/** + * WIQL returns at most 20,000 work item references. We cap `$top` at this bound + * so the connector never silently relies on truncated results; users who need + * more should narrow the query via the work-item filters. + */ +const WIQL_MAX_RESULTS = 20000 + +/** + * externalId discriminators. Wiki pages are addressed by `wiki:{wikiId}:{path}`, + * work items by `wi:{id}`, and repository files by `file:{repoId}:{path}`. + */ +const FILE_PREFIX = 'file:' + +type ContentType = 'wiki' | 'workitems' | 'files' | 'both' | 'all' + +/** Listing phases, walked in order: wiki ➜ work items ➜ repository files. */ +type SyncPhase = 'wiki' | 'workitems' | 'file' + +/** + * Returns the ordered list of active sync phases for a content-type choice. + * Phase order is fixed (wiki ➜ workitems ➜ file) so the phase-encoded cursor and + * the maxItems phase-boundary guard compose deterministically. + */ +function activePhases(contentType: ContentType): SyncPhase[] { + const phases: SyncPhase[] = [] + if (contentType === 'wiki' || contentType === 'both' || contentType === 'all') phases.push('wiki') + if (contentType === 'workitems' || contentType === 'both' || contentType === 'all') { + phases.push('workitems') + } + if (contentType === 'files' || contentType === 'all') phases.push('file') + return phases +} + +/** + * Returns the phase following `current` for a content type, or undefined when + * `current` is the last active phase. + */ +function nextPhase(current: SyncPhase, contentType: ContentType): SyncPhase | undefined { + const phases = activePhases(contentType) + const idx = phases.indexOf(current) + return idx >= 0 && idx + 1 < phases.length ? phases[idx + 1] : undefined +} + +/** + * Builds the Azure DevOps PAT auth header. ADO PATs authenticate via HTTP Basic + * with an empty username and the token as the password. + */ +function patAuthHeader(accessToken: string): string { + return `Basic ${Buffer.from(`:${accessToken}`).toString('base64')}` +} + +/** + * Normalizes the configured content type, defaulting to wiki pages. + */ +function parseContentType(value: unknown): ContentType { + if (value === 'workitems' || value === 'files' || value === 'both' || value === 'all') { + return value + } + return 'wiki' +} + +/** + * Heuristic binary detection: a NUL byte in the first 8 KB marks the file as + * binary, matching `git diff` / `git grep` semantics. + */ +function isBinaryBuffer(buf: Buffer): boolean { + const len = Math.min(buf.length, BINARY_SNIFF_BYTES) + for (let i = 0; i < len; i++) { + if (buf[i] === 0) return true + } + return false +} + +/** + * Parses a comma-separated extension filter into a normalized set (leading dot, + * lowercased). Returns null when no filter is configured (accept all files). + */ +function parseExtensions(raw: unknown): Set | null { + const trimmed = typeof raw === 'string' ? raw.trim() : '' + if (!trimmed) return null + const exts = trimmed + .split(',') + .map((e) => e.trim().toLowerCase()) + .filter(Boolean) + .map((e) => (e.startsWith('.') ? e : `.${e}`)) + return exts.length > 0 ? new Set(exts) : null +} + +/** + * Returns true when the file path matches the extension filter (or no filter set). + */ +function matchesExtension(filePath: string, extSet: Set | null): boolean { + if (!extSet) return true + const lastDot = filePath.lastIndexOf('.') + if (lastDot === -1) return false + return extSet.has(filePath.slice(lastDot).toLowerCase()) +} + +/** + * Strips the `refs/heads/` prefix from a default-branch ref so it can be used as + * a `versionDescriptor.version` branch name. + */ +function stripRefsHeads(ref: string): string { + return ref.replace(/^refs\/heads\//, '') +} + +/** + * Reads a trimmed string config value, returning '' when absent. + */ +function readString(value: unknown): string { + return typeof value === 'string' ? value.trim() : '' +} + +/** + * Escapes a value for safe interpolation into a single-quoted WIQL string literal. + * WIQL escapes an embedded single quote by doubling it. + */ +function escapeWiql(value: string): string { + return value.replace(/'/g, "''") +} + +/** + * Encodes an external ID that combines a discriminator with its identifier, + * e.g. `wiki:{wikiId}:{pagePath}` or `wi:{id}`. + */ +function workItemExternalId(id: number): string { + return `wi:${id}` +} + +function wikiPageExternalId(wikiId: string, pagePath: string): string { + return `wiki:${wikiId}:${pagePath}` +} + +/** + * Parses a wiki external ID back into its wiki ID and page path. + */ +function parseWikiExternalId(externalId: string): { wikiId: string; pagePath: string } | null { + if (!externalId.startsWith('wiki:')) return null + const rest = externalId.slice('wiki:'.length) + const sep = rest.indexOf(':') + if (sep === -1) return null + return { wikiId: rest.slice(0, sep), pagePath: rest.slice(sep + 1) } +} + +/** + * Builds the externalId for a repository file: `file:{repoId}:{path}`. The path + * retains its leading slash as returned by the Items API. + */ +function fileExternalId(repoId: string, path: string): string { + return `${FILE_PREFIX}${repoId}:${path}` +} + +/** + * Parses a file externalId back into its repository ID and path. Returns null + * when the externalId is not a file ID. + */ +function parseFileExternalId(externalId: string): { repoId: string; path: string } | null { + if (!externalId.startsWith(FILE_PREFIX)) return null + const rest = externalId.slice(FILE_PREFIX.length) + const sep = rest.indexOf(':') + if (sep === -1) return null + return { repoId: rest.slice(0, sep), path: rest.slice(sep + 1) } +} + +/** + * Builds the change-detection hash for a repository file. The git blob objectId + * is content-addressable, so it changes exactly when the file content changes — + * and it is available both on the tree listing (`objectId`) and the file fetch + * (`objectId`), so the stub and hydrated document hash identically without a + * content fetch during listing. + */ +function buildFileContentHash(repoId: string, objectId: string): string { + return `ado:file:${repoId}:${objectId}` +} + +interface WikiV2 { + id: string + name: string + remoteUrl?: string + type?: string +} + +interface GitRepository { + id: string + name: string + defaultBranch?: string + isDisabled?: boolean + remoteUrl?: string + webUrl?: string +} + +interface GitItem { + objectId: string + gitObjectType?: string + path: string + isFolder?: boolean + content?: string + contentMetadata?: { + isBinary?: boolean + fileName?: string + encoding?: number + } +} + +/** + * A repository file flattened across all in-scope repositories, carrying enough + * context to build its stub and source URL during offset-based pagination. + */ +interface RepoFileEntry { + repoId: string + repoName: string + repoWebUrl?: string + branch: string + item: GitItem +} + +interface WikiPageDetail { + id: number + path: string +} + +interface WorkItemRef { + id: number +} + +interface RawWorkItem { + id: number + rev?: number + url?: string + fields?: Record +} + +/** + * Resolves the change-detection revision for a work item. ADO returns the + * revision as the top-level `rev` property on each batch item; `System.Rev` is + * not guaranteed to be echoed in the requested `fields`, so `rev` is the + * authoritative source. Falls back to the in-fields rev, then `System.ChangedDate`. + */ +function resolveWorkItemRev(raw: RawWorkItem, fields: Record): string { + if (typeof raw.rev === 'number') return String(raw.rev) + const fieldRev = fields['System.Rev'] + if (typeof fieldRev === 'number') return String(fieldRev) + const changed = fields['System.ChangedDate'] + if (typeof changed === 'string' && changed) return changed + return '0' +} + +/** + * Fetches the list of wikis in the configured project. Returns an empty list on + * 401/403/404 so a missing or inaccessible wiki feature degrades gracefully + * rather than aborting the sync. + */ +async function listWikis( + accessToken: string, + organization: string, + project: string, + retryOptions?: Parameters[2] +): Promise { + const url = `${ADO_BASE_URL}/${encodeURIComponent(organization)}/${encodeURIComponent(project)}/_apis/wiki/wikis?api-version=${WIKIS_LIST_API_VERSION}` + const response = await fetchWithRetry( + url, + { + method: 'GET', + headers: { Accept: 'application/json', Authorization: patAuthHeader(accessToken) }, + }, + retryOptions + ) + if (!response.ok) { + if (response.status === 401 || response.status === 403 || response.status === 404) { + logger.warn('Azure DevOps wikis unavailable; skipping wiki listing', { + organization, + project, + status: response.status, + }) + return [] + } + const errorText = await response.text().catch(() => '') + logger.error('Failed to list Azure DevOps wikis', { status: response.status, error: errorText }) + throw new Error(`Failed to list wikis: ${response.status}`) + } + const data = await response.json() + return (data.value as WikiV2[] | undefined) ?? [] +} + +/** + * Resolves the wikis for the project, caching them on the sync context so a + * single sync (and its deferred getDocument calls) reuse one listing. + */ +async function resolveWikis( + accessToken: string, + organization: string, + project: string, + syncContext?: Record +): Promise { + const cached = syncContext?.wikis as WikiV2[] | undefined + if (cached) return cached + const wikis = await listWikis(accessToken, organization, project) + if (syncContext) syncContext.wikis = wikis + return wikis +} + +/** + * Returns true when the wiki should be included given an optional wiki filter + * (matched case-insensitively against the wiki id or name). + */ +function wikiMatchesFilter(wiki: WikiV2, filter: string): boolean { + if (!filter) return true + const needle = filter.toLowerCase() + return wiki.id.toLowerCase() === needle || (wiki.name ?? '').toLowerCase() === needle +} + +/** + * Fetches the ETag for a single wiki page without downloading its content. + * The ETag changes whenever the page is edited, making it a reliable + * metadata-only change-detection hash for the deferred-content pattern. + */ +async function fetchWikiPageETag( + accessToken: string, + organization: string, + project: string, + wikiId: string, + pagePath: string +): Promise { + const url = `${ADO_BASE_URL}/${encodeURIComponent(organization)}/${encodeURIComponent(project)}/_apis/wiki/wikis/${encodeURIComponent(wikiId)}/pages?path=${encodeURIComponent(pagePath)}&api-version=${WIKI_API_VERSION}` + const response = await fetchWithRetry(url, { + method: 'GET', + headers: { Accept: 'application/json', Authorization: patAuthHeader(accessToken) }, + }) + if (!response.ok) { + if (response.status === 404) return null + logger.warn('Failed to fetch wiki page ETag', { pagePath, status: response.status }) + return null + } + const etag = response.headers.get('etag') + return etag ? etag.replace(/"/g, '') : null +} + +/** + * Builds a wiki page stub. The contentHash is derived from the page ETag + * (falls back to the page id when no ETag is available), guaranteeing the + * hash is identical between listing and content fetch. + */ +function wikiPageToStub( + organization: string, + project: string, + wiki: WikiV2, + page: WikiPageDetail, + etag: string | null +): ExternalDocument { + const title = page.path.split('/').filter(Boolean).pop() || page.path || 'Untitled' + const sourceUrl = wiki.remoteUrl + ? `${wiki.remoteUrl}?pagePath=${encodeURIComponent(page.path)}` + : undefined + return { + externalId: wikiPageExternalId(wiki.id, page.path), + title, + content: '', + contentDeferred: true, + mimeType: 'text/plain', + sourceUrl, + contentHash: `ado:wiki:${wiki.id}:${page.path}:${etag ?? page.id}`, + metadata: { + kind: 'wiki', + organization, + project, + wikiId: wiki.id, + wikiName: wiki.name, + pageId: page.id, + pagePath: page.path, + }, + } +} + +/** + * Builds a work item document. Work items are returned inline (not deferred) + * because the batch fetch already includes all field content. The contentHash + * uses the work item revision, which increments on every change. HTML-bearing + * fields (description, repro steps, acceptance criteria) are stripped to text. + */ +function workItemToDocument( + organization: string, + project: string, + raw: RawWorkItem +): ExternalDocument { + const fields = raw.fields ?? {} + const title = (fields['System.Title'] as string | undefined) ?? `Work Item ${raw.id}` + const workItemType = (fields['System.WorkItemType'] as string | undefined) ?? '' + const state = (fields['System.State'] as string | undefined) ?? '' + const rev = resolveWorkItemRev(raw, fields) + const changedDate = (fields['System.ChangedDate'] as string | undefined) ?? '' + const areaPath = (fields['System.AreaPath'] as string | undefined) ?? '' + const iterationPath = (fields['System.IterationPath'] as string | undefined) ?? '' + const rawTags = (fields['System.Tags'] as string | undefined) ?? '' + const tags = rawTags + .split(';') + .map((t) => t.trim()) + .filter(Boolean) + const description = htmlToPlainText((fields['System.Description'] as string | undefined) ?? '') + const reproSteps = htmlToPlainText( + (fields['Microsoft.VSTS.TCM.ReproSteps'] as string | undefined) ?? '' + ) + const acceptanceCriteria = htmlToPlainText( + (fields['Microsoft.VSTS.Common.AcceptanceCriteria'] as string | undefined) ?? '' + ) + + const contentParts: string[] = [`Title: ${title}`, `Type: ${workItemType}`, `State: ${state}`] + if (tags.length > 0) contentParts.push(`Tags: ${tags.join(', ')}`) + if (description) contentParts.push('', 'Description:', description) + if (reproSteps) contentParts.push('', 'Repro Steps:', reproSteps) + if (acceptanceCriteria) contentParts.push('', 'Acceptance Criteria:', acceptanceCriteria) + + return { + externalId: workItemExternalId(raw.id), + title: `#${raw.id}: ${title}`, + content: contentParts.join('\n'), + contentDeferred: false, + mimeType: 'text/plain', + sourceUrl: `${ADO_BASE_URL}/${encodeURIComponent(organization)}/${encodeURIComponent(project)}/_workitems/edit/${raw.id}`, + contentHash: `ado:wi:${raw.id}:${rev}`, + metadata: { + kind: 'workitem', + organization, + project, + workItemId: raw.id, + workItemType, + state, + areaPath, + iterationPath, + tags, + changedDate, + rev, + }, + } +} + +/** + * Reads the work-item filter configuration from sourceConfig. + */ +interface WorkItemFilters { + workItemType: string + state: string + areaPath: string + tags: string[] + customWiql: string +} + +function readWorkItemFilters(sourceConfig: Record): WorkItemFilters { + const tagsRaw = readString(sourceConfig.workItemTags) + const tags = tagsRaw + ? tagsRaw + .split(',') + .map((t) => t.trim()) + .filter(Boolean) + : [] + return { + workItemType: readString(sourceConfig.workItemType), + state: readString(sourceConfig.state), + areaPath: readString(sourceConfig.areaPath), + tags, + customWiql: readString(sourceConfig.customWiql), + } +} + +/** + * Builds the WIQL query for the configured work-item filters. User-supplied + * values are escaped against WIQL string-literal injection. When a custom WIQL + * query is provided it is used verbatim and the structured filters are ignored. + * `lastSyncAt` narrows results to items changed since the previous sync. + */ +function buildWiql(filters: WorkItemFilters, lastSyncAt?: Date): string { + if (filters.customWiql) return filters.customWiql + + const clauses: string[] = ['[System.TeamProject] = @project'] + if (filters.workItemType) { + clauses.push(`[System.WorkItemType] = '${escapeWiql(filters.workItemType)}'`) + } + if (filters.state) { + clauses.push(`[System.State] = '${escapeWiql(filters.state)}'`) + } + if (filters.areaPath) { + clauses.push(`[System.AreaPath] UNDER '${escapeWiql(filters.areaPath)}'`) + } + for (const tag of filters.tags) { + clauses.push(`[System.Tags] CONTAINS '${escapeWiql(tag)}'`) + } + if (lastSyncAt) { + clauses.push(`[System.ChangedDate] >= '${lastSyncAt.toISOString()}'`) + } + + return `SELECT [System.Id] FROM workitems WHERE ${clauses.join(' AND ')} ORDER BY [System.ChangedDate] DESC` +} + +/** + * Runs a WIQL query for work items in the project and returns their IDs. + * WIQL itself is not paginated and returns at most 20,000 ids; pagination + * happens over the resulting ID list via the workitemsbatch endpoint. + */ +async function queryWorkItemIds( + accessToken: string, + organization: string, + project: string, + wiql: string +): Promise { + const url = `${ADO_BASE_URL}/${encodeURIComponent(organization)}/${encodeURIComponent(project)}/_apis/wit/wiql?$top=${WIQL_MAX_RESULTS}&api-version=${WIQL_API_VERSION}` + const response = await fetchWithRetry(url, { + method: 'POST', + headers: { + Accept: 'application/json', + 'Content-Type': 'application/json', + Authorization: patAuthHeader(accessToken), + }, + body: JSON.stringify({ query: wiql }), + }) + if (!response.ok) { + const errorText = await response.text().catch(() => '') + logger.error('Failed to query Azure DevOps work items', { + status: response.status, + error: errorText, + }) + throw new Error(`Failed to query work items: ${response.status}`) + } + const data = await response.json() + const refs = (data.workItems as WorkItemRef[] | undefined) ?? [] + if (refs.length >= WIQL_MAX_RESULTS) { + logger.warn('WIQL result hit the 20,000-item cap; narrow work-item filters to sync all items', { + organization, + project, + }) + } + return refs.map((ref) => ref.id) +} + +/** + * Fetches full field details for a batch of work item IDs (max 200 per call). + * `errorPolicy: 'Omit'` keeps the batch resilient: a single inaccessible or + * deleted id is dropped from the response rather than failing the whole call. + */ +async function fetchWorkItemsBatch( + accessToken: string, + organization: string, + project: string, + ids: number[] +): Promise { + if (ids.length === 0) return [] + const url = `${ADO_BASE_URL}/${encodeURIComponent(organization)}/${encodeURIComponent(project)}/_apis/wit/workitemsbatch?api-version=${WORKITEMS_API_VERSION}` + const response = await fetchWithRetry(url, { + method: 'POST', + headers: { + Accept: 'application/json', + 'Content-Type': 'application/json', + Authorization: patAuthHeader(accessToken), + }, + body: JSON.stringify({ + ids, + errorPolicy: 'Omit', + fields: [ + 'System.Id', + 'System.Title', + 'System.WorkItemType', + 'System.State', + 'System.AreaPath', + 'System.IterationPath', + 'System.ChangedDate', + 'System.Tags', + 'System.Description', + 'Microsoft.VSTS.TCM.ReproSteps', + 'Microsoft.VSTS.Common.AcceptanceCriteria', + ], + }), + }) + if (!response.ok) { + const errorText = await response.text().catch(() => '') + logger.error('Failed to fetch Azure DevOps work items batch', { + status: response.status, + error: errorText, + }) + throw new Error(`Failed to fetch work items batch: ${response.status}`) + } + const data = await response.json() + return (data.value as RawWorkItem[] | undefined) ?? [] +} + +/** + * Reads the repository-file filter configuration from sourceConfig. + */ +interface FileFilters { + repositoryName: string + branch: string + pathPrefix: string + extensions: Set | null +} + +function readFileFilters(sourceConfig: Record): FileFilters { + const rawPrefix = readString(sourceConfig.pathPrefix) + return { + repositoryName: readString(sourceConfig.repositoryName), + branch: readString(sourceConfig.branch), + pathPrefix: rawPrefix, + extensions: parseExtensions(sourceConfig.fileExtensions), + } +} + +/** + * Lists the project's git repositories. Returns an empty list on 401/403/404 so + * a project without Git or without repo access degrades gracefully instead of + * aborting the sync. + */ +async function listRepositories( + accessToken: string, + organization: string, + project: string, + retryOptions?: Parameters[2] +): Promise { + const url = `${ADO_BASE_URL}/${encodeURIComponent(organization)}/${encodeURIComponent(project)}/_apis/git/repositories?api-version=${GIT_API_VERSION}` + const response = await fetchWithRetry( + url, + { + method: 'GET', + headers: { Accept: 'application/json', Authorization: patAuthHeader(accessToken) }, + }, + retryOptions + ) + if (!response.ok) { + if (response.status === 401 || response.status === 403 || response.status === 404) { + logger.warn('Azure DevOps repositories unavailable; skipping file listing', { + organization, + project, + status: response.status, + }) + return [] + } + const errorText = await response.text().catch(() => '') + logger.error('Failed to list Azure DevOps repositories', { + status: response.status, + error: errorText, + }) + throw new Error(`Failed to list repositories: ${response.status}`) + } + const data = await response.json() + return (data.value as GitRepository[] | undefined) ?? [] +} + +/** + * Resolves the in-scope repositories for the project, caching them on the sync + * context so a single sync reuses one listing. Disabled repositories and, when a + * filter is set, non-matching repositories are excluded. + */ +async function resolveRepositories( + accessToken: string, + organization: string, + project: string, + repositoryFilter: string, + syncContext?: Record +): Promise { + const cached = syncContext?.repositories as GitRepository[] | undefined + const all = cached ?? (await listRepositories(accessToken, organization, project)) + if (syncContext && !cached) syncContext.repositories = all + + const needle = repositoryFilter.toLowerCase() + return all.filter((repo) => { + if (repo.isDisabled) return false + if (!needle) return true + return repo.id.toLowerCase() === needle || (repo.name ?? '').toLowerCase() === needle + }) +} + +/** + * Lists every blob in a repository at the given branch via the non-paginated + * Items list API (recursionLevel=Full). Returns an empty list on 401/403/404 so + * a single inaccessible or empty repo does not abort the sync. + */ +async function listRepositoryBlobs( + accessToken: string, + organization: string, + project: string, + repoId: string, + branch: string +): Promise { + const params = new URLSearchParams({ + recursionLevel: 'Full', + 'versionDescriptor.version': branch, + 'versionDescriptor.versionType': 'Branch', + 'api-version': GIT_API_VERSION, + }) + const url = `${ADO_BASE_URL}/${encodeURIComponent(organization)}/${encodeURIComponent(project)}/_apis/git/repositories/${encodeURIComponent(repoId)}/items?${params.toString()}` + const response = await fetchWithRetry(url, { + method: 'GET', + headers: { Accept: 'application/json', Authorization: patAuthHeader(accessToken) }, + }) + if (!response.ok) { + if (response.status === 401 || response.status === 403 || response.status === 404) { + logger.warn('Azure DevOps repository items unavailable; skipping repository', { + repoId, + branch, + status: response.status, + }) + return [] + } + const errorText = await response.text().catch(() => '') + logger.error('Failed to list Azure DevOps repository items', { + repoId, + branch, + status: response.status, + error: errorText, + }) + throw new Error(`Failed to list repository items: ${response.status}`) + } + const data = await response.json() + const items = (data.value as GitItem[] | undefined) ?? [] + return items.filter((item) => item.gitObjectType === 'blob' && !item.isFolder && item.path) +} + +/** + * Builds the web UI URL for a repository file at a given branch. Azure DevOps + * file links use `{repoWebUrl}?path={path}&version=GB{branch}` (GB = Git Branch). + */ +function buildFileSourceUrl( + repoWebUrl: string | undefined, + branch: string, + path: string +): string | undefined { + if (!repoWebUrl) return undefined + return `${repoWebUrl}?path=${encodeURIComponent(path)}&version=GB${encodeURIComponent(branch)}` +} + +/** + * Builds a deferred stub for a repository file. Content is empty and fetched + * lazily via getDocument for new/changed files only. The contentHash is the git + * blob objectId, identical between the stub and the hydrated document. + */ +function fileToStub(organization: string, project: string, entry: RepoFileEntry): ExternalDocument { + const path = entry.item.path + const title = path.split('/').filter(Boolean).pop() || path + return { + externalId: fileExternalId(entry.repoId, path), + title, + content: '', + contentDeferred: true, + mimeType: 'text/plain', + sourceUrl: buildFileSourceUrl(entry.repoWebUrl, entry.branch, path), + contentHash: buildFileContentHash(entry.repoId, entry.item.objectId), + metadata: { + kind: 'file', + organization, + project, + repository: entry.repoName, + repositoryId: entry.repoId, + branch: entry.branch, + path, + }, + } +} + +/** + * Resolves the flattened, filtered list of repository files for the configured + * scope. Repositories are listed once, each is walked via the recursive Items + * API, and blobs are filtered by path prefix and extension. The result is cached + * on syncContext so offset-based pagination and the maxItems cap apply over a + * stable list across pages. + */ +async function resolveRepoFiles( + accessToken: string, + organization: string, + project: string, + filters: FileFilters, + syncContext?: Record +): Promise { + const cached = syncContext?.repoFiles as RepoFileEntry[] | undefined + if (cached) return cached + + const repositories = await resolveRepositories( + accessToken, + organization, + project, + filters.repositoryName, + syncContext + ) + + const normalizedPrefix = + filters.pathPrefix && !filters.pathPrefix.startsWith('/') + ? `/${filters.pathPrefix}` + : filters.pathPrefix + + const entries: RepoFileEntry[] = [] + for (const repo of repositories) { + const branch = filters.branch || stripRefsHeads(repo.defaultBranch ?? '') + if (!branch) { + logger.warn('Skipping Azure DevOps repository with no default branch', { + repoId: repo.id, + repoName: repo.name, + }) + continue + } + const blobs = await listRepositoryBlobs(accessToken, organization, project, repo.id, branch) + for (const item of blobs) { + if (normalizedPrefix && !item.path.startsWith(normalizedPrefix)) continue + if (!matchesExtension(item.path, filters.extensions)) continue + entries.push({ + repoId: repo.id, + repoName: repo.name, + repoWebUrl: repo.webUrl, + branch, + item, + }) + } + } + + if (syncContext) syncContext.repoFiles = entries + return entries +} + +/** + * Lists a single batch of repository-file stubs. The full filtered file list is + * resolved once and cached on syncContext; the cursor is an offset into that + * list, of the form `file|{offset}`. + */ +async function listRepoFiles( + accessToken: string, + organization: string, + project: string, + filters: FileFilters, + maxItems: number, + cursor: string | undefined, + syncContext: Record | undefined +): Promise { + const entries = await resolveRepoFiles(accessToken, organization, project, filters, syncContext) + + if (entries.length === 0) { + return { documents: [], hasMore: false } + } + + let offset = 0 + if (cursor) { + const parts = cursor.split('|') + offset = Number(parts[1]) || 0 + } + + const chunk = entries.slice(offset, offset + FILE_BATCH_SIZE) + const documents = chunk.map((entry) => fileToStub(organization, project, entry)) + + const { documents: capped, capped: hitLimit } = applyMaxItemsCap(documents, maxItems, syncContext) + + const nextOffset = offset + FILE_BATCH_SIZE + const hasMore = !hitLimit && nextOffset < entries.length + + return { + documents: capped, + nextCursor: hasMore ? `file|${nextOffset}` : undefined, + hasMore, + } +} + +/** + * Resolves the branch to fetch a single repository file from in getDocument. Uses + * the configured branch override when set, otherwise the repository's default + * branch (resolved from the cached or freshly-listed repository record). + */ +async function resolveFileBranch( + accessToken: string, + organization: string, + project: string, + repoId: string, + branchOverride: string, + syncContext?: Record +): Promise<{ branch: string; repo?: GitRepository }> { + if (branchOverride) { + const repos = (syncContext?.repositories as GitRepository[] | undefined) ?? [] + return { branch: branchOverride, repo: repos.find((r) => r.id === repoId) } + } + const repos = + (syncContext?.repositories as GitRepository[] | undefined) ?? + (await listRepositories(accessToken, organization, project)) + if (syncContext && !syncContext.repositories) syncContext.repositories = repos + const repo = repos.find((r) => r.id === repoId) + return { branch: stripRefsHeads(repo?.defaultBranch ?? ''), repo } +} + +/** + * Fetches and hydrates a single repository file by its externalId. Re-fetches the + * item with content, rebuilds the objectId-based hash identically to the stub, + * and skips binary, oversized, or empty files. Returns null for 404 / not found. + */ +async function getFileDocument( + accessToken: string, + organization: string, + project: string, + externalId: string, + branchOverride: string, + syncContext?: Record +): Promise { + const parsed = parseFileExternalId(externalId) + if (!parsed) return null + const { repoId, path } = parsed + + const { branch, repo } = await resolveFileBranch( + accessToken, + organization, + project, + repoId, + branchOverride, + syncContext + ) + if (!branch) { + logger.warn('Cannot resolve branch for Azure DevOps file', { externalId }) + return null + } + + const params = new URLSearchParams({ + path, + 'versionDescriptor.version': branch, + 'versionDescriptor.versionType': 'Branch', + includeContent: 'true', + includeContentMetadata: 'true', + $format: 'json', + 'api-version': GIT_API_VERSION, + }) + const url = `${ADO_BASE_URL}/${encodeURIComponent(organization)}/${encodeURIComponent(project)}/_apis/git/repositories/${encodeURIComponent(repoId)}/items?${params.toString()}` + const response = await fetchWithRetry(url, { + method: 'GET', + headers: { Accept: 'application/json', Authorization: patAuthHeader(accessToken) }, + }) + + if (!response.ok) { + if (response.status === 404) return null + throw new Error(`Failed to fetch repository file: ${response.status}`) + } + + const item = (await response.json()) as GitItem + if (!item.objectId) return null + if (item.contentMetadata?.isBinary) { + logger.info('Skipping binary Azure DevOps file', { path }) + return null + } + + const raw = typeof item.content === 'string' ? item.content : '' + const buffer = Buffer.from(raw, 'utf8') + if (isBinaryBuffer(buffer)) { + logger.info('Skipping binary Azure DevOps file', { path }) + return null + } + if (buffer.byteLength > MAX_FILE_SIZE) { + logger.info('Skipping oversized Azure DevOps file', { path, size: buffer.byteLength }) + return null + } + + const content = buffer.toString('utf8') + if (!content.trim()) return null + + const title = path.split('/').filter(Boolean).pop() || path + return { + externalId, + title, + content, + contentDeferred: false, + mimeType: 'text/plain', + sourceUrl: buildFileSourceUrl(repo?.webUrl, branch, path), + contentHash: buildFileContentHash(repoId, item.objectId), + metadata: { + kind: 'file', + organization, + project, + repository: repo?.name ?? '', + repositoryId: repoId, + branch, + path, + size: buffer.byteLength, + }, + } +} + +/** + * Applies the optional maxItems cap to a batch, tracking the running total in + * syncContext and flagging `listingCapped` when the cap is hit. The sync engine + * reads `listingCapped` to suppress deletion reconciliation on a truncated + * listing — without it, a capped full sync would wrongly delete every source + * document beyond the cap. + */ +function applyMaxItemsCap( + documents: ExternalDocument[], + maxItems: number, + syncContext: Record | undefined +): { documents: ExternalDocument[]; capped: boolean } { + if (maxItems <= 0) return { documents, capped: false } + const prevTotal = (syncContext?.totalDocsFetched as number) ?? 0 + const remaining = Math.max(0, maxItems - prevTotal) + const sliced = documents.length > remaining ? documents.slice(0, remaining) : documents + const newTotal = prevTotal + sliced.length + if (syncContext) syncContext.totalDocsFetched = newTotal + const capped = newTotal >= maxItems + if (capped && syncContext) syncContext.listingCapped = true + return { documents: sliced, capped } +} + +/** + * Lists a single batch of wiki pages across the project's wikis (optionally + * filtered to one wiki). Uses a compound cursor of the form + * `wiki|{wikiIndex}|{continuationToken}` so each wiki's `pagesbatch` pagination + * is tracked independently. + */ +async function listWikiPages( + accessToken: string, + organization: string, + project: string, + wikiFilter: string, + maxItems: number, + cursor: string | undefined, + syncContext?: Record +): Promise { + const allWikis = await resolveWikis(accessToken, organization, project, syncContext) + const wikis = allWikis.filter((w) => wikiMatchesFilter(w, wikiFilter)) + + if (wikis.length === 0) { + return { documents: [], hasMore: false } + } + + let wikiIndex = 0 + let continuationToken: string | undefined + if (cursor) { + // The continuation token is opaque and may contain `|`; keep everything after + // the second separator intact instead of truncating it with a naive split. + const firstSep = cursor.indexOf('|') + const secondSep = firstSep === -1 ? -1 : cursor.indexOf('|', firstSep + 1) + if (secondSep !== -1) { + wikiIndex = Number(cursor.slice(firstSep + 1, secondSep)) || 0 + const token = cursor.slice(secondSep + 1) + continuationToken = token || undefined + } + } + + if (wikiIndex >= wikis.length) { + return { documents: [], hasMore: false } + } + + const wiki = wikis[wikiIndex] + const url = `${ADO_BASE_URL}/${encodeURIComponent(organization)}/${encodeURIComponent(project)}/_apis/wiki/wikis/${encodeURIComponent(wiki.id)}/pagesbatch?api-version=${WIKI_API_VERSION}` + const body: Record = { top: WIKI_PAGE_BATCH_SIZE } + if (continuationToken) body.continuationToken = continuationToken + + const response = await fetchWithRetry(url, { + method: 'POST', + headers: { + Accept: 'application/json', + 'Content-Type': 'application/json', + Authorization: patAuthHeader(accessToken), + }, + body: JSON.stringify(body), + }) + if (!response.ok) { + const errorText = await response.text().catch(() => '') + logger.error('Failed to list Azure DevOps wiki pages', { + wikiId: wiki.id, + status: response.status, + error: errorText, + }) + throw new Error(`Failed to list wiki pages: ${response.status}`) + } + + const data = await response.json() + const pages = (data.value as WikiPageDetail[] | undefined) ?? [] + const nextContinuation = response.headers.get('x-ms-continuationtoken') || undefined + + const documents: ExternalDocument[] = [] + for (let i = 0; i < pages.length; i += WIKI_ETAG_CONCURRENCY) { + const batch = pages.slice(i, i + WIKI_ETAG_CONCURRENCY) + const stubs = await Promise.all( + batch.map(async (page) => { + const etag = await fetchWikiPageETag(accessToken, organization, project, wiki.id, page.path) + return wikiPageToStub(organization, project, wiki, page, etag) + }) + ) + documents.push(...stubs) + } + + const { documents: capped, capped: hitLimit } = applyMaxItemsCap(documents, maxItems, syncContext) + if (hitLimit) { + return { documents: capped, hasMore: false } + } + + let nextCursor: string | undefined + let hasMore: boolean + if (nextContinuation) { + nextCursor = `wiki|${wikiIndex}|${nextContinuation}` + hasMore = true + } else if (wikiIndex + 1 < wikis.length) { + nextCursor = `wiki|${wikiIndex + 1}|` + hasMore = true + } else { + hasMore = false + } + + return { documents: capped, nextCursor, hasMore } +} + +/** + * Lists a single batch of work items. The full ID list is resolved once via WIQL + * and cached on the sync context; the cursor is an offset into that list. + */ +async function listWorkItems( + accessToken: string, + organization: string, + project: string, + filters: WorkItemFilters, + maxItems: number, + cursor: string | undefined, + syncContext: Record | undefined, + lastSyncAt: Date | undefined +): Promise { + let ids = syncContext?.workItemIds as number[] | undefined + if (!ids) { + const wiql = buildWiql(filters, lastSyncAt) + ids = await queryWorkItemIds(accessToken, organization, project, wiql) + if (syncContext) syncContext.workItemIds = ids + } + + if (ids.length === 0) { + return { documents: [], hasMore: false } + } + + let offset = 0 + if (cursor) { + const parts = cursor.split('|') + offset = Number(parts[1]) || 0 + } + + const chunk = ids.slice(offset, offset + WORK_ITEM_BATCH_SIZE) + const raw = await fetchWorkItemsBatch(accessToken, organization, project, chunk) + const documents = raw.map((item) => workItemToDocument(organization, project, item)) + + const { documents: capped, capped: hitLimit } = applyMaxItemsCap(documents, maxItems, syncContext) + + const nextOffset = offset + WORK_ITEM_BATCH_SIZE + const hasMore = !hitLimit && nextOffset < ids.length + + return { + documents: capped, + nextCursor: hasMore ? `wi|${nextOffset}` : undefined, + hasMore, + } +} + +export const azureDevopsConnector: ConnectorConfig = { + id: 'azure_devops', + name: 'Azure DevOps', + description: + 'Sync wiki pages, work items, and repository files from an Azure DevOps project into your knowledge base', + version: '1.1.0', + icon: AzureDevOpsIcon, + + auth: { + mode: 'apiKey', + label: 'Personal Access Token', + placeholder: 'Enter your Azure DevOps PAT (scopes: Wiki Read, Work Items Read, Code Read)', + }, + + /** + * Incremental sync applies to work items only, via a `System.ChangedDate` + * WIQL filter derived from lastSyncAt. Wiki pages have no change timestamp on + * listing, so they are always re-listed and reconciled by ETag content hash. + * Repository files are likewise always re-listed in full and reconciled by the + * git blob objectId hash — a commit-diff incremental path is intentionally + * avoided to match the github/gitlab full-listing approach, keeping change + * detection correct without tracking per-branch commit state. Unchanged + * documents are skipped without a content fetch in every case. + */ + supportsIncrementalSync: true, + + configFields: [ + { + id: 'organization', + title: 'Organization', + type: 'short-input', + placeholder: 'e.g. my-org', + required: true, + }, + { + id: 'project', + title: 'Project', + type: 'short-input', + placeholder: 'e.g. my-project', + required: true, + }, + { + id: 'contentType', + title: 'Content', + type: 'dropdown', + required: false, + options: [ + { label: 'Wiki pages only', id: 'wiki' }, + { label: 'Work items only', id: 'workitems' }, + { label: 'Repository files only', id: 'files' }, + { label: 'Wiki pages and work items', id: 'both' }, + { label: 'Wiki, work items, and files', id: 'all' }, + ], + description: 'Which content to index from the project.', + }, + { + id: 'wikiName', + title: 'Wiki', + type: 'short-input', + required: false, + mode: 'advanced', + placeholder: 'Wiki name or ID (all wikis if blank)', + description: + 'Restrict syncing to a single wiki by name or ID. Applies only when syncing wiki pages.', + }, + { + id: 'workItemType', + title: 'Work Item Type', + type: 'short-input', + required: false, + mode: 'advanced', + placeholder: 'e.g. Bug, Task, User Story', + description: 'Only sync work items of this type. Applies only when syncing work items.', + }, + { + id: 'state', + title: 'State', + type: 'short-input', + required: false, + mode: 'advanced', + placeholder: 'e.g. Active, Closed', + description: 'Only sync work items in this state. Applies only when syncing work items.', + }, + { + id: 'areaPath', + title: 'Area Path', + type: 'short-input', + required: false, + mode: 'advanced', + placeholder: 'e.g. MyProject\\Team A', + description: + 'Only sync work items under this area path (and its children). Applies only when syncing work items.', + }, + { + id: 'workItemTags', + title: 'Tags', + type: 'short-input', + required: false, + mode: 'advanced', + placeholder: 'e.g. customer, urgent (comma-separated)', + description: + 'Only sync work items containing all of these tags (comma-separated). Applies only when syncing work items.', + }, + { + id: 'customWiql', + title: 'Custom WIQL Query', + type: 'short-input', + required: false, + mode: 'advanced', + placeholder: 'SELECT [System.Id] FROM workitems WHERE ...', + description: + 'Advanced: a full WIQL query selecting [System.Id]. Overrides the type, state, area path, and tag filters when set.', + }, + { + id: 'repositoryName', + title: 'Repository', + type: 'short-input', + required: false, + mode: 'advanced', + placeholder: 'Repository name or ID (all repos if blank)', + description: + 'Restrict syncing to a single repository by name or ID. Applies only when syncing repository files.', + }, + { + id: 'branch', + title: 'Branch', + type: 'short-input', + required: false, + mode: 'advanced', + placeholder: "Each repo's default branch", + description: + 'Branch to sync repository files from. Defaults to each repository’s default branch. Applies only when syncing repository files.', + }, + { + id: 'pathPrefix', + title: 'Path Filter', + type: 'short-input', + required: false, + mode: 'advanced', + placeholder: 'e.g. docs/, src/', + description: + 'Only sync repository files under this path prefix. Applies only when syncing repository files.', + }, + { + id: 'fileExtensions', + title: 'File Extensions', + type: 'short-input', + required: false, + mode: 'advanced', + placeholder: 'e.g. .md, .txt, .ts', + description: + 'Only sync repository files with these extensions (comma-separated). Leave blank for all text files. Applies only when syncing repository files.', + }, + { + id: 'maxItems', + title: 'Max Items', + type: 'short-input', + required: false, + placeholder: 'e.g. 500 (default: unlimited)', + }, + ], + + listDocuments: async ( + accessToken: string, + sourceConfig: Record, + cursor?: string, + syncContext?: Record, + lastSyncAt?: Date + ): Promise => { + const organization = readString(sourceConfig.organization) + const project = readString(sourceConfig.project) + const contentType = parseContentType(sourceConfig.contentType) + const wikiFilter = readString(sourceConfig.wikiName) + const filters = readWorkItemFilters(sourceConfig) + const fileFilters = readFileFilters(sourceConfig) + const maxItems = sourceConfig.maxItems ? Number(sourceConfig.maxItems) : 0 + + if (!organization || !project) { + throw new Error('Organization and project are required') + } + + const phases = activePhases(contentType) + if (phases.length === 0) return { documents: [], hasMore: false } + + /** + * Resolves which phase a cursor belongs to. Phases run in a fixed order + * (wiki ➜ workitems ➜ file) and each phase owns a cursor prefix + * (`wiki|`, `wi|`, `file|`). A missing cursor starts at the first active phase. + */ + const cursorPhase: SyncPhase = cursor?.startsWith('wi|') + ? 'workitems' + : cursor?.startsWith('file|') + ? 'file' + : 'wiki' + const phase = phases.includes(cursorPhase) ? cursorPhase : phases[0] + + /** Lists a single batch for the given phase. The cursor is passed only when it belongs to that phase. */ + const runPhase = (target: SyncPhase, phaseCursor: string | undefined) => { + if (target === 'wiki') { + return listWikiPages( + accessToken, + organization, + project, + wikiFilter, + maxItems, + phaseCursor, + syncContext + ) + } + if (target === 'workitems') { + return listWorkItems( + accessToken, + organization, + project, + filters, + maxItems, + phaseCursor, + syncContext, + lastSyncAt + ) + } + return listRepoFiles( + accessToken, + organization, + project, + fileFilters, + maxItems, + phaseCursor, + syncContext + ) + } + + /** True once the maxItems cap has been reached during this sync run. */ + const capReached = () => + maxItems > 0 && ((syncContext?.totalDocsFetched as number) ?? 0) >= maxItems + + /** + * Walks phases starting at `phase`, accumulating documents. Within a phase, + * pagination is driven by that phase's own cursor; when a phase is exhausted + * the walker advances to the next active phase (resetting its cursor). The + * maxItems cap is honored at phase boundaries so the cap is never exceeded + * across phases. + */ + let current: SyncPhase | undefined = phase + let phaseCursor = cursor + const documents: ExternalDocument[] = [] + + while (current) { + const result = await runPhase(current, phaseCursor) + documents.push(...result.documents) + + if (result.hasMore) { + return { documents, nextCursor: result.nextCursor, hasMore: true } + } + if (capReached()) { + return { documents, hasMore: false } + } + current = nextPhase(current, contentType) + phaseCursor = undefined + } + + return { documents, hasMore: false } + }, + + getDocument: async ( + accessToken: string, + sourceConfig: Record, + externalId: string, + syncContext?: Record + ): Promise => { + const organization = readString(sourceConfig.organization) + const project = readString(sourceConfig.project) + if (!organization || !project) { + throw new Error('Organization and project are required') + } + + /** + * Repository files are deferred and re-fetched here. Work items are returned + * inline during listing, so getDocument is otherwise only invoked for + * deferred wiki pages. Unknown IDs return null defensively. + */ + if (externalId.startsWith(FILE_PREFIX)) { + try { + return await getFileDocument( + accessToken, + organization, + project, + externalId, + readString(sourceConfig.branch), + syncContext + ) + } catch (error) { + logger.warn(`Failed to fetch Azure DevOps file ${externalId}`, { + error: toError(error).message, + }) + return null + } + } + + const parsed = parseWikiExternalId(externalId) + if (!parsed) return null + + const { wikiId, pagePath } = parsed + + let wikiName: string | undefined + let remoteUrl: string | undefined + try { + const wikis = await resolveWikis(accessToken, organization, project, syncContext) + const wiki = wikis.find((w) => w.id === wikiId) + wikiName = wiki?.name + remoteUrl = wiki?.remoteUrl + } catch (error) { + logger.warn('Failed to resolve wiki metadata for page', { + externalId, + error: toError(error).message, + }) + } + + const url = `${ADO_BASE_URL}/${encodeURIComponent(organization)}/${encodeURIComponent(project)}/_apis/wiki/wikis/${encodeURIComponent(wikiId)}/pages?path=${encodeURIComponent(pagePath)}&includeContent=true&api-version=${WIKI_API_VERSION}` + const response = await fetchWithRetry(url, { + method: 'GET', + headers: { Accept: 'application/json', Authorization: patAuthHeader(accessToken) }, + }) + + if (!response.ok) { + if (response.status === 404) return null + throw new Error(`Failed to fetch wiki page: ${response.status}`) + } + + const etag = response.headers.get('etag') + const data = await response.json() + const content = (data.content as string | undefined) ?? '' + if (!content.trim()) return null + + const pageId = (data.id as number | undefined) ?? 0 + const title = pagePath.split('/').filter(Boolean).pop() || pagePath || 'Untitled' + const sourceUrl = remoteUrl + ? `${remoteUrl}?pagePath=${encodeURIComponent(pagePath)}` + : ((data.remoteUrl as string | undefined) ?? undefined) + + return { + externalId, + title, + content, + contentDeferred: false, + mimeType: 'text/plain', + sourceUrl, + contentHash: `ado:wiki:${wikiId}:${pagePath}:${etag ? etag.replace(/"/g, '') : pageId}`, + metadata: { + kind: 'wiki', + organization, + project, + wikiId, + wikiName, + pageId, + pagePath, + }, + } + }, + + validateConfig: async ( + accessToken: string, + sourceConfig: Record + ): Promise<{ valid: boolean; error?: string }> => { + const organization = readString(sourceConfig.organization) + const project = readString(sourceConfig.project) + + if (!organization || !project) { + return { valid: false, error: 'Organization and project are required' } + } + + const maxItems = sourceConfig.maxItems as string | undefined + if (maxItems && (Number.isNaN(Number(maxItems)) || Number(maxItems) <= 0)) { + return { valid: false, error: 'Max items must be a positive number' } + } + + const customWiql = readString(sourceConfig.customWiql) + if (customWiql && !/from\s+workitems/i.test(customWiql)) { + return { + valid: false, + error: 'Custom WIQL query must select work items (e.g. "... FROM workitems WHERE ...")', + } + } + + const contentType = parseContentType(sourceConfig.contentType) + const repositoryFilter = readString(sourceConfig.repositoryName) + + try { + const url = `${ADO_BASE_URL}/${encodeURIComponent(organization)}/_apis/projects/${encodeURIComponent(project)}?api-version=${PROJECT_API_VERSION}` + const response = await fetchWithRetry( + url, + { + method: 'GET', + headers: { Accept: 'application/json', Authorization: patAuthHeader(accessToken) }, + }, + VALIDATE_RETRY_OPTIONS + ) + + if (response.status === 401 || response.status === 403) { + return { valid: false, error: 'Invalid or unauthorized Personal Access Token' } + } + if (response.status === 404) { + return { + valid: false, + error: `Project "${project}" not found in organization "${organization}"`, + } + } + if (!response.ok) { + return { valid: false, error: `Cannot access project: ${response.status}` } + } + + if (activePhases(contentType).includes('file')) { + const repos = await listRepositories( + accessToken, + organization, + project, + VALIDATE_RETRY_OPTIONS + ) + if (repositoryFilter) { + const needle = repositoryFilter.toLowerCase() + const match = repos.find( + (r) => r.id.toLowerCase() === needle || (r.name ?? '').toLowerCase() === needle + ) + if (!match) { + return { + valid: false, + error: `Repository "${repositoryFilter}" not found in project "${project}"`, + } + } + if (match.isDisabled) { + return { + valid: false, + error: `Repository "${repositoryFilter}" is disabled`, + } + } + } else if (repos.length === 0) { + if (contentType === 'files') { + return { + valid: false, + error: `No accessible Git repositories found in project "${project}"`, + } + } + logger.warn('No accessible repositories; repository files will be skipped', { + organization, + project, + }) + } + } + + return { valid: true } + } catch (error) { + return { valid: false, error: getErrorMessage(error, 'Failed to validate configuration') } + } + }, + + tagDefinitions: [ + { id: 'kind', displayName: 'Type', fieldType: 'text' }, + { id: 'wikiName', displayName: 'Wiki', fieldType: 'text' }, + { id: 'workItemType', displayName: 'Work Item Type', fieldType: 'text' }, + { id: 'state', displayName: 'State', fieldType: 'text' }, + { id: 'areaPath', displayName: 'Area Path', fieldType: 'text' }, + { id: 'tags', displayName: 'Tags', fieldType: 'text' }, + { id: 'repository', displayName: 'Repository', fieldType: 'text' }, + { id: 'path', displayName: 'File Path', fieldType: 'text' }, + { id: 'changedDate', displayName: 'Changed Date', fieldType: 'date' }, + ], + + /** + * Maps document metadata to tag slots. `kind` applies to every document. + * `wikiName` is wiki-only; `workItemType`/`state`/`areaPath`/`tags`/`changedDate` + * are work-item-only; `repository`/`path` are file-only. Each document type leaves + * the others' fields empty and the type/empty guards below skip them. + */ + mapTags: (metadata: Record): Record => { + const result: Record = {} + + if (typeof metadata.kind === 'string') result.kind = metadata.kind + if (typeof metadata.wikiName === 'string' && metadata.wikiName) { + result.wikiName = metadata.wikiName + } + if (typeof metadata.workItemType === 'string' && metadata.workItemType) { + result.workItemType = metadata.workItemType + } + if (typeof metadata.state === 'string' && metadata.state) result.state = metadata.state + if (typeof metadata.areaPath === 'string' && metadata.areaPath) + result.areaPath = metadata.areaPath + + if (typeof metadata.repository === 'string' && metadata.repository) { + result.repository = metadata.repository + } + if (typeof metadata.path === 'string' && metadata.path) result.path = metadata.path + + const tags = joinTagArray(metadata.tags) + if (tags) result.tags = tags + + const changedDate = parseTagDate(metadata.changedDate) + if (changedDate) result.changedDate = changedDate + + return result + }, +} diff --git a/apps/sim/connectors/azure-devops/index.ts b/apps/sim/connectors/azure-devops/index.ts new file mode 100644 index 00000000000..25f6550e9a3 --- /dev/null +++ b/apps/sim/connectors/azure-devops/index.ts @@ -0,0 +1 @@ +export { azureDevopsConnector } from '@/connectors/azure-devops/azure-devops' diff --git a/apps/sim/connectors/google-forms/google-forms.ts b/apps/sim/connectors/google-forms/google-forms.ts new file mode 100644 index 00000000000..547b81e2cfd --- /dev/null +++ b/apps/sim/connectors/google-forms/google-forms.ts @@ -0,0 +1,771 @@ +import { createLogger } from '@sim/logger' +import { getErrorMessage, toError } from '@sim/utils/errors' +import { GoogleFormsIcon } from '@/components/icons' +import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils' +import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types' +import { joinTagArray, parseTagDate } from '@/connectors/utils' + +const logger = createLogger('GoogleFormsConnector') + +const DRIVE_API_BASE = 'https://www.googleapis.com/drive/v3' +const FORMS_API_BASE = 'https://forms.googleapis.com/v1' +const FORM_MIME_TYPE = 'application/vnd.google-apps.form' +const FOLDER_MIME_TYPE = 'application/vnd.google-apps.folder' + +/** + * Hard cap on the number of responses appended to a single form document. + * Keeps individual documents within a reasonable size for embedding/indexing. + */ +const MAX_RESPONSES_PER_FORM = 500 + +/** + * Drive API page size when listing forms. The Drive API caps pageSize at 100. + */ +const DRIVE_PAGE_SIZE = 100 + +/** + * Maximum responses returned per Forms API page (API caps and defaults to 5000). + */ +const RESPONSES_PAGE_SIZE = 5000 + +/** + * Number of forms whose change indicators are fetched concurrently during + * listing. Keeps the Forms API call volume bounded while still parallelizing. + */ +const LIST_CONCURRENCY = 4 + +/** + * Content scope for a form document. `both` indexes the form's questions and its + * submitted responses; `structure` indexes only the questions (no response reads, + * so the responses scope is never exercised for that connector instance). + */ +type ContentScope = 'both' | 'structure' + +/** + * Resolves the content scope from sourceConfig, defaulting to `both`. + */ +function resolveContentScope(value: unknown): ContentScope { + return value === 'structure' ? 'structure' : 'both' +} + +/** + * Represents a Google Drive file entry for a form, returned by the Drive API. + */ +interface DriveFormFile { + id: string + name: string + mimeType: string + modifiedTime?: string + createdTime?: string + webViewLink?: string + owners?: { displayName?: string; emailAddress?: string }[] + trashed?: boolean +} + +/** + * A single answer entry inside a response answer container. + */ +interface FormTextAnswer { + value?: string +} + +/** + * A single question's answers within a form response. The Forms API keys the + * `answers` map by questionId and stores text values under + * `textAnswers.answers[].value`. + */ +interface FormAnswer { + questionId?: string + textAnswers?: { answers?: FormTextAnswer[] } +} + +/** + * A single submitted response to a form. + */ +interface FormResponse { + responseId?: string + createTime?: string + lastSubmittedTime?: string + respondentEmail?: string + answers?: Record +} + +/** + * Paginated response list from the Forms API. + */ +interface FormResponseList { + responses?: FormResponse[] + nextPageToken?: string +} + +/** + * A question item within a form's structure. + */ +interface FormQuestionItem { + question?: { + questionId?: string + required?: boolean + } +} + +/** + * A single structural item within a form (question, section, image, etc.). + */ +interface FormItem { + itemId?: string + title?: string + description?: string + questionItem?: FormQuestionItem +} + +/** + * The form structure returned by the Forms API `forms.get` endpoint. + */ +interface FormStructure { + formId?: string + info?: { + title?: string + description?: string + documentTitle?: string + } + items?: FormItem[] + revisionId?: string + responderUri?: string +} + +/** + * Lightweight metadata captured during listing, sufficient to build a stub + * and detect changes without downloading the full form content. + */ +interface FormStubInput { + file: DriveFormFile + formTitle?: string + revisionId?: string + latestResponseTime?: string + contentScope: ContentScope +} + +/** + * Parses an optional positive-integer config value, returning 0 when unset/invalid. + */ +function parsePositiveInt(value: unknown): number { + if (value == null || value === '') return 0 + const num = Number(value) + return Number.isNaN(num) || num <= 0 ? 0 : Math.floor(num) +} + +/** + * Maps a small array over an async worker with a bounded concurrency, preserving + * input order in the returned results. + */ +async function mapWithConcurrency( + items: T[], + limit: number, + worker: (item: T, index: number) => Promise +): Promise { + const results = new Array(items.length) + let next = 0 + + async function run(): Promise { + while (next < items.length) { + const current = next++ + results[current] = await worker(items[current], current) + } + } + + const runners = Array.from({ length: Math.min(limit, items.length) }, run) + await Promise.all(runners) + return results +} + +/** + * Fetches the form structure via the Forms API. Returns null on 404 (form + * deleted or inaccessible). + */ +async function fetchFormStructure( + accessToken: string, + formId: string +): Promise { + const url = `${FORMS_API_BASE}/forms/${encodeURIComponent(formId)}` + const response = await fetchWithRetry(url, { + method: 'GET', + headers: { + Authorization: `Bearer ${accessToken}`, + Accept: 'application/json', + }, + }) + + if (!response.ok) { + if (response.status === 404) return null + throw new Error(`Failed to fetch form structure ${formId}: ${response.status}`) + } + + return (await response.json()) as FormStructure +} + +/** + * Result of fetching a form's responses: the collected responses (capped at + * `MAX_RESPONSES_PER_FORM` for rendering) plus the greatest submission timestamp + * across the first response page. + * + * `latestSubmittedTime` is tracked separately from the capped `responses` so the + * content hash computed in getDocument stays identical to the one computed during + * listing, which scans the same first page via `fetchLatestResponseTime`. If it + * were derived from the capped slice alone, a form with more than + * `MAX_RESPONSES_PER_FORM` responses could hash differently between the two paths + * and re-sync on every run. + */ +interface FetchedResponses { + responses: FormResponse[] + latestSubmittedTime?: string +} + +/** + * Fetches form responses, retaining up to `MAX_RESPONSES_PER_FORM` for rendering. + * The latest submission timestamp is derived from the full first page (up to + * `RESPONSES_PAGE_SIZE`) so it matches the change indicator computed during + * listing by `fetchLatestResponseTime`, which reads the same first page. This + * keeps the content hash identical across the listing and getDocument paths even + * when a form has more responses than the render cap. Responses are returned in + * the order provided by the API. + */ +async function fetchFormResponses(accessToken: string, formId: string): Promise { + const collected: FormResponse[] = [] + let latestSubmittedTime: string | undefined + let pageToken: string | undefined + let firstPage = true + + do { + const url = new URL(`${FORMS_API_BASE}/forms/${encodeURIComponent(formId)}/responses`) + url.searchParams.set('pageSize', String(RESPONSES_PAGE_SIZE)) + if (pageToken) url.searchParams.set('pageToken', pageToken) + + const response = await fetchWithRetry(url.toString(), { + method: 'GET', + headers: { + Authorization: `Bearer ${accessToken}`, + Accept: 'application/json', + }, + }) + + if (!response.ok) { + throw new Error(`Failed to list responses for form ${formId}: ${response.status}`) + } + + const data = (await response.json()) as FormResponseList + const responses = data.responses ?? [] + + if (firstPage) { + latestSubmittedTime = latestResponseTime(responses) + firstPage = false + } + + for (const r of responses) { + if (collected.length >= MAX_RESPONSES_PER_FORM) break + collected.push(r) + } + + pageToken = collected.length >= MAX_RESPONSES_PER_FORM ? undefined : data.nextPageToken + } while (pageToken) + + return { responses: collected, latestSubmittedTime } +} + +/** + * Reads the latest response submission time for change detection without + * retaining every response. Returns the greatest `lastSubmittedTime` (falling + * back to `createTime`) across all responses, or undefined when there are none. + */ +async function fetchLatestResponseTime( + accessToken: string, + formId: string +): Promise { + const url = new URL(`${FORMS_API_BASE}/forms/${encodeURIComponent(formId)}/responses`) + url.searchParams.set('pageSize', String(RESPONSES_PAGE_SIZE)) + + const response = await fetchWithRetry(url.toString(), { + method: 'GET', + headers: { + Authorization: `Bearer ${accessToken}`, + Accept: 'application/json', + }, + }) + + if (!response.ok) { + /** + * Treat response-listing failures as "no responses" for hashing purposes + * so a transient error never silently drops the form from the sync. + */ + logger.warn(`Failed to read responses for change detection on form ${formId}`, { + status: response.status, + }) + return undefined + } + + const data = (await response.json()) as FormResponseList + return latestResponseTime(data.responses ?? []) +} + +/** + * Returns the greatest submission timestamp across the given responses, or + * undefined when the list is empty. + */ +function latestResponseTime(responses: FormResponse[]): string | undefined { + let latest = '' + for (const r of responses) { + const t = r.lastSubmittedTime || r.createTime || '' + if (t > latest) latest = t + } + return latest || undefined +} + +/** + * Builds the content hash for a form. The hash must change when either the form + * structure (revisionId) or, when responses are indexed, the set of responses + * (latest submission time) changes. Drive `modifiedTime` alone is insufficient + * because new response submissions do not update the form's Drive modifiedTime. + * The content scope is part of the hash so that toggling response indexing + * forces a re-sync of every document. + */ +function formContentHash(input: FormStubInput): string { + const responsePart = input.contentScope === 'both' ? (input.latestResponseTime ?? '') : 'none' + return `gforms:${input.file.id}:${input.contentScope}:${input.revisionId ?? ''}:${responsePart}` +} + +/** + * Creates a lightweight stub from a form's Drive file and change indicators. + * Content is deferred and only fetched via getDocument for new/changed forms. + */ +function formToStub(input: FormStubInput): ExternalDocument { + const { file } = input + const title = input.formTitle?.trim() || file.name || 'Untitled Form' + return { + externalId: file.id, + title, + content: '', + contentDeferred: true, + mimeType: 'text/plain', + sourceUrl: file.webViewLink || `https://docs.google.com/forms/d/${file.id}/edit`, + contentHash: formContentHash(input), + metadata: { + formTitle: title, + modifiedTime: file.modifiedTime, + createdTime: file.createdTime, + latestResponseTime: input.contentScope === 'both' ? input.latestResponseTime : undefined, + owners: file.owners?.map((o) => o.displayName || o.emailAddress).filter(Boolean), + }, + } +} + +/** + * Extracts the answer values for a single question from a response. + */ +function extractAnswerText(answer: FormAnswer | undefined): string { + const values = answer?.textAnswers?.answers + ?.map((a) => a.value) + .filter((v): v is string => typeof v === 'string' && v.trim().length > 0) + return values && values.length > 0 ? values.join(', ') : '' +} + +/** + * Builds a question-id → title map from the form structure, so responses can be + * rendered with human-readable question labels instead of opaque IDs. + */ +function buildQuestionTitleMap(form: FormStructure): Map { + const map = new Map() + for (const item of form.items ?? []) { + const questionId = item.questionItem?.question?.questionId + if (questionId && item.title) { + map.set(questionId, item.title) + } + } + return map +} + +/** + * Renders the full form document: its structure (title, description, questions) + * followed by each response's question/answer pairs when responses are included. + */ +function renderFormDocument(form: FormStructure, responses: FormResponse[]): string { + const parts: string[] = [] + + const title = form.info?.title || form.info?.documentTitle + if (title) parts.push(`# ${title}`) + if (form.info?.description?.trim()) parts.push(form.info.description.trim()) + + const questionTitles = buildQuestionTitleMap(form) + + const questionLines: string[] = [] + for (const item of form.items ?? []) { + if (!item.title?.trim()) continue + const required = item.questionItem?.question?.required ? ' (required)' : '' + questionLines.push(`- ${item.title.trim()}${required}`) + if (item.description?.trim()) questionLines.push(` ${item.description.trim()}`) + } + if (questionLines.length > 0) { + parts.push('## Questions') + parts.push(questionLines.join('\n')) + } + + if (responses.length > 0) { + parts.push(`## Responses (${responses.length})`) + responses.forEach((response, index) => { + const responseLines: string[] = [] + const submitted = response.lastSubmittedTime || response.createTime + const header = submitted + ? `### Response ${index + 1} — ${submitted}` + : `### Response ${index + 1}` + responseLines.push(header) + if (response.respondentEmail) { + responseLines.push(`Respondent: ${response.respondentEmail}`) + } + for (const [questionId, answer] of Object.entries(response.answers ?? {})) { + const label = questionTitles.get(questionId) || questionId + const value = extractAnswerText(answer) + if (value) responseLines.push(`${label}: ${value}`) + } + parts.push(responseLines.join('\n')) + }) + } + + return parts.join('\n\n').trim() +} + +/** + * Builds the Drive `q` query that selects form files, optionally scoped to a + * folder. Single quotes and backslashes in the folder ID are escaped to prevent + * query injection. + */ +function buildDriveQuery(folderId?: string): string { + const parts = ['trashed = false', `mimeType = '${FORM_MIME_TYPE}'`] + if (folderId?.trim()) { + const escaped = folderId.trim().replace(/\\/g, '\\\\').replace(/'/g, "\\'") + parts.push(`'${escaped}' in parents`) + } + return parts.join(' and ') +} + +export const googleFormsConnector: ConnectorConfig = { + id: 'google_forms', + name: 'Google Forms', + description: 'Sync Google Forms questions and responses into your knowledge base', + version: '1.0.0', + icon: GoogleFormsIcon, + + auth: { + mode: 'oauth', + provider: 'google-forms', + requiredScopes: [ + 'https://www.googleapis.com/auth/drive', + 'https://www.googleapis.com/auth/forms.body', + 'https://www.googleapis.com/auth/forms.responses.readonly', + ], + }, + + configFields: [ + { + id: 'folderId', + title: 'Folder ID', + type: 'short-input', + placeholder: 'e.g. 1aBcDeFgHiJkLmNoPqRsTuVwXyZ (optional)', + required: false, + description: 'Only sync forms inside this Drive folder. Leave blank to sync all forms.', + }, + { + id: 'contentScope', + title: 'Content', + type: 'dropdown', + required: false, + options: [ + { label: 'Questions & responses', id: 'both' }, + { label: 'Questions only', id: 'structure' }, + ], + description: 'Whether to index submitted responses alongside each form’s questions.', + }, + { + id: 'maxForms', + title: 'Max Forms', + type: 'short-input', + required: false, + placeholder: 'e.g. 100 (default: unlimited)', + }, + { + id: 'maxResponsesPerForm', + title: 'Max Responses Per Form', + type: 'short-input', + required: false, + mode: 'advanced', + placeholder: `e.g. 100 (default: ${MAX_RESPONSES_PER_FORM})`, + description: 'Cap on responses indexed per form. Applies only when indexing responses.', + }, + ], + + listDocuments: async ( + accessToken: string, + sourceConfig: Record, + cursor?: string, + syncContext?: Record + ): Promise => { + const maxForms = parsePositiveInt(sourceConfig.maxForms) + const contentScope = resolveContentScope(sourceConfig.contentScope) + const previouslyFetched = (syncContext?.totalDocsFetched as number) ?? 0 + + if (maxForms > 0 && previouslyFetched >= maxForms) { + return { documents: [], hasMore: false } + } + + const folderId = sourceConfig.folderId as string | undefined + const queryParams = new URLSearchParams({ + q: buildDriveQuery(folderId), + pageSize: String(DRIVE_PAGE_SIZE), + orderBy: 'modifiedTime desc', + fields: 'nextPageToken,files(id,name,mimeType,modifiedTime,createdTime,webViewLink,owners)', + supportsAllDrives: 'true', + includeItemsFromAllDrives: 'true', + }) + if (cursor) queryParams.set('pageToken', cursor) + + const url = `${DRIVE_API_BASE}/files?${queryParams.toString()}` + + logger.info('Listing Google Forms', { + folderId: folderId?.trim() || 'all', + contentScope, + cursor: cursor ?? 'initial', + }) + + const response = await fetchWithRetry(url, { + method: 'GET', + headers: { + Authorization: `Bearer ${accessToken}`, + Accept: 'application/json', + }, + }) + + if (!response.ok) { + const errorText = await response.text() + logger.error('Failed to list Google Forms', { status: response.status, error: errorText }) + throw new Error(`Failed to list Google Forms: ${response.status}`) + } + + const data = await response.json() + let files = (data.files || []) as DriveFormFile[] + + if (maxForms > 0) { + const remaining = maxForms - previouslyFetched + if (files.length > remaining) files = files.slice(0, remaining) + } + + /** + * Build stubs with metadata-based change indicators. Each form needs its + * revisionId (structure changes) and, when responses are indexed, the latest + * response time (new submissions) so the sync engine can detect changes + * without downloading full content. Forms are processed with bounded + * concurrency; a transient per-form failure is skipped rather than aborting + * the whole page, but it is recorded so the listing is marked incomplete. + */ + let skippedOnError = false + const stubs = await mapWithConcurrency(files, LIST_CONCURRENCY, async (file) => { + try { + const form = await fetchFormStructure(accessToken, file.id) + if (!form) return null + const latest = + contentScope === 'both' ? await fetchLatestResponseTime(accessToken, file.id) : undefined + return formToStub({ + file, + formTitle: form.info?.title || form.info?.documentTitle, + revisionId: form.revisionId, + latestResponseTime: latest, + contentScope, + }) + } catch (error) { + skippedOnError = true + logger.warn(`Skipping form during listing: ${file.name} (${file.id})`, { + error: toError(error).message, + }) + return null + } + }) + + const documents = stubs.filter((s): s is ExternalDocument => s !== null) + + const totalFetched = previouslyFetched + documents.length + if (syncContext) syncContext.totalDocsFetched = totalFetched + const hitLimit = maxForms > 0 && totalFetched >= maxForms + + const nextPageToken = data.nextPageToken as string | undefined + + /** + * Mark the listing as incomplete so the sync engine skips deletion + * reconciliation. This applies when the `maxForms` cap truncates results + * (forms beyond the cap are not absent from the source) or when a transient + * error caused a still-present form to be dropped from this page — deleting + * those would wipe valid documents from the knowledge base. + */ + if (syncContext && (hitLimit || skippedOnError)) { + syncContext.listingCapped = true + } + + return { + documents, + nextCursor: hitLimit ? undefined : nextPageToken, + hasMore: hitLimit ? false : Boolean(nextPageToken), + } + }, + + getDocument: async ( + accessToken: string, + sourceConfig: Record, + externalId: string + ): Promise => { + const contentScope = resolveContentScope(sourceConfig.contentScope) + const fields = 'id,name,mimeType,modifiedTime,createdTime,webViewLink,owners,trashed' + const metadataUrl = `${DRIVE_API_BASE}/files/${encodeURIComponent(externalId)}?fields=${encodeURIComponent(fields)}&supportsAllDrives=true` + + const metadataResponse = await fetchWithRetry(metadataUrl, { + method: 'GET', + headers: { + Authorization: `Bearer ${accessToken}`, + Accept: 'application/json', + }, + }) + + if (!metadataResponse.ok) { + if (metadataResponse.status === 404) return null + throw new Error(`Failed to get form metadata: ${metadataResponse.status}`) + } + + const file = (await metadataResponse.json()) as DriveFormFile + + if (file.trashed) return null + if (file.mimeType !== FORM_MIME_TYPE) return null + + try { + const form = await fetchFormStructure(accessToken, file.id) + if (!form) return null + + const maxResponses = parsePositiveInt(sourceConfig.maxResponsesPerForm) + const fetched = + contentScope === 'both' + ? await fetchFormResponses(accessToken, file.id) + : { responses: [], latestSubmittedTime: undefined } + const responses = fetched.responses + const cappedResponses = + maxResponses > 0 && responses.length > maxResponses + ? responses.slice(0, maxResponses) + : responses + + const content = renderFormDocument(form, cappedResponses) + if (!content.trim()) return null + + const stub = formToStub({ + file, + formTitle: form.info?.title || form.info?.documentTitle, + revisionId: form.revisionId, + latestResponseTime: fetched.latestSubmittedTime, + contentScope, + }) + return { ...stub, content, contentDeferred: false } + } catch (error) { + logger.warn(`Failed to fetch content for form: ${file.name} (${file.id})`, { + error: toError(error).message, + }) + return null + } + }, + + validateConfig: async ( + accessToken: string, + sourceConfig: Record + ): Promise<{ valid: boolean; error?: string }> => { + const folderId = sourceConfig.folderId as string | undefined + const maxForms = sourceConfig.maxForms as string | undefined + const maxResponsesPerForm = sourceConfig.maxResponsesPerForm as string | undefined + + if (maxForms && (Number.isNaN(Number(maxForms)) || Number(maxForms) <= 0)) { + return { valid: false, error: 'Max forms must be a positive number' } + } + + if ( + maxResponsesPerForm && + (Number.isNaN(Number(maxResponsesPerForm)) || Number(maxResponsesPerForm) <= 0) + ) { + return { valid: false, error: 'Max responses per form must be a positive number' } + } + + try { + if (folderId?.trim()) { + const url = `${DRIVE_API_BASE}/files/${encodeURIComponent(folderId.trim())}?fields=id,name,mimeType&supportsAllDrives=true` + const response = await fetchWithRetry( + url, + { + method: 'GET', + headers: { + Authorization: `Bearer ${accessToken}`, + Accept: 'application/json', + }, + }, + VALIDATE_RETRY_OPTIONS + ) + + if (!response.ok) { + if (response.status === 404) { + return { valid: false, error: 'Folder not found. Check the folder ID and permissions.' } + } + return { valid: false, error: `Failed to access folder: ${response.status}` } + } + + const folder = await response.json() + if (folder.mimeType !== FOLDER_MIME_TYPE) { + return { valid: false, error: 'The provided ID is not a folder' } + } + } else { + const url = `${DRIVE_API_BASE}/files?pageSize=1&q=${encodeURIComponent(`mimeType = '${FORM_MIME_TYPE}'`)}&fields=files(id)&supportsAllDrives=true&includeItemsFromAllDrives=true` + const response = await fetchWithRetry( + url, + { + method: 'GET', + headers: { + Authorization: `Bearer ${accessToken}`, + Accept: 'application/json', + }, + }, + VALIDATE_RETRY_OPTIONS + ) + + if (!response.ok) { + return { valid: false, error: `Failed to access Google Forms: ${response.status}` } + } + } + + return { valid: true } + } catch (error) { + return { valid: false, error: getErrorMessage(error, 'Failed to validate configuration') } + } + }, + + tagDefinitions: [ + { id: 'formTitle', displayName: 'Form Title', fieldType: 'text' }, + { id: 'owners', displayName: 'Owner', fieldType: 'text' }, + { id: 'lastModified', displayName: 'Last Modified', fieldType: 'date' }, + { id: 'lastResponse', displayName: 'Last Response', fieldType: 'date' }, + ], + + mapTags: (metadata: Record): Record => { + const result: Record = {} + + if (typeof metadata.formTitle === 'string' && metadata.formTitle.trim()) { + result.formTitle = metadata.formTitle.trim() + } + + const owners = joinTagArray(metadata.owners) + if (owners) result.owners = owners + + const lastModified = parseTagDate(metadata.modifiedTime) + if (lastModified) result.lastModified = lastModified + + const lastResponse = parseTagDate(metadata.latestResponseTime) + if (lastResponse) result.lastResponse = lastResponse + + return result + }, +} diff --git a/apps/sim/connectors/google-forms/index.ts b/apps/sim/connectors/google-forms/index.ts new file mode 100644 index 00000000000..4a11928e0c8 --- /dev/null +++ b/apps/sim/connectors/google-forms/index.ts @@ -0,0 +1 @@ +export { googleFormsConnector } from '@/connectors/google-forms/google-forms' diff --git a/apps/sim/connectors/jsm/index.ts b/apps/sim/connectors/jsm/index.ts new file mode 100644 index 00000000000..ba6fe961592 --- /dev/null +++ b/apps/sim/connectors/jsm/index.ts @@ -0,0 +1 @@ +export { jsmConnector } from '@/connectors/jsm/jsm' diff --git a/apps/sim/connectors/jsm/jsm.ts b/apps/sim/connectors/jsm/jsm.ts new file mode 100644 index 00000000000..83af8ad1d27 --- /dev/null +++ b/apps/sim/connectors/jsm/jsm.ts @@ -0,0 +1,674 @@ +import { createLogger } from '@sim/logger' +import { toError } from '@sim/utils/errors' +import { JiraServiceManagementIcon } from '@/components/icons' +import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils' +import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types' +import { parseTagDate } from '@/connectors/utils' +import { extractAdfText, getJiraCloudId } from '@/tools/jira/utils' +import { getJsmApiBaseUrl, getJsmHeaders } from '@/tools/jsm/utils' + +const logger = createLogger('JsmConnector') + +const PAGE_SIZE = 50 + +/** + * Allowed `requestStatus` filter values for `GET /rest/servicedeskapi/request`. + * When omitted, the JSM API defaults to `ALL_REQUESTS`. + */ +const VALID_REQUEST_STATUS = ['OPEN_REQUESTS', 'CLOSED_REQUESTS', 'ALL_REQUESTS'] as const +type JsmRequestStatus = (typeof VALID_REQUEST_STATUS)[number] + +/** + * Allowed `requestOwnership` filter values for `GET /rest/servicedeskapi/request`. + * + * This param scopes results to the OAuth user's relationship to each request. When + * omitted, the JSM API defaults to `OWNED_REQUESTS` — i.e. only requests the + * authenticated user reported. For a knowledge-base sync the user almost always + * wants every request in the service desk, so the connector defaults this to + * `ALL_REQUESTS` (which the JSM API treats as "owned + participated") rather than + * relying on the API's narrower default. + */ +const VALID_REQUEST_OWNERSHIP = ['OWNED_REQUESTS', 'PARTICIPATED_REQUESTS', 'ALL_REQUESTS'] as const +type JsmRequestOwnership = (typeof VALID_REQUEST_OWNERSHIP)[number] + +/** + * Which comments to include in synced documents. + */ +const VALID_COMMENT_SCOPE = ['none', 'public', 'all'] as const +type JsmCommentScope = (typeof VALID_COMMENT_SCOPE)[number] + +/** + * A JSM date object as returned by the Service Desk REST API. The same shape is + * used for `createdDate`, `currentStatus.statusDate`, and comment `created`. + */ +interface JsmDate { + iso8601?: string + friendly?: string + epochMillis?: number +} + +/** + * Subset of a JSM customer request returned by `GET /request` and + * `GET /request/{issueIdOrKey}`. Only the fields the connector reads are typed. + */ +interface JsmRequest { + issueId?: string + issueKey?: string + requestTypeId?: string + serviceDeskId?: string + createdDate?: JsmDate + currentStatus?: { + status?: string + statusCategory?: string + statusDate?: JsmDate + } + reporter?: { + displayName?: string + emailAddress?: string + } + requestFieldValues?: Array<{ + fieldId?: string + label?: string + value?: unknown + renderedValue?: unknown + }> + _links?: { + web?: string + } +} + +/** + * A single comment on a JSM request. The JSM API returns the comment `body` as a + * plain string containing Jira wiki markup (not an ADF document), so no rich-text + * extraction is required. + */ +interface JsmComment { + id?: string + body?: string + public?: boolean + author?: { + displayName?: string + } + created?: JsmDate +} + +/** + * Paginated envelope shared by every JSM Service Desk list endpoint. + */ +interface JsmPage { + values?: T[] + size?: number + isLastPage?: boolean +} + +/** + * Reads the resolved sync options off the raw `sourceConfig`, normalizing + * enum-like fields to their valid set and clamping the numeric cap. Centralized + * so `listDocuments`, `getDocument`, and `validateConfig` agree on defaults. + */ +function resolveOptions(sourceConfig: Record): { + requestStatus: JsmRequestStatus + requestOwnership: JsmRequestOwnership + requestTypeId: string + searchTerm: string + commentScope: JsmCommentScope + maxRequests: number +} { + const requestStatus = VALID_REQUEST_STATUS.includes( + sourceConfig.requestStatus as JsmRequestStatus + ) + ? (sourceConfig.requestStatus as JsmRequestStatus) + : 'ALL_REQUESTS' + + const requestOwnership = VALID_REQUEST_OWNERSHIP.includes( + sourceConfig.requestOwnership as JsmRequestOwnership + ) + ? (sourceConfig.requestOwnership as JsmRequestOwnership) + : 'ALL_REQUESTS' + + const commentScope = VALID_COMMENT_SCOPE.includes(sourceConfig.comments as JsmCommentScope) + ? (sourceConfig.comments as JsmCommentScope) + : 'public' + + const requestTypeId = + typeof sourceConfig.requestTypeId === 'string' ? sourceConfig.requestTypeId.trim() : '' + const searchTerm = + typeof sourceConfig.searchTerm === 'string' ? sourceConfig.searchTerm.trim() : '' + + const parsedMax = sourceConfig.maxRequests ? Number(sourceConfig.maxRequests) : 0 + const maxRequests = Number.isFinite(parsedMax) && parsedMax > 0 ? Math.floor(parsedMax) : 0 + + return { requestStatus, requestOwnership, requestTypeId, searchTerm, commentScope, maxRequests } +} + +/** + * Extracts a plain-text value for a given request field id (e.g. `summary`, + * `description`) from a request's `requestFieldValues`. The JSM API returns + * `value` either as a plain string (wiki markup) or, for some rich-text fields, + * as an ADF document — both are handled. + */ +function getFieldText(request: JsmRequest, fieldId: string): string { + const field = request.requestFieldValues?.find((f) => f.fieldId === fieldId) + if (!field) return '' + const { value } = field + if (typeof value === 'string') return value + if (value && typeof value === 'object') { + const adf = extractAdfText(value) + if (adf) return adf + } + return '' +} + +/** + * Resolves the best available "change indicator" timestamp for a request. + * + * The JSM list endpoint does NOT return an updated/last-modified field — only + * `createdDate` and `currentStatus.statusDate` are present. We use + * `statusDate` (the time the request last changed status) when available, and + * fall back to `createdDate`. This is the change signal encoded into the + * contentHash. Note: edits that do not change status (e.g. a new comment) are + * not reflected here, so such changes may not trigger a re-sync. + */ +function getChangeIndicator(request: JsmRequest): string { + const statusDate = request.currentStatus?.statusDate + if (statusDate?.epochMillis != null) return String(statusDate.epochMillis) + if (statusDate?.iso8601) return statusDate.iso8601 + const created = request.createdDate + if (created?.epochMillis != null) return String(created.epochMillis) + if (created?.iso8601) return created.iso8601 + return '' +} + +/** + * Builds a stub ExternalDocument from a request returned by the list endpoint. + * Content is deferred — description and comments require a per-request API call + * fetched lazily in `getDocument`. The contentHash is metadata-only so it is + * identical whether produced here or in `getDocument`. + */ +function requestToStub(request: JsmRequest, domain: string): ExternalDocument { + const issueId = String(request.issueId ?? '') + const issueKey = request.issueKey ?? issueId + const summary = getFieldText(request, 'summary') || 'Untitled' + const status = request.currentStatus?.status + + const bareDomain = domain + .trim() + .replace(/^https?:\/\//i, '') + .replace(/\/+$/, '') + + return { + externalId: issueId, + title: `${issueKey}: ${summary}`, + content: '', + contentDeferred: true, + mimeType: 'text/plain', + sourceUrl: request._links?.web || `https://${bareDomain}/browse/${issueKey}`, + contentHash: `jsm:${issueId}:${getChangeIndicator(request)}`, + metadata: { + issueKey, + requestTypeId: request.requestTypeId, + serviceDeskId: request.serviceDeskId, + status, + reporter: request.reporter?.displayName, + created: request.createdDate?.iso8601, + /** + * The list endpoint has no true "last updated" field; `statusDate` is the + * closest available signal (time of last status change). Mapped to the + * `updated` tag and documented as such. + */ + statusDate: request.currentStatus?.statusDate?.iso8601, + }, + } +} + +/** + * Renders a readable plain-text document from a fully-fetched request and its + * comments. Includes summary, description, reporter, status, and comment thread. + */ +function buildContent(request: JsmRequest, comments: JsmComment[]): string { + const parts: string[] = [] + + const summary = getFieldText(request, 'summary') + if (summary) parts.push(summary) + + const description = getFieldText(request, 'description') + if (description) parts.push(description) + + const status = request.currentStatus?.status + if (status) parts.push(`Status: ${status}`) + + const reporter = request.reporter?.displayName + if (reporter) parts.push(`Reporter: ${reporter}`) + + if (comments.length > 0) { + parts.push('Comments:') + for (const comment of comments) { + const body = (comment.body ?? '').trim() + if (!body) continue + const author = comment.author?.displayName + parts.push(author ? `${author}: ${body}` : body) + } + } + + return parts.join('\n\n').trim() +} + +/** + * Resolves and caches the Jira cloud ID for a domain across a sync run. + */ +async function resolveCloudId( + domain: string, + accessToken: string, + syncContext?: Record +): Promise { + const cached = syncContext?.cloudId as string | undefined + if (cached) return cached + const cloudId = await getJiraCloudId(domain, accessToken) + if (syncContext) syncContext.cloudId = cloudId + return cloudId +} + +/** + * Fetches comments for a request, following offset pagination until the API + * signals `isLastPage`. When `publicOnly` is true the `public=true` filter is + * applied so internal/agent-only comments are excluded. + */ +async function fetchComments( + baseUrl: string, + accessToken: string, + issueIdOrKey: string, + publicOnly: boolean +): Promise { + const comments: JsmComment[] = [] + let start = 0 + + while (true) { + const params = new URLSearchParams({ + start: String(start), + limit: String(PAGE_SIZE), + }) + /** + * The JSM comment endpoint exposes `public` and `internal` as independent + * inclusion filters that both default to `true`. Requesting public-only + * therefore requires explicitly disabling `internal` — passing `public=true` + * alone would still return agent-only/internal comments. + */ + if (publicOnly) { + params.append('public', 'true') + params.append('internal', 'false') + } + const url = `${baseUrl}/request/${encodeURIComponent(issueIdOrKey)}/comment?${params.toString()}` + + const response = await fetchWithRetry(url, { + method: 'GET', + headers: getJsmHeaders(accessToken), + }) + + if (!response.ok) { + logger.warn('Failed to fetch JSM comments', { + issueIdOrKey, + status: response.status, + }) + break + } + + const data = (await response.json()) as JsmPage + const values = data.values ?? [] + comments.push(...values) + + if (data.isLastPage || values.length === 0) break + start += values.length + } + + return comments +} + +export const jsmConnector: ConnectorConfig = { + id: 'jsm', + name: 'Jira Service Management', + description: 'Sync service desk requests from Jira Service Management into your knowledge base', + version: '1.0.0', + icon: JiraServiceManagementIcon, + + auth: { + mode: 'oauth', + provider: 'jira', + requiredScopes: [ + 'read:servicedesk:jira-service-management', + 'read:request:jira-service-management', + 'read:request.comment:jira-service-management', + 'read:request.status:jira-service-management', + 'offline_access', + ], + }, + + configFields: [ + { + id: 'domain', + title: 'Jira Domain', + type: 'short-input', + placeholder: 'yoursite.atlassian.net', + required: true, + }, + { + id: 'serviceDeskSelector', + title: 'Service Desk', + type: 'selector', + selectorKey: 'jsm.serviceDesks', + canonicalParamId: 'serviceDeskId', + mode: 'basic', + dependsOn: ['domain'], + placeholder: 'Select a service desk', + required: true, + }, + { + id: 'serviceDeskId', + title: 'Service Desk ID', + type: 'short-input', + canonicalParamId: 'serviceDeskId', + mode: 'advanced', + placeholder: 'e.g. 1, 2', + required: true, + }, + { + id: 'requestTypeSelector', + title: 'Request Type', + type: 'selector', + selectorKey: 'jsm.requestTypes', + canonicalParamId: 'requestTypeId', + mode: 'basic', + dependsOn: ['domain', 'serviceDeskSelector'], + placeholder: 'All request types', + required: false, + }, + { + id: 'requestTypeId', + title: 'Request Type ID', + type: 'short-input', + canonicalParamId: 'requestTypeId', + mode: 'advanced', + placeholder: 'e.g. 10 (leave blank for all)', + required: false, + }, + { + id: 'requestStatus', + title: 'Request Status', + type: 'dropdown', + required: false, + options: [ + { label: 'All requests', id: 'ALL_REQUESTS' }, + { label: 'Open requests', id: 'OPEN_REQUESTS' }, + { label: 'Closed requests', id: 'CLOSED_REQUESTS' }, + ], + }, + { + id: 'requestOwnership', + title: 'Request Ownership', + type: 'dropdown', + required: false, + description: + 'Which requests the connected account can see. "Owned + participated" is the broadest scope a customer token can sync.', + options: [ + { label: 'Owned + participated', id: 'ALL_REQUESTS' }, + { label: 'Owned only', id: 'OWNED_REQUESTS' }, + { label: 'Participated only', id: 'PARTICIPATED_REQUESTS' }, + ], + }, + { + id: 'comments', + title: 'Include Comments', + type: 'dropdown', + required: false, + description: 'Comments require an extra API call per request during sync.', + options: [ + { label: 'Public comments only', id: 'public' }, + { label: 'All comments (incl. internal)', id: 'all' }, + { label: 'No comments', id: 'none' }, + ], + }, + { + id: 'searchTerm', + title: 'Search Filter', + type: 'short-input', + required: false, + placeholder: 'e.g. password reset (optional)', + }, + { + id: 'maxRequests', + title: 'Max Requests', + type: 'short-input', + required: false, + placeholder: 'e.g. 500 (default: unlimited)', + }, + ], + + listDocuments: async ( + accessToken: string, + sourceConfig: Record, + cursor?: string, + syncContext?: Record + ): Promise => { + const domain = sourceConfig.domain as string + const serviceDeskId = sourceConfig.serviceDeskId as string + + if (!domain || !serviceDeskId) { + throw new Error('Domain and service desk ID are required') + } + + const { requestStatus, requestOwnership, requestTypeId, searchTerm, maxRequests } = + resolveOptions(sourceConfig) + + const cloudId = await resolveCloudId(domain, accessToken, syncContext) + const baseUrl = getJsmApiBaseUrl(cloudId) + + /** + * `start|collected` is encoded in the cursor so the maxRequests cap holds + * across pages even if syncContext is not threaded through by the caller. + */ + let start = 0 + let collectedSoFar = (syncContext?.collectedCount as number | undefined) ?? 0 + if (cursor) { + const sep = cursor.indexOf('|') + if (sep > 0) { + const parsedStart = Number(cursor.slice(0, sep)) + const parsedCount = Number(cursor.slice(sep + 1)) + if (Number.isFinite(parsedStart) && parsedStart >= 0) start = parsedStart + if (Number.isFinite(parsedCount) && parsedCount >= 0) collectedSoFar = parsedCount + } else { + const parsedStart = Number(cursor) + if (Number.isFinite(parsedStart) && parsedStart >= 0) start = parsedStart + } + } + + const remaining = maxRequests > 0 ? Math.max(0, maxRequests - collectedSoFar) : PAGE_SIZE + if (maxRequests > 0 && remaining === 0) { + return { documents: [], hasMore: false } + } + + const params = new URLSearchParams({ + serviceDeskId, + requestStatus, + start: String(start), + limit: String(Math.min(PAGE_SIZE, remaining)), + }) + params.append('requestOwnership', requestOwnership) + if (requestTypeId) params.append('requestTypeId', requestTypeId) + if (searchTerm) params.append('searchTerm', searchTerm) + + const url = `${baseUrl}/request?${params.toString()}` + + logger.info('Listing JSM requests', { + serviceDeskId, + requestStatus, + requestOwnership, + hasCursor: Boolean(cursor), + }) + + const response = await fetchWithRetry(url, { + method: 'GET', + headers: getJsmHeaders(accessToken), + }) + + if (!response.ok) { + const errorText = await response.text() + logger.error('Failed to list JSM requests', { status: response.status, error: errorText }) + throw new Error(`Failed to list JSM requests: ${response.status}`) + } + + const data = (await response.json()) as JsmPage + let requests = data.values ?? [] + + if (maxRequests > 0 && requests.length > remaining) { + requests = requests.slice(0, remaining) + } + + const documents = requests.map((request) => requestToStub(request, domain)) + + const newCollected = collectedSoFar + requests.length + if (syncContext) syncContext.collectedCount = newCollected + + const reachedCap = maxRequests > 0 && newCollected >= maxRequests + + /** + * When `maxRequests` truncates the listing before the source is exhausted, + * flag the run as capped so the sync engine skips deletion reconciliation — + * otherwise unseen requests beyond the cap would be deleted on every sync. + */ + if (reachedCap && !data.isLastPage && syncContext) { + syncContext.listingCapped = true + } + + const hasMore = !data.isLastPage && requests.length > 0 && !reachedCap + const nextStart = start + requests.length + + return { + documents, + nextCursor: hasMore ? `${nextStart}|${newCollected}` : undefined, + hasMore, + } + }, + + getDocument: async ( + accessToken: string, + sourceConfig: Record, + externalId: string, + syncContext?: Record + ): Promise => { + const domain = sourceConfig.domain as string + const { commentScope } = resolveOptions(sourceConfig) + const cloudId = await resolveCloudId(domain, accessToken, syncContext) + const baseUrl = getJsmApiBaseUrl(cloudId) + + const requestUrl = `${baseUrl}/request/${encodeURIComponent(externalId)}?expand=status` + const response = await fetchWithRetry(requestUrl, { + method: 'GET', + headers: getJsmHeaders(accessToken), + }) + + if (!response.ok) { + if (response.status === 404) return null + if (response.status === 401 || response.status === 403) { + logger.warn('Access denied fetching JSM request', { externalId, status: response.status }) + return null + } + throw new Error(`Failed to get JSM request: ${response.status}`) + } + + const request = (await response.json()) as JsmRequest + + const comments = + commentScope === 'none' + ? [] + : await fetchComments(baseUrl, accessToken, externalId, commentScope === 'public') + + const stub = requestToStub(request, domain) + const content = buildContent(request, comments) + + return { + ...stub, + content, + contentDeferred: false, + } + }, + + validateConfig: async ( + accessToken: string, + sourceConfig: Record + ): Promise<{ valid: boolean; error?: string }> => { + const domain = sourceConfig.domain as string + const serviceDeskId = sourceConfig.serviceDeskId as string + + if (!domain || !serviceDeskId) { + return { valid: false, error: 'Domain and service desk ID are required' } + } + + if (sourceConfig.maxRequests) { + const max = Number(sourceConfig.maxRequests) + if (Number.isNaN(max) || max <= 0) { + return { valid: false, error: 'Max requests must be a positive number' } + } + } + + try { + const cloudId = await getJiraCloudId(domain, accessToken, VALIDATE_RETRY_OPTIONS) + const baseUrl = getJsmApiBaseUrl(cloudId) + const url = `${baseUrl}/servicedesk/${encodeURIComponent(serviceDeskId)}` + + const response = await fetchWithRetry( + url, + { + method: 'GET', + headers: getJsmHeaders(accessToken), + }, + VALIDATE_RETRY_OPTIONS + ) + + if (!response.ok) { + if (response.status === 404) { + return { valid: false, error: `Service desk "${serviceDeskId}" not found` } + } + if (response.status === 401 || response.status === 403) { + return { + valid: false, + error: 'Access denied. Check the connected account has access to this service desk.', + } + } + const errorText = await response.text() + return { valid: false, error: `Failed to validate: ${response.status} - ${errorText}` } + } + + return { valid: true } + } catch (error) { + return { valid: false, error: toError(error).message || 'Failed to validate configuration' } + } + }, + + tagDefinitions: [ + { id: 'status', displayName: 'Status', fieldType: 'text' }, + { id: 'requestTypeId', displayName: 'Request Type', fieldType: 'text' }, + { id: 'reporter', displayName: 'Reporter', fieldType: 'text' }, + { id: 'created', displayName: 'Created', fieldType: 'date' }, + { id: 'updated', displayName: 'Last Status Change', fieldType: 'date' }, + ], + + mapTags: (metadata: Record): Record => { + const result: Record = {} + + if (typeof metadata.status === 'string') result.status = metadata.status + if (typeof metadata.requestTypeId === 'string') result.requestTypeId = metadata.requestTypeId + if (typeof metadata.reporter === 'string') result.reporter = metadata.reporter + + const created = parseTagDate(metadata.created) + if (created) result.created = created + + /** + * The list endpoint exposes no true last-updated field; `statusDate` (time + * of last status change) is the closest available signal and surfaces under + * the "Last Status Change" tag. + */ + const statusDate = parseTagDate(metadata.statusDate) + if (statusDate) result.updated = statusDate + + return result + }, +} diff --git a/apps/sim/connectors/registry.ts b/apps/sim/connectors/registry.ts index a0d468417a9..33eb7a5e0e8 100644 --- a/apps/sim/connectors/registry.ts +++ b/apps/sim/connectors/registry.ts @@ -1,6 +1,7 @@ import { airtableConnector } from '@/connectors/airtable' import { asanaConnector } from '@/connectors/asana' import { ashbyConnector } from '@/connectors/ashby' +import { azureDevopsConnector } from '@/connectors/azure-devops' import { confluenceConnector } from '@/connectors/confluence' import { discordConnector } from '@/connectors/discord' import { docusignConnector } from '@/connectors/docusign' @@ -15,6 +16,7 @@ import { gongConnector } from '@/connectors/gong' import { googleCalendarConnector } from '@/connectors/google-calendar' import { googleDocsConnector } from '@/connectors/google-docs' import { googleDriveConnector } from '@/connectors/google-drive' +import { googleFormsConnector } from '@/connectors/google-forms' import { googleSheetsConnector } from '@/connectors/google-sheets' import { grainConnector } from '@/connectors/grain' import { granolaConnector } from '@/connectors/granola' @@ -23,6 +25,7 @@ import { hubspotConnector } from '@/connectors/hubspot' import { incidentioConnector } from '@/connectors/incidentio' import { intercomConnector } from '@/connectors/intercom' import { jiraConnector } from '@/connectors/jira' +import { jsmConnector } from '@/connectors/jsm' import { linearConnector } from '@/connectors/linear' import { microsoftTeamsConnector } from '@/connectors/microsoft-teams' import { mondayConnector } from '@/connectors/monday' @@ -32,13 +35,17 @@ import { onedriveConnector } from '@/connectors/onedrive' import { outlookConnector } from '@/connectors/outlook' import { redditConnector } from '@/connectors/reddit' import { rootlyConnector } from '@/connectors/rootly' +import { s3Connector } from '@/connectors/s3' import { salesforceConnector } from '@/connectors/salesforce' +import { sentryConnector } from '@/connectors/sentry' import { servicenowConnector } from '@/connectors/servicenow' import { sharepointConnector } from '@/connectors/sharepoint' import { slackConnector } from '@/connectors/slack' +import { typeformConnector } from '@/connectors/typeform' import type { ConnectorRegistry } from '@/connectors/types' import { webflowConnector } from '@/connectors/webflow' import { wordpressConnector } from '@/connectors/wordpress' +import { youtubeConnector } from '@/connectors/youtube' import { zendeskConnector } from '@/connectors/zendesk' import { zoomConnector } from '@/connectors/zoom' @@ -46,6 +53,7 @@ export const CONNECTOR_REGISTRY: ConnectorRegistry = { airtable: airtableConnector, asana: asanaConnector, ashby: ashbyConnector, + azure_devops: azureDevopsConnector, confluence: confluenceConnector, discord: discordConnector, docusign: docusignConnector, @@ -60,6 +68,7 @@ export const CONNECTOR_REGISTRY: ConnectorRegistry = { google_calendar: googleCalendarConnector, google_docs: googleDocsConnector, google_drive: googleDriveConnector, + google_forms: googleFormsConnector, google_sheets: googleSheetsConnector, grain: grainConnector, granola: granolaConnector, @@ -68,6 +77,7 @@ export const CONNECTOR_REGISTRY: ConnectorRegistry = { incidentio: incidentioConnector, intercom: intercomConnector, jira: jiraConnector, + jsm: jsmConnector, linear: linearConnector, microsoft_teams: microsoftTeamsConnector, monday: mondayConnector, @@ -77,12 +87,16 @@ export const CONNECTOR_REGISTRY: ConnectorRegistry = { outlook: outlookConnector, reddit: redditConnector, rootly: rootlyConnector, + s3: s3Connector, salesforce: salesforceConnector, + sentry: sentryConnector, servicenow: servicenowConnector, sharepoint: sharepointConnector, slack: slackConnector, + typeform: typeformConnector, webflow: webflowConnector, wordpress: wordpressConnector, + youtube: youtubeConnector, zendesk: zendeskConnector, zoom: zoomConnector, } diff --git a/apps/sim/connectors/s3/index.ts b/apps/sim/connectors/s3/index.ts new file mode 100644 index 00000000000..c61a9d64c58 --- /dev/null +++ b/apps/sim/connectors/s3/index.ts @@ -0,0 +1 @@ +export { s3Connector } from '@/connectors/s3/s3' diff --git a/apps/sim/connectors/s3/s3.ts b/apps/sim/connectors/s3/s3.ts new file mode 100644 index 00000000000..058969f9b42 --- /dev/null +++ b/apps/sim/connectors/s3/s3.ts @@ -0,0 +1,721 @@ +import crypto from 'crypto' +import { createLogger } from '@sim/logger' +import { getErrorMessage, toError } from '@sim/utils/errors' +import { S3Icon } from '@/components/icons' +import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils' +import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types' +import { parseTagDate } from '@/connectors/utils' +import { encodeS3PathComponent, getSignatureKey } from '@/tools/s3/utils' + +const logger = createLogger('S3Connector') + +/** Maximum object size to sync. Larger objects are skipped during listing. */ +const MAX_FILE_SIZE = 10 * 1024 * 1024 // 10 MB + +/** Number of objects requested per ListObjectsV2 page (S3 caps at 1000). */ +const LIST_MAX_KEYS = 1000 + +/** + * Default set of file extensions considered safely text-extractable. Objects + * with any other extension (or no extension) are skipped, since their content + * cannot be reliably decoded to plain text. Users can override this list via + * the `extensions` config field. + */ +const DEFAULT_EXTENSIONS = new Set([ + 'txt', + 'md', + 'markdown', + 'csv', + 'tsv', + 'json', + 'jsonl', + 'ndjson', + 'html', + 'htm', + 'xml', + 'yaml', + 'yml', + 'log', + 'rtf', +]) + +/** + * A single object entry parsed out of a ListObjectsV2 XML response. + */ +interface S3ObjectEntry { + key: string + etag: string + lastModified: string + size: number +} + +/** + * A parsed custom S3-compatible endpoint (Cloudflare R2, MinIO, etc.). + * + * `host` is the bare hostname, `hostHeader` is the value used both as the wire + * `Host` header and in the SigV4 canonical headers — it includes the port when + * a non-default port is configured (e.g. `localhost:9000`). When the endpoint + * uses the scheme's default port (443 for https, 80 for http) the port is + * omitted from `hostHeader`, matching what the HTTP client sends on the wire. + */ +interface S3Endpoint { + scheme: 'http' | 'https' + host: string + hostHeader: string +} + +/** + * AWS credentials and target resource resolved from sourceConfig + access token. + * + * When `endpoint` is present the connector targets an S3-compatible store using + * path-style addressing (`{endpoint}/{bucket}/{key}`). When absent it targets + * AWS S3 using virtual-hosted-style addressing + * (`{bucket}.s3.{region}.amazonaws.com`), preserving the original behavior. + */ +interface S3Context { + accessKeyId: string + secretAccessKey: string + region: string + bucket: string + endpoint?: S3Endpoint +} + +/** + * Parses the comma-separated `extensions` config override into a normalized set + * (lowercased, no leading dot). Returns the built-in default set when the + * override is empty or contains no usable entries. + */ +function resolveExtensions(raw: unknown): Set { + if (typeof raw !== 'string') return DEFAULT_EXTENSIONS + const exts = raw + .split(',') + .map((e) => e.trim().toLowerCase().replace(/^\./, '')) + .filter(Boolean) + return exts.length > 0 ? new Set(exts) : DEFAULT_EXTENSIONS +} + +/** + * Extracts the lowercased file extension from an object key, or '' if none. + */ +function getExtension(key: string): string { + const lastSegment = key.split('/').pop() ?? '' + const dotIndex = lastSegment.lastIndexOf('.') + if (dotIndex <= 0 || dotIndex === lastSegment.length - 1) return '' + return lastSegment.slice(dotIndex + 1).toLowerCase() +} + +/** + * Returns true when the object key ends in one of the allowed text extensions. + */ +function isSupportedKey(key: string, allowedExtensions: Set): boolean { + return allowedExtensions.has(getExtension(key)) +} + +/** + * Returns true when the host is a loopback address for which plain `http://` + * is tolerated (local MinIO development). Any other host must use `https://` so + * that credentials are never transmitted over cleartext. + */ +function isLoopbackHost(host: string): boolean { + const bare = host.replace(/^\[|\]$/g, '') + return bare === 'localhost' || bare === '127.0.0.1' || bare === '::1' +} + +/** + * Parses and validates a custom S3-compatible endpoint string. + * + * Accepts a full origin such as `https://accountid.r2.cloudflarestorage.com` or + * `http://localhost:9000`. Trailing slashes are stripped. Throws when the value + * is not a valid URL, carries a path/query/fragment beyond `/` (which would + * corrupt the path-style canonical URI), or uses plain `http://` against a + * non-loopback host. + * + * The returned `hostHeader` includes the port only when it differs from the + * scheme default, matching the `Host` header the HTTP client emits — this keeps + * the SigV4 canonical Host byte-identical to the wire Host. + */ +function parseEndpoint(raw: string): S3Endpoint { + let url: URL + try { + url = new URL(raw) + } catch { + throw new Error('Endpoint must be a valid URL, e.g. https://accountid.r2.cloudflarestorage.com') + } + + if (url.protocol !== 'https:' && url.protocol !== 'http:') { + throw new Error('Endpoint must use http:// or https://') + } + const scheme = url.protocol === 'https:' ? 'https' : 'http' + + if (url.username || url.password) { + throw new Error('Endpoint must not contain credentials') + } + if (url.search || url.hash) { + throw new Error('Endpoint must not contain a query string or fragment') + } + const path = url.pathname.replace(/\/+$/, '') + if (path !== '') { + throw new Error('Endpoint must not contain a path — provide only the host, e.g. https://host') + } + + const host = url.hostname + if (!host) throw new Error('Endpoint is missing a host') + if (scheme === 'http' && !isLoopbackHost(host)) { + throw new Error( + 'Plain http:// endpoints are only allowed for localhost — use https:// otherwise' + ) + } + + const defaultPort = scheme === 'https' ? '443' : '80' + const port = url.port && url.port !== defaultPort ? url.port : '' + const hostHeader = port ? `${host}:${port}` : host + + return { scheme, host, hostHeader } +} + +/** + * Resolves AWS credentials and the target bucket from the connector's + * sourceConfig and the encrypted secret (delivered as accessToken). When an + * `endpoint` is configured it is parsed/validated into an {@link S3Endpoint} so + * the connector targets an S3-compatible store via path-style addressing. + */ +function resolveContext(accessToken: string, sourceConfig: Record): S3Context { + const accessKeyId = ((sourceConfig.accessKeyId as string) ?? '').trim() + const region = ((sourceConfig.region as string) ?? '').trim() + const bucket = ((sourceConfig.bucket as string) ?? '').trim() + const secretAccessKey = (accessToken ?? '').trim() + const rawEndpoint = ((sourceConfig.endpoint as string) ?? '').trim() + + if (!accessKeyId) throw new Error('Missing AWS Access Key ID') + if (!secretAccessKey) throw new Error('Missing AWS Secret Access Key') + if (!region) throw new Error('Missing AWS region') + if (!bucket) throw new Error('Missing S3 bucket name') + + const endpoint = rawEndpoint ? parseEndpoint(rawEndpoint) : undefined + + return { accessKeyId, secretAccessKey, region, bucket, endpoint } +} + +/** + * Returns the SigV4 canonical Host header for the request. For AWS this is the + * virtual-hosted-style host; for a custom endpoint it is the endpoint host + * (with port when non-default). + */ +function resolveHost(ctx: S3Context): string { + return ctx.endpoint ? ctx.endpoint.hostHeader : `${ctx.bucket}.s3.${ctx.region}.amazonaws.com` +} + +/** + * Returns the request scheme: always `https` for AWS, or the endpoint scheme + * (which may be `http` for local MinIO) for a custom endpoint. + */ +function resolveScheme(ctx: S3Context): string { + return ctx.endpoint ? ctx.endpoint.scheme : 'https' +} + +/** + * Builds the canonical URI for an object key. + * + * AWS (virtual-hosted-style): `/{key}` — the bucket lives in the host. + * Custom endpoint (path-style): `/{bucket}/{key}` — the bucket is the first + * path segment. Both the bucket and key are percent-encoded per AWS UriEncode + * rules while preserving `/` separators via {@link encodeS3PathComponent}. + */ +function buildObjectPath(ctx: S3Context, key: string): string { + const encodedKey = encodeS3PathComponent(key) + return ctx.endpoint ? `/${encodeS3PathComponent(ctx.bucket)}/${encodedKey}` : `/${encodedKey}` +} + +/** + * Builds the canonical URI for a bucket-level (ListObjectsV2) request. + * + * AWS (virtual-hosted-style): `/`. + * Custom endpoint (path-style): `/{bucket}/`. + */ +function buildBucketPath(ctx: S3Context): string { + return ctx.endpoint ? `/${encodeS3PathComponent(ctx.bucket)}/` : '/' +} + +/** + * Builds the full request URL from the canonical path and an optional canonical + * query string. The path passed here is the same canonical, percent-encoded + * string used to compute the SigV4 signature, so the signed URI and the wire + * URI are byte-identical. + */ +function buildUrl(ctx: S3Context, encodedPath: string, canonicalQueryString: string): string { + const base = `${resolveScheme(ctx)}://${resolveHost(ctx)}${encodedPath}` + return canonicalQueryString ? `${base}?${canonicalQueryString}` : base +} + +/** + * Builds SigV4 request headers for an S3 REST call. + * + * `canonicalQueryString` must be the already-sorted, percent-encoded query + * string (empty for GetObject) — the caller builds the request URL from this + * exact same string so the signed query and the wire query are byte-identical + * (the classic continuation-token signing mismatch cannot occur here). + * `encodedPath` is the canonical URI path starting with '/' (virtual-hosted + * `/{key}` for AWS, path-style `/{bucket}/{key}` for custom endpoints). The + * canonical Host header is resolved via {@link resolveHost} and includes the + * port for non-default custom-endpoint ports, exactly matching the wire Host. + * Reuses {@link getSignatureKey} from the s3 tool utilities. + * + * The signed headers embed `x-amz-date` and are reused verbatim across + * `fetchWithRetry` attempts. S3 allows a 15-minute clock-skew window; the + * retry helper's worst-case total backoff (~31s default, ~10s in validate) is + * far inside that window, so a stale timestamp never triggers + * RequestTimeTooSkewed. + */ +function buildSignedHeaders( + ctx: S3Context, + method: 'GET', + encodedPath: string, + canonicalQueryString: string +): Record { + const date = new Date() + const amzDate = date.toISOString().replace(/[:-]|\.\d{3}/g, '') + const dateStamp = amzDate.slice(0, 8) + + const host = resolveHost(ctx) + const payloadHash = crypto.createHash('sha256').update('').digest('hex') + + const canonicalHeaders = + `host:${host}\n` + `x-amz-content-sha256:${payloadHash}\n` + `x-amz-date:${amzDate}\n` + const signedHeaders = 'host;x-amz-content-sha256;x-amz-date' + + const canonicalRequest = `${method}\n${encodedPath}\n${canonicalQueryString}\n${canonicalHeaders}\n${signedHeaders}\n${payloadHash}` + + const algorithm = 'AWS4-HMAC-SHA256' + const credentialScope = `${dateStamp}/${ctx.region}/s3/aws4_request` + const stringToSign = `${algorithm}\n${amzDate}\n${credentialScope}\n${crypto + .createHash('sha256') + .update(canonicalRequest) + .digest('hex')}` + + const signingKey = getSignatureKey(ctx.secretAccessKey, dateStamp, ctx.region, 's3') + const signature = crypto.createHmac('sha256', signingKey).update(stringToSign).digest('hex') + + const authorizationHeader = `${algorithm} Credential=${ctx.accessKeyId}/${credentialScope}, SignedHeaders=${signedHeaders}, Signature=${signature}` + + return { + Host: host, + 'X-Amz-Content-Sha256': payloadHash, + 'X-Amz-Date': amzDate, + Authorization: authorizationHeader, + } +} + +/** + * Percent-encodes a query parameter name or value per AWS SigV4 canonical rules + * (every byte except the unreserved set `A-Za-z0-9-_.~` is encoded). + * `encodeURIComponent` leaves `!'()*` unencoded, so those are encoded here. + */ +function encodeQueryValue(value: string): string { + return encodeURIComponent(value).replace( + /[!'()*]/g, + (c) => `%${c.charCodeAt(0).toString(16).toUpperCase()}` + ) +} + +/** + * Builds the canonical (sorted, percent-encoded) query string for a + * ListObjectsV2 request. Keys are sorted lexicographically after encoding and + * each name/value pair is encoded individually. + */ +function buildListQueryString(params: Record): string { + return Object.keys(params) + .sort() + .map((key) => `${encodeQueryValue(key)}=${encodeQueryValue(params[key])}`) + .join('&') +} + +/** + * Decodes XML entities found in S3 response text values. `&` is decoded + * last so sequences like `&lt;` resolve to `<` rather than `<`. + */ +function decodeXmlEntities(value: string): string { + return value + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/&/g, '&') +} + +/** + * Normalizes an ETag from either a ListObjectsV2 XML `` element or a + * GetObject response header into a stable bare token used in the content hash. + * + * Strips surrounding double quotes and a leading weak-validator prefix (`W/`). + * AWS S3 always returns strong, quoted ETags (including the multipart `-N` + * suffix) identically from List and Get, but S3-compatible stores (MinIO, R2) + * are not contractually bound to that and could emit a weak ETag on one path + * and a strong one on the other. Normalizing both ends keeps the + * `s3:{key}:{etag}` hash invariant between the listing stub and the hydrated + * document so unchanged objects are not re-uploaded every sync. + */ +function normalizeEtag(raw: string): string { + return raw.replace(/^W\//, '').replace(/"/g, '') +} + +/** + * Decodes a URL-encoded object key returned when `encoding-type=url` is set. + * Falls back to the raw value if decoding fails (malformed percent sequence). + */ +function decodeObjectKey(value: string): string { + try { + return decodeURIComponent(value) + } catch { + return value + } +} + +/** + * Extracts the text content of the first matching XML tag within a fragment. + */ +function extractTag(fragment: string, tag: string): string | undefined { + const match = fragment.match(new RegExp(`<${tag}>([\\s\\S]*?)`)) + return match ? decodeXmlEntities(match[1]) : undefined +} + +/** + * Parses a ListObjectsV2 XML response into object entries plus pagination state. + * + * The request is always made with `encoding-type=url`, so the per-`Key` values + * are percent-encoded in the XML (safe for the regex parser even when keys + * contain XML-hostile bytes such as `&`, `<`, or ASCII control characters). + * Each `Key` is XML-entity-decoded then URL-decoded back to its true value. + * `NextContinuationToken` is opaque and is not affected by `encoding-type`, so + * it is used verbatim. + */ +function parseListResponse(xml: string): { + objects: S3ObjectEntry[] + isTruncated: boolean + nextContinuationToken?: string +} { + const objects: S3ObjectEntry[] = [] + + for (const match of xml.matchAll(/([\s\S]*?)<\/Contents>/g)) { + const block = match[1] + const rawKey = extractTag(block, 'Key') + if (!rawKey) continue + const key = decodeObjectKey(rawKey) + + const etag = normalizeEtag(extractTag(block, 'ETag') ?? '') + const lastModified = extractTag(block, 'LastModified') ?? '' + const size = Number(extractTag(block, 'Size') ?? '0') + + objects.push({ key, etag, lastModified, size: Number.isNaN(size) ? 0 : size }) + } + + const isTruncated = extractTag(xml, 'IsTruncated') === 'true' + const nextContinuationToken = extractTag(xml, 'NextContinuationToken') + + return { objects, isTruncated, nextContinuationToken } +} + +/** + * Builds a metadata stub for an S3 object. The content hash combines the key + * and ETag — S3's ETag changes whenever object content changes, making it an + * ideal change indicator. Used by both listDocuments and getDocument to + * guarantee identical hashes. + */ +function objectToStub(ctx: S3Context, entry: S3ObjectEntry): ExternalDocument { + const title = entry.key.split('/').pop() || entry.key + const prefix = entry.key.includes('/') ? entry.key.slice(0, entry.key.lastIndexOf('/')) : '' + + return { + externalId: entry.key, + title, + content: '', + contentDeferred: true, + mimeType: 'text/plain', + sourceUrl: buildUrl(ctx, buildObjectPath(ctx, entry.key), ''), + contentHash: `s3:${entry.key}:${entry.etag}`, + metadata: { + key: entry.key, + prefix, + etag: entry.etag, + lastModified: entry.lastModified, + fileSize: entry.size, + }, + } +} + +/** + * Performs a single ListObjectsV2 page request and returns the parsed result. + */ +async function listObjectsPage( + ctx: S3Context, + prefix: string, + continuationToken: string | undefined, + retryOptions?: Parameters[2], + maxKeys: number = LIST_MAX_KEYS +): Promise<{ objects: S3ObjectEntry[]; isTruncated: boolean; nextContinuationToken?: string }> { + const queryParams: Record = { + 'list-type': '2', + 'encoding-type': 'url', + 'max-keys': String(maxKeys), + } + if (prefix) queryParams.prefix = prefix + if (continuationToken) queryParams['continuation-token'] = continuationToken + + const canonicalQueryString = buildListQueryString(queryParams) + const bucketPath = buildBucketPath(ctx) + const headers = buildSignedHeaders(ctx, 'GET', bucketPath, canonicalQueryString) + + const url = buildUrl(ctx, bucketPath, canonicalQueryString) + + const response = await fetchWithRetry(url, { method: 'GET', headers }, retryOptions) + + if (!response.ok) { + const errorText = await response.text() + throw new Error(`S3 ListObjectsV2 failed: ${response.status} ${errorText}`) + } + + const xml = await response.text() + return parseListResponse(xml) +} + +export const s3Connector: ConnectorConfig = { + id: 's3', + name: 'Amazon S3', + description: + 'Sync text-based objects from Amazon S3 or any S3-compatible store (Cloudflare R2, MinIO) into your knowledge base', + version: '1.1.0', + icon: S3Icon, + + auth: { + mode: 'apiKey', + label: 'Secret Access Key', + placeholder: 'Enter your AWS Secret Access Key', + }, + + configFields: [ + { + id: 'accessKeyId', + title: 'Access Key ID', + type: 'short-input', + placeholder: 'e.g. AKIAIOSFODNN7EXAMPLE', + required: true, + }, + { + id: 'region', + title: 'Region', + type: 'short-input', + placeholder: 'e.g. us-east-1 (use auto for Cloudflare R2)', + required: true, + description: + 'AWS region for the bucket. For Cloudflare R2 use "auto"; for MinIO use the region the server is configured with (commonly us-east-1).', + }, + { + id: 'bucket', + title: 'Bucket', + type: 'short-input', + placeholder: 'e.g. my-bucket', + required: true, + }, + { + id: 'endpoint', + title: 'Custom Endpoint', + type: 'short-input', + placeholder: 'https://accountid.r2.cloudflarestorage.com (optional — leave empty for AWS S3)', + required: false, + description: + 'S3-compatible endpoint for Cloudflare R2, MinIO, etc. Leave empty for AWS S3. Uses path-style addressing. Plain http:// is only allowed for localhost.', + }, + { + id: 'prefix', + title: 'Prefix', + type: 'short-input', + placeholder: 'e.g. docs/ (optional)', + required: false, + description: 'Only sync objects whose key starts with this prefix', + }, + { + id: 'extensions', + title: 'File Extensions', + type: 'short-input', + placeholder: 'e.g. txt, md, csv (optional)', + required: false, + description: + 'Comma-separated list of file extensions to sync. Leave blank to use the built-in text formats.', + }, + { + id: 'maxObjects', + title: 'Max Objects', + type: 'short-input', + required: false, + placeholder: 'e.g. 500 (default: unlimited)', + description: 'Stop syncing after this many objects', + }, + ], + + listDocuments: async ( + accessToken: string, + sourceConfig: Record, + cursor?: string, + syncContext?: Record + ): Promise => { + const ctx = resolveContext(accessToken, sourceConfig) + const prefix = ((sourceConfig.prefix as string) ?? '').trim() + const allowedExtensions = resolveExtensions(sourceConfig.extensions) + + const maxObjects = sourceConfig.maxObjects ? Number(sourceConfig.maxObjects) : 0 + const previouslyFetched = (syncContext?.totalDocsFetched as number) ?? 0 + + if (maxObjects > 0 && previouslyFetched >= maxObjects) { + return { documents: [], hasMore: false } + } + + logger.info('Listing S3 objects', { bucket: ctx.bucket, prefix, cursor: cursor ?? 'initial' }) + + const { objects, isTruncated, nextContinuationToken } = await listObjectsPage( + ctx, + prefix, + cursor + ) + + let documents = objects + .filter((entry) => isSupportedKey(entry.key, allowedExtensions)) + .filter((entry) => entry.size > 0 && entry.size <= MAX_FILE_SIZE) + .map((entry) => objectToStub(ctx, entry)) + + if (maxObjects > 0) { + const remaining = maxObjects - previouslyFetched + if (documents.length > remaining) { + documents = documents.slice(0, remaining) + } + } + + const totalFetched = previouslyFetched + documents.length + if (syncContext) syncContext.totalDocsFetched = totalFetched + const hitLimit = maxObjects > 0 && totalFetched >= maxObjects + if (hitLimit && syncContext) syncContext.listingCapped = true + + return { + documents, + nextCursor: hitLimit ? undefined : isTruncated ? nextContinuationToken : undefined, + hasMore: hitLimit ? false : isTruncated && Boolean(nextContinuationToken), + } + }, + + getDocument: async ( + accessToken: string, + sourceConfig: Record, + externalId: string + ): Promise => { + const ctx = resolveContext(accessToken, sourceConfig) + const key = externalId + + try { + const encodedPath = buildObjectPath(ctx, key) + const headers = buildSignedHeaders(ctx, 'GET', encodedPath, '') + const url = buildUrl(ctx, encodedPath, '') + + const response = await fetchWithRetry(url, { method: 'GET', headers }) + + if (response.status === 404) return null + if (!response.ok) { + const errorText = await response.text() + throw new Error(`S3 GetObject failed: ${response.status} ${errorText}`) + } + + const etag = normalizeEtag(response.headers.get('etag') ?? '') + const lastModified = response.headers.get('last-modified') ?? '' + const contentLength = Number(response.headers.get('content-length') ?? '0') + + if (contentLength > MAX_FILE_SIZE) { + logger.warn('Skipping oversized S3 object', { key, size: contentLength }) + return null + } + + const content = await response.text() + if (!content.trim()) return null + + const entry: S3ObjectEntry = { + key, + etag, + lastModified, + size: Number.isNaN(contentLength) ? 0 : contentLength, + } + const stub = objectToStub(ctx, entry) + return { ...stub, content, contentDeferred: false } + } catch (error) { + logger.warn('Failed to get S3 object', { key, error: toError(error).message }) + return null + } + }, + + validateConfig: async ( + accessToken: string, + sourceConfig: Record + ): Promise<{ valid: boolean; error?: string }> => { + let ctx: S3Context + try { + ctx = resolveContext(accessToken, sourceConfig) + } catch (error) { + return { valid: false, error: getErrorMessage(error, 'Invalid configuration') } + } + + const maxObjects = sourceConfig.maxObjects as string | undefined + if (maxObjects && (Number.isNaN(Number(maxObjects)) || Number(maxObjects) <= 0)) { + return { valid: false, error: 'Max objects must be a positive number' } + } + + const prefix = ((sourceConfig.prefix as string) ?? '').trim() + + try { + await listObjectsPage(ctx, prefix, undefined, VALIDATE_RETRY_OPTIONS, 1) + return { valid: true } + } catch (error) { + const message = getErrorMessage(error, 'Failed to validate configuration') + const lower = message.toLowerCase() + if ( + lower.includes('permanentredirect') || + lower.includes('authorizationheadermalformed') || + lower.includes(' 301 ') + ) { + return { + valid: false, + error: + 'Wrong region for this bucket. Update the region to match where the bucket lives (or use "auto" for Cloudflare R2).', + } + } + if (lower.includes('403') || lower.includes('accessdenied') || lower.includes('signature')) { + return { + valid: false, + error: 'Access denied. Check the access key, secret key, and bucket permissions.', + } + } + if (lower.includes('404') || lower.includes('nosuchbucket')) { + return { valid: false, error: 'Bucket not found. Check the bucket name and region.' } + } + return { valid: false, error: message } + } + }, + + tagDefinitions: [ + { id: 'prefix', displayName: 'Folder', fieldType: 'text' }, + { id: 'fileSize', displayName: 'Size (bytes)', fieldType: 'number' }, + { id: 'lastModified', displayName: 'Last Modified', fieldType: 'date' }, + ], + + mapTags: (metadata: Record): Record => { + const result: Record = {} + + if (typeof metadata.prefix === 'string' && metadata.prefix.length > 0) { + result.prefix = metadata.prefix + } + + if (metadata.fileSize != null) { + const num = Number(metadata.fileSize) + if (!Number.isNaN(num)) result.fileSize = num + } + + const lastModified = parseTagDate(metadata.lastModified) + if (lastModified) result.lastModified = lastModified + + return result + }, +} diff --git a/apps/sim/connectors/sentry/index.ts b/apps/sim/connectors/sentry/index.ts new file mode 100644 index 00000000000..caeb2e10bec --- /dev/null +++ b/apps/sim/connectors/sentry/index.ts @@ -0,0 +1 @@ +export { sentryConnector } from '@/connectors/sentry/sentry' diff --git a/apps/sim/connectors/sentry/sentry.ts b/apps/sim/connectors/sentry/sentry.ts new file mode 100644 index 00000000000..eb17f6e8115 --- /dev/null +++ b/apps/sim/connectors/sentry/sentry.ts @@ -0,0 +1,732 @@ +import { createLogger } from '@sim/logger' +import { getErrorMessage, toError } from '@sim/utils/errors' +import { SentryIcon } from '@/components/icons' +import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils' +import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types' +import { parseTagDate } from '@/connectors/utils' + +const logger = createLogger('SentryConnector') + +const DEFAULT_HOST = 'sentry.io' +const ISSUES_PER_PAGE = 100 + +/** + * Default issue search query. + * + * Reconciliation semantics: the sync engine hard-deletes any previously-synced + * document whose `externalId` is absent from a full (non-capped) listing pass. + * With the default `is:unresolved` query this means an issue that is resolved, + * ignored/muted, or aged out of the query window will fall out of the listing + * and be removed from the knowledge base on the next full sync. That is the + * intended semantic — the KB tracks the *currently matching* issue set, not a + * permanent archive. Users who want resolved issues retained should widen the + * query (e.g. drop `is:unresolved`). When `maxIssues` caps the listing, the + * engine sets `listingCapped` and skips deletion, so capped runs never remove + * unseen issues. + */ +const DEFAULT_QUERY = 'is:unresolved' + +/** + * Allowed `statsPeriod` values for the project issues list endpoint. Sentry's + * project issues endpoint only honors `24h` (default) or `14d` for its timeline + * stats; an empty value disables the stats window. Other periods (e.g. `90d`) + * are accepted by the organization issues endpoint but not this one, so they are + * rejected during validation to avoid a silently-ignored filter. + */ +const ALLOWED_STATS_PERIODS = new Set(['24h', '14d']) + +/** + * Metadata block on a Sentry issue, carrying the human-readable error type/value. + */ +interface SentryIssueMetadata { + type?: string + value?: string + function?: string + title?: string +} + +/** + * A single issue (error group) returned by the issues list/detail endpoints. + */ +interface SentryIssue { + id: string + shortId?: string + title?: string + culprit?: string | null + permalink?: string + logger?: string | null + level?: string + status?: string + platform?: string | null + type?: string | null + metadata?: SentryIssueMetadata + /** Sentry returns the event count as a string (e.g. "12"), not a number. */ + count?: string + userCount?: number + firstSeen?: string + lastSeen?: string +} + +/** + * One entry inside a Sentry event. Entries carry the structured payload (exception, + * breadcrumbs, request, message) keyed by `type`, with the shape under `data` varying + * per entry type. + */ +interface SentryEventEntry { + type?: string + data?: unknown +} + +/** + * A key/value tag pair attached to a Sentry event. + */ +interface SentryEventTag { + key?: string + value?: string +} + +/** + * The latest event for an issue, used to enrich the synced document with the concrete + * message, exception detail, and tags from the most recent occurrence. + */ +interface SentryEvent { + id?: string + eventID?: string + message?: string + title?: string + culprit?: string | null + platform?: string | null + dateCreated?: string + metadata?: SentryIssueMetadata + entries?: SentryEventEntry[] + tags?: SentryEventTag[] +} + +/** + * The shape of an exception entry's `data` payload: a list of exception values, each + * with a type, message, and an optional rendered stack frame list. + */ +interface SentryExceptionData { + values?: { + type?: string + value?: string + module?: string + stacktrace?: { + frames?: { + filename?: string + function?: string + lineNo?: number + module?: string + }[] + } + }[] +} + +/** + * Resolved connector source configuration after normalization. + */ +interface SentrySourceConfig { + /** Bare host (no protocol, no trailing slash), e.g. `sentry.io` or a self-hosted host. */ + host: string + /** REST API base, e.g. `https://sentry.io/api/0`. */ + apiBase: string + organization: string + project: string + query: string + statsPeriod: string + environment: string + maxIssues: number +} + +/** + * Normalizes the host config value: trims whitespace, strips any protocol prefix, + * trailing slashes, and a pasted `/api` or `/api/0` suffix (the connector appends + * `/api/0` itself), and falls back to sentry.io when empty. Genuine path prefixes + * (e.g. `company.com/sentry` for subpath self-hosted installs) are preserved. + */ +function normalizeHost(rawHost: unknown): string { + const host = typeof rawHost === 'string' ? rawHost.trim() : '' + if (!host) return DEFAULT_HOST + return host + .replace(/^https?:\/\//i, '') + .replace(/\/+$/, '') + .replace(/\/api(\/0)?$/i, '') + .replace(/\/+$/, '') + .trim() +} + +/** + * Reads and normalizes the connector source configuration once per call. + */ +function readSourceConfig(sourceConfig: Record): SentrySourceConfig { + const host = normalizeHost(sourceConfig.baseUrl) + const organization = + typeof sourceConfig.organization === 'string' ? sourceConfig.organization.trim() : '' + const project = typeof sourceConfig.project === 'string' ? sourceConfig.project.trim() : '' + const query = + typeof sourceConfig.query === 'string' && sourceConfig.query.trim() + ? sourceConfig.query.trim() + : DEFAULT_QUERY + const statsPeriod = + typeof sourceConfig.statsPeriod === 'string' ? sourceConfig.statsPeriod.trim() : '' + const environment = + typeof sourceConfig.environment === 'string' ? sourceConfig.environment.trim() : '' + const maxIssues = sourceConfig.maxIssues ? Number(sourceConfig.maxIssues) : 0 + + return { + host, + apiBase: `https://${host}/api/0`, + organization, + project, + query, + statsPeriod, + environment, + maxIssues, + } +} + +/** + * Builds the standard JSON request headers carrying the Sentry auth token. + */ +function authHeaders(accessToken: string): Record { + return { + Authorization: `Bearer ${accessToken}`, + 'Content-Type': 'application/json', + } +} + +/** + * Reads the `cursor` of the `rel="next"` link from a Sentry `Link` header. + * + * Sentry paginates via the `Link` header: each link is annotated with `rel`, + * `results`, and `cursor` attributes, e.g. + * `; rel="next"; results="true"; cursor="0:100:0"`. + * A further page exists only when the `next` link reports `results="true"`; when it + * reports `results="false"` (or the header is absent) the cursor points at an empty + * page and pagination must stop. The cursor is read from the `cursor="…"` attribute, + * which is the canonical token Sentry expects echoed back on the next request. + */ +function parseNextCursor(linkHeader: string | null): string | undefined { + if (!linkHeader) return undefined + + for (const part of linkHeader.split(',')) { + if (!/rel="next"/.test(part)) continue + if (!/results="true"/.test(part)) return undefined + const cursorMatch = part.match(/cursor="([^"]*)"/) + if (cursorMatch) return cursorMatch[1] + return undefined + } + + return undefined +} + +/** + * Builds the metadata-based content hash for an issue. + * + * The hash combines the issue id, its status, and `lastSeen`. `lastSeen` advances every + * time a new event lands on the group, which is exactly when the latest-event content can + * change — so it captures content freshness without hashing the downloaded body. `status` + * is included so resolve/ignore transitions also re-sync. `count` is deliberately omitted: + * it changes on every single occurrence and would churn the document on each event even + * when `lastSeen` already moved, providing no extra signal over `lastSeen`. + * + * The hash is derived purely from issue metadata present on both the list stub and the + * getDocument detail fetch, so both paths produce an identical hash for the same issue + * snapshot. If a fresh event lands between listing and hydration, `lastSeen` advances and + * getDocument computes a newer hash; the sync engine stores that newer hash, which the next + * list pass reproduces — so the document converges without churn. + */ +function buildContentHash(issue: SentryIssue): string { + return `sentry:${issue.id}:${issue.status ?? ''}:${issue.lastSeen ?? ''}` +} + +/** + * Builds the document title, preferring the issue title and falling back to the + * metadata type/value or short id. + */ +function buildTitle(issue: SentryIssue): string { + const title = issue.title?.trim() + if (title) return title + + const metaType = issue.metadata?.type?.trim() + const metaValue = issue.metadata?.value?.trim() + if (metaType && metaValue) return `${metaType}: ${metaValue}` + return metaType || issue.shortId || `Issue ${issue.id}` +} + +/** + * Collects the source-specific metadata fed to mapTags. Shared between the list stub and + * getDocument so tag values stay consistent regardless of which path produced the doc. + */ +function buildMetadata(issue: SentryIssue): Record { + return { + level: issue.level, + status: issue.status, + firstSeen: issue.firstSeen, + lastSeen: issue.lastSeen, + count: issue.count != null ? Number(issue.count) : undefined, + } +} + +/** + * Creates a lightweight document stub from a list entry. No per-issue API calls — the + * latest-event content is deferred to getDocument and only fetched for new/changed issues. + */ +function issueToStub(issue: SentryIssue): ExternalDocument { + return { + externalId: issue.id, + title: buildTitle(issue), + content: '', + contentDeferred: true, + mimeType: 'text/plain', + sourceUrl: issue.permalink || undefined, + contentHash: buildContentHash(issue), + metadata: buildMetadata(issue), + } +} + +/** + * Renders the exception entry of a latest event into readable lines: each exception's + * type/value plus a compact, top-down stack frame list. + */ +function formatException(data: SentryExceptionData): string[] { + const lines: string[] = [] + + for (const value of data.values ?? []) { + const header = [value.type, value.value].filter(Boolean).join(': ') + if (header) lines.push(header) + + const frames = value.stacktrace?.frames ?? [] + for (const frame of frames.slice().reverse()) { + const location = [frame.module || frame.filename, frame.function].filter(Boolean).join(' in ') + const lineNo = frame.lineNo != null ? `:${frame.lineNo}` : '' + if (location) lines.push(` at ${location}${lineNo}`) + } + } + + return lines +} + +/** + * Formats an issue and its latest event into a single plain-text document covering the + * title, culprit, counts, the latest event's message/exception, and event tags. + */ +function formatIssueContent(issue: SentryIssue, event: SentryEvent | null): string { + const parts: string[] = [] + + parts.push(`Issue: ${buildTitle(issue)}`) + if (issue.shortId) parts.push(`Short ID: ${issue.shortId}`) + if (issue.culprit) parts.push(`Culprit: ${issue.culprit}`) + if (issue.level) parts.push(`Level: ${issue.level}`) + if (issue.status) parts.push(`Status: ${issue.status}`) + if (issue.platform) parts.push(`Platform: ${issue.platform}`) + if (issue.count) parts.push(`Events: ${issue.count}`) + if (issue.userCount != null) parts.push(`Users affected: ${issue.userCount}`) + if (issue.firstSeen) parts.push(`First seen: ${issue.firstSeen}`) + if (issue.lastSeen) parts.push(`Last seen: ${issue.lastSeen}`) + + if (event) { + const message = event.message?.trim() || event.title?.trim() + if (message) { + parts.push('') + parts.push('--- Latest Event ---') + if (event.dateCreated) parts.push(`Occurred: ${event.dateCreated}`) + parts.push(message) + } + + const exceptionEntry = event.entries?.find((entry) => entry.type === 'exception') + if (exceptionEntry?.data) { + const exceptionLines = formatException(exceptionEntry.data as SentryExceptionData) + if (exceptionLines.length > 0) { + parts.push('') + parts.push('--- Exception ---') + parts.push(...exceptionLines) + } + } + + const tagLines = (event.tags ?? []) + .map((tag) => (tag.key && tag.value ? `${tag.key}: ${tag.value}` : undefined)) + .filter((line): line is string => Boolean(line)) + if (tagLines.length > 0) { + parts.push('') + parts.push('--- Tags ---') + parts.push(...tagLines) + } + } + + return parts.join('\n').trim() +} + +/** + * Fetches the latest event for an issue. Returns null when the issue has no events or the + * request fails, so the document still syncs with its list-level summary. + * + * Uses the organization-scoped event endpoint + * `/api/0/organizations/{org}/issues/{id}/events/latest/`, which is the documented path + * and works for both sentry.io and self-hosted installs. + */ +async function fetchLatestEvent( + apiBase: string, + organization: string, + accessToken: string, + issueId: string +): Promise { + const url = `${apiBase}/organizations/${encodeURIComponent(organization)}/issues/${encodeURIComponent(issueId)}/events/latest/` + + const response = await fetchWithRetry(url, { + method: 'GET', + headers: authHeaders(accessToken), + }) + + if (!response.ok) { + if (response.status !== 404) { + logger.warn('Failed to fetch latest Sentry event', { issueId, status: response.status }) + } + return null + } + + return (await response.json()) as SentryEvent +} + +export const sentryConnector: ConnectorConfig = { + id: 'sentry', + name: 'Sentry', + description: 'Sync issues and errors from Sentry into your knowledge base', + version: '1.0.0', + icon: SentryIcon, + + auth: { + mode: 'apiKey', + label: 'Auth Token', + placeholder: 'Enter your Sentry auth token', + }, + + configFields: [ + { + id: 'baseUrl', + title: 'Sentry URL', + type: 'short-input', + placeholder: 'sentry.io', + required: false, + mode: 'advanced', + description: + 'Host of your Sentry install. Leave blank for sentry.io. Set this for self-hosted Sentry (e.g. sentry.mycompany.com).', + }, + { + id: 'organization', + title: 'Organization Slug', + type: 'short-input', + placeholder: 'e.g. my-org', + required: true, + description: 'The slug of your Sentry organization.', + }, + { + id: 'project', + title: 'Project Slug', + type: 'short-input', + placeholder: 'e.g. my-project', + required: true, + description: 'The slug of the project whose issues should be synced.', + }, + { + id: 'query', + title: 'Search Query', + type: 'short-input', + placeholder: `e.g. ${DEFAULT_QUERY}`, + required: false, + description: + 'Sentry search query to filter issues (e.g. "is:unresolved level:error environment:production"). Defaults to "is:unresolved".', + }, + { + id: 'environment', + title: 'Environment', + type: 'short-input', + required: false, + mode: 'advanced', + placeholder: 'e.g. production', + description: 'Only sync issues seen in this environment. Leave blank for all environments.', + }, + { + id: 'statsPeriod', + title: 'Stats Period', + type: 'dropdown', + required: false, + mode: 'advanced', + options: [ + { label: 'Sentry default (24h)', id: '' }, + { label: 'Last 24 hours', id: '24h' }, + { label: 'Last 14 days', id: '14d' }, + ], + description: 'Time window for the issue stats Sentry computes on the project issues list.', + }, + { + id: 'maxIssues', + title: 'Max Issues', + type: 'short-input', + required: false, + placeholder: 'e.g. 500 (default: unlimited)', + description: 'Cap the number of issues synced. Leave empty to sync all matching issues.', + }, + ], + + listDocuments: async ( + accessToken: string, + sourceConfig: Record, + cursor?: string, + syncContext?: Record + ): Promise => { + const { apiBase, organization, project, query, statsPeriod, environment, maxIssues } = + readSourceConfig(sourceConfig) + + if (!organization || !project) { + throw new Error('Organization and project slugs are required') + } + + /* + * Uses the project issues list endpoint + * `/api/0/projects/{org}/{project}/issues/`. This endpoint is deprecated in favor of + * `/api/0/organizations/{org}/issues/?project=`, but the organization endpoint + * filters by numeric project ID rather than slug — a UX regression for a connector + * keyed on the human-readable project slug. The project endpoint remains functional + * and slug-addressable, so it is retained deliberately for the listing path. Issue + * detail and latest-event fetches use the organization-scoped paths. + */ + const url = new URL( + `${apiBase}/projects/${encodeURIComponent(organization)}/${encodeURIComponent(project)}/issues/` + ) + url.searchParams.set('query', query) + url.searchParams.set('limit', String(ISSUES_PER_PAGE)) + if (statsPeriod) url.searchParams.set('statsPeriod', statsPeriod) + if (environment) url.searchParams.set('environment', environment) + if (cursor) url.searchParams.set('cursor', cursor) + + logger.info('Listing Sentry issues', { + organization, + project, + cursor: cursor ?? 'initial', + maxIssues, + }) + + const response = await fetchWithRetry(url.toString(), { + method: 'GET', + headers: authHeaders(accessToken), + }) + + if (!response.ok) { + const errorText = await response.text().catch(() => '') + logger.error('Failed to list Sentry issues', { + status: response.status, + error: errorText.slice(0, 500), + }) + throw new Error(`Failed to list Sentry issues: ${response.status}`) + } + + const issues = ((await response.json()) as SentryIssue[]).filter((issue) => Boolean(issue.id)) + + const prevFetched = (syncContext?.totalDocsFetched as number) ?? 0 + let documents = issues.map(issueToStub) + if (maxIssues > 0) { + const remaining = Math.max(0, maxIssues - prevFetched) + if (documents.length > remaining) { + documents = documents.slice(0, remaining) + } + } + + const totalFetched = prevFetched + documents.length + if (syncContext) syncContext.totalDocsFetched = totalFetched + const hitLimit = maxIssues > 0 && totalFetched >= maxIssues + if (hitLimit && syncContext) syncContext.listingCapped = true + + const nextCursor = parseNextCursor(response.headers.get('Link')) + const hasMore = !hitLimit && Boolean(nextCursor) + + return { + documents, + nextCursor: hasMore ? nextCursor : undefined, + hasMore, + } + }, + + getDocument: async ( + accessToken: string, + sourceConfig: Record, + externalId: string + ): Promise => { + try { + if (!externalId) return null + + const { apiBase, organization } = readSourceConfig(sourceConfig) + if (!organization) return null + + const url = `${apiBase}/organizations/${encodeURIComponent(organization)}/issues/${encodeURIComponent(externalId)}/` + + const response = await fetchWithRetry(url, { + method: 'GET', + headers: authHeaders(accessToken), + }) + + if (!response.ok) { + if (response.status === 404 || response.status === 410) return null + throw new Error(`Failed to fetch Sentry issue: ${response.status}`) + } + + const issue = (await response.json()) as SentryIssue + if (!issue?.id) return null + + const event = await fetchLatestEvent(apiBase, organization, accessToken, issue.id) + const content = formatIssueContent(issue, event) + if (!content.trim()) return null + + return { + externalId: issue.id, + title: buildTitle(issue), + content, + contentDeferred: false, + mimeType: 'text/plain', + sourceUrl: issue.permalink || undefined, + contentHash: buildContentHash(issue), + metadata: buildMetadata(issue), + } + } catch (error) { + logger.warn('Failed to get Sentry issue', { + externalId, + error: toError(error).message, + }) + return null + } + }, + + validateConfig: async ( + accessToken: string, + sourceConfig: Record + ): Promise<{ valid: boolean; error?: string }> => { + const { apiBase, organization, project, statsPeriod, maxIssues, host } = + readSourceConfig(sourceConfig) + + if (!organization) { + return { valid: false, error: 'Organization slug is required' } + } + if (!project) { + return { valid: false, error: 'Project slug is required' } + } + + if (statsPeriod && !ALLOWED_STATS_PERIODS.has(statsPeriod)) { + return { valid: false, error: 'Stats period must be 24h or 14d' } + } + + const rawMax = sourceConfig.maxIssues as string | undefined + if (rawMax && (Number.isNaN(maxIssues) || maxIssues < 0)) { + return { valid: false, error: 'Max issues must be a non-negative number' } + } + + try { + /* + * Probe the project detail endpoint first. This exercises the `project:read` + * scope and the project-scoped path style, and gives a precise "not found" + * message when the org or project slug is wrong. + */ + const projectResponse = await fetchWithRetry( + `${apiBase}/projects/${encodeURIComponent(organization)}/${encodeURIComponent(project)}/`, + { + method: 'GET', + headers: authHeaders(accessToken), + }, + VALIDATE_RETRY_OPTIONS + ) + + if (!projectResponse.ok) { + if (projectResponse.status === 401 || projectResponse.status === 403) { + return { valid: false, error: 'Invalid auth token or insufficient permissions' } + } + if (projectResponse.status === 404) { + return { + valid: false, + error: `Organization or project not found on ${host}`, + } + } + const errorText = await projectResponse.text().catch(() => '') + return { + valid: false, + error: `Sentry access failed: ${projectResponse.status}${errorText ? ` — ${errorText.slice(0, 200)}` : ''}`, + } + } + + /* + * Probe the issues-list endpoint with a single-result page. The project + * detail probe above only proves `project:read`, but every sync operation — + * `listDocuments` and the org-scoped `getDocument`/latest-event hydration — + * needs `event:read`. A token scoped to `project:read` only would pass the + * first probe yet fail at hydration time, so this second probe forces a + * misconfigured token to fail fast at save time. It is slug-addressable and + * cheap (one issue, no stats window). + */ + const issuesProbeUrl = new URL( + `${apiBase}/projects/${encodeURIComponent(organization)}/${encodeURIComponent(project)}/issues/` + ) + issuesProbeUrl.searchParams.set('query', DEFAULT_QUERY) + issuesProbeUrl.searchParams.set('limit', '1') + + const issuesResponse = await fetchWithRetry( + issuesProbeUrl.toString(), + { + method: 'GET', + headers: authHeaders(accessToken), + }, + VALIDATE_RETRY_OPTIONS + ) + + if (!issuesResponse.ok) { + if (issuesResponse.status === 401 || issuesResponse.status === 403) { + return { + valid: false, + error: + 'Auth token cannot read issues. The token needs the "event:read" scope (in addition to "project:read").', + } + } + const errorText = await issuesResponse.text().catch(() => '') + return { + valid: false, + error: `Sentry issue access failed: ${issuesResponse.status}${errorText ? ` — ${errorText.slice(0, 200)}` : ''}`, + } + } + + return { valid: true } + } catch (error) { + const message = getErrorMessage(error, 'Failed to validate configuration') + return { valid: false, error: message } + } + }, + + tagDefinitions: [ + { id: 'level', displayName: 'Level', fieldType: 'text' }, + { id: 'status', displayName: 'Status', fieldType: 'text' }, + { id: 'count', displayName: 'Event Count', fieldType: 'number' }, + { id: 'firstSeen', displayName: 'First Seen', fieldType: 'date' }, + { id: 'lastSeen', displayName: 'Last Seen', fieldType: 'date' }, + ], + + mapTags: (metadata: Record): Record => { + const result: Record = {} + + if (typeof metadata.level === 'string' && metadata.level.trim()) { + result.level = metadata.level + } + + if (typeof metadata.status === 'string' && metadata.status.trim()) { + result.status = metadata.status + } + + if (metadata.count != null) { + const num = Number(metadata.count) + if (!Number.isNaN(num)) result.count = num + } + + const firstSeen = parseTagDate(metadata.firstSeen) + if (firstSeen) result.firstSeen = firstSeen + + const lastSeen = parseTagDate(metadata.lastSeen) + if (lastSeen) result.lastSeen = lastSeen + + return result + }, +} diff --git a/apps/sim/connectors/typeform/index.ts b/apps/sim/connectors/typeform/index.ts new file mode 100644 index 00000000000..031ca7f4f62 --- /dev/null +++ b/apps/sim/connectors/typeform/index.ts @@ -0,0 +1 @@ +export { typeformConnector } from '@/connectors/typeform/typeform' diff --git a/apps/sim/connectors/typeform/typeform.ts b/apps/sim/connectors/typeform/typeform.ts new file mode 100644 index 00000000000..0df206348bb --- /dev/null +++ b/apps/sim/connectors/typeform/typeform.ts @@ -0,0 +1,596 @@ +import { createLogger } from '@sim/logger' +import { getErrorMessage, toError } from '@sim/utils/errors' +import { TypeformIcon } from '@/components/icons' +import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils' +import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types' +import { parseTagDate } from '@/connectors/utils' + +const logger = createLogger('TypeformConnector') + +const TYPEFORM_API_BASE = 'https://api.typeform.com' +/** Typeform allows page_size up to 1000; 100 keeps per-batch memory bounded. */ +const RESPONSES_PER_PAGE = 100 + +/** + * Allowed `response_type` filter values per the Responses API. `completed` is the + * API default; `all` is a connector-local sentinel that omits the filter so every + * response type (`started`, `partial`, `completed`) is returned. + */ +type ResponseTypeChoice = 'completed' | 'partial' | 'all' + +/** + * A single field definition from the Typeform form structure. + */ +interface TypeformField { + id: string + ref?: string + title?: string + type?: string +} + +/** + * The relevant subset of a Typeform form definition. + */ +interface TypeformFormDefinition { + id: string + title?: string + fields?: TypeformField[] + _links?: { display?: string } +} + +/** + * A single answer within a Typeform response. Only the value-bearing keys for + * each answer `type` are declared explicitly; the remainder are optional. + */ +interface TypeformAnswer { + field?: { id?: string; type?: string; ref?: string } + type?: string + text?: string + email?: string + url?: string + phone_number?: string + file_url?: string + number?: number + boolean?: boolean + date?: string + choice?: { label?: string; other?: string } + choices?: { labels?: string[]; other?: string } + payment?: { amount?: string; last4?: string; name?: string; success?: boolean } +} + +/** + * A single Typeform response item. + * + * `token` is the cursor field consumed by the `before`/`after` query params, while + * `response_id` is the identifier consumed by `included_response_ids`. They are + * distinct values, so both are tracked: the externalId is keyed off `response_id` + * (used by getDocument), the pagination cursor off `token`. + */ +interface TypeformResponseItem { + response_id?: string + token: string + landing_id?: string + landed_at?: string + submitted_at?: string + metadata?: { + platform?: string + browser?: string + referer?: string + } + answers?: TypeformAnswer[] | null + hidden?: Record | null +} + +/** + * Reads the `response_type` choice from sourceConfig, defaulting to `completed`. + */ +function getResponseTypeChoice(sourceConfig: Record): ResponseTypeChoice { + const value = + typeof sourceConfig.responseType === 'string' ? sourceConfig.responseType.trim() : '' + if (value === 'partial' || value === 'all') return value + return 'completed' +} + +/** + * Appends the `response_type` filter to a query string for a given choice. `all` + * omits the parameter so every type is returned; `partial` requests both partial + * and completed so partially-answered submissions are included alongside finished + * ones. + */ +function appendResponseType(params: URLSearchParams, choice: ResponseTypeChoice): void { + if (choice === 'completed') params.append('response_type', 'completed') + else if (choice === 'partial') params.append('response_type', 'partial,completed') +} + +/** + * Renders a single answer's value into a human-readable string. + */ +function renderAnswerValue(answer: TypeformAnswer): string { + switch (answer.type) { + case 'text': + return answer.text ?? '' + case 'email': + return answer.email ?? '' + case 'url': + return answer.url ?? '' + case 'phone_number': + return answer.phone_number ?? '' + case 'file_url': + return answer.file_url ?? '' + case 'number': + return answer.number != null ? String(answer.number) : '' + case 'boolean': + return answer.boolean != null ? (answer.boolean ? 'Yes' : 'No') : '' + case 'date': + return answer.date ?? '' + case 'choice': { + const parts = [answer.choice?.label, answer.choice?.other].filter(Boolean) + return parts.join(', ') + } + case 'choices': { + const labels = Array.isArray(answer.choices?.labels) ? (answer.choices?.labels ?? []) : [] + const parts = [...labels] + if (answer.choices?.other) parts.push(answer.choices.other) + return parts.join(', ') + } + case 'payment': + return answer.payment?.amount != null ? String(answer.payment.amount) : '' + default: + return '' + } +} + +/** + * Builds a map of field id to its human-readable question title from a form definition. + */ +function buildFieldTitleMap(form: TypeformFormDefinition): Map { + const map = new Map() + for (const field of form.fields ?? []) { + if (field.id) map.set(field.id, field.title || field.id) + } + return map +} + +/** + * Renders a Typeform response as readable "Question: Answer" plain text. + */ +function renderResponseContent( + form: TypeformFormDefinition, + response: TypeformResponseItem, + fieldTitles: Map +): string { + const parts: string[] = [] + + if (form.title) parts.push(`Form: ${form.title}`) + if (response.submitted_at) parts.push(`Submitted: ${response.submitted_at}`) + parts.push('') + + const answers = Array.isArray(response.answers) ? response.answers : [] + for (const answer of answers) { + const fieldId = answer.field?.id + const question = (fieldId && fieldTitles.get(fieldId)) || fieldId || 'Answer' + const value = renderAnswerValue(answer) + parts.push(`${question}: ${value}`) + } + + if (response.hidden && Object.keys(response.hidden).length > 0) { + parts.push('') + parts.push('--- Hidden Fields ---') + for (const [key, val] of Object.entries(response.hidden)) { + parts.push(`${key}: ${String(val)}`) + } + } + + return parts.join('\n') +} + +/** + * Derives the stable external identifier for a response. Prefers `response_id` + * (the identifier `included_response_ids` filters on, so getDocument can fetch the + * exact response) and falls back to `token` when `response_id` is absent. + */ +function getResponseExternalId(response: TypeformResponseItem): string { + return response.response_id || response.token +} + +/** + * Produces the metadata-based content hash for a response. Responses are immutable + * once submitted, so `submitted_at` is a stable change key. For not-yet-submitted + * (started/partial) responses, `landed_at` is used as the fallback indicator. + */ +function getResponseContentHash(response: TypeformResponseItem): string { + const indicator = response.submitted_at || response.landed_at || '' + return `typeform:${getResponseExternalId(response)}:${indicator}` +} + +/** + * Builds a full ExternalDocument from a rendered response. + */ +function responseToDocument( + form: TypeformFormDefinition, + response: TypeformResponseItem, + fieldTitles: Map +): ExternalDocument { + const externalId = getResponseExternalId(response) + const submittedAt = response.submitted_at + const displayUrl = form._links?.display + + return { + externalId, + title: `${form.title || 'Typeform'} — ${submittedAt || response.landed_at || externalId}`, + content: renderResponseContent(form, response, fieldTitles), + contentDeferred: false, + mimeType: 'text/plain', + sourceUrl: displayUrl || undefined, + contentHash: getResponseContentHash(response), + metadata: { + formId: form.id, + formTitle: form.title, + submittedAt, + landedAt: response.landed_at, + platform: response.metadata?.platform, + }, + } +} + +/** + * Fetches a form definition, caching it in syncContext keyed by form id so a + * single sync run fetches each form's structure only once. + */ +async function getFormDefinition( + accessToken: string, + formId: string, + syncContext?: Record, + retryOptions?: Parameters[2] +): Promise { + const cacheKey = `form:${formId}` + const cached = syncContext?.[cacheKey] as TypeformFormDefinition | undefined + if (cached) return cached + + const response = await fetchWithRetry( + `${TYPEFORM_API_BASE}/forms/${encodeURIComponent(formId)}`, + { + method: 'GET', + headers: { + Authorization: `Bearer ${accessToken}`, + Accept: 'application/json', + }, + }, + retryOptions + ) + + if (!response.ok) { + throw new Error(`Failed to fetch Typeform form ${formId}: ${response.status}`) + } + + const form = (await response.json()) as TypeformFormDefinition + if (syncContext) syncContext[cacheKey] = form + return form +} + +export const typeformConnector: ConnectorConfig = { + id: 'typeform', + name: 'Typeform', + description: 'Sync form responses from Typeform into your knowledge base', + version: '1.0.0', + icon: TypeformIcon, + + auth: { + mode: 'apiKey', + label: 'Personal Access Token', + placeholder: 'Enter your Typeform personal access token', + }, + + /** + * Incremental sync narrows the listing to responses submitted after the last + * sync via the `since` filter (inclusive, matched against `submitted_at` for + * completed responses). Responses are immutable, so reconciliation by content + * hash skips anything already indexed. + */ + supportsIncrementalSync: true, + + configFields: [ + { + id: 'formId', + title: 'Form ID', + type: 'short-input', + placeholder: 'e.g. abc123XYZ', + required: true, + description: 'The Typeform form whose responses you want to sync', + }, + { + id: 'responseType', + title: 'Responses', + type: 'dropdown', + required: false, + options: [ + { label: 'Completed only', id: 'completed' }, + { label: 'Partial & completed', id: 'partial' }, + { label: 'All (including started)', id: 'all' }, + ], + description: 'Which responses to sync by completion status. Defaults to completed only.', + }, + { + id: 'since', + title: 'Submitted After', + type: 'short-input', + required: false, + mode: 'advanced', + placeholder: 'e.g. 2024-01-01T00:00:00Z', + description: 'Only sync responses submitted on or after this date (ISO 8601, UTC).', + }, + { + id: 'until', + title: 'Submitted Before', + type: 'short-input', + required: false, + mode: 'advanced', + placeholder: 'e.g. 2024-12-31T23:59:59Z', + description: 'Only sync responses submitted on or before this date (ISO 8601, UTC).', + }, + { + id: 'query', + title: 'Search Filter', + type: 'short-input', + required: false, + mode: 'advanced', + placeholder: 'e.g. acme', + description: + 'Only sync responses containing this text in any answer, hidden field, or variable.', + }, + { + id: 'maxResponses', + title: 'Max Responses', + type: 'short-input', + required: false, + placeholder: 'e.g. 500 (default: unlimited)', + }, + ], + + listDocuments: async ( + accessToken: string, + sourceConfig: Record, + cursor?: string, + syncContext?: Record, + lastSyncAt?: Date + ): Promise => { + const formId = (sourceConfig.formId as string)?.trim() + if (!formId) { + throw new Error('Form ID is required') + } + const maxResponses = sourceConfig.maxResponses ? Number(sourceConfig.maxResponses) : 0 + + const form = await getFormDefinition(accessToken, formId, syncContext) + const fieldTitles = buildFieldTitleMap(form) + + const queryParams = new URLSearchParams() + queryParams.append('page_size', String(RESPONSES_PER_PAGE)) + appendResponseType(queryParams, getResponseTypeChoice(sourceConfig)) + + const since = typeof sourceConfig.since === 'string' ? sourceConfig.since.trim() : '' + const until = typeof sourceConfig.until === 'string' ? sourceConfig.until.trim() : '' + const search = typeof sourceConfig.query === 'string' ? sourceConfig.query.trim() : '' + if (until) queryParams.append('until', until) + if (search) queryParams.append('query', search) + + /** + * `since` from the user config wins; otherwise incremental sync derives it + * from lastSyncAt. `since` narrows the set by submission date while `before` + * (token paging) walks it newest-to-oldest; the two compose — only `sort` is + * mutually exclusive with `before`/`after`, which this connector never sets. + */ + if (since) queryParams.append('since', since) + else if (lastSyncAt) queryParams.append('since', lastSyncAt.toISOString()) + + if (cursor) { + queryParams.append('before', cursor) + } + + const url = `${TYPEFORM_API_BASE}/forms/${encodeURIComponent(formId)}/responses?${queryParams.toString()}` + + logger.info('Listing Typeform responses', { + formId, + before: cursor, + incremental: Boolean(lastSyncAt), + }) + + const response = await fetchWithRetry(url, { + method: 'GET', + headers: { + Authorization: `Bearer ${accessToken}`, + Accept: 'application/json', + }, + }) + + if (!response.ok) { + const errorText = await response.text().catch(() => '') + logger.error('Failed to list Typeform responses', { + formId, + status: response.status, + error: errorText.slice(0, 500), + }) + throw new Error(`Failed to list Typeform responses: ${response.status}`) + } + + const data = (await response.json()) as { items?: TypeformResponseItem[] } + const items = Array.isArray(data.items) ? data.items.filter((item) => item?.token) : [] + + const prevTotal = (syncContext?.totalDocsFetched as number) ?? 0 + + /** + * Trim the page to the remaining `maxResponses` budget so the cap is honored + * exactly rather than overshooting by up to a full page. The `before` cursor + * is still derived from the untrimmed page below, but it is unused once the + * cap is hit because `hasMore` becomes false. + */ + let cappedItems = items + if (maxResponses > 0) { + const remaining = Math.max(0, maxResponses - prevTotal) + if (items.length > remaining) cappedItems = items.slice(0, remaining) + } + + const documents: ExternalDocument[] = cappedItems.map((item) => + responseToDocument(form, item, fieldTitles) + ) + + const totalFetched = prevTotal + documents.length + if (syncContext) syncContext.totalDocsFetched = totalFetched + const hitLimit = maxResponses > 0 && totalFetched >= maxResponses + + /** + * Signal a truncated listing so the engine skips deletion reconciliation: + * a capped page does not represent the full set of responses, and deleting + * everything past the cap would wipe still-present docs from the KB. + */ + if (hitLimit && syncContext) syncContext.listingCapped = true + + /** + * The `before` cursor is the response `token` (not `response_id`). Each full + * page advances to the oldest token seen so the next request pages strictly + * older responses. A short page or a missing token ends pagination, which also + * guards against an infinite loop if the API ever repeats a cursor. + */ + const lastItem = items[items.length - 1] + const nextCursor = lastItem?.token + const hasMore = !hitLimit && items.length === RESPONSES_PER_PAGE && Boolean(nextCursor) + + return { + documents, + nextCursor: hasMore ? nextCursor : undefined, + hasMore, + } + }, + + getDocument: async ( + accessToken: string, + sourceConfig: Record, + externalId: string, + syncContext?: Record + ): Promise => { + const formId = (sourceConfig.formId as string)?.trim() + if (!formId || !externalId) return null + + try { + const form = await getFormDefinition(accessToken, formId, syncContext) + const fieldTitles = buildFieldTitleMap(form) + + /** + * `included_response_ids` filters by `response_id`, matching the externalId + * minted in listDocuments. The configured response_type is forwarded so a + * partial response stays fetchable (the endpoint defaults to completed-only, + * which would otherwise exclude it). + */ + const params = new URLSearchParams() + params.append('included_response_ids', externalId) + appendResponseType(params, getResponseTypeChoice(sourceConfig)) + + const url = `${TYPEFORM_API_BASE}/forms/${encodeURIComponent(formId)}/responses?${params.toString()}` + const response = await fetchWithRetry(url, { + method: 'GET', + headers: { + Authorization: `Bearer ${accessToken}`, + Accept: 'application/json', + }, + }) + + if (!response.ok) { + if (response.status === 404) return null + throw new Error(`Failed to fetch Typeform response ${externalId}: ${response.status}`) + } + + const data = (await response.json()) as { items?: TypeformResponseItem[] } + const item = Array.isArray(data.items) + ? data.items.find((candidate) => getResponseExternalId(candidate) === externalId) + : undefined + if (!item) return null + + return responseToDocument(form, item, fieldTitles) + } catch (error) { + logger.warn('Failed to get Typeform response', { + externalId, + error: toError(error).message, + }) + return null + } + }, + + validateConfig: async ( + accessToken: string, + sourceConfig: Record + ): Promise<{ valid: boolean; error?: string }> => { + const formId = (sourceConfig.formId as string)?.trim() + if (!formId) { + return { valid: false, error: 'Form ID is required' } + } + + const maxResponses = sourceConfig.maxResponses as string | undefined + if (maxResponses && (Number.isNaN(Number(maxResponses)) || Number(maxResponses) <= 0)) { + return { valid: false, error: 'Max responses must be a positive number' } + } + + const since = typeof sourceConfig.since === 'string' ? sourceConfig.since.trim() : '' + if (since && Number.isNaN(new Date(since).getTime())) { + return { valid: false, error: '"Submitted After" must be a valid ISO 8601 date' } + } + + const until = typeof sourceConfig.until === 'string' ? sourceConfig.until.trim() : '' + if (until && Number.isNaN(new Date(until).getTime())) { + return { valid: false, error: '"Submitted Before" must be a valid ISO 8601 date' } + } + + if (since && until && new Date(since).getTime() > new Date(until).getTime()) { + return { valid: false, error: '"Submitted After" must not be later than "Submitted Before"' } + } + + try { + const response = await fetchWithRetry( + `${TYPEFORM_API_BASE}/forms/${encodeURIComponent(formId)}`, + { + method: 'GET', + headers: { + Authorization: `Bearer ${accessToken}`, + Accept: 'application/json', + }, + }, + VALIDATE_RETRY_OPTIONS + ) + + if (response.status === 401 || response.status === 403) { + return { valid: false, error: 'Invalid or unauthorized Typeform personal access token' } + } + if (response.status === 404) { + return { valid: false, error: `Form not found: ${formId}` } + } + if (!response.ok) { + return { valid: false, error: `Failed to validate Typeform form: ${response.status}` } + } + + return { valid: true } + } catch (error) { + return { valid: false, error: getErrorMessage(error, 'Failed to validate configuration') } + } + }, + + tagDefinitions: [ + { id: 'formTitle', displayName: 'Form Title', fieldType: 'text' }, + { id: 'platform', displayName: 'Platform', fieldType: 'text' }, + { id: 'submittedAt', displayName: 'Submitted At', fieldType: 'date' }, + ], + + mapTags: (metadata: Record): Record => { + const result: Record = {} + + if (typeof metadata.formTitle === 'string' && metadata.formTitle) { + result.formTitle = metadata.formTitle + } + + if (typeof metadata.platform === 'string' && metadata.platform) { + result.platform = metadata.platform + } + + const submittedAt = parseTagDate(metadata.submittedAt) + if (submittedAt) result.submittedAt = submittedAt + + return result + }, +} diff --git a/apps/sim/connectors/x/index.ts b/apps/sim/connectors/x/index.ts new file mode 100644 index 00000000000..3760c8b2985 --- /dev/null +++ b/apps/sim/connectors/x/index.ts @@ -0,0 +1 @@ +export { xConnector } from '@/connectors/x/x' diff --git a/apps/sim/connectors/x/x.ts b/apps/sim/connectors/x/x.ts new file mode 100644 index 00000000000..0b52253f12e --- /dev/null +++ b/apps/sim/connectors/x/x.ts @@ -0,0 +1,628 @@ +import { createLogger } from '@sim/logger' +import { getErrorMessage, toError } from '@sim/utils/errors' +import { xIcon } from '@/components/icons' +import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils' +import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types' +import { parseMultiValue, parseTagDate } from '@/connectors/utils' + +const logger = createLogger('XConnector') + +const X_API_BASE = 'https://api.x.com/2' +const DEFAULT_MAX_POSTS = 200 +/** Max page size accepted by the timeline, mentions, bookmarks, and likes endpoints. */ +const POSTS_PER_PAGE = 100 +/** + * Minimum `max_results` accepted by the user-tweets, mentions, and liked-tweets + * endpoints. The bookmarks endpoint is the sole exception and accepts a minimum of 1. + */ +const MIN_PAGE_SIZE = 5 +/** + * `edit_history_tweet_ids` is requested explicitly (it is not a default field) so the + * content hash can key on edit-history length and detect edits. + */ +const TWEET_FIELDS = 'created_at,public_metrics,text,edit_history_tweet_ids' + +/** + * Sync mode determines which timeline the connector reads. + * - `me`: the authenticated user's own posts (GET /2/users/:id/tweets) + * - `user`: another account's posts by username (GET /2/users/:id/tweets) + * - `mentions`: posts mentioning the authenticated user (GET /2/users/:id/mentions) + * - `bookmarks`: the authenticated user's bookmarks (GET /2/users/:id/bookmarks) + * - `likes`: posts the authenticated user has liked (GET /2/users/:id/liked_tweets) + */ +type SyncMode = 'me' | 'user' | 'mentions' | 'bookmarks' | 'likes' + +/** Modes whose endpoint supports the `exclude=retweets,replies` parameter. */ +const EXCLUDE_CAPABLE_MODES: ReadonlySet = new Set(['me', 'user']) +/** Modes whose endpoint supports the `start_time` / `end_time` parameters. */ +const DATE_RANGE_CAPABLE_MODES: ReadonlySet = new Set([ + 'me', + 'user', + 'mentions', +]) + +interface XPublicMetrics { + retweet_count?: number + reply_count?: number + like_count?: number + quote_count?: number +} + +interface XTweet { + id: string + text: string + created_at?: string + author_id?: string + public_metrics?: XPublicMetrics + edit_history_tweet_ids?: string[] +} + +interface XUser { + id: string + name?: string + username?: string +} + +interface XListResponse { + data?: XTweet[] + includes?: { users?: XUser[] } + meta?: { next_token?: string; result_count?: number } + errors?: Array<{ detail?: string; title?: string }> +} + +interface XSingleResponse { + data?: XTweet + includes?: { users?: XUser[] } + errors?: Array<{ detail?: string; title?: string }> +} + +/** + * Resolves the configured sync mode, defaulting to the authenticated user's + * own posts. + */ +function resolveSyncMode(sourceConfig: Record): SyncMode { + const mode = sourceConfig.syncMode + if (mode === 'user' || mode === 'mentions' || mode === 'bookmarks' || mode === 'likes') { + return mode + } + return 'me' +} + +/** + * Reads a boolean toggle from a dropdown config field that stores 'true' / 'false' + * strings. Falls back to `defaultValue` when unset or unrecognized. + */ +function readBooleanOption(value: unknown, defaultValue: boolean): boolean { + if (value === 'true' || value === true) return true + if (value === 'false' || value === false) return false + return defaultValue +} + +/** + * Parses the configured usernames into a normalized, deduplicated handle list. + * + * Handles are lowercased and stripped of a leading `@` before deduplication so + * that `jack`, `@jack`, and `Jack` collapse to a single entry — avoiding a + * duplicate user-id lookup and a redundant `userIndex` slot in the packed + * pagination cursor. Both `validateConfig` and `listDocuments` call this so the + * cursor's `userIndex` stays aligned to the same array across pages. + */ +function parseUsernames(value: unknown): string[] { + const seen = new Set() + const out: string[] = [] + for (const raw of parseMultiValue(value)) { + const handle = raw.replace(/^@/, '').toLowerCase() + if (!handle || seen.has(handle)) continue + seen.add(handle) + out.push(handle) + } + return out +} + +/** + * Reads and trims a string config field, returning undefined when blank. + */ +function readTrimmed(value: unknown): string | undefined { + if (typeof value !== 'string') return undefined + const trimmed = value.trim() + return trimmed.length > 0 ? trimmed : undefined +} + +/** + * Performs an authenticated GET against the X API v2 and returns the parsed JSON. + */ +async function xApiGet( + path: string, + accessToken: string, + params?: Record, + retryOptions?: Parameters[2] +): Promise { + const queryParams = params ? `?${new URLSearchParams(params).toString()}` : '' + const url = `${X_API_BASE}${path}${queryParams}` + + const response = await fetchWithRetry( + url, + { + method: 'GET', + headers: { + Authorization: `Bearer ${accessToken}`, + 'Content-Type': 'application/json', + }, + }, + retryOptions + ) + + if (!response.ok) { + const body = await response.text().catch(() => '') + throw new Error(`X API HTTP error: ${response.status} ${response.statusText} ${body}`.trim()) + } + + return response.json() +} + +/** + * Resolves the authenticated user's numeric ID via GET /2/users/me. + */ +async function resolveMyUserId( + accessToken: string, + retryOptions?: Parameters[2] +): Promise { + const data = (await xApiGet('/users/me', accessToken, undefined, retryOptions)) as { + data?: { id?: string } + } + const id = data.data?.id + if (!id) throw new Error('Failed to resolve authenticated user ID') + return id +} + +/** + * Resolves a public username to its numeric user ID via + * GET /2/users/by/username/:username. + */ +async function resolveUsernameId( + accessToken: string, + username: string, + retryOptions?: Parameters[2] +): Promise { + const handle = username.trim().replace(/^@/, '') + const data = (await xApiGet( + `/users/by/username/${encodeURIComponent(handle)}`, + accessToken, + undefined, + retryOptions + )) as { data?: { id?: string }; errors?: Array<{ detail?: string }> } + const id = data.data?.id + if (!id) { + throw new Error(data.errors?.[0]?.detail || `User @${handle} not found`) + } + return id +} + +/** + * Builds a deterministic, metadata-based content hash for a tweet. + * + * Tweets are immutable outside the brief post-publish edit window; an edit + * appends a new ID to `edit_history_tweet_ids`. We therefore key the hash on + * the edit-history length when present (so edits are detected as changes), and + * fall back to `created_at` when the field is absent. + */ +function tweetContentHash(tweet: XTweet): string { + const historyLength = Array.isArray(tweet.edit_history_tweet_ids) + ? tweet.edit_history_tweet_ids.length + : undefined + const changeIndicator = historyLength ?? tweet.created_at ?? '' + return `x:${tweet.id}:${changeIndicator}` +} + +/** + * Builds the canonical source URL for a tweet. When the author's username is + * unknown, falls back to the username-agnostic permalink which X redirects. + */ +function tweetSourceUrl(tweetId: string, username?: string): string { + if (username) return `https://x.com/${username}/status/${tweetId}` + return `https://x.com/i/web/status/${tweetId}` +} + +/** + * Derives a short title from the tweet text (first line, truncated). + */ +function tweetTitle(text: string): string { + const firstLine = text.split('\n')[0].trim() + if (!firstLine) return 'Tweet' + return firstLine.length > 80 ? `${firstLine.slice(0, 77)}...` : firstLine +} + +/** + * Converts a tweet (and its resolved author) into an ExternalDocument with + * inline content — the list API returns full text, so no deferral is needed. + * + * The author is the actual tweet author resolved from the `author_id` expansion, + * not the credential owner — important for bookmarks and likes, where most posts + * belong to other accounts. + */ +function tweetToDocument(tweet: XTweet, author?: XUser): ExternalDocument { + const metrics = tweet.public_metrics ?? {} + return { + externalId: tweet.id, + title: tweetTitle(tweet.text), + content: tweet.text, + mimeType: 'text/plain', + sourceUrl: tweetSourceUrl(tweet.id, author?.username), + contentHash: tweetContentHash(tweet), + metadata: { + author: author?.username ?? author?.name ?? undefined, + authorName: author?.name ?? undefined, + createdAt: tweet.created_at ?? undefined, + likeCount: metrics.like_count ?? 0, + retweetCount: metrics.retweet_count ?? 0, + replyCount: metrics.reply_count ?? 0, + quoteCount: metrics.quote_count ?? 0, + }, + } +} + +/** + * Maps tweets from a list response to documents, joining each tweet to its + * author via the `includes.users` expansion (matched on `author_id`). + */ +function mapTweets(response: XListResponse): ExternalDocument[] { + const usersById = new Map() + for (const user of response.includes?.users ?? []) { + usersById.set(user.id, user) + } + const tweets = response.data ?? [] + return tweets.map((tweet) => tweetToDocument(tweet, usersById.get(tweet.author_id ?? ''))) +} + +/** + * Returns the API path for a given mode and resolved user ID. + */ +function listPathForMode(mode: SyncMode, userId: string): string { + switch (mode) { + case 'bookmarks': + return `/users/${userId}/bookmarks` + case 'likes': + return `/users/${userId}/liked_tweets` + case 'mentions': + return `/users/${userId}/mentions` + default: + return `/users/${userId}/tweets` + } +} + +/** + * Builds the query string for the active listing endpoint. `pageSize` is the + * per-request `max_results`, already clamped to the endpoint's valid range and + * to any remaining cap. `exclude` and date-range params are only attached for + * the modes whose endpoint supports them. + */ +function buildListParams( + sourceConfig: Record, + mode: SyncMode, + pageSize: number, + cursor?: string +): Record { + const params: Record = { + max_results: String(pageSize), + 'tweet.fields': TWEET_FIELDS, + expansions: 'author_id', + 'user.fields': 'name,username', + } + + if (EXCLUDE_CAPABLE_MODES.has(mode)) { + const includeReplies = readBooleanOption(sourceConfig.includeReplies, false) + const includeRetweets = readBooleanOption(sourceConfig.includeRetweets, false) + const exclude: string[] = [] + if (!includeRetweets) exclude.push('retweets') + if (!includeReplies) exclude.push('replies') + if (exclude.length > 0) params.exclude = exclude.join(',') + } + + if (DATE_RANGE_CAPABLE_MODES.has(mode)) { + const startTime = readTrimmed(sourceConfig.startTime) + const endTime = readTrimmed(sourceConfig.endTime) + if (startTime) params.start_time = startTime + if (endTime) params.end_time = endTime + } + + if (cursor) params.pagination_token = cursor + return params +} + +/** + * Clamps the requested page size to the endpoint's valid range and to the number + * of posts still needed under the cap. The user-tweets, mentions, and liked-tweets + * endpoints require `max_results` ≥ 5; only bookmarks accepts ≥ 1. We always request + * at least the endpoint minimum (over-fetch on the final page is trimmed afterward). + */ +function resolvePageSize(mode: SyncMode, remaining: number): number { + const floor = mode === 'bookmarks' ? 1 : MIN_PAGE_SIZE + if (remaining <= 0) return POSTS_PER_PAGE + return Math.max(floor, Math.min(POSTS_PER_PAGE, remaining)) +} + +export const xConnector: ConnectorConfig = { + id: 'x', + name: 'X', + description: 'Sync posts from X (formerly Twitter) into your knowledge base', + version: '1.0.0', + icon: xIcon, + + auth: { + mode: 'oauth', + provider: 'x', + requiredScopes: ['tweet.read', 'users.read', 'bookmark.read', 'like.read', 'offline.access'], + }, + + configFields: [ + { + id: 'syncMode', + title: 'Sync Mode', + type: 'dropdown', + required: false, + description: 'Which posts to sync into the knowledge base', + options: [ + { label: 'My posts', id: 'me' }, + { label: 'Another user', id: 'user' }, + { label: 'My mentions', id: 'mentions' }, + { label: 'My bookmarks', id: 'bookmarks' }, + { label: 'My likes', id: 'likes' }, + ], + }, + { + id: 'username', + title: 'Username(s)', + type: 'short-input', + required: false, + multi: true, + placeholder: 'e.g. jack, xdevelopers (required for "Another user")', + description: + 'One or more X usernames to sync posts from (comma-separated). Only used when Sync Mode is "Another user".', + }, + { + id: 'includeReplies', + title: 'Include Replies', + type: 'dropdown', + required: false, + options: [ + { label: 'Exclude replies', id: 'false' }, + { label: 'Include replies', id: 'true' }, + ], + description: 'Whether to include reply posts. Applies to "My posts" and "Another user".', + }, + { + id: 'includeRetweets', + title: 'Include Retweets', + type: 'dropdown', + required: false, + options: [ + { label: 'Exclude retweets', id: 'false' }, + { label: 'Include retweets', id: 'true' }, + ], + description: 'Whether to include retweets. Applies to "My posts" and "Another user".', + }, + { + id: 'startTime', + title: 'Start Time', + type: 'short-input', + required: false, + mode: 'advanced', + placeholder: 'e.g. 2024-01-01T00:00:00Z', + description: + 'Oldest post time (ISO 8601 UTC). Applies to posts and mentions; ignored for bookmarks and likes.', + }, + { + id: 'endTime', + title: 'End Time', + type: 'short-input', + required: false, + mode: 'advanced', + placeholder: 'e.g. 2024-12-31T23:59:59Z', + description: + 'Newest post time (ISO 8601 UTC). Applies to posts and mentions; ignored for bookmarks and likes.', + }, + { + id: 'maxPosts', + title: 'Max Posts', + type: 'short-input', + required: false, + placeholder: `e.g. 100 (default: ${DEFAULT_MAX_POSTS})`, + description: + 'Maximum number of posts to sync (across all configured users). Posts beyond this limit are not deleted from the knowledge base; X also only exposes a limited recent window (≈3,200 timeline posts, ≈800 bookmarks), so posts that age out of that window are removed on the next sync.', + }, + ], + + listDocuments: async ( + accessToken: string, + sourceConfig: Record, + cursor?: string, + syncContext?: Record + ): Promise => { + const mode = resolveSyncMode(sourceConfig) + const maxPosts = sourceConfig.maxPosts ? Number(sourceConfig.maxPosts) : DEFAULT_MAX_POSTS + + const collectedSoFar = (syncContext?.collected as number) ?? 0 + if (maxPosts > 0 && collectedSoFar >= maxPosts) { + return { documents: [], hasMore: false } + } + + // For the multi-username "user" mode, walk one username per cursor cycle. The + // cursor packs the username index and that user's pagination token; the shared + // cap is enforced across all users via syncContext.collected. + const usernames = mode === 'user' ? parseUsernames(sourceConfig.username) : [] + if (mode === 'user' && usernames.length === 0) { + throw new Error('Username is required when Sync Mode is "Another user"') + } + + let userIndex = 0 + let pageToken = cursor + if (mode === 'user' && cursor) { + const sep = cursor.indexOf(':') + if (sep >= 0) { + userIndex = Number(cursor.slice(0, sep)) || 0 + const token = cursor.slice(sep + 1) + pageToken = token.length > 0 ? token : undefined + } + } + + // Resolve the target user ID. For `user` mode it depends on the current index + // (resolved per page, cheap); for self-modes it is cached on syncContext. + let userId: string + if (mode === 'user') { + userId = await resolveUsernameId(accessToken, usernames[userIndex]) + } else { + userId = (syncContext?.userId as string | undefined) ?? (await resolveMyUserId(accessToken)) + if (syncContext) syncContext.userId = userId + } + + const remaining = maxPosts > 0 ? maxPosts - collectedSoFar : 0 + const pageSize = resolvePageSize(mode, remaining) + const path = listPathForMode(mode, userId) + const params = buildListParams(sourceConfig, mode, pageSize, pageToken) + + logger.info('Syncing X posts', { mode, userId, userIndex, maxPosts }) + + const response = (await xApiGet(path, accessToken, params)) as XListResponse + if (response.errors?.length && !response.data) { + throw new Error(response.errors[0]?.detail || response.errors[0]?.title || 'X API error') + } + + let documents = mapTweets(response) + + if (maxPosts > 0 && collectedSoFar + documents.length > maxPosts) { + documents = documents.slice(0, maxPosts - collectedSoFar) + } + const newCollected = collectedSoFar + documents.length + if (syncContext) syncContext.collected = newCollected + + const capReached = maxPosts > 0 && newCollected >= maxPosts + const nextToken = response.meta?.next_token + + // Advance pagination: continue the current user's pages, else move to the next + // username (user mode), else stop. + if (capReached) { + // We stopped before exhausting the source, so the listing is incomplete: + // older previously-synced posts may still exist beyond the `maxPosts` cap. + // Flag the sync as capped so the engine skips deletion reconciliation and + // does not soft-delete posts that simply fell outside this run's window. + // A forced full sync bypasses this guard and reconciles normally. + if (syncContext) syncContext.listingCapped = true + return { documents, hasMore: false } + } + + if (mode === 'user') { + if (nextToken) { + return { documents, nextCursor: `${userIndex}:${nextToken}`, hasMore: true } + } + const nextUserIndex = userIndex + 1 + if (nextUserIndex < usernames.length) { + return { documents, nextCursor: `${nextUserIndex}:`, hasMore: true } + } + return { documents, hasMore: false } + } + + return { + documents, + nextCursor: nextToken ?? undefined, + hasMore: Boolean(nextToken), + } + }, + + getDocument: async ( + accessToken: string, + _sourceConfig: Record, + externalId: string + ): Promise => { + try { + const response = (await xApiGet(`/tweets/${encodeURIComponent(externalId)}`, accessToken, { + 'tweet.fields': TWEET_FIELDS, + expansions: 'author_id', + 'user.fields': 'name,username', + })) as XSingleResponse + + const tweet = response.data + if (!tweet) return null + + const author = response.includes?.users?.find((u) => u.id === tweet.author_id) + return tweetToDocument(tweet, author) + } catch (error) { + logger.warn('Failed to get X tweet document', { + externalId, + error: toError(error).message, + }) + return null + } + }, + + validateConfig: async ( + accessToken: string, + sourceConfig: Record + ): Promise<{ valid: boolean; error?: string }> => { + const mode = resolveSyncMode(sourceConfig) + const usernames = mode === 'user' ? parseUsernames(sourceConfig.username) : [] + const maxPosts = sourceConfig.maxPosts as string | undefined + + if (mode === 'user' && usernames.length === 0) { + return { valid: false, error: 'Username is required when Sync Mode is "Another user"' } + } + + if (maxPosts && (Number.isNaN(Number(maxPosts)) || Number(maxPosts) <= 0)) { + return { valid: false, error: 'Max posts must be a positive number' } + } + + const startTime = readTrimmed(sourceConfig.startTime) + if (startTime && Number.isNaN(new Date(startTime).getTime())) { + return { valid: false, error: 'Start Time must be a valid ISO 8601 timestamp' } + } + const endTime = readTrimmed(sourceConfig.endTime) + if (endTime && Number.isNaN(new Date(endTime).getTime())) { + return { valid: false, error: 'End Time must be a valid ISO 8601 timestamp' } + } + + try { + await resolveMyUserId(accessToken, VALIDATE_RETRY_OPTIONS) + + if (mode === 'user') { + for (const username of usernames) { + await resolveUsernameId(accessToken, username, VALIDATE_RETRY_OPTIONS) + } + } + + return { valid: true } + } catch (error) { + return { valid: false, error: getErrorMessage(error, 'Failed to validate configuration') } + } + }, + + tagDefinitions: [ + { id: 'author', displayName: 'Author', fieldType: 'text' }, + { id: 'createdAt', displayName: 'Created Date', fieldType: 'date' }, + { id: 'likeCount', displayName: 'Like Count', fieldType: 'number' }, + { id: 'retweetCount', displayName: 'Retweet Count', fieldType: 'number' }, + ], + + mapTags: (metadata: Record): Record => { + const result: Record = {} + + if (typeof metadata.author === 'string') { + result.author = metadata.author + } + + const createdAt = parseTagDate(metadata.createdAt) + if (createdAt) { + result.createdAt = createdAt + } + + if (metadata.likeCount != null) { + const num = Number(metadata.likeCount) + if (!Number.isNaN(num)) result.likeCount = num + } + + if (metadata.retweetCount != null) { + const num = Number(metadata.retweetCount) + if (!Number.isNaN(num)) result.retweetCount = num + } + + return result + }, +} diff --git a/apps/sim/connectors/youtube/index.ts b/apps/sim/connectors/youtube/index.ts new file mode 100644 index 00000000000..a7d08fd17b3 --- /dev/null +++ b/apps/sim/connectors/youtube/index.ts @@ -0,0 +1 @@ +export { youtubeConnector } from '@/connectors/youtube/youtube' diff --git a/apps/sim/connectors/youtube/youtube.ts b/apps/sim/connectors/youtube/youtube.ts new file mode 100644 index 00000000000..a682ddeea23 --- /dev/null +++ b/apps/sim/connectors/youtube/youtube.ts @@ -0,0 +1,650 @@ +import { createLogger } from '@sim/logger' +import { getErrorMessage, toError } from '@sim/utils/errors' +import { YouTubeIcon } from '@/components/icons' +import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils' +import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types' +import { joinTagArray, parseTagDate } from '@/connectors/utils' + +const logger = createLogger('YouTubeConnector') + +const YOUTUBE_API_BASE = 'https://www.googleapis.com/youtube/v3' + +/** Max videos fetched per `playlistItems.list` page (YouTube hard limit is 50). */ +const PAGE_SIZE = 50 + +/** Videos shorter than this (seconds) are treated as Shorts when the exclude filter is on. */ +const SHORTS_MAX_DURATION_SECONDS = 60 + +/** + * Minimal `playlistItems.list` item shape we consume. + * `contentDetails.videoId` is the stable video identifier; `snippet.resourceId.videoId` + * is used as a fallback for older API responses. + * + * `snippet.publishedAt` is the time the item was ADDED to the playlist, whereas + * `contentDetails.videoPublishedAt` is the time the VIDEO was published to YouTube. + * These differ for hand-curated playlists, so only `videoPublishedAt` is used for the + * change-detection hash (it matches `videos.list` `snippet.publishedAt`). + */ +interface PlaylistItem { + contentDetails?: { videoId?: string; videoPublishedAt?: string } + snippet?: { + title?: string + publishedAt?: string + channelTitle?: string + videoOwnerChannelTitle?: string + resourceId?: { videoId?: string } + } +} + +/** + * Minimal `videos.list` item shape we consume in `getDocument`. + */ +interface VideoItem { + id?: string + snippet?: { + title?: string + description?: string + publishedAt?: string + channelTitle?: string + tags?: string[] + categoryId?: string + } + contentDetails?: { duration?: string } + status?: { privacyStatus?: string } +} + +/** + * Resolves the API key from the access token the sync engine provides. + * In `apiKey` mode the engine decrypts the stored key and passes it as `accessToken`. + */ +function getApiKey(accessToken: string): string { + return accessToken.trim() +} + +/** + * Builds the change-detection hash for a video. + * + * The hash is keyed on the video's own publish time (`videos.list` `snippet.publishedAt` + * / playlistItem `contentDetails.videoPublishedAt`), which is identical on both the + * listing stub and the hydrated document — guaranteeing the stub/getDocument hash + * invariant. The playlist-item "added at" time (`snippet.publishedAt`) is deliberately + * NOT used, since `getDocument` (via `videos.list`) cannot reproduce it. + * + * YouTube exposes no field that reliably changes when a video's title/description is + * edited, so edits to already-synced videos are not detected — only new videos are + * picked up. This is a known limitation of the API-key data surface. + */ +function buildContentHash(videoId: string, videoPublishedAt: string): string { + return `youtube:${videoId}:${videoPublishedAt}` +} + +/** + * Parses an ISO 8601 duration (e.g. `PT1M30S`, `PT2H`, `P1DT2H`) into total seconds. + * Returns null when the value is missing or unparseable. + */ +function parseIso8601Duration(value: string | undefined): number | null { + if (!value) return null + const match = value.match(/^P(?:(\d+)D)?(?:T(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?)?$/) + if (!match) return null + const [, days, hours, minutes, seconds] = match + if (!days && !hours && !minutes && !seconds) return null + return ( + Number(days ?? 0) * 86400 + + Number(hours ?? 0) * 3600 + + Number(minutes ?? 0) * 60 + + Number(seconds ?? 0) + ) +} + +/** + * Resolves a channel reference to its "uploads" playlist ID via `channels.list`. + * + * Accepts a `UC…` channel ID, an `@handle` (resolved with `forHandle`), or a legacy + * username (resolved with `forUsername`). Returns null when the channel is missing or + * has no uploads playlist. + */ +async function resolveUploadsPlaylistId( + apiKey: string, + channelRef: string, + retryOptions?: Parameters[2] +): Promise { + const ref = channelRef.trim() + if (!ref) return null + + const params = new URLSearchParams({ part: 'contentDetails', key: apiKey }) + if (ref.startsWith('@')) { + params.set('forHandle', ref) + } else if (/^UC[\w-]{20,}$/.test(ref)) { + params.set('id', ref) + } else { + params.set('forUsername', ref) + } + + const url = `${YOUTUBE_API_BASE}/channels?${params.toString()}` + + const response = await fetchWithRetry( + url, + { method: 'GET', headers: { Accept: 'application/json' } }, + retryOptions + ) + + if (!response.ok) { + const errorText = await response.text().catch(() => '') + logger.error('Failed to resolve channel uploads playlist', { + channelRef: ref, + status: response.status, + error: errorText.slice(0, 500), + }) + throw new Error(`Failed to resolve channel: ${response.status}`) + } + + const data = await response.json() + const items = (data.items ?? []) as Array<{ + contentDetails?: { relatedPlaylists?: { uploads?: string } } + }> + return items[0]?.contentDetails?.relatedPlaylists?.uploads ?? null +} + +/** + * Resolves the effective playlist ID to sync from sourceConfig, and whether the source + * is a channel's reverse-chronological uploads playlist (which enables early-stop for + * the `publishedAfter` filter). A `playlistId` takes precedence over a `channelId`. + */ +async function resolvePlaylistId( + apiKey: string, + sourceConfig: Record, + retryOptions?: Parameters[2] +): Promise<{ playlistId: string | null; isUploadsPlaylist: boolean }> { + const playlistId = (sourceConfig.playlistId as string | undefined)?.trim() + if (playlistId) return { playlistId, isUploadsPlaylist: false } + + const channelId = (sourceConfig.channelId as string | undefined)?.trim() + if (channelId) { + const resolved = await resolveUploadsPlaylistId(apiKey, channelId, retryOptions) + return { playlistId: resolved, isUploadsPlaylist: resolved != null } + } + + return { playlistId: null, isUploadsPlaylist: false } +} + +/** + * Extracts the video ID from a playlist item, preferring the stable + * `contentDetails.videoId` over the legacy `snippet.resourceId.videoId`. + */ +function getVideoId(item: PlaylistItem): string { + return item.contentDetails?.videoId ?? item.snippet?.resourceId?.videoId ?? '' +} + +/** + * Reads the optional `publishedAfter` cutoff from sourceConfig as a timestamp (ms), + * or null when unset/invalid. + */ +function getPublishedAfter(sourceConfig: Record): number | null { + const raw = (sourceConfig.publishedAfter as string | undefined)?.trim() + if (!raw) return null + const ms = new Date(raw).getTime() + return Number.isNaN(ms) ? null : ms +} + +/** + * Builds a metadata-only stub from a playlist item. + * + * Duration/tags/category are not available on `playlistItems.list` — they are populated + * during hydration in `getDocument` via `videos.list`. The content hash uses the video's + * publish time only, so it is identical between this stub and the hydrated document. + */ +function itemToStub(item: PlaylistItem): ExternalDocument | null { + const videoId = getVideoId(item) + if (!videoId) return null + + const snippet = item.snippet ?? {} + const videoPublishedAt = item.contentDetails?.videoPublishedAt ?? '' + const channelTitle = snippet.videoOwnerChannelTitle ?? snippet.channelTitle ?? '' + + return { + externalId: videoId, + title: snippet.title || 'Untitled', + content: '', + contentDeferred: true, + mimeType: 'text/plain', + sourceUrl: `https://www.youtube.com/watch?v=${videoId}`, + contentHash: buildContentHash(videoId, videoPublishedAt), + metadata: { + channelTitle, + publishedAt: videoPublishedAt, + }, + } +} + +/** + * Batch-fetches full video resources for the given IDs via `videos.list`. + * + * `videos.list` accepts up to 50 comma-separated IDs and costs a flat 1 quota unit per + * call regardless of ID count, so a single call covers a full `playlistItems.list` page. + * Videos that are private, deleted, or region-blocked are simply absent from the response. + */ +async function fetchVideosByIds( + apiKey: string, + videoIds: string[], + retryOptions?: Parameters[2] +): Promise> { + const result = new Map() + if (videoIds.length === 0) return result + + const url = `${YOUTUBE_API_BASE}/videos?part=snippet,contentDetails,status&id=${encodeURIComponent( + videoIds.join(',') + )}&key=${encodeURIComponent(apiKey)}` + + const response = await fetchWithRetry( + url, + { method: 'GET', headers: { Accept: 'application/json' } }, + retryOptions + ) + + if (!response.ok) { + const errorText = await response.text().catch(() => '') + logger.error('Failed to batch-fetch YouTube videos', { + count: videoIds.length, + status: response.status, + error: errorText.slice(0, 500), + }) + throw new Error(`Failed to batch-fetch YouTube videos: ${response.status}`) + } + + const data = await response.json() + const items = (data.items ?? []) as VideoItem[] + for (const item of items) { + if (item.id) result.set(item.id, item) + } + return result +} + +/** + * Builds the full document for a video, combining title and description as plain-text + * content. Returns null for unlisted/private/deleted videos, and (when configured) for + * Shorts shorter than 60 seconds. + * + * Captions/transcripts are intentionally not fetched: `captions.download` requires OAuth + * as the video owner, which the API-key auth surface cannot provide. Content is therefore + * the video title plus description only. + */ +function videoToDocument(video: VideoItem, excludeShorts: boolean): ExternalDocument | null { + const videoId = video.id + if (!videoId) return null + + const privacyStatus = video.status?.privacyStatus + if (privacyStatus && privacyStatus !== 'public' && privacyStatus !== 'unlisted') { + return null + } + + if (excludeShorts) { + const seconds = parseIso8601Duration(video.contentDetails?.duration) + if (seconds != null && seconds > 0 && seconds < SHORTS_MAX_DURATION_SECONDS) { + return null + } + } + + const snippet = video.snippet ?? {} + const title = snippet.title || 'Untitled' + const description = snippet.description ?? '' + const publishedAt = snippet.publishedAt ?? '' + const content = description.trim() ? `${title}\n\n${description}` : title + const tags = Array.isArray(snippet.tags) ? snippet.tags : [] + + return { + externalId: videoId, + title, + content, + contentDeferred: false, + mimeType: 'text/plain', + sourceUrl: `https://www.youtube.com/watch?v=${videoId}`, + contentHash: buildContentHash(videoId, publishedAt), + metadata: { + channelTitle: snippet.channelTitle ?? '', + publishedAt, + duration: video.contentDetails?.duration ?? '', + categoryId: snippet.categoryId ?? '', + tags, + }, + } +} + +export const youtubeConnector: ConnectorConfig = { + id: 'youtube', + name: 'YouTube', + description: 'Sync videos from a YouTube channel or playlist into your knowledge base', + version: '1.0.0', + icon: YouTubeIcon, + + auth: { + mode: 'apiKey', + label: 'YouTube Data API Key', + placeholder: 'Enter your YouTube Data API v3 key', + }, + + configFields: [ + { + id: 'channelId', + title: 'Channel', + type: 'short-input', + placeholder: 'e.g. @mkbhd or UCXXXXXXXXXXXXXXXXXXXXXX', + required: false, + description: + 'Channel handle (@name), channel ID (starts with "UC"), or legacy username. Syncs the channel\'s uploaded videos.', + }, + { + id: 'playlistId', + title: 'Playlist ID', + type: 'short-input', + placeholder: 'e.g. PLXXXXXXXXXXXXXXXX', + required: false, + description: 'Playlist ID. Takes precedence over Channel when both are set.', + }, + { + id: 'publishedAfter', + title: 'Published After', + type: 'short-input', + required: false, + mode: 'advanced', + placeholder: 'e.g. 2024-01-01', + description: + 'Only sync videos published on or after this date (ISO 8601, e.g. 2024-01-01). Applies to the video publish date.', + }, + { + id: 'excludeShorts', + title: 'Exclude Shorts', + type: 'dropdown', + required: false, + mode: 'advanced', + options: [ + { label: 'Include Shorts', id: 'false' }, + { label: 'Exclude Shorts (< 60s)', id: 'true' }, + ], + description: 'Skip videos shorter than 60 seconds (Shorts).', + }, + { + id: 'maxVideos', + title: 'Max Videos', + type: 'short-input', + required: false, + placeholder: 'e.g. 500 (default: unlimited)', + }, + ], + + listDocuments: async ( + accessToken: string, + sourceConfig: Record, + cursor?: string, + syncContext?: Record + ): Promise => { + const apiKey = getApiKey(accessToken) + + const maxVideos = sourceConfig.maxVideos ? Number(sourceConfig.maxVideos) : 0 + const previouslyFetched = (syncContext?.totalDocsFetched as number) ?? 0 + + if (maxVideos > 0 && previouslyFetched >= maxVideos) { + return { documents: [], hasMore: false } + } + + const cachedPlaylistId = syncContext?.resolvedPlaylistId as string | undefined + let playlistId: string | null = cachedPlaylistId ?? null + let isUploadsPlaylist = (syncContext?.isUploadsPlaylist as boolean | undefined) ?? false + + if (!playlistId) { + const resolved = await resolvePlaylistId(apiKey, sourceConfig) + playlistId = resolved.playlistId + isUploadsPlaylist = resolved.isUploadsPlaylist + if (syncContext) { + if (playlistId) syncContext.resolvedPlaylistId = playlistId + syncContext.isUploadsPlaylist = isUploadsPlaylist + } + } + + if (!playlistId) { + throw new Error('No playlistId or channelId configured, or channel has no uploads playlist') + } + + const publishedAfter = getPublishedAfter(sourceConfig) + + const remaining = maxVideos > 0 ? maxVideos - previouslyFetched : 0 + const effectivePageSize = maxVideos > 0 ? Math.min(PAGE_SIZE, remaining) : PAGE_SIZE + + const queryParams = new URLSearchParams({ + part: 'snippet,contentDetails', + playlistId, + maxResults: String(effectivePageSize), + key: apiKey, + }) + if (cursor) queryParams.set('pageToken', cursor) + + const url = `${YOUTUBE_API_BASE}/playlistItems?${queryParams.toString()}` + + logger.info('Listing YouTube playlist items', { playlistId, cursor: cursor ?? 'initial' }) + + const response = await fetchWithRetry(url, { + method: 'GET', + headers: { Accept: 'application/json' }, + }) + + if (!response.ok) { + const errorText = await response.text().catch(() => '') + logger.error('Failed to list YouTube playlist items', { + playlistId, + status: response.status, + error: errorText.slice(0, 500), + }) + throw new Error(`Failed to list YouTube playlist items: ${response.status}`) + } + + const data = await response.json() + const items = (data.items ?? []) as PlaylistItem[] + const excludeShorts = String(sourceConfig.excludeShorts ?? '') === 'true' + + const keptItems: PlaylistItem[] = [] + let stopEarly = false + + for (const item of items) { + if (!getVideoId(item)) continue + + if (publishedAfter != null) { + const videoPublishedAt = item.contentDetails?.videoPublishedAt + const ms = videoPublishedAt ? new Date(videoPublishedAt).getTime() : Number.NaN + if (!Number.isNaN(ms) && ms < publishedAfter) { + // Uploads playlists are reverse-chronological by publish date, so once we + // cross the cutoff no later item can qualify — stop paginating. For arbitrary + // playlists we only filter per-item (order is not guaranteed). + if (isUploadsPlaylist) { + stopEarly = true + break + } + continue + } + } + + keptItems.push(item) + } + + let documents: ExternalDocument[] = [] + + if (excludeShorts && keptItems.length > 0) { + // When excluding Shorts we must know each video's duration, which is not exposed on + // `playlistItems.list`. Resolve it here with a single batched `videos.list` call + // (1 quota unit per page) and emit FULLY-HYDRATED documents. This is deliberate: + // emitting deferred stubs for Shorts would make every excluded Short re-list as a + // brand-new doc on every sync (it is never persisted), re-hydrating to null forever. + // Filtering at listing time bounds the cost to one batched call per page per sync. + const videoMap = await fetchVideosByIds(apiKey, keptItems.map(getVideoId)) + for (const item of keptItems) { + const video = videoMap.get(getVideoId(item)) + // Absent from `videos.list` => private/deleted/region-blocked. Drop it instead of + // emitting a stub that would re-hydrate to null on every sync. + if (!video) continue + const doc = videoToDocument(video, true) + if (doc) documents.push(doc) + } + } else { + for (const item of keptItems) { + const stub = itemToStub(item) + if (stub) documents.push(stub) + } + } + + const totalFetched = previouslyFetched + documents.length + if (syncContext) syncContext.totalDocsFetched = totalFetched + + const hitMax = maxVideos > 0 && totalFetched >= maxVideos + if (hitMax && maxVideos > 0) { + const overflow = totalFetched - maxVideos + if (overflow > 0) documents = documents.slice(0, documents.length - overflow) + if (syncContext) syncContext.totalDocsFetched = maxVideos + } + + const nextPageToken = data.nextPageToken as string | undefined + + // When the `maxVideos` cap stops the listing before the source is exhausted, mark the + // listing as capped so the sync engine does not delete still-present-but-unlisted + // videos from the knowledge base. `stopEarly` (publishedAfter cutoff) is NOT a cap — + // every remaining video is older than the cutoff and intentionally out of scope, so + // those should reconcile (delete) normally. + if (hitMax && Boolean(nextPageToken) && syncContext) { + syncContext.listingCapped = true + } + + const hasMore = !hitMax && !stopEarly && Boolean(nextPageToken) + + return { + documents, + nextCursor: hasMore ? nextPageToken : undefined, + hasMore, + } + }, + + getDocument: async ( + accessToken: string, + sourceConfig: Record, + externalId: string + ): Promise => { + const apiKey = getApiKey(accessToken) + const excludeShorts = String(sourceConfig.excludeShorts ?? '') === 'true' + + const url = `${YOUTUBE_API_BASE}/videos?part=snippet,contentDetails,status&id=${encodeURIComponent(externalId)}&key=${encodeURIComponent(apiKey)}` + + try { + const response = await fetchWithRetry(url, { + method: 'GET', + headers: { Accept: 'application/json' }, + }) + + if (!response.ok) { + if (response.status === 403 || response.status === 404) return null + throw new Error(`Failed to get YouTube video: ${response.status}`) + } + + const data = await response.json() + const items = (data.items ?? []) as VideoItem[] + const video = items[0] + + // An empty items array means the video is deleted, private, or region-blocked. + if (!video) return null + + return videoToDocument(video, excludeShorts) + } catch (error) { + logger.warn(`Failed to fetch YouTube video ${externalId}`, { error: toError(error).message }) + return null + } + }, + + validateConfig: async ( + accessToken: string, + sourceConfig: Record + ): Promise<{ valid: boolean; error?: string }> => { + const apiKey = getApiKey(accessToken) + if (!apiKey) { + return { valid: false, error: 'A YouTube Data API key is required' } + } + + const channelId = (sourceConfig.channelId as string | undefined)?.trim() + const playlistId = (sourceConfig.playlistId as string | undefined)?.trim() + if (!channelId && !playlistId) { + return { valid: false, error: 'Provide a channel or a playlistId' } + } + + const maxVideos = sourceConfig.maxVideos as string | undefined + if (maxVideos && (Number.isNaN(Number(maxVideos)) || Number(maxVideos) <= 0)) { + return { valid: false, error: 'Max videos must be a positive number' } + } + + const publishedAfterRaw = (sourceConfig.publishedAfter as string | undefined)?.trim() + if (publishedAfterRaw && Number.isNaN(new Date(publishedAfterRaw).getTime())) { + return { valid: false, error: 'Published After must be a valid date (e.g. 2024-01-01)' } + } + + try { + const resolvedPlaylistId = playlistId + ? playlistId + : await resolveUploadsPlaylistId(apiKey, channelId as string, VALIDATE_RETRY_OPTIONS) + + if (!resolvedPlaylistId) { + return { valid: false, error: 'Channel not found or has no uploaded videos' } + } + + const url = `${YOUTUBE_API_BASE}/playlistItems?part=id&maxResults=1&playlistId=${encodeURIComponent(resolvedPlaylistId)}&key=${encodeURIComponent(apiKey)}` + const response = await fetchWithRetry( + url, + { method: 'GET', headers: { Accept: 'application/json' } }, + VALIDATE_RETRY_OPTIONS + ) + + if (!response.ok) { + if (response.status === 403) { + return { + valid: false, + error: + 'API key rejected. Check that the key is valid, has no HTTP referrer/IP restrictions (server-side use requires an unrestricted or IP-allowed key), and that your daily quota is not exhausted.', + } + } + if (response.status === 404) { + return { valid: false, error: 'Playlist not found. Check the playlist or channel ID.' } + } + return { valid: false, error: `Failed to access YouTube: ${response.status}` } + } + + return { valid: true } + } catch (error) { + return { valid: false, error: getErrorMessage(error, 'Failed to validate configuration') } + } + }, + + tagDefinitions: [ + { id: 'channelTitle', displayName: 'Channel', fieldType: 'text' }, + { id: 'publishedAt', displayName: 'Published Date', fieldType: 'date' }, + { id: 'duration', displayName: 'Duration', fieldType: 'text' }, + { id: 'tags', displayName: 'Tags', fieldType: 'text' }, + ], + + /** + * Maps document metadata to tag slots. `duration` and `tags` are only present after + * hydration in `getDocument`; on the listing stub they are absent and simply skipped + * by the guards below. The sync engine only runs `mapTags` on add/update (after + * hydration), so durations/tags are populated when tags are actually written. + */ + mapTags: (metadata: Record): Record => { + const result: Record = {} + + if (typeof metadata.channelTitle === 'string' && metadata.channelTitle.trim()) { + result.channelTitle = metadata.channelTitle + } + + const publishedAt = parseTagDate(metadata.publishedAt) + if (publishedAt) result.publishedAt = publishedAt + + if (typeof metadata.duration === 'string' && metadata.duration.trim()) { + result.duration = metadata.duration + } + + const tags = joinTagArray(metadata.tags) + if (tags) result.tags = tags + + return result + }, +} From 4d84cb60e3591114d1a4dfc2eff5380a00682a73 Mon Sep 17 00:00:00 2001 From: waleed Date: Thu, 4 Jun 2026 11:41:15 -0700 Subject: [PATCH 02/16] fix(connectors): tighten listingCapped semantics per review (WIQL cap, batch omissions, cap-vs-exhaustion) --- apps/docs/app/global.css | 7 +++ .../connectors/azure-devops/azure-devops.ts | 62 +++++++++++++++---- .../connectors/google-forms/google-forms.ts | 16 +++-- apps/sim/connectors/s3/s3.ts | 5 +- apps/sim/connectors/sentry/sentry.ts | 6 +- apps/sim/connectors/typeform/typeform.ts | 27 +++++--- 6 files changed, 95 insertions(+), 28 deletions(-) diff --git a/apps/docs/app/global.css b/apps/docs/app/global.css index 2bb74df043e..d0645dc8046 100644 --- a/apps/docs/app/global.css +++ b/apps/docs/app/global.css @@ -510,6 +510,13 @@ figure[data-rehype-pretty-code-figure], max-width: 480px !important; } +/* Search dialog overlay + panel must cover the sticky navbar — both default to z-50, + and the navbar wins the tie by DOM order, leaving it unblurred above the overlay */ +.bg-fd-overlay, +[role="dialog"][data-state] { + z-index: 60 !important; +} + pre { font-size: 0.875rem; line-height: 1.7; diff --git a/apps/sim/connectors/azure-devops/azure-devops.ts b/apps/sim/connectors/azure-devops/azure-devops.ts index 2f99f9922f5..77383ee28e4 100644 --- a/apps/sim/connectors/azure-devops/azure-devops.ts +++ b/apps/sim/connectors/azure-devops/azure-devops.ts @@ -871,9 +871,14 @@ async function listRepoFiles( const chunk = entries.slice(offset, offset + FILE_BATCH_SIZE) const documents = chunk.map((entry) => fileToStub(organization, project, entry)) - const { documents: capped, capped: hitLimit } = applyMaxItemsCap(documents, maxItems, syncContext) - const nextOffset = offset + FILE_BATCH_SIZE + const { documents: capped, capped: hitLimit } = applyMaxItemsCap( + documents, + maxItems, + syncContext, + nextOffset < entries.length + ) + const hasMore = !hitLimit && nextOffset < entries.length return { @@ -1003,24 +1008,32 @@ async function getFileDocument( /** * Applies the optional maxItems cap to a batch, tracking the running total in - * syncContext and flagging `listingCapped` when the cap is hit. The sync engine - * reads `listingCapped` to suppress deletion reconciliation on a truncated - * listing — without it, a capped full sync would wrongly delete every source - * document beyond the cap. + * syncContext and flagging `listingCapped` when the cap actually truncated the + * listing. The sync engine reads `listingCapped` to suppress deletion + * reconciliation on a truncated listing — without it, a capped full sync would + * wrongly delete every source document beyond the cap. + * + * `moreAvailable` tells the helper whether the current phase has further items + * beyond this page. The flag is only set when documents were actually dropped + * (this page was sliced, or more pages exist) — when the cap merely coincides + * with source exhaustion, reconciliation stays enabled so deleted source + * documents are still cleaned up. */ function applyMaxItemsCap( documents: ExternalDocument[], maxItems: number, - syncContext: Record | undefined + syncContext: Record | undefined, + moreAvailable: boolean ): { documents: ExternalDocument[]; capped: boolean } { if (maxItems <= 0) return { documents, capped: false } const prevTotal = (syncContext?.totalDocsFetched as number) ?? 0 const remaining = Math.max(0, maxItems - prevTotal) - const sliced = documents.length > remaining ? documents.slice(0, remaining) : documents + const slicedSome = documents.length > remaining + const sliced = slicedSome ? documents.slice(0, remaining) : documents const newTotal = prevTotal + sliced.length if (syncContext) syncContext.totalDocsFetched = newTotal const capped = newTotal >= maxItems - if (capped && syncContext) syncContext.listingCapped = true + if (capped && (slicedSome || moreAvailable) && syncContext) syncContext.listingCapped = true return { documents: sliced, capped } } @@ -1104,7 +1117,12 @@ async function listWikiPages( documents.push(...stubs) } - const { documents: capped, capped: hitLimit } = applyMaxItemsCap(documents, maxItems, syncContext) + const { documents: capped, capped: hitLimit } = applyMaxItemsCap( + documents, + maxItems, + syncContext, + Boolean(nextContinuation) || wikiIndex + 1 < wikis.length + ) if (hitLimit) { return { documents: capped, hasMore: false } } @@ -1145,6 +1163,10 @@ async function listWorkItems( if (syncContext) syncContext.workItemIds = ids } + if (ids.length >= WIQL_MAX_RESULTS && syncContext) { + syncContext.listingCapped = true + } + if (ids.length === 0) { return { documents: [], hasMore: false } } @@ -1157,11 +1179,23 @@ async function listWorkItems( const chunk = ids.slice(offset, offset + WORK_ITEM_BATCH_SIZE) const raw = await fetchWorkItemsBatch(accessToken, organization, project, chunk) + if (raw.length < chunk.length && syncContext) { + syncContext.listingCapped = true + logger.warn( + 'workitemsbatch omitted ids that WIQL returned; flagging listing as incomplete so reconciliation skips deletion', + { requested: chunk.length, returned: raw.length, organization, project } + ) + } const documents = raw.map((item) => workItemToDocument(organization, project, item)) - const { documents: capped, capped: hitLimit } = applyMaxItemsCap(documents, maxItems, syncContext) - const nextOffset = offset + WORK_ITEM_BATCH_SIZE + const { documents: capped, capped: hitLimit } = applyMaxItemsCap( + documents, + maxItems, + syncContext, + nextOffset < ids.length + ) + const hasMore = !hitLimit && nextOffset < ids.length return { @@ -1426,6 +1460,10 @@ export const azureDevopsConnector: ConnectorConfig = { return { documents, nextCursor: result.nextCursor, hasMore: true } } if (capReached()) { + const remainingPhase = nextPhase(current, contentType) + if (remainingPhase && syncContext) { + syncContext.listingCapped = true + } return { documents, hasMore: false } } current = nextPhase(current, contentType) diff --git a/apps/sim/connectors/google-forms/google-forms.ts b/apps/sim/connectors/google-forms/google-forms.ts index 547b81e2cfd..560510ee312 100644 --- a/apps/sim/connectors/google-forms/google-forms.ts +++ b/apps/sim/connectors/google-forms/google-forms.ts @@ -550,9 +550,13 @@ export const googleFormsConnector: ConnectorConfig = { const data = await response.json() let files = (data.files || []) as DriveFormFile[] + let slicedSome = false if (maxForms > 0) { const remaining = maxForms - previouslyFetched - if (files.length > remaining) files = files.slice(0, remaining) + if (files.length > remaining) { + slicedSome = true + files = files.slice(0, remaining) + } } /** @@ -597,11 +601,13 @@ export const googleFormsConnector: ConnectorConfig = { /** * Mark the listing as incomplete so the sync engine skips deletion * reconciliation. This applies when the `maxForms` cap truncates results - * (forms beyond the cap are not absent from the source) or when a transient - * error caused a still-present form to be dropped from this page — deleting - * those would wipe valid documents from the knowledge base. + * while more forms exist (forms beyond the cap are not absent from the + * source) or when a transient error caused a still-present form to be + * dropped from this page — deleting those would wipe valid documents from + * the knowledge base. When the cap merely coincides with source exhaustion + * (no next page), reconciliation stays enabled so deleted forms are cleaned up. */ - if (syncContext && (hitLimit || skippedOnError)) { + if (syncContext && ((hitLimit && (slicedSome || Boolean(nextPageToken))) || skippedOnError)) { syncContext.listingCapped = true } diff --git a/apps/sim/connectors/s3/s3.ts b/apps/sim/connectors/s3/s3.ts index 058969f9b42..b5036e84999 100644 --- a/apps/sim/connectors/s3/s3.ts +++ b/apps/sim/connectors/s3/s3.ts @@ -581,9 +581,11 @@ export const s3Connector: ConnectorConfig = { .filter((entry) => entry.size > 0 && entry.size <= MAX_FILE_SIZE) .map((entry) => objectToStub(ctx, entry)) + let slicedSome = false if (maxObjects > 0) { const remaining = maxObjects - previouslyFetched if (documents.length > remaining) { + slicedSome = true documents = documents.slice(0, remaining) } } @@ -591,7 +593,8 @@ export const s3Connector: ConnectorConfig = { const totalFetched = previouslyFetched + documents.length if (syncContext) syncContext.totalDocsFetched = totalFetched const hitLimit = maxObjects > 0 && totalFetched >= maxObjects - if (hitLimit && syncContext) syncContext.listingCapped = true + const moreAvailable = slicedSome || (isTruncated && Boolean(nextContinuationToken)) + if (hitLimit && moreAvailable && syncContext) syncContext.listingCapped = true return { documents, diff --git a/apps/sim/connectors/sentry/sentry.ts b/apps/sim/connectors/sentry/sentry.ts index eb17f6e8115..55838ecda5b 100644 --- a/apps/sim/connectors/sentry/sentry.ts +++ b/apps/sim/connectors/sentry/sentry.ts @@ -525,9 +525,11 @@ export const sentryConnector: ConnectorConfig = { const prevFetched = (syncContext?.totalDocsFetched as number) ?? 0 let documents = issues.map(issueToStub) + let slicedSome = false if (maxIssues > 0) { const remaining = Math.max(0, maxIssues - prevFetched) if (documents.length > remaining) { + slicedSome = true documents = documents.slice(0, remaining) } } @@ -535,9 +537,11 @@ export const sentryConnector: ConnectorConfig = { const totalFetched = prevFetched + documents.length if (syncContext) syncContext.totalDocsFetched = totalFetched const hitLimit = maxIssues > 0 && totalFetched >= maxIssues - if (hitLimit && syncContext) syncContext.listingCapped = true const nextCursor = parseNextCursor(response.headers.get('Link')) + if (hitLimit && (slicedSome || Boolean(nextCursor)) && syncContext) { + syncContext.listingCapped = true + } const hasMore = !hitLimit && Boolean(nextCursor) return { diff --git a/apps/sim/connectors/typeform/typeform.ts b/apps/sim/connectors/typeform/typeform.ts index 0df206348bb..d4fd3d2c02b 100644 --- a/apps/sim/connectors/typeform/typeform.ts +++ b/apps/sim/connectors/typeform/typeform.ts @@ -424,9 +424,13 @@ export const typeformConnector: ConnectorConfig = { * cap is hit because `hasMore` becomes false. */ let cappedItems = items + let slicedSome = false if (maxResponses > 0) { const remaining = Math.max(0, maxResponses - prevTotal) - if (items.length > remaining) cappedItems = items.slice(0, remaining) + if (items.length > remaining) { + slicedSome = true + cappedItems = items.slice(0, remaining) + } } const documents: ExternalDocument[] = cappedItems.map((item) => @@ -437,13 +441,6 @@ export const typeformConnector: ConnectorConfig = { if (syncContext) syncContext.totalDocsFetched = totalFetched const hitLimit = maxResponses > 0 && totalFetched >= maxResponses - /** - * Signal a truncated listing so the engine skips deletion reconciliation: - * a capped page does not represent the full set of responses, and deleting - * everything past the cap would wipe still-present docs from the KB. - */ - if (hitLimit && syncContext) syncContext.listingCapped = true - /** * The `before` cursor is the response `token` (not `response_id`). Each full * page advances to the oldest token seen so the next request pages strictly @@ -452,7 +449,19 @@ export const typeformConnector: ConnectorConfig = { */ const lastItem = items[items.length - 1] const nextCursor = lastItem?.token - const hasMore = !hitLimit && items.length === RESPONSES_PER_PAGE && Boolean(nextCursor) + const sourceHasMore = items.length === RESPONSES_PER_PAGE && Boolean(nextCursor) + + /** + * Signal a truncated listing so the engine skips deletion reconciliation — + * but only when the cap actually dropped responses (this page was sliced, or + * the source had more pages). If the cap merely coincides with source + * exhaustion, reconciliation stays enabled so deleted responses are cleaned up. + */ + if (hitLimit && (slicedSome || sourceHasMore) && syncContext) { + syncContext.listingCapped = true + } + + const hasMore = !hitLimit && sourceHasMore return { documents, From 597d408b91ab314d4fe7d88153259790d63fd47d Mon Sep 17 00:00:00 2001 From: waleed Date: Thu, 4 Jun 2026 11:51:10 -0700 Subject: [PATCH 03/16] fix(connectors): google-forms listingCapped must fire on slice regardless of hitLimit (404-null-filter gap) --- .../connectors/google-forms/google-forms.ts | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/apps/sim/connectors/google-forms/google-forms.ts b/apps/sim/connectors/google-forms/google-forms.ts index 560510ee312..b82b8b19b14 100644 --- a/apps/sim/connectors/google-forms/google-forms.ts +++ b/apps/sim/connectors/google-forms/google-forms.ts @@ -600,14 +600,20 @@ export const googleFormsConnector: ConnectorConfig = { /** * Mark the listing as incomplete so the sync engine skips deletion - * reconciliation. This applies when the `maxForms` cap truncates results - * while more forms exist (forms beyond the cap are not absent from the - * source) or when a transient error caused a still-present form to be - * dropped from this page — deleting those would wipe valid documents from - * the knowledge base. When the cap merely coincides with source exhaustion - * (no next page), reconciliation stays enabled so deleted forms are cleaned up. + * reconciliation. Three cases drop still-existing forms from the listing: + * - `slicedSome`: this page held more forms than the `maxForms` cap allowed, + * so forms beyond the slice were truncated. This is independent of + * `hitLimit`, which counts successfully fetched stubs and can fall below + * the cap when 404s or errors null out items even though real forms were + * sliced off. + * - `hitLimit` with a next page: the cap was reached while more pages of + * forms remain in the source. + * - `skippedOnError`: a transient error dropped a still-present form. + * Deleting any of those would wipe valid documents from the knowledge base. + * When the cap merely coincides with source exhaustion (no slice, no next + * page), reconciliation stays enabled so deleted forms are cleaned up. */ - if (syncContext && ((hitLimit && (slicedSome || Boolean(nextPageToken))) || skippedOnError)) { + if (syncContext && (slicedSome || (hitLimit && Boolean(nextPageToken)) || skippedOnError)) { syncContext.listingCapped = true } From f87d05d55249901c84fba03efd1a84124b733f47 Mon Sep 17 00:00:00 2001 From: waleed Date: Thu, 4 Jun 2026 11:57:24 -0700 Subject: [PATCH 04/16] fix(connectors): s3 streaming size cap for chunked responses without content-length --- apps/sim/connectors/s3/s3.ts | 47 ++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/apps/sim/connectors/s3/s3.ts b/apps/sim/connectors/s3/s3.ts index b5036e84999..17e28387bae 100644 --- a/apps/sim/connectors/s3/s3.ts +++ b/apps/sim/connectors/s3/s3.ts @@ -329,6 +329,36 @@ function buildListQueryString(params: Record): string { .join('&') } +/** + * Reads a response body as UTF-8 text while enforcing a hard byte cap. The + * declared `content-length` header cannot be trusted as the sole guard: + * S3-compatible stores (MinIO, Cloudflare R2) may use chunked transfer + * encoding and omit the header entirely. Bytes are accumulated from the + * stream and reading aborts as soon as the cap is exceeded, so an oversized + * body is never fully buffered. Returns null when the cap is exceeded. + */ +async function readBodyWithLimit(response: Response, maxBytes: number): Promise { + if (!response.body) { + const text = await response.text() + return Buffer.byteLength(text) > maxBytes ? null : text + } + + const reader = response.body.getReader() + const chunks: Uint8Array[] = [] + let total = 0 + while (true) { + const { done, value } = await reader.read() + if (done) break + total += value.byteLength + if (total > maxBytes) { + await reader.cancel().catch(() => {}) + return null + } + chunks.push(value) + } + return Buffer.concat(chunks).toString('utf-8') +} + /** * Decodes XML entities found in S3 response text values. `&` is decoded * last so sequences like `&lt;` resolve to `<` rather than `<`. @@ -626,21 +656,28 @@ export const s3Connector: ConnectorConfig = { const etag = normalizeEtag(response.headers.get('etag') ?? '') const lastModified = response.headers.get('last-modified') ?? '' - const contentLength = Number(response.headers.get('content-length') ?? '0') + const declaredLength = Number(response.headers.get('content-length') ?? '') - if (contentLength > MAX_FILE_SIZE) { - logger.warn('Skipping oversized S3 object', { key, size: contentLength }) + if (declaredLength > MAX_FILE_SIZE) { + logger.warn('Skipping oversized S3 object', { key, size: declaredLength }) return null } - const content = await response.text() + const content = await readBodyWithLimit(response, MAX_FILE_SIZE) + if (content === null) { + logger.warn('Skipping oversized S3 object (size cap exceeded while streaming)', { key }) + return null + } if (!content.trim()) return null const entry: S3ObjectEntry = { key, etag, lastModified, - size: Number.isNaN(contentLength) ? 0 : contentLength, + size: + Number.isNaN(declaredLength) || declaredLength <= 0 + ? Buffer.byteLength(content) + : declaredLength, } const stub = objectToStub(ctx, entry) return { ...stub, content, contentDeferred: false } From 6b668551b62a0efaf7ea7525a86342791f20bac0 Mon Sep 17 00:00:00 2001 From: waleed Date: Thu, 4 Jun 2026 12:06:56 -0700 Subject: [PATCH 05/16] fix(connectors): ado byte-exact file content fetch, google-forms hash-poisoning on listing failure --- .../connectors/azure-devops/azure-devops.ts | 42 ++++++++++++++----- .../connectors/google-forms/google-forms.ts | 18 +++++--- 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/apps/sim/connectors/azure-devops/azure-devops.ts b/apps/sim/connectors/azure-devops/azure-devops.ts index 77383ee28e4..7e4df0590d8 100644 --- a/apps/sim/connectors/azure-devops/azure-devops.ts +++ b/apps/sim/connectors/azure-devops/azure-devops.ts @@ -943,35 +943,57 @@ async function getFileDocument( return null } - const params = new URLSearchParams({ + const metadataParams = new URLSearchParams({ path, 'versionDescriptor.version': branch, 'versionDescriptor.versionType': 'Branch', - includeContent: 'true', includeContentMetadata: 'true', $format: 'json', 'api-version': GIT_API_VERSION, }) - const url = `${ADO_BASE_URL}/${encodeURIComponent(organization)}/${encodeURIComponent(project)}/_apis/git/repositories/${encodeURIComponent(repoId)}/items?${params.toString()}` - const response = await fetchWithRetry(url, { + const metadataUrl = `${ADO_BASE_URL}/${encodeURIComponent(organization)}/${encodeURIComponent(project)}/_apis/git/repositories/${encodeURIComponent(repoId)}/items?${metadataParams.toString()}` + const metadataResponse = await fetchWithRetry(metadataUrl, { method: 'GET', headers: { Accept: 'application/json', Authorization: patAuthHeader(accessToken) }, }) - if (!response.ok) { - if (response.status === 404) return null - throw new Error(`Failed to fetch repository file: ${response.status}`) + if (!metadataResponse.ok) { + if (metadataResponse.status === 404) return null + throw new Error(`Failed to fetch repository file metadata: ${metadataResponse.status}`) } - const item = (await response.json()) as GitItem + const item = (await metadataResponse.json()) as GitItem if (!item.objectId) return null if (item.contentMetadata?.isBinary) { logger.info('Skipping binary Azure DevOps file', { path }) return null } - const raw = typeof item.content === 'string' ? item.content : '' - const buffer = Buffer.from(raw, 'utf8') + /** + * Content is fetched as raw bytes (Accept: application/octet-stream) rather + * than via `includeContent=true` JSON. The JSON `content` field's encoding is + * ambiguous (the API may deliver base64 or codepage-transcoded text per + * `ItemContentType`), whereas the octet-stream response is the byte-exact git + * blob, which is then binary-sniffed and decoded as UTF-8. + */ + const contentParams = new URLSearchParams({ + path, + 'versionDescriptor.version': branch, + 'versionDescriptor.versionType': 'Branch', + 'api-version': GIT_API_VERSION, + }) + const contentUrl = `${ADO_BASE_URL}/${encodeURIComponent(organization)}/${encodeURIComponent(project)}/_apis/git/repositories/${encodeURIComponent(repoId)}/items?${contentParams.toString()}` + const contentResponse = await fetchWithRetry(contentUrl, { + method: 'GET', + headers: { Accept: 'application/octet-stream', Authorization: patAuthHeader(accessToken) }, + }) + + if (!contentResponse.ok) { + if (contentResponse.status === 404) return null + throw new Error(`Failed to fetch repository file content: ${contentResponse.status}`) + } + + const buffer = Buffer.from(await contentResponse.arrayBuffer()) if (isBinaryBuffer(buffer)) { logger.info('Skipping binary Azure DevOps file', { path }) return null diff --git a/apps/sim/connectors/google-forms/google-forms.ts b/apps/sim/connectors/google-forms/google-forms.ts index b82b8b19b14..83342bf07a6 100644 --- a/apps/sim/connectors/google-forms/google-forms.ts +++ b/apps/sim/connectors/google-forms/google-forms.ts @@ -275,6 +275,8 @@ async function fetchFormResponses(accessToken: string, formId: string): Promise< * Reads the latest response submission time for change detection without * retaining every response. Returns the greatest `lastSubmittedTime` (falling * back to `createTime`) across all responses, or undefined when there are none. + * Throws on a failed read so the caller skips the form for this run instead of + * computing a hash from incomplete data. */ async function fetchLatestResponseTime( accessToken: string, @@ -293,13 +295,17 @@ async function fetchLatestResponseTime( if (!response.ok) { /** - * Treat response-listing failures as "no responses" for hashing purposes - * so a transient error never silently drops the form from the sync. + * Propagate the failure rather than hashing with an empty response segment. + * A swallowed error here would poison the stub's content hash (listing + * would hash "no responses" while getDocument hashes the real latest + * submission time), making the form re-process on every sync. Throwing lets + * the per-form catch in listDocuments skip the form for this run and set + * `skippedOnError` → `listingCapped`, so the form is neither deleted nor + * hashed incorrectly. */ - logger.warn(`Failed to read responses for change detection on form ${formId}`, { - status: response.status, - }) - return undefined + throw new Error( + `Failed to read responses for change detection on form ${formId}: ${response.status}` + ) } const data = (await response.json()) as FormResponseList From 8355b80699795d51f1fedc0eba7efab03336af78 Mon Sep 17 00:00:00 2001 From: waleed Date: Thu, 4 Jun 2026 12:19:54 -0700 Subject: [PATCH 06/16] fix(connectors): ado auth-failure deletion guard, jsm last-page slice flag, google-forms response cap in hash --- .../connectors/azure-devops/azure-devops.ts | 51 ++++++++++++++++--- .../connectors/google-forms/google-forms.ts | 26 ++++++++-- apps/sim/connectors/jsm/jsm.ts | 8 ++- 3 files changed, 73 insertions(+), 12 deletions(-) diff --git a/apps/sim/connectors/azure-devops/azure-devops.ts b/apps/sim/connectors/azure-devops/azure-devops.ts index 7e4df0590d8..218e22d7628 100644 --- a/apps/sim/connectors/azure-devops/azure-devops.ts +++ b/apps/sim/connectors/azure-devops/azure-devops.ts @@ -287,7 +287,8 @@ async function listWikis( accessToken: string, organization: string, project: string, - retryOptions?: Parameters[2] + retryOptions?: Parameters[2], + syncContext?: Record ): Promise { const url = `${ADO_BASE_URL}/${encodeURIComponent(organization)}/${encodeURIComponent(project)}/_apis/wiki/wikis?api-version=${WIKIS_LIST_API_VERSION}` const response = await fetchWithRetry( @@ -300,6 +301,15 @@ async function listWikis( ) if (!response.ok) { if (response.status === 401 || response.status === 403 || response.status === 404) { + /** + * 401/403 mean the wikis still exist but this PAT cannot read them right + * now — flag the listing as incomplete so reconciliation does not delete + * previously synced wiki pages. A 404 means the wiki feature/content is + * genuinely absent, so reconciliation stays enabled. + */ + if ((response.status === 401 || response.status === 403) && syncContext) { + syncContext.listingCapped = true + } logger.warn('Azure DevOps wikis unavailable; skipping wiki listing', { organization, project, @@ -327,7 +337,7 @@ async function resolveWikis( ): Promise { const cached = syncContext?.wikis as WikiV2[] | undefined if (cached) return cached - const wikis = await listWikis(accessToken, organization, project) + const wikis = await listWikis(accessToken, organization, project, undefined, syncContext) if (syncContext) syncContext.wikis = wikis return wikis } @@ -642,7 +652,8 @@ async function listRepositories( accessToken: string, organization: string, project: string, - retryOptions?: Parameters[2] + retryOptions?: Parameters[2], + syncContext?: Record ): Promise { const url = `${ADO_BASE_URL}/${encodeURIComponent(organization)}/${encodeURIComponent(project)}/_apis/git/repositories?api-version=${GIT_API_VERSION}` const response = await fetchWithRetry( @@ -655,6 +666,15 @@ async function listRepositories( ) if (!response.ok) { if (response.status === 401 || response.status === 403 || response.status === 404) { + /** + * 401/403 mean repositories still exist but this PAT cannot read them + * right now — flag the listing as incomplete so reconciliation does not + * delete previously synced repository files. A 404 means the Git feature + * is genuinely absent, so reconciliation stays enabled. + */ + if ((response.status === 401 || response.status === 403) && syncContext) { + syncContext.listingCapped = true + } logger.warn('Azure DevOps repositories unavailable; skipping file listing', { organization, project, @@ -686,7 +706,8 @@ async function resolveRepositories( syncContext?: Record ): Promise { const cached = syncContext?.repositories as GitRepository[] | undefined - const all = cached ?? (await listRepositories(accessToken, organization, project)) + const all = + cached ?? (await listRepositories(accessToken, organization, project, undefined, syncContext)) if (syncContext && !cached) syncContext.repositories = all const needle = repositoryFilter.toLowerCase() @@ -707,7 +728,8 @@ async function listRepositoryBlobs( organization: string, project: string, repoId: string, - branch: string + branch: string, + syncContext?: Record ): Promise { const params = new URLSearchParams({ recursionLevel: 'Full', @@ -722,6 +744,16 @@ async function listRepositoryBlobs( }) if (!response.ok) { if (response.status === 401 || response.status === 403 || response.status === 404) { + /** + * 401/403 mean the repository's files still exist but this PAT cannot + * read them right now — flag the listing as incomplete so reconciliation + * does not delete previously synced files. A 404 means the branch/repo + * content is genuinely absent (empty repo, deleted branch), so + * reconciliation stays enabled. + */ + if ((response.status === 401 || response.status === 403) && syncContext) { + syncContext.listingCapped = true + } logger.warn('Azure DevOps repository items unavailable; skipping repository', { repoId, branch, @@ -824,7 +856,14 @@ async function resolveRepoFiles( }) continue } - const blobs = await listRepositoryBlobs(accessToken, organization, project, repo.id, branch) + const blobs = await listRepositoryBlobs( + accessToken, + organization, + project, + repo.id, + branch, + syncContext + ) for (const item of blobs) { if (normalizedPrefix && !item.path.startsWith(normalizedPrefix)) continue if (!matchesExtension(item.path, filters.extensions)) continue diff --git a/apps/sim/connectors/google-forms/google-forms.ts b/apps/sim/connectors/google-forms/google-forms.ts index 83342bf07a6..02f187caaf6 100644 --- a/apps/sim/connectors/google-forms/google-forms.ts +++ b/apps/sim/connectors/google-forms/google-forms.ts @@ -143,6 +143,18 @@ interface FormStubInput { revisionId?: string latestResponseTime?: string contentScope: ContentScope + responseCap: number +} + +/** + * Resolves the effective per-form response cap applied when rendering content: + * the user-configured `maxResponsesPerForm` clamped to the hard + * `MAX_RESPONSES_PER_FORM` ceiling. Part of the content hash so changing the + * cap re-syncs every form (the rendered content depends on it). + */ +function resolveResponseCap(sourceConfig: Record): number { + const configured = parsePositiveInt(sourceConfig.maxResponsesPerForm) + return configured > 0 ? Math.min(configured, MAX_RESPONSES_PER_FORM) : MAX_RESPONSES_PER_FORM } /** @@ -334,7 +346,10 @@ function latestResponseTime(responses: FormResponse[]): string | undefined { * forces a re-sync of every document. */ function formContentHash(input: FormStubInput): string { - const responsePart = input.contentScope === 'both' ? (input.latestResponseTime ?? '') : 'none' + const responsePart = + input.contentScope === 'both' + ? `${input.latestResponseTime ?? ''}:${input.responseCap}` + : 'none' return `gforms:${input.file.id}:${input.contentScope}:${input.revisionId ?? ''}:${responsePart}` } @@ -514,6 +529,7 @@ export const googleFormsConnector: ConnectorConfig = { ): Promise => { const maxForms = parsePositiveInt(sourceConfig.maxForms) const contentScope = resolveContentScope(sourceConfig.contentScope) + const responseCap = resolveResponseCap(sourceConfig) const previouslyFetched = (syncContext?.totalDocsFetched as number) ?? 0 if (maxForms > 0 && previouslyFetched >= maxForms) { @@ -586,6 +602,7 @@ export const googleFormsConnector: ConnectorConfig = { revisionId: form.revisionId, latestResponseTime: latest, contentScope, + responseCap, }) } catch (error) { skippedOnError = true @@ -661,16 +678,14 @@ export const googleFormsConnector: ConnectorConfig = { const form = await fetchFormStructure(accessToken, file.id) if (!form) return null - const maxResponses = parsePositiveInt(sourceConfig.maxResponsesPerForm) + const responseCap = resolveResponseCap(sourceConfig) const fetched = contentScope === 'both' ? await fetchFormResponses(accessToken, file.id) : { responses: [], latestSubmittedTime: undefined } const responses = fetched.responses const cappedResponses = - maxResponses > 0 && responses.length > maxResponses - ? responses.slice(0, maxResponses) - : responses + responses.length > responseCap ? responses.slice(0, responseCap) : responses const content = renderFormDocument(form, cappedResponses) if (!content.trim()) return null @@ -681,6 +696,7 @@ export const googleFormsConnector: ConnectorConfig = { revisionId: form.revisionId, latestResponseTime: fetched.latestSubmittedTime, contentScope, + responseCap, }) return { ...stub, content, contentDeferred: false } } catch (error) { diff --git a/apps/sim/connectors/jsm/jsm.ts b/apps/sim/connectors/jsm/jsm.ts index 83af8ad1d27..9a7f28341cc 100644 --- a/apps/sim/connectors/jsm/jsm.ts +++ b/apps/sim/connectors/jsm/jsm.ts @@ -518,7 +518,9 @@ export const jsmConnector: ConnectorConfig = { const data = (await response.json()) as JsmPage let requests = data.values ?? [] + let slicedSome = false if (maxRequests > 0 && requests.length > remaining) { + slicedSome = true requests = requests.slice(0, remaining) } @@ -533,8 +535,12 @@ export const jsmConnector: ConnectorConfig = { * When `maxRequests` truncates the listing before the source is exhausted, * flag the run as capped so the sync engine skips deletion reconciliation — * otherwise unseen requests beyond the cap would be deleted on every sync. + * `slicedSome` covers truncation on the final page: requests dropped from + * this page still exist even when `isLastPage` is true. (The requested + * `limit` never exceeds the remaining budget, so a slice should be + * impossible — this is defense in depth against the API over-returning.) */ - if (reachedCap && !data.isLastPage && syncContext) { + if (((reachedCap && !data.isLastPage) || slicedSome) && syncContext) { syncContext.listingCapped = true } From 38233962c4ee5a6d6e4d138923ccc78e51ec7f54 Mon Sep 17 00:00:00 2001 From: waleed Date: Thu, 4 Jun 2026 12:33:15 -0700 Subject: [PATCH 07/16] fix(connectors): shared streaming size-cap reader for ado file hydration (promote from s3) --- .../connectors/azure-devops/azure-devops.ts | 12 +++--- apps/sim/connectors/s3/s3.ts | 37 ++----------------- apps/sim/connectors/utils.ts | 33 +++++++++++++++++ 3 files changed, 43 insertions(+), 39 deletions(-) diff --git a/apps/sim/connectors/azure-devops/azure-devops.ts b/apps/sim/connectors/azure-devops/azure-devops.ts index 218e22d7628..9a24194517a 100644 --- a/apps/sim/connectors/azure-devops/azure-devops.ts +++ b/apps/sim/connectors/azure-devops/azure-devops.ts @@ -3,7 +3,7 @@ import { getErrorMessage, toError } from '@sim/utils/errors' import { AzureDevOpsIcon } from '@/components/icons' import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils' import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types' -import { htmlToPlainText, joinTagArray, parseTagDate } from '@/connectors/utils' +import { htmlToPlainText, joinTagArray, parseTagDate, readBodyWithLimit } from '@/connectors/utils' const logger = createLogger('AzureDevOpsConnector') @@ -1032,13 +1032,13 @@ async function getFileDocument( throw new Error(`Failed to fetch repository file content: ${contentResponse.status}`) } - const buffer = Buffer.from(await contentResponse.arrayBuffer()) - if (isBinaryBuffer(buffer)) { - logger.info('Skipping binary Azure DevOps file', { path }) + const buffer = await readBodyWithLimit(contentResponse, MAX_FILE_SIZE) + if (buffer === null) { + logger.info('Skipping oversized Azure DevOps file', { path }) return null } - if (buffer.byteLength > MAX_FILE_SIZE) { - logger.info('Skipping oversized Azure DevOps file', { path, size: buffer.byteLength }) + if (isBinaryBuffer(buffer)) { + logger.info('Skipping binary Azure DevOps file', { path }) return null } diff --git a/apps/sim/connectors/s3/s3.ts b/apps/sim/connectors/s3/s3.ts index 17e28387bae..0367b3bccba 100644 --- a/apps/sim/connectors/s3/s3.ts +++ b/apps/sim/connectors/s3/s3.ts @@ -4,7 +4,7 @@ import { getErrorMessage, toError } from '@sim/utils/errors' import { S3Icon } from '@/components/icons' import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils' import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types' -import { parseTagDate } from '@/connectors/utils' +import { parseTagDate, readBodyWithLimit } from '@/connectors/utils' import { encodeS3PathComponent, getSignatureKey } from '@/tools/s3/utils' const logger = createLogger('S3Connector') @@ -329,36 +329,6 @@ function buildListQueryString(params: Record): string { .join('&') } -/** - * Reads a response body as UTF-8 text while enforcing a hard byte cap. The - * declared `content-length` header cannot be trusted as the sole guard: - * S3-compatible stores (MinIO, Cloudflare R2) may use chunked transfer - * encoding and omit the header entirely. Bytes are accumulated from the - * stream and reading aborts as soon as the cap is exceeded, so an oversized - * body is never fully buffered. Returns null when the cap is exceeded. - */ -async function readBodyWithLimit(response: Response, maxBytes: number): Promise { - if (!response.body) { - const text = await response.text() - return Buffer.byteLength(text) > maxBytes ? null : text - } - - const reader = response.body.getReader() - const chunks: Uint8Array[] = [] - let total = 0 - while (true) { - const { done, value } = await reader.read() - if (done) break - total += value.byteLength - if (total > maxBytes) { - await reader.cancel().catch(() => {}) - return null - } - chunks.push(value) - } - return Buffer.concat(chunks).toString('utf-8') -} - /** * Decodes XML entities found in S3 response text values. `&` is decoded * last so sequences like `&lt;` resolve to `<` rather than `<`. @@ -663,11 +633,12 @@ export const s3Connector: ConnectorConfig = { return null } - const content = await readBodyWithLimit(response, MAX_FILE_SIZE) - if (content === null) { + const body = await readBodyWithLimit(response, MAX_FILE_SIZE) + if (body === null) { logger.warn('Skipping oversized S3 object (size cap exceeded while streaming)', { key }) return null } + const content = body.toString('utf-8') if (!content.trim()) return null const entry: S3ObjectEntry = { diff --git a/apps/sim/connectors/utils.ts b/apps/sim/connectors/utils.ts index 391d3a590f8..cc78c3e680d 100644 --- a/apps/sim/connectors/utils.ts +++ b/apps/sim/connectors/utils.ts @@ -78,3 +78,36 @@ export function parseMultiValue(value: unknown): string[] { } return [] } + +/** + * Reads a response body into a Buffer while enforcing a hard byte cap. The + * declared `content-length` header cannot be trusted as the sole guard — + * chunked transfer encoding may omit it entirely — so bytes are accumulated + * from the stream and reading aborts as soon as the cap is exceeded, ensuring + * an oversized (or hostile) body is never fully buffered into memory. + * Returns null when the cap is exceeded. + */ +export async function readBodyWithLimit( + response: Response, + maxBytes: number +): Promise { + if (!response.body) { + const buffer = Buffer.from(await response.arrayBuffer()) + return buffer.byteLength > maxBytes ? null : buffer + } + + const reader = response.body.getReader() + const chunks: Uint8Array[] = [] + let total = 0 + while (true) { + const { done, value } = await reader.read() + if (done) break + total += value.byteLength + if (total > maxBytes) { + await reader.cancel().catch(() => {}) + return null + } + chunks.push(value) + } + return Buffer.concat(chunks) +} From b24b22337d6ec8770802c3062e470244b08acdf6 Mon Sep 17 00:00:00 2001 From: waleed Date: Thu, 4 Jun 2026 13:02:38 -0700 Subject: [PATCH 08/16] fix(knowledge): flag incomplete listings at engine level when pagination is truncated --- apps/sim/lib/knowledge/connectors/sync-engine.ts | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/apps/sim/lib/knowledge/connectors/sync-engine.ts b/apps/sim/lib/knowledge/connectors/sync-engine.ts index 7ae8cfe38fa..66663223a1d 100644 --- a/apps/sim/lib/knowledge/connectors/sync-engine.ts +++ b/apps/sim/lib/knowledge/connectors/sync-engine.ts @@ -415,6 +415,21 @@ export async function executeSync( hasMore = page.hasMore } + if (hasMore) { + /** + * Pagination stopped before the source was exhausted — either the + * MAX_PAGES guard tripped or the connector reported hasMore without a + * cursor. The listing is incomplete, so flag it to suppress deletion + * reconciliation; otherwise documents beyond the truncation point would + * be removed even though they still exist in the source. + */ + syncContext.listingCapped = true + logger.warn('Pagination ended before source exhaustion; skipping deletion reconciliation', { + connectorId, + docsSoFar: externalDocs.length, + }) + } + logger.info(`Fetched ${externalDocs.length} documents from ${connectorConfig.name}`, { connectorId, }) From 5f4516b1bc91dbe703ef2099f7f7b0ab937b1e7c Mon Sep 17 00:00:00 2001 From: waleed Date: Thu, 4 Jun 2026 13:15:24 -0700 Subject: [PATCH 09/16] fix(connectors): ado flags listing incomplete when a non-empty repo has no resolvable branch --- apps/sim/connectors/azure-devops/azure-devops.ts | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/apps/sim/connectors/azure-devops/azure-devops.ts b/apps/sim/connectors/azure-devops/azure-devops.ts index 9a24194517a..e49e437e697 100644 --- a/apps/sim/connectors/azure-devops/azure-devops.ts +++ b/apps/sim/connectors/azure-devops/azure-devops.ts @@ -220,6 +220,7 @@ interface GitRepository { isDisabled?: boolean remoteUrl?: string webUrl?: string + size?: number } interface GitItem { @@ -850,9 +851,23 @@ async function resolveRepoFiles( for (const repo of repositories) { const branch = filters.branch || stripRefsHeads(repo.defaultBranch ?? '') if (!branch) { + /** + * No branch override and no resolvable default branch. An empty + * repository (size 0) has nothing to list and nothing previously synced, + * so it is skipped without flagging — flagging here would permanently + * suppress deletion reconciliation for any project containing an empty + * repo. A non-empty repository reaching this branch means content exists + * but its default branch ref is missing/unreadable, so the listing is + * flagged incomplete to protect previously synced files from + * reconciliation deletion. + */ + if ((repo.size ?? 0) > 0 && syncContext) { + syncContext.listingCapped = true + } logger.warn('Skipping Azure DevOps repository with no default branch', { repoId: repo.id, repoName: repo.name, + size: repo.size ?? 0, }) continue } From 1f1f1af8b2c8ad347fbd936af030a8620ad85c31 Mon Sep 17 00:00:00 2001 From: waleed Date: Thu, 4 Jun 2026 13:27:54 -0700 Subject: [PATCH 10/16] fix(knowledge): engine truncation flag is an absolute deletion block (fullSync cannot override); s3 byte-exact size fallback; ado tsdoc accuracy --- .../connectors/azure-devops/azure-devops.ts | 25 ++++++++++++++----- apps/sim/connectors/s3/s3.ts | 4 +-- .../lib/knowledge/connectors/sync-engine.ts | 15 ++++++++++- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/apps/sim/connectors/azure-devops/azure-devops.ts b/apps/sim/connectors/azure-devops/azure-devops.ts index e49e437e697..560fc01a6ee 100644 --- a/apps/sim/connectors/azure-devops/azure-devops.ts +++ b/apps/sim/connectors/azure-devops/azure-devops.ts @@ -25,8 +25,10 @@ const WIKI_ETAG_CONCURRENCY = 5 const FILE_BATCH_SIZE = 100 /** * Max repository file size to index. The Items list API does not return file - * size, so this cap is enforced at content-fetch time in getDocument via the - * decoded byte length. Larger files are skipped. + * size, so this cap is enforced at content-fetch time in getDocument: the raw + * octet-stream body is read through `readBodyWithLimit`, which streams the bytes + * and aborts (returning null) the moment the cap is exceeded. Larger files are + * skipped without being fully buffered. */ const MAX_FILE_SIZE = 10 * 1024 * 1024 /** Bytes sniffed for a NUL byte when detecting binary files (matches git's heuristic). */ @@ -197,10 +199,21 @@ function parseFileExternalId(externalId: string): { repoId: string; path: string /** * Builds the change-detection hash for a repository file. The git blob objectId - * is content-addressable, so it changes exactly when the file content changes — - * and it is available both on the tree listing (`objectId`) and the file fetch - * (`objectId`), so the stub and hydrated document hash identically without a - * content fetch during listing. + * is content-addressable, so it changes exactly when the file content changes, + * and it is reported both by the tree listing (`objectId`) and the per-file + * metadata fetch (`objectId`) — so the listing stub and the hydrated document + * normally hash identically without a content fetch during listing. + * + * Hydration in getFileDocument is a two-step fetch against the same branch ref: + * a JSON metadata call yields the objectId used for this hash, then a raw + * octet-stream call yields the content. Both pin to the branch *name*, not a + * commit SHA, so if the branch advances between the two calls the stored hash + * (metadata call's objectId) and the stored content (content call's bytes) can + * be one commit apart. This window is bounded and self-heals: the next listing + * reports the branch's current objectId, which differs from the stored + * one-commit-old hash, queuing an update that re-fetches and re-converges + * content and hash. (A revert to identical bytes yields the identical objectId + * by content-addressing, so the stored content is already correct in that case.) */ function buildFileContentHash(repoId: string, objectId: string): string { return `ado:file:${repoId}:${objectId}` diff --git a/apps/sim/connectors/s3/s3.ts b/apps/sim/connectors/s3/s3.ts index 0367b3bccba..2b2ac1843aa 100644 --- a/apps/sim/connectors/s3/s3.ts +++ b/apps/sim/connectors/s3/s3.ts @@ -646,9 +646,7 @@ export const s3Connector: ConnectorConfig = { etag, lastModified, size: - Number.isNaN(declaredLength) || declaredLength <= 0 - ? Buffer.byteLength(content) - : declaredLength, + Number.isNaN(declaredLength) || declaredLength <= 0 ? body.byteLength : declaredLength, } const stub = objectToStub(ctx, entry) return { ...stub, content, contentDeferred: false } diff --git a/apps/sim/lib/knowledge/connectors/sync-engine.ts b/apps/sim/lib/knowledge/connectors/sync-engine.ts index 66663223a1d..641a30e628a 100644 --- a/apps/sim/lib/knowledge/connectors/sync-engine.ts +++ b/apps/sim/lib/knowledge/connectors/sync-engine.ts @@ -422,8 +422,15 @@ export async function executeSync( * cursor. The listing is incomplete, so flag it to suppress deletion * reconciliation; otherwise documents beyond the truncation point would * be removed even though they still exist in the source. + * + * `listingTruncated` is distinct from connector-set `listingCapped`: + * a forced fullSync may legitimately override a connector's soft cap + * (the user opted to reconcile the capped scope), but engine-level + * truncation can never be resolved by forcing a fullSync — the next + * fullSync truncates identically — so it is an absolute deletion block. */ syncContext.listingCapped = true + syncContext.listingTruncated = true logger.warn('Pagination ended before source exhaustion; skipping deletion reconciliation', { connectorId, docsSoFar: externalDocs.length, @@ -652,7 +659,13 @@ export async function executeSync( // Reconcile deletions for non-incremental syncs that returned ALL docs. // Skip when listing was capped (maxFiles/maxThreads) — unseen docs may still exist in the source. - if (!isIncremental && (!syncContext?.listingCapped || options?.fullSync)) { + // A forced fullSync overrides connector-set caps, but never engine-level truncation + // (listingTruncated): a truncated listing is incomplete no matter how the sync was triggered. + if ( + !isIncremental && + !syncContext?.listingTruncated && + (!syncContext?.listingCapped || options?.fullSync) + ) { const removedIds = existingDocs .filter((d) => d.externalId && !seenExternalIds.has(d.externalId)) .map((d) => d.id) From 847a4993cbe06ce47565d0ff0ff7edb17864a4a7 Mon Sep 17 00:00:00 2001 From: waleed Date: Thu, 4 Jun 2026 13:32:51 -0700 Subject: [PATCH 11/16] improvement(knowledge): extract shouldReconcileDeletions gate as tested pure function, tighten engine comments --- .../knowledge/connectors/sync-engine.test.ts | 40 ++++++++++++++++ .../lib/knowledge/connectors/sync-engine.ts | 47 +++++++++++-------- 2 files changed, 67 insertions(+), 20 deletions(-) diff --git a/apps/sim/lib/knowledge/connectors/sync-engine.test.ts b/apps/sim/lib/knowledge/connectors/sync-engine.test.ts index 6eb786ce7d8..0e6494bbfe4 100644 --- a/apps/sim/lib/knowledge/connectors/sync-engine.test.ts +++ b/apps/sim/lib/knowledge/connectors/sync-engine.test.ts @@ -37,6 +37,46 @@ vi.mock('@/connectors/registry', () => ({ }, })) +describe('shouldReconcileDeletions', () => { + it('runs on a clean full listing', async () => { + const { shouldReconcileDeletions } = await import('@/lib/knowledge/connectors/sync-engine') + + expect(shouldReconcileDeletions(false, {}, undefined)).toBe(true) + expect(shouldReconcileDeletions(false, undefined, undefined)).toBe(true) + }) + + it('never runs on incremental syncs', async () => { + const { shouldReconcileDeletions } = await import('@/lib/knowledge/connectors/sync-engine') + + expect(shouldReconcileDeletions(true, {}, undefined)).toBe(false) + expect(shouldReconcileDeletions(true, {}, true)).toBe(false) + expect(shouldReconcileDeletions(true, { listingCapped: true }, true)).toBe(false) + }) + + it('skips when a connector capped the listing', async () => { + const { shouldReconcileDeletions } = await import('@/lib/knowledge/connectors/sync-engine') + + expect(shouldReconcileDeletions(false, { listingCapped: true }, undefined)).toBe(false) + expect(shouldReconcileDeletions(false, { listingCapped: true }, false)).toBe(false) + }) + + it('lets a forced fullSync override a connector cap', async () => { + const { shouldReconcileDeletions } = await import('@/lib/knowledge/connectors/sync-engine') + + expect(shouldReconcileDeletions(false, { listingCapped: true }, true)).toBe(true) + }) + + it('never runs when the engine truncated pagination, even on a forced fullSync', async () => { + const { shouldReconcileDeletions } = await import('@/lib/knowledge/connectors/sync-engine') + + expect(shouldReconcileDeletions(false, { listingTruncated: true }, undefined)).toBe(false) + expect(shouldReconcileDeletions(false, { listingTruncated: true }, true)).toBe(false) + expect( + shouldReconcileDeletions(false, { listingCapped: true, listingTruncated: true }, true) + ).toBe(false) + }) +}) + describe('resolveTagMapping', () => { beforeEach(() => { vi.clearAllMocks() diff --git a/apps/sim/lib/knowledge/connectors/sync-engine.ts b/apps/sim/lib/knowledge/connectors/sync-engine.ts index 641a30e628a..7da5d4b956f 100644 --- a/apps/sim/lib/knowledge/connectors/sync-engine.ts +++ b/apps/sim/lib/knowledge/connectors/sync-engine.ts @@ -128,6 +128,27 @@ async function completeSyncLog( .where(eq(knowledgeConnectorSyncLog.id, syncLogId)) } +/** + * Decides whether deletion reconciliation may run for a sync. + * + * Reconciliation hard-deletes every stored document absent from the listing, + * so it must only run against a complete source set: + * - never on incremental syncs (they list only changed documents) + * - never when the engine truncated pagination (`listingTruncated`) — a forced + * fullSync cannot fix truncation, so it cannot override it + * - not when a connector capped its listing (`listingCapped`), unless a forced + * fullSync deliberately overrides the cap to reconcile the capped scope + */ +export function shouldReconcileDeletions( + isIncremental: boolean | undefined, + syncContext: Record | undefined, + fullSync: boolean | undefined +): boolean { + if (isIncremental) return false + if (syncContext?.listingTruncated) return false + return !syncContext?.listingCapped || Boolean(fullSync) +} + /** * Resolves tag values from connector metadata using the connector's mapTags function. * Translates semantic keys returned by mapTags to actual DB slots using the @@ -417,17 +438,11 @@ export async function executeSync( if (hasMore) { /** - * Pagination stopped before the source was exhausted — either the - * MAX_PAGES guard tripped or the connector reported hasMore without a - * cursor. The listing is incomplete, so flag it to suppress deletion - * reconciliation; otherwise documents beyond the truncation point would - * be removed even though they still exist in the source. - * - * `listingTruncated` is distinct from connector-set `listingCapped`: - * a forced fullSync may legitimately override a connector's soft cap - * (the user opted to reconcile the capped scope), but engine-level - * truncation can never be resolved by forcing a fullSync — the next - * fullSync truncates identically — so it is an absolute deletion block. + * Pagination stopped before source exhaustion (MAX_PAGES or a missing + * cursor), so the listing is incomplete. `listingTruncated` blocks + * deletion reconciliation absolutely — unlike connector-set + * `listingCapped`, it cannot be overridden by a forced fullSync, since + * re-running one truncates identically. */ syncContext.listingCapped = true syncContext.listingTruncated = true @@ -657,15 +672,7 @@ export async function executeSync( } } - // Reconcile deletions for non-incremental syncs that returned ALL docs. - // Skip when listing was capped (maxFiles/maxThreads) — unseen docs may still exist in the source. - // A forced fullSync overrides connector-set caps, but never engine-level truncation - // (listingTruncated): a truncated listing is incomplete no matter how the sync was triggered. - if ( - !isIncremental && - !syncContext?.listingTruncated && - (!syncContext?.listingCapped || options?.fullSync) - ) { + if (shouldReconcileDeletions(isIncremental, syncContext, options?.fullSync)) { const removedIds = existingDocs .filter((d) => d.externalId && !seenExternalIds.has(d.externalId)) .map((d) => d.id) From fa665276320a4fb0e3c4709fbc9a8113540a775e Mon Sep 17 00:00:00 2001 From: waleed Date: Thu, 4 Jun 2026 13:44:06 -0700 Subject: [PATCH 12/16] test(connectors): mapTags coverage for the 7 new connectors --- apps/sim/connectors/mapTags.test.ts | 399 ++++++++++++++++++++++++++++ 1 file changed, 399 insertions(+) diff --git a/apps/sim/connectors/mapTags.test.ts b/apps/sim/connectors/mapTags.test.ts index 62701d04698..9f043ebabd7 100644 --- a/apps/sim/connectors/mapTags.test.ts +++ b/apps/sim/connectors/mapTags.test.ts @@ -11,6 +11,13 @@ vi.mock('@/components/icons', () => ({ NotionIcon: () => null, GoogleDriveIcon: () => null, AirtableIcon: () => null, + SentryIcon: () => null, + TypeformIcon: () => null, + YouTubeIcon: () => null, + JiraServiceManagementIcon: () => null, + S3Icon: () => null, + GoogleFormsIcon: () => null, + AzureDevOpsIcon: () => null, })) vi.mock('@/lib/knowledge/documents/utils', () => ({ fetchWithRetry: vi.fn(), @@ -18,14 +25,32 @@ vi.mock('@/lib/knowledge/documents/utils', () => ({ })) vi.mock('@/tools/jira/utils', () => ({ extractAdfText: vi.fn(), getJiraCloudId: vi.fn() })) vi.mock('@/tools/confluence/utils', () => ({ getConfluenceCloudId: vi.fn() })) +vi.mock('@/tools/jsm/utils', () => ({ + getJsmApiBaseUrl: vi.fn(), + getJsmFormsApiBaseUrl: vi.fn(), + getJsmHeaders: vi.fn(), +})) +vi.mock('@/tools/s3/utils', () => ({ + encodeS3PathComponent: vi.fn(), + getSignatureKey: vi.fn(), + parseS3Uri: vi.fn(), + generatePresignedUrl: vi.fn(), +})) import { airtableConnector } from '@/connectors/airtable/airtable' +import { azureDevopsConnector } from '@/connectors/azure-devops/azure-devops' import { confluenceConnector } from '@/connectors/confluence/confluence' import { githubConnector } from '@/connectors/github/github' import { googleDriveConnector } from '@/connectors/google-drive/google-drive' +import { googleFormsConnector } from '@/connectors/google-forms/google-forms' import { jiraConnector } from '@/connectors/jira/jira' +import { jsmConnector } from '@/connectors/jsm/jsm' import { linearConnector } from '@/connectors/linear/linear' import { notionConnector } from '@/connectors/notion/notion' +import { s3Connector } from '@/connectors/s3/s3' +import { sentryConnector } from '@/connectors/sentry/sentry' +import { typeformConnector } from '@/connectors/typeform/typeform' +import { youtubeConnector } from '@/connectors/youtube/youtube' const ISO_DATE = '2025-06-15T10:30:00.000Z' @@ -388,3 +413,377 @@ describe('Airtable mapTags', () => { expect(result).toEqual({ createdTime: new Date(ISO_DATE) }) }) }) + +describe('Sentry mapTags', () => { + const mapTags = sentryConnector.mapTags! + + it.concurrent('maps all fields when present', () => { + const result = mapTags({ + level: 'error', + status: 'unresolved', + count: 1234, + firstSeen: '2025-01-01T00:00:00.000Z', + lastSeen: ISO_DATE, + }) + + expect(result).toEqual({ + level: 'error', + status: 'unresolved', + count: 1234, + firstSeen: new Date('2025-01-01T00:00:00.000Z'), + lastSeen: new Date(ISO_DATE), + }) + }) + + it.concurrent('returns empty object for empty metadata', () => { + expect(mapTags({})).toEqual({}) + }) + + it.concurrent('skips fields with wrong types', () => { + const result = mapTags({ + level: 123, + status: null, + count: 'not-a-number', + firstSeen: 'bad-date', + lastSeen: 99999, + }) + expect(result).toEqual({}) + }) + + it.concurrent('skips blank string fields', () => { + const result = mapTags({ level: ' ', status: '' }) + expect(result).toEqual({}) + }) + + it.concurrent('converts string count to number', () => { + const result = mapTags({ count: '42' }) + expect(result).toEqual({ count: 42 }) + }) + + it.concurrent('maps count of zero', () => { + const result = mapTags({ count: 0 }) + expect(result).toEqual({ count: 0 }) + }) +}) + +describe('Typeform mapTags', () => { + const mapTags = typeformConnector.mapTags! + + it.concurrent('maps all fields when present', () => { + const result = mapTags({ + formTitle: 'Customer Survey', + platform: 'web', + submittedAt: ISO_DATE, + }) + + expect(result).toEqual({ + formTitle: 'Customer Survey', + platform: 'web', + submittedAt: new Date(ISO_DATE), + }) + }) + + it.concurrent('returns empty object for empty metadata', () => { + expect(mapTags({})).toEqual({}) + }) + + it.concurrent('skips fields with wrong types', () => { + const result = mapTags({ + formTitle: 123, + platform: null, + submittedAt: 99999, + }) + expect(result).toEqual({}) + }) + + it.concurrent('skips submittedAt when date is invalid', () => { + const result = mapTags({ submittedAt: 'not-a-date' }) + expect(result).toEqual({}) + }) + + it.concurrent('skips empty string fields', () => { + const result = mapTags({ formTitle: '', platform: '' }) + expect(result).toEqual({}) + }) +}) + +describe('YouTube mapTags', () => { + const mapTags = youtubeConnector.mapTags! + + it.concurrent('maps all fields when present', () => { + const result = mapTags({ + channelTitle: 'Tech Channel', + publishedAt: ISO_DATE, + duration: 'PT10M30S', + tags: ['tutorial', 'coding'], + }) + + expect(result).toEqual({ + channelTitle: 'Tech Channel', + publishedAt: new Date(ISO_DATE), + duration: 'PT10M30S', + tags: 'tutorial, coding', + }) + }) + + it.concurrent('returns empty object for empty metadata', () => { + expect(mapTags({})).toEqual({}) + }) + + it.concurrent('skips fields with wrong types', () => { + const result = mapTags({ + channelTitle: 123, + publishedAt: 99999, + duration: null, + tags: 'not-an-array', + }) + expect(result).toEqual({}) + }) + + it.concurrent('skips publishedAt when date is invalid', () => { + const result = mapTags({ publishedAt: 'bad-date' }) + expect(result).toEqual({}) + }) + + it.concurrent('skips blank string fields', () => { + const result = mapTags({ channelTitle: ' ', duration: '' }) + expect(result).toEqual({}) + }) + + it.concurrent('skips tags when array is empty', () => { + const result = mapTags({ tags: [] }) + expect(result).toEqual({}) + }) +}) + +describe('JSM mapTags', () => { + const mapTags = jsmConnector.mapTags! + + it.concurrent('maps all fields when present', () => { + const result = mapTags({ + status: 'Waiting for support', + requestTypeId: 'rt-123', + reporter: 'Carol', + created: '2025-01-01T00:00:00.000Z', + statusDate: ISO_DATE, + }) + + expect(result).toEqual({ + status: 'Waiting for support', + requestTypeId: 'rt-123', + reporter: 'Carol', + created: new Date('2025-01-01T00:00:00.000Z'), + updated: new Date(ISO_DATE), + }) + }) + + it.concurrent('returns empty object for empty metadata', () => { + expect(mapTags({})).toEqual({}) + }) + + it.concurrent('skips fields with wrong types', () => { + const result = mapTags({ + status: 123, + requestTypeId: null, + reporter: true, + created: 'bad-date', + statusDate: 99999, + }) + expect(result).toEqual({}) + }) + + it.concurrent('skips created when date is invalid', () => { + const result = mapTags({ created: 'not-a-date' }) + expect(result).toEqual({}) + }) + + it.concurrent('maps statusDate to updated key', () => { + const result = mapTags({ statusDate: ISO_DATE }) + expect(result).toEqual({ updated: new Date(ISO_DATE) }) + expect(result).not.toHaveProperty('statusDate') + }) +}) + +describe('S3 mapTags', () => { + const mapTags = s3Connector.mapTags! + + it.concurrent('maps all fields when present', () => { + const result = mapTags({ + prefix: 'documents/reports/', + fileSize: 2048, + lastModified: ISO_DATE, + }) + + expect(result).toEqual({ + prefix: 'documents/reports/', + fileSize: 2048, + lastModified: new Date(ISO_DATE), + }) + }) + + it.concurrent('returns empty object for empty metadata', () => { + expect(mapTags({})).toEqual({}) + }) + + it.concurrent('skips fields with wrong types', () => { + const result = mapTags({ + prefix: 123, + fileSize: 'not-a-number', + lastModified: 99999, + }) + expect(result).toEqual({}) + }) + + it.concurrent('skips prefix when empty string', () => { + const result = mapTags({ prefix: '' }) + expect(result).toEqual({}) + }) + + it.concurrent('skips lastModified when date is invalid', () => { + const result = mapTags({ lastModified: 'bad-date' }) + expect(result).toEqual({}) + }) + + it.concurrent('converts string fileSize to number', () => { + const result = mapTags({ fileSize: '512' }) + expect(result).toEqual({ fileSize: 512 }) + }) + + it.concurrent('maps fileSize of zero', () => { + const result = mapTags({ fileSize: 0 }) + expect(result).toEqual({ fileSize: 0 }) + }) +}) + +describe('Google Forms mapTags', () => { + const mapTags = googleFormsConnector.mapTags! + + it.concurrent('maps all fields when present', () => { + const result = mapTags({ + formTitle: 'Feedback Form', + owners: ['Alice', 'Bob'], + modifiedTime: ISO_DATE, + latestResponseTime: '2025-01-01T00:00:00.000Z', + }) + + expect(result).toEqual({ + formTitle: 'Feedback Form', + owners: 'Alice, Bob', + lastModified: new Date(ISO_DATE), + lastResponse: new Date('2025-01-01T00:00:00.000Z'), + }) + }) + + it.concurrent('returns empty object for empty metadata', () => { + expect(mapTags({})).toEqual({}) + }) + + it.concurrent('skips fields with wrong types', () => { + const result = mapTags({ + formTitle: 123, + owners: 'not-an-array', + modifiedTime: 99999, + latestResponseTime: false, + }) + expect(result).toEqual({}) + }) + + it.concurrent('trims formTitle in output', () => { + const result = mapTags({ formTitle: ' Feedback Form ' }) + expect(result).toEqual({ formTitle: 'Feedback Form' }) + }) + + it.concurrent('skips formTitle when blank', () => { + const result = mapTags({ formTitle: ' ' }) + expect(result).toEqual({}) + }) + + it.concurrent('skips owners when array is empty', () => { + const result = mapTags({ owners: [] }) + expect(result).toEqual({}) + }) + + it.concurrent('maps modifiedTime to lastModified key', () => { + const result = mapTags({ modifiedTime: ISO_DATE }) + expect(result).toEqual({ lastModified: new Date(ISO_DATE) }) + expect(result).not.toHaveProperty('modifiedTime') + }) + + it.concurrent('maps latestResponseTime to lastResponse key', () => { + const result = mapTags({ latestResponseTime: ISO_DATE }) + expect(result).toEqual({ lastResponse: new Date(ISO_DATE) }) + expect(result).not.toHaveProperty('latestResponseTime') + }) +}) + +describe('Azure DevOps mapTags', () => { + const mapTags = azureDevopsConnector.mapTags! + + it.concurrent('maps all fields when present', () => { + const result = mapTags({ + kind: 'workItem', + wikiName: 'Engineering Wiki', + workItemType: 'Bug', + state: 'Active', + areaPath: 'Project\\Team', + tags: ['frontend', 'urgent'], + repository: 'owner/repo', + path: 'src/index.ts', + changedDate: ISO_DATE, + }) + + expect(result).toEqual({ + kind: 'workItem', + wikiName: 'Engineering Wiki', + workItemType: 'Bug', + state: 'Active', + areaPath: 'Project\\Team', + tags: 'frontend, urgent', + repository: 'owner/repo', + path: 'src/index.ts', + changedDate: new Date(ISO_DATE), + }) + }) + + it.concurrent('returns empty object for empty metadata', () => { + expect(mapTags({})).toEqual({}) + }) + + it.concurrent('skips fields with wrong types', () => { + const result = mapTags({ + kind: 123, + wikiName: null, + workItemType: true, + state: [], + areaPath: 99999, + tags: 'not-an-array', + repository: false, + path: 42, + changedDate: 'bad-date', + }) + expect(result).toEqual({}) + }) + + it.concurrent('skips empty string fields except kind', () => { + const result = mapTags({ + kind: '', + wikiName: '', + workItemType: '', + state: '', + areaPath: '', + repository: '', + path: '', + }) + expect(result).toEqual({ kind: '' }) + }) + + it.concurrent('skips tags when array is empty', () => { + const result = mapTags({ tags: [] }) + expect(result).toEqual({}) + }) + + it.concurrent('skips changedDate when date is invalid', () => { + const result = mapTags({ changedDate: 'garbage' }) + expect(result).toEqual({}) + }) +}) From 9bcd6138b45666a0dff4f2a8b83174d26b66a677 Mon Sep 17 00:00:00 2001 From: waleed Date: Thu, 4 Jun 2026 13:52:35 -0700 Subject: [PATCH 13/16] fix(connectors): ado probes past the wiql 20k cap before flagging; document custom-wiql full-listing behavior --- .../connectors/azure-devops/azure-devops.ts | 51 +++++++++++++++---- 1 file changed, 41 insertions(+), 10 deletions(-) diff --git a/apps/sim/connectors/azure-devops/azure-devops.ts b/apps/sim/connectors/azure-devops/azure-devops.ts index 560fc01a6ee..07d93ed3b88 100644 --- a/apps/sim/connectors/azure-devops/azure-devops.ts +++ b/apps/sim/connectors/azure-devops/azure-devops.ts @@ -520,11 +520,17 @@ function readWorkItemFilters(sourceConfig: Record): WorkItemFil /** * Builds the WIQL query for the configured work-item filters. User-supplied - * values are escaped against WIQL string-literal injection. When a custom WIQL - * query is provided it is used verbatim and the structured filters are ignored. - * `lastSyncAt` narrows results to items changed since the previous sync. + * values are escaped against WIQL string-literal injection. `lastSyncAt` + * narrows results to items changed since the previous sync, and `idAfter` + * restricts to items with a greater id (used to probe past the 20,000-item + * WIQL cap). + * + * A custom WIQL query is used verbatim: neither the incremental changed-date + * filter nor the probe condition can be injected into arbitrary user WIQL + * safely, so custom queries always run as full listings on every sync. Change + * detection still short-circuits unchanged items via the content hash. */ -function buildWiql(filters: WorkItemFilters, lastSyncAt?: Date): string { +function buildWiql(filters: WorkItemFilters, lastSyncAt?: Date, idAfter?: number): string { if (filters.customWiql) return filters.customWiql const clauses: string[] = ['[System.TeamProject] = @project'] @@ -543,6 +549,9 @@ function buildWiql(filters: WorkItemFilters, lastSyncAt?: Date): string { if (lastSyncAt) { clauses.push(`[System.ChangedDate] >= '${lastSyncAt.toISOString()}'`) } + if (idAfter !== undefined) { + clauses.push(`[System.Id] > ${idAfter}`) + } return `SELECT [System.Id] FROM workitems WHERE ${clauses.join(' AND ')} ORDER BY [System.ChangedDate] DESC` } @@ -556,9 +565,10 @@ async function queryWorkItemIds( accessToken: string, organization: string, project: string, - wiql: string + wiql: string, + top: number = WIQL_MAX_RESULTS ): Promise { - const url = `${ADO_BASE_URL}/${encodeURIComponent(organization)}/${encodeURIComponent(project)}/_apis/wit/wiql?$top=${WIQL_MAX_RESULTS}&api-version=${WIQL_API_VERSION}` + const url = `${ADO_BASE_URL}/${encodeURIComponent(organization)}/${encodeURIComponent(project)}/_apis/wit/wiql?$top=${top}&api-version=${WIQL_API_VERSION}` const response = await fetchWithRetry(url, { method: 'POST', headers: { @@ -1250,10 +1260,31 @@ async function listWorkItems( const wiql = buildWiql(filters, lastSyncAt) ids = await queryWorkItemIds(accessToken, organization, project, wiql) if (syncContext) syncContext.workItemIds = ids - } - if (ids.length >= WIQL_MAX_RESULTS && syncContext) { - syncContext.listingCapped = true + if (ids.length >= WIQL_MAX_RESULTS && syncContext) { + /** + * The WIQL result filled the 20,000-item cap. Distinguish an exact fit + * from genuine truncation: for structured filters, probe for any + * matching item with an id beyond the largest returned one and only + * flag the listing incomplete when one exists — otherwise deletion + * reconciliation would be disabled forever for a project with exactly + * 20,000 matching items. Custom WIQL cannot be probed (no safe clause + * injection), so it is flagged conservatively. + */ + let truncated = true + if (!filters.customWiql) { + let maxId = 0 + for (const id of ids) { + if (id > maxId) maxId = id + } + const probeWiql = buildWiql(filters, lastSyncAt, maxId) + const beyond = await queryWorkItemIds(accessToken, organization, project, probeWiql, 1) + truncated = beyond.length > 0 + } + if (truncated) { + syncContext.listingCapped = true + } + } } if (ids.length === 0) { @@ -1405,7 +1436,7 @@ export const azureDevopsConnector: ConnectorConfig = { mode: 'advanced', placeholder: 'SELECT [System.Id] FROM workitems WHERE ...', description: - 'Advanced: a full WIQL query selecting [System.Id]. Overrides the type, state, area path, and tag filters when set.', + 'Advanced: a full WIQL query selecting [System.Id]. Overrides the type, state, area path, and tag filters when set. Custom queries always run as full listings on every sync (the incremental changed-date filter is not applied).', }, { id: 'repositoryName', From 4766fb3104a48518d9a42c35bbbd748b26661a34 Mon Sep 17 00:00:00 2001 From: waleed Date: Thu, 4 Jun 2026 14:02:06 -0700 Subject: [PATCH 14/16] fix(connectors): ado flags partial repo trees when items listing emits a continuation token --- .../sim/connectors/azure-devops/azure-devops.ts | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/apps/sim/connectors/azure-devops/azure-devops.ts b/apps/sim/connectors/azure-devops/azure-devops.ts index 07d93ed3b88..b1911c75d8f 100644 --- a/apps/sim/connectors/azure-devops/azure-devops.ts +++ b/apps/sim/connectors/azure-devops/azure-devops.ts @@ -794,6 +794,23 @@ async function listRepositoryBlobs( }) throw new Error(`Failed to list repository items: ${response.status}`) } + /** + * The Items list API documents no pagination, but very large trees may emit + * an `x-ms-continuationtoken` response header. No request parameter exists + * to follow it, so when it appears the tree is treated as incomplete: the + * listing is flagged so deletion reconciliation cannot remove files that + * were never returned. + */ + if (response.headers.get('x-ms-continuationtoken')) { + if (syncContext) syncContext.listingCapped = true + logger.warn( + 'Azure DevOps repository tree listing returned a continuation token; partial tree', + { + repoId, + branch, + } + ) + } const data = await response.json() const items = (data.value as GitItem[] | undefined) ?? [] return items.filter((item) => item.gitObjectType === 'blob' && !item.isFolder && item.path) From 3bfec6e7b7e03b225c60ee4e770040d020a49224 Mon Sep 17 00:00:00 2001 From: waleed Date: Thu, 4 Jun 2026 14:10:03 -0700 Subject: [PATCH 15/16] fix(connectors): ado discards foreign-phase cursors; google-forms scans all response pages for change detection --- .../connectors/azure-devops/azure-devops.ts | 12 ++- .../connectors/google-forms/google-forms.ts | 91 ++++++++++--------- 2 files changed, 56 insertions(+), 47 deletions(-) diff --git a/apps/sim/connectors/azure-devops/azure-devops.ts b/apps/sim/connectors/azure-devops/azure-devops.ts index b1911c75d8f..2c9727866e6 100644 --- a/apps/sim/connectors/azure-devops/azure-devops.ts +++ b/apps/sim/connectors/azure-devops/azure-devops.ts @@ -1536,7 +1536,15 @@ export const azureDevopsConnector: ConnectorConfig = { : cursor?.startsWith('file|') ? 'file' : 'wiki' - const phase = phases.includes(cursorPhase) ? cursorPhase : phases[0] + + /** + * A cursor from a phase that is no longer active (e.g. the content-type + * config changed) is discarded along with its offsets — otherwise another + * phase would misparse its tokens as numeric offsets and skip documents. + */ + const cursorIsActive = phases.includes(cursorPhase) + const phase = cursorIsActive ? cursorPhase : phases[0] + const initialCursor = cursorIsActive ? cursor : undefined /** Lists a single batch for the given phase. The cursor is passed only when it belongs to that phase. */ const runPhase = (target: SyncPhase, phaseCursor: string | undefined) => { @@ -1586,7 +1594,7 @@ export const azureDevopsConnector: ConnectorConfig = { * across phases. */ let current: SyncPhase | undefined = phase - let phaseCursor = cursor + let phaseCursor = initialCursor const documents: ExternalDocument[] = [] while (current) { diff --git a/apps/sim/connectors/google-forms/google-forms.ts b/apps/sim/connectors/google-forms/google-forms.ts index 02f187caaf6..4871cc35ac3 100644 --- a/apps/sim/connectors/google-forms/google-forms.ts +++ b/apps/sim/connectors/google-forms/google-forms.ts @@ -218,11 +218,11 @@ async function fetchFormStructure( /** * Result of fetching a form's responses: the collected responses (capped at * `MAX_RESPONSES_PER_FORM` for rendering) plus the greatest submission timestamp - * across the first response page. + * across ALL response pages. * * `latestSubmittedTime` is tracked separately from the capped `responses` so the * content hash computed in getDocument stays identical to the one computed during - * listing, which scans the same first page via `fetchLatestResponseTime`. If it + * listing, which scans the same full set via `fetchLatestResponseTime`. If it * were derived from the capped slice alone, a form with more than * `MAX_RESPONSES_PER_FORM` responses could hash differently between the two paths * and re-sync on every run. @@ -234,18 +234,16 @@ interface FetchedResponses { /** * Fetches form responses, retaining up to `MAX_RESPONSES_PER_FORM` for rendering. - * The latest submission timestamp is derived from the full first page (up to - * `RESPONSES_PAGE_SIZE`) so it matches the change indicator computed during - * listing by `fetchLatestResponseTime`, which reads the same first page. This - * keeps the content hash identical across the listing and getDocument paths even - * when a form has more responses than the render cap. Responses are returned in - * the order provided by the API. + * Every page is scanned for the latest submission timestamp even after the + * render cap is reached — the Forms API does not guarantee response order, so + * the newest submission may sit on any page. `fetchLatestResponseTime` scans + * the same full set during listing, keeping the content hash identical across + * the listing and getDocument paths regardless of form size. */ async function fetchFormResponses(accessToken: string, formId: string): Promise { const collected: FormResponse[] = [] - let latestSubmittedTime: string | undefined + let latest = '' let pageToken: string | undefined - let firstPage = true do { const url = new URL(`${FORMS_API_BASE}/forms/${encodeURIComponent(formId)}/responses`) @@ -267,61 +265,64 @@ async function fetchFormResponses(accessToken: string, formId: string): Promise< const data = (await response.json()) as FormResponseList const responses = data.responses ?? [] - if (firstPage) { - latestSubmittedTime = latestResponseTime(responses) - firstPage = false - } + const pageLatest = latestResponseTime(responses) + if (pageLatest && pageLatest > latest) latest = pageLatest for (const r of responses) { if (collected.length >= MAX_RESPONSES_PER_FORM) break collected.push(r) } - pageToken = collected.length >= MAX_RESPONSES_PER_FORM ? undefined : data.nextPageToken + pageToken = data.nextPageToken } while (pageToken) - return { responses: collected, latestSubmittedTime } + return { responses: collected, latestSubmittedTime: latest || undefined } } /** * Reads the latest response submission time for change detection without - * retaining every response. Returns the greatest `lastSubmittedTime` (falling - * back to `createTime`) across all responses, or undefined when there are none. - * Throws on a failed read so the caller skips the form for this run instead of - * computing a hash from incomplete data. + * retaining responses. Scans every page — the Forms API does not guarantee + * response order, so the newest submission may sit on any page. Returns the + * greatest `lastSubmittedTime` (falling back to `createTime`), or undefined + * when there are none. Throws on a failed read so the caller skips the form + * for this run instead of computing a hash from incomplete data — a swallowed + * error would poison the stub's content hash and re-process the form on every + * sync, while throwing routes into the per-form catch that sets + * `skippedOnError` → `listingCapped`. */ async function fetchLatestResponseTime( accessToken: string, formId: string ): Promise { - const url = new URL(`${FORMS_API_BASE}/forms/${encodeURIComponent(formId)}/responses`) - url.searchParams.set('pageSize', String(RESPONSES_PAGE_SIZE)) + let latest = '' + let pageToken: string | undefined - const response = await fetchWithRetry(url.toString(), { - method: 'GET', - headers: { - Authorization: `Bearer ${accessToken}`, - Accept: 'application/json', - }, - }) + do { + const url = new URL(`${FORMS_API_BASE}/forms/${encodeURIComponent(formId)}/responses`) + url.searchParams.set('pageSize', String(RESPONSES_PAGE_SIZE)) + if (pageToken) url.searchParams.set('pageToken', pageToken) - if (!response.ok) { - /** - * Propagate the failure rather than hashing with an empty response segment. - * A swallowed error here would poison the stub's content hash (listing - * would hash "no responses" while getDocument hashes the real latest - * submission time), making the form re-process on every sync. Throwing lets - * the per-form catch in listDocuments skip the form for this run and set - * `skippedOnError` → `listingCapped`, so the form is neither deleted nor - * hashed incorrectly. - */ - throw new Error( - `Failed to read responses for change detection on form ${formId}: ${response.status}` - ) - } + const response = await fetchWithRetry(url.toString(), { + method: 'GET', + headers: { + Authorization: `Bearer ${accessToken}`, + Accept: 'application/json', + }, + }) + + if (!response.ok) { + throw new Error( + `Failed to read responses for change detection on form ${formId}: ${response.status}` + ) + } - const data = (await response.json()) as FormResponseList - return latestResponseTime(data.responses ?? []) + const data = (await response.json()) as FormResponseList + const pageLatest = latestResponseTime(data.responses ?? []) + if (pageLatest && pageLatest > latest) latest = pageLatest + pageToken = data.nextPageToken + } while (pageToken) + + return latest || undefined } /** From c16e94a3e8f621f8bd563f2a16bc3badc3cd12c8 Mon Sep 17 00:00:00 2001 From: waleed Date: Thu, 4 Jun 2026 14:47:13 -0700 Subject: [PATCH 16/16] fix(connectors): audit fixes across new connectors - registry: register x connector (was dead code, never wired in) - google-docs/google-drive/google-forms: gate deletion reconciliation on Drive incompleteSearch; google-docs also now sets listingCapped on its maxDocs cap path - jsm: add read:jira-user scope so reporter resolves on requests - gong: only set listingCapped on genuine truncation, not exact-cap source exhaustion - gitlab: issues phase switched to keyset pagination (removes ~50k offset ceiling), matching the repo-tree phase - grain: parallelize recording + transcript fetch in getDocument - ashby: document updatedAt-based content-hash limitation for notes/feedback change detection - tests: mapTags coverage for x, granola, greenhouse, fathom, rootly --- apps/sim/connectors/ashby/ashby.ts | 27 +- apps/sim/connectors/gitlab/gitlab.ts | 41 ++- apps/sim/connectors/gong/gong.ts | 16 +- .../sim/connectors/google-docs/google-docs.ts | 23 ++ .../connectors/google-drive/google-drive.ts | 10 +- .../connectors/google-forms/google-forms.ts | 15 +- apps/sim/connectors/grain/grain.ts | 7 +- apps/sim/connectors/jsm/jsm.ts | 7 + apps/sim/connectors/mapTags.test.ts | 347 ++++++++++++++++++ apps/sim/connectors/registry.ts | 2 + 10 files changed, 471 insertions(+), 24 deletions(-) diff --git a/apps/sim/connectors/ashby/ashby.ts b/apps/sim/connectors/ashby/ashby.ts index fdf1c21e8b2..d27853a4b02 100644 --- a/apps/sim/connectors/ashby/ashby.ts +++ b/apps/sim/connectors/ashby/ashby.ts @@ -298,7 +298,32 @@ function renderFeedbackValue(value: unknown): string { /** * Stable, metadata-based content hash for a candidate document. Identical between the - * listing stub and the fully-fetched document so unchanged candidates are skipped. + * listing stub and the fully-fetched document so unchanged candidates are skipped, + * which keeps the `getDocument` re-hydration (notes + feedback fetches) cheap: the + * sync engine only re-hydrates a deferred stub when this hash differs from the stored + * document's hash (see `lib/knowledge/connectors/sync-engine.ts`). + * + * Known limitation — notes/feedback freshness depends on `candidate.updatedAt`. + * Candidate notes (`candidate.listNotes`) and interview feedback + * (`applicationFeedback.list`) are separate Ashby objects, not candidate fields. This + * hash is derived solely from the candidate's own `updatedAt`, so a new note or newly + * submitted feedback is only re-synced if Ashby advances `candidate.updatedAt` as a + * side effect of that write. + * + * As of this writing Ashby's public API docs do not specify what counts as a + * "modification" for `candidate.updatedAt` or for `candidate.list` syncToken + * incremental sync, and no third-party ATS-integration vendor (Merge, Nango, Knit) + * documents it either — so this behavior is unverified. If Ashby does NOT touch + * `candidate.updatedAt` on note/feedback writes, those additions will not be picked up + * until some other candidate field changes; a forced full sync re-hydrates everything + * regardless. No cheaper listing-time signal exists to fold into this hash: the + * `candidate.list` object exposes no note/feedback count, and syncToken carries the + * same unspecified change semantics as `updatedAt`. + * + * Refs: + * - https://developers.ashbyhq.com/reference/candidatelist + * - https://developers.ashbyhq.com/reference/candidatecreatenote + * - https://developers.ashbyhq.com/docs/pagination-and-incremental-sync */ function buildContentHash(id: string, updatedAt: string | null): string { return `ashby:${id}:${updatedAt ?? ''}` diff --git a/apps/sim/connectors/gitlab/gitlab.ts b/apps/sim/connectors/gitlab/gitlab.ts index 66e71796d60..b41303bdf26 100644 --- a/apps/sim/connectors/gitlab/gitlab.ts +++ b/apps/sim/connectors/gitlab/gitlab.ts @@ -470,15 +470,18 @@ async function fetchProject( } /** - * Encodes the listing cursor. The cursor packs the resource phase (wiki ➜ issues) - * and the issues page number so a single sync walks wikis first, then paginates - * issues via the X-Next-Page header. + * Encodes the listing cursor. The cursor packs the resource phase (repo ➜ wiki ➜ + * issues) and a per-phase continuation token so a single sync walks the phases in + * order. The repository-tree and issues phases both use GitLab keyset pagination + * and store the full `rel="next"` URL from the Link header to fetch verbatim. */ interface CursorState { phase: SyncPhase issuePage: number /** Full `rel="next"` URL for the repository-tree keyset page to fetch next. */ fileNextUrl?: string + /** Full `rel="next"` URL for the issues keyset page to fetch next. */ + issueNextUrl?: string } function encodeCursor(state: CursorState): string { @@ -492,6 +495,7 @@ function decodeCursor(cursor: string | undefined, initialPhase: SyncPhase): Curs phase: SyncPhase issuePage: number fileNextUrl: string + issueNextUrl: string }> const phase: SyncPhase = parsed.phase === 'repo' || parsed.phase === 'issues' || parsed.phase === 'wiki' @@ -501,6 +505,7 @@ function decodeCursor(cursor: string | undefined, initialPhase: SyncPhase): Curs phase, issuePage: Number(parsed.issuePage) > 0 ? Number(parsed.issuePage) : 1, fileNextUrl: typeof parsed.fileNextUrl === 'string' ? parsed.fileNextUrl : undefined, + issueNextUrl: typeof parsed.issueNextUrl === 'string' ? parsed.issueNextUrl : undefined, } } catch { return { phase: initialPhase, issuePage: 1 } @@ -859,9 +864,9 @@ export const gitlabConnector: ConnectorConfig = { if (state.phase === 'issues') { const params = new URLSearchParams({ per_page: String(PAGE_SIZE), - page: String(state.issuePage), order_by: 'updated_at', sort: 'desc', + pagination: 'keyset', }) if (lastSyncAt) params.set('updated_after', lastSyncAt.toISOString()) const issueState = @@ -874,11 +879,15 @@ export const gitlabConnector: ConnectorConfig = { typeof sourceConfig.issueMilestone === 'string' ? sourceConfig.issueMilestone.trim() : '' if (issueMilestone) params.set('milestone', issueMilestone) - const url = `${apiBase}/projects/${encodedProject}/issues?${params.toString()}` + if (state.issueNextUrl && !isSameOrigin(state.issueNextUrl, apiBase)) { + throw new Error('GitLab pagination cursor points to an unexpected host') + } + const url = + state.issueNextUrl ?? `${apiBase}/projects/${encodedProject}/issues?${params.toString()}` logger.info('Listing GitLab issues', { host, project: encodedProject, - page: state.issuePage, + continued: Boolean(state.issueNextUrl), incremental: Boolean(lastSyncAt), }) @@ -909,18 +918,18 @@ export const gitlabConnector: ConnectorConfig = { maxItems, syncContext ) + if (hitLimit) return { documents: capped, hasMore: false } - const nextPageHeader = response.headers.get('x-next-page')?.trim() - const nextPage = nextPageHeader ? Number(nextPageHeader) : 0 - const hasMorePages = !hitLimit && Number.isFinite(nextPage) && nextPage > 0 - - return { - documents: capped, - nextCursor: hasMorePages - ? encodeCursor({ phase: 'issues', issuePage: nextPage }) - : undefined, - hasMore: hasMorePages, + const nextLink = parseNextLink(response.headers.get('link')) + if (nextLink) { + return { + documents: capped, + nextCursor: encodeCursor({ phase: 'issues', issuePage: 1, issueNextUrl: nextLink }), + hasMore: true, + } } + + return { documents: capped, hasMore: false } } return { documents: [], hasMore: false } diff --git a/apps/sim/connectors/gong/gong.ts b/apps/sim/connectors/gong/gong.ts index beb7391573c..69ce220c1fd 100644 --- a/apps/sim/connectors/gong/gong.ts +++ b/apps/sim/connectors/gong/gong.ts @@ -417,20 +417,32 @@ export const gongConnector: ConnectorConfig = { const prevFetched = (syncContext?.totalDocsFetched as number) ?? 0 let documents = allDocuments + let capDroppedDocs = false if (maxCalls > 0) { const remaining = Math.max(0, maxCalls - prevFetched) if (allDocuments.length > remaining) { documents = allDocuments.slice(0, remaining) + capDroppedDocs = true } } const totalFetched = prevFetched + documents.length if (syncContext) syncContext.totalDocsFetched = totalFetched const hitLimit = maxCalls > 0 && totalFetched >= maxCalls - if (hitLimit && syncContext) syncContext.listingCapped = true - const hasMore = !hitLimit && Boolean(nextPageCursor) + /** + * Only flag the listing as capped when the `maxCalls` limit actually + * truncated calls that still exist in the source — either by dropping calls + * from the current page or by stopping while another page remains. Reaching + * the limit exactly at source exhaustion (no dropped calls, no further + * cursor) yields a complete listing, so deletion reconciliation must still + * run for calls removed in Gong. + */ + if (syncContext && (capDroppedDocs || (hitLimit && Boolean(nextPageCursor)))) { + syncContext.listingCapped = true + } + return { documents, nextCursor: hasMore ? nextPageCursor : undefined, diff --git a/apps/sim/connectors/google-docs/google-docs.ts b/apps/sim/connectors/google-docs/google-docs.ts index 0a89ea91650..f125f23a44c 100644 --- a/apps/sim/connectors/google-docs/google-docs.ts +++ b/apps/sim/connectors/google-docs/google-docs.ts @@ -235,13 +235,23 @@ export const googleDocsConnector: ConnectorConfig = { const data = await response.json() const files = (data.files || []) as DriveFile[] + /** + * Drive sets `incompleteSearch` when it could not search every corpus (it + * arises with the `allDrives` scope enabled by `includeItemsFromAllDrives`). + * A partial listing drops still-existing docs, so reconciliation must be + * suppressed to avoid hard-deleting valid documents. + */ + const incompleteSearch = data.incompleteSearch === true + const maxDocs = sourceConfig.maxDocs ? Number(sourceConfig.maxDocs) : 0 const previouslyFetched = (syncContext?.totalDocsFetched as number) ?? 0 let documents = files.map(fileToStub) + let slicedSome = false if (maxDocs > 0) { const remaining = maxDocs - previouslyFetched if (documents.length > remaining) { + slicedSome = true documents = documents.slice(0, remaining) } } @@ -252,6 +262,19 @@ export const googleDocsConnector: ConnectorConfig = { const nextPageToken = data.nextPageToken as string | undefined + /** + * Mark the listing as incomplete so the sync engine skips deletion + * reconciliation when this page does not represent the full source set: + * - `slicedSome`: the page held more docs than the `maxDocs` cap allowed. + * - `hitLimit` with a next page: the cap was reached while more pages remain. + * - `incompleteSearch`: Drive could not search every corpus, so the page is + * partial and may omit still-existing docs. + * Reconciliation against any of these would hard-delete valid documents. + */ + if (syncContext && (slicedSome || (hitLimit && Boolean(nextPageToken)) || incompleteSearch)) { + syncContext.listingCapped = true + } + return { documents, nextCursor: hitLimit ? undefined : nextPageToken, diff --git a/apps/sim/connectors/google-drive/google-drive.ts b/apps/sim/connectors/google-drive/google-drive.ts index 0516c47fd7e..41013cdb247 100644 --- a/apps/sim/connectors/google-drive/google-drive.ts +++ b/apps/sim/connectors/google-drive/google-drive.ts @@ -268,6 +268,14 @@ export const googleDriveConnector: ConnectorConfig = { const data = await response.json() const files = (data.files || []) as DriveFile[] + /** + * Drive sets `incompleteSearch` when it could not search every corpus (it + * arises with the `allDrives` scope enabled by `includeItemsFromAllDrives`). + * A partial listing drops still-existing files, so reconciliation must be + * suppressed to avoid hard-deleting valid documents. + */ + const incompleteSearch = data.incompleteSearch === true + const documents = files .filter((f) => isGoogleWorkspaceFile(f.mimeType) || isSupportedTextFile(f.mimeType)) .map(fileToStub) @@ -275,7 +283,7 @@ export const googleDriveConnector: ConnectorConfig = { const totalFetched = previouslyFetched + documents.length if (syncContext) syncContext.totalDocsFetched = totalFetched const hitLimit = maxFiles > 0 && totalFetched >= maxFiles - if (hitLimit && syncContext) syncContext.listingCapped = true + if (syncContext && (hitLimit || incompleteSearch)) syncContext.listingCapped = true const nextPageToken = data.nextPageToken as string | undefined diff --git a/apps/sim/connectors/google-forms/google-forms.ts b/apps/sim/connectors/google-forms/google-forms.ts index 4871cc35ac3..d29bf5ef9b3 100644 --- a/apps/sim/connectors/google-forms/google-forms.ts +++ b/apps/sim/connectors/google-forms/google-forms.ts @@ -573,6 +573,14 @@ export const googleFormsConnector: ConnectorConfig = { const data = await response.json() let files = (data.files || []) as DriveFormFile[] + /** + * Drive sets `incompleteSearch` when it could not search every corpus (it + * arises with the `allDrives` scope enabled by `includeItemsFromAllDrives`). + * A partial listing drops still-existing forms, so reconciliation must be + * suppressed to avoid hard-deleting valid documents. + */ + const incompleteSearch = data.incompleteSearch === true + let slicedSome = false if (maxForms > 0) { const remaining = maxForms - previouslyFetched @@ -633,11 +641,16 @@ export const googleFormsConnector: ConnectorConfig = { * - `hitLimit` with a next page: the cap was reached while more pages of * forms remain in the source. * - `skippedOnError`: a transient error dropped a still-present form. + * - `incompleteSearch`: Drive could not search every corpus, so the page + * itself is partial and may omit still-existing forms. * Deleting any of those would wipe valid documents from the knowledge base. * When the cap merely coincides with source exhaustion (no slice, no next * page), reconciliation stays enabled so deleted forms are cleaned up. */ - if (syncContext && (slicedSome || (hitLimit && Boolean(nextPageToken)) || skippedOnError)) { + if ( + syncContext && + (slicedSome || (hitLimit && Boolean(nextPageToken)) || skippedOnError || incompleteSearch) + ) { syncContext.listingCapped = true } diff --git a/apps/sim/connectors/grain/grain.ts b/apps/sim/connectors/grain/grain.ts index be05cca037a..ea4243c3213 100644 --- a/apps/sim/connectors/grain/grain.ts +++ b/apps/sim/connectors/grain/grain.ts @@ -471,10 +471,11 @@ export const grainConnector: ConnectorConfig = { try { if (!externalId) return null - const recording = await fetchRecording(accessToken, externalId) + const [recording, segments] = await Promise.all([ + fetchRecording(accessToken, externalId), + fetchTranscript(accessToken, externalId), + ]) if (!recording) return null - - const segments = await fetchTranscript(accessToken, externalId) if (!segments) return null const hasTranscript = segments.some((segment) => segment.text?.trim()) diff --git a/apps/sim/connectors/jsm/jsm.ts b/apps/sim/connectors/jsm/jsm.ts index 9a7f28341cc..9b61d7c9b19 100644 --- a/apps/sim/connectors/jsm/jsm.ts +++ b/apps/sim/connectors/jsm/jsm.ts @@ -338,6 +338,13 @@ export const jsmConnector: ConnectorConfig = { 'read:request:jira-service-management', 'read:request.comment:jira-service-management', 'read:request.status:jira-service-management', + /** + * Requests embed a `reporter` user object whose `displayName` is surfaced + * in document content and the Reporter tag. Atlassian only populates + * embedded user data when the user-read scope is granted, so request it + * here. Present in the `jira` OAuth provider config as `read:jira-user`. + */ + 'read:jira-user', 'offline_access', ], }, diff --git a/apps/sim/connectors/mapTags.test.ts b/apps/sim/connectors/mapTags.test.ts index 9f043ebabd7..956f796cbd1 100644 --- a/apps/sim/connectors/mapTags.test.ts +++ b/apps/sim/connectors/mapTags.test.ts @@ -18,6 +18,11 @@ vi.mock('@/components/icons', () => ({ S3Icon: () => null, GoogleFormsIcon: () => null, AzureDevOpsIcon: () => null, + xIcon: () => null, + GranolaIcon: () => null, + GreenhouseIcon: () => null, + FathomIcon: () => null, + RootlyIcon: () => null, })) vi.mock('@/lib/knowledge/documents/utils', () => ({ fetchWithRetry: vi.fn(), @@ -40,16 +45,21 @@ vi.mock('@/tools/s3/utils', () => ({ import { airtableConnector } from '@/connectors/airtable/airtable' import { azureDevopsConnector } from '@/connectors/azure-devops/azure-devops' import { confluenceConnector } from '@/connectors/confluence/confluence' +import { fathomConnector } from '@/connectors/fathom/fathom' import { githubConnector } from '@/connectors/github/github' import { googleDriveConnector } from '@/connectors/google-drive/google-drive' import { googleFormsConnector } from '@/connectors/google-forms/google-forms' +import { granolaConnector } from '@/connectors/granola/granola' +import { greenhouseConnector } from '@/connectors/greenhouse/greenhouse' import { jiraConnector } from '@/connectors/jira/jira' import { jsmConnector } from '@/connectors/jsm/jsm' import { linearConnector } from '@/connectors/linear/linear' import { notionConnector } from '@/connectors/notion/notion' +import { rootlyConnector } from '@/connectors/rootly/rootly' import { s3Connector } from '@/connectors/s3/s3' import { sentryConnector } from '@/connectors/sentry/sentry' import { typeformConnector } from '@/connectors/typeform/typeform' +import { xConnector } from '@/connectors/x/x' import { youtubeConnector } from '@/connectors/youtube/youtube' const ISO_DATE = '2025-06-15T10:30:00.000Z' @@ -787,3 +797,340 @@ describe('Azure DevOps mapTags', () => { expect(result).toEqual({}) }) }) + +describe('X mapTags', () => { + const mapTags = xConnector.mapTags! + + it.concurrent('maps all fields when present', () => { + const result = mapTags({ + author: 'jack', + createdAt: ISO_DATE, + likeCount: 1500, + retweetCount: 300, + }) + + expect(result).toEqual({ + author: 'jack', + createdAt: new Date(ISO_DATE), + likeCount: 1500, + retweetCount: 300, + }) + }) + + it.concurrent('returns empty object for empty metadata', () => { + expect(mapTags({})).toEqual({}) + }) + + it.concurrent('skips fields with wrong types', () => { + const result = mapTags({ + author: 123, + createdAt: 99999, + likeCount: 'not-a-number', + retweetCount: 'nope', + }) + expect(result).toEqual({}) + }) + + it.concurrent('skips createdAt when date is invalid', () => { + const result = mapTags({ createdAt: 'not-a-date' }) + expect(result).toEqual({}) + }) + + it.concurrent('converts string counts to numbers', () => { + const result = mapTags({ likeCount: '42', retweetCount: '7' }) + expect(result).toEqual({ likeCount: 42, retweetCount: 7 }) + }) + + it.concurrent('maps counts of zero', () => { + const result = mapTags({ likeCount: 0, retweetCount: 0 }) + expect(result).toEqual({ likeCount: 0, retweetCount: 0 }) + }) +}) + +describe('Granola mapTags', () => { + const mapTags = granolaConnector.mapTags! + + it.concurrent('maps all fields when present', () => { + const result = mapTags({ + title: 'Weekly Sync', + owner: 'Alice', + attendees: ['Alice', 'Bob'], + folders: ['Team', 'Projects'], + meeting: 'Q3 Planning', + noteDate: ISO_DATE, + meetingDate: '2025-01-01T00:00:00.000Z', + }) + + expect(result).toEqual({ + title: 'Weekly Sync', + owner: 'Alice', + attendees: 'Alice, Bob', + folders: 'Team, Projects', + meeting: 'Q3 Planning', + noteDate: new Date(ISO_DATE), + meetingDate: new Date('2025-01-01T00:00:00.000Z'), + }) + }) + + it.concurrent('returns empty object for empty metadata', () => { + expect(mapTags({})).toEqual({}) + }) + + it.concurrent('skips fields with wrong types', () => { + const result = mapTags({ + title: 123, + owner: null, + attendees: 'not-an-array', + folders: 'not-an-array', + meeting: true, + noteDate: 99999, + meetingDate: false, + }) + expect(result).toEqual({}) + }) + + it.concurrent('trims text fields in output', () => { + const result = mapTags({ title: ' Weekly Sync ', owner: ' Alice ', meeting: ' Q3 ' }) + expect(result).toEqual({ title: 'Weekly Sync', owner: 'Alice', meeting: 'Q3' }) + }) + + it.concurrent('skips blank text fields', () => { + const result = mapTags({ title: ' ', owner: '', meeting: ' ' }) + expect(result).toEqual({}) + }) + + it.concurrent('skips array fields when empty', () => { + const result = mapTags({ attendees: [], folders: [] }) + expect(result).toEqual({}) + }) + + it.concurrent('skips noteDate when date is invalid', () => { + const result = mapTags({ noteDate: 'not-a-date' }) + expect(result).toEqual({}) + }) + + it.concurrent('skips meetingDate when date is invalid', () => { + const result = mapTags({ meetingDate: 'garbage' }) + expect(result).toEqual({}) + }) +}) + +describe('Greenhouse mapTags', () => { + const mapTags = greenhouseConnector.mapTags! + + it.concurrent('maps all fields when present', () => { + const result = mapTags({ + candidateName: 'Jane Doe', + company: 'Acme', + title: 'Engineer', + recruiter: 'Alice', + coordinator: 'Bob', + source: 'LinkedIn', + applicationCount: 3, + updatedAt: ISO_DATE, + lastActivity: '2025-01-01T00:00:00.000Z', + }) + + expect(result).toEqual({ + candidateName: 'Jane Doe', + company: 'Acme', + title: 'Engineer', + recruiter: 'Alice', + coordinator: 'Bob', + source: 'LinkedIn', + applicationCount: 3, + updatedAt: new Date(ISO_DATE), + lastActivity: new Date('2025-01-01T00:00:00.000Z'), + }) + }) + + it.concurrent('returns empty object for empty metadata', () => { + expect(mapTags({})).toEqual({}) + }) + + it.concurrent('skips fields with wrong types', () => { + const result = mapTags({ + candidateName: 123, + company: null, + title: true, + recruiter: [], + coordinator: false, + source: 99999, + applicationCount: 'not-a-number', + updatedAt: 12345, + lastActivity: 'bad-date', + }) + expect(result).toEqual({}) + }) + + it.concurrent('trims text fields in output', () => { + const result = mapTags({ candidateName: ' Jane Doe ', company: ' Acme ' }) + expect(result).toEqual({ candidateName: 'Jane Doe', company: 'Acme' }) + }) + + it.concurrent('skips blank text fields', () => { + const result = mapTags({ candidateName: ' ', company: '', source: ' ' }) + expect(result).toEqual({}) + }) + + it.concurrent('maps applicationCount of zero', () => { + const result = mapTags({ applicationCount: 0 }) + expect(result).toEqual({ applicationCount: 0 }) + }) + + it.concurrent('skips applicationCount when string', () => { + const result = mapTags({ applicationCount: '3' }) + expect(result).toEqual({}) + }) + + it.concurrent('skips updatedAt when date is invalid', () => { + const result = mapTags({ updatedAt: 'not-a-date' }) + expect(result).toEqual({}) + }) + + it.concurrent('skips lastActivity when date is invalid', () => { + const result = mapTags({ lastActivity: 'garbage' }) + expect(result).toEqual({}) + }) +}) + +describe('Fathom mapTags', () => { + const mapTags = fathomConnector.mapTags! + + it.concurrent('maps all fields when present', () => { + const result = mapTags({ + title: 'Sales Call', + recordedByEmail: 'john@example.com', + recordedByName: 'John Smith', + team: 'Sales', + meetingType: 'external', + transcriptLanguage: 'en', + durationSeconds: 1800, + meetingDate: ISO_DATE, + }) + + expect(result).toEqual({ + title: 'Sales Call', + recordedByEmail: 'john@example.com', + recordedByName: 'John Smith', + team: 'Sales', + meetingType: 'external', + transcriptLanguage: 'en', + durationSeconds: 1800, + meetingDate: new Date(ISO_DATE), + }) + }) + + it.concurrent('returns empty object for empty metadata', () => { + expect(mapTags({})).toEqual({}) + }) + + it.concurrent('skips fields with wrong types', () => { + const result = mapTags({ + title: 123, + recordedByEmail: null, + recordedByName: true, + team: [], + meetingType: false, + transcriptLanguage: 99999, + durationSeconds: 'not-a-number', + meetingDate: 12345, + }) + expect(result).toEqual({}) + }) + + it.concurrent('skips blank string fields', () => { + const result = mapTags({ title: ' ', team: '', transcriptLanguage: ' ' }) + expect(result).toEqual({}) + }) + + it.concurrent('converts string durationSeconds to number', () => { + const result = mapTags({ durationSeconds: '900' }) + expect(result).toEqual({ durationSeconds: 900 }) + }) + + it.concurrent('maps durationSeconds of zero', () => { + const result = mapTags({ durationSeconds: 0 }) + expect(result).toEqual({ durationSeconds: 0 }) + }) + + it.concurrent('skips meetingDate when date is invalid', () => { + const result = mapTags({ meetingDate: 'not-a-date' }) + expect(result).toEqual({}) + }) +}) + +describe('Rootly mapTags', () => { + const mapTags = rootlyConnector.mapTags! + + it.concurrent('maps all fields when present', () => { + const result = mapTags({ + status: 'resolved', + severityName: 'SEV1', + kind: 'incident', + services: ['api', 'web'], + teams: ['platform'], + environments: ['production'], + labels: ['platform:osx'], + incidentDate: ISO_DATE, + resolvedDate: '2025-01-01T00:00:00.000Z', + }) + + expect(result).toEqual({ + status: 'resolved', + severity: 'SEV1', + kind: 'incident', + services: 'api, web', + teams: 'platform', + environments: 'production', + labels: 'platform:osx', + incidentDate: new Date(ISO_DATE), + resolvedDate: new Date('2025-01-01T00:00:00.000Z'), + }) + }) + + it.concurrent('returns empty object for empty metadata', () => { + expect(mapTags({})).toEqual({}) + }) + + it.concurrent('skips fields with wrong types', () => { + const result = mapTags({ + status: 123, + severityName: null, + severityLevel: true, + kind: [], + services: 'not-an-array', + teams: 'not-an-array', + environments: 'not-an-array', + labels: 'not-an-array', + incidentDate: 99999, + resolvedDate: false, + }) + expect(result).toEqual({}) + }) + + it.concurrent('falls back to severityLevel when severityName is absent', () => { + const result = mapTags({ severityLevel: 'sev0' }) + expect(result).toEqual({ severity: 'sev0' }) + }) + + it.concurrent('prefers severityName over severityLevel', () => { + const result = mapTags({ severityName: 'Critical', severityLevel: 'sev0' }) + expect(result).toEqual({ severity: 'Critical' }) + }) + + it.concurrent('skips array fields when empty', () => { + const result = mapTags({ services: [], teams: [], environments: [], labels: [] }) + expect(result).toEqual({}) + }) + + it.concurrent('skips incidentDate when date is invalid', () => { + const result = mapTags({ incidentDate: 'not-a-date' }) + expect(result).toEqual({}) + }) + + it.concurrent('skips resolvedDate when date is invalid', () => { + const result = mapTags({ resolvedDate: 'garbage' }) + expect(result).toEqual({}) + }) +}) diff --git a/apps/sim/connectors/registry.ts b/apps/sim/connectors/registry.ts index 33eb7a5e0e8..9224ac77c61 100644 --- a/apps/sim/connectors/registry.ts +++ b/apps/sim/connectors/registry.ts @@ -45,6 +45,7 @@ import { typeformConnector } from '@/connectors/typeform' import type { ConnectorRegistry } from '@/connectors/types' import { webflowConnector } from '@/connectors/webflow' import { wordpressConnector } from '@/connectors/wordpress' +import { xConnector } from '@/connectors/x' import { youtubeConnector } from '@/connectors/youtube' import { zendeskConnector } from '@/connectors/zendesk' import { zoomConnector } from '@/connectors/zoom' @@ -96,6 +97,7 @@ export const CONNECTOR_REGISTRY: ConnectorRegistry = { typeform: typeformConnector, webflow: webflowConnector, wordpress: wordpressConnector, + x: xConnector, youtube: youtubeConnector, zendesk: zendeskConnector, zoom: zoomConnector,