fix(tables): retry transient DB/Redis failures in cell execution and surface error causes

TheodoreSpeaks · claude · TheodoreSpeaks · commit 91f00bda16d1 · 2026-06-03T13:01:23.000-07:00
Workflow-group-cell runs intermittently failed on trivial DB reads/writes
under heavy fan-out, stranding cells in `running`. Investigation showed the
PlanetScale and ElastiCache backends were healthy at the time — the failures
are transient connection-level faults that the cell (maxAttempts: 1) had no
tolerance for, and the real cause was never logged (Drizzle wraps it as
"Failed query: ..." and the driver cause lives in error.cause).

Resilience:
- Add retryTransient (lib/table/retry-transient.ts): retries only transient
  infra errors (reuses isRetryableInfrastructureError; adds an ioredis
  command-timeout match) with jittered backoff, then rethrows. Fail-fast for
  everything else.
- Wrap the cell's getTableById/getRowById reads, the terminal write
  (cell-write updateRow — idempotent via the executionId guard), and the
  Redis cascade-lock acquire.

Diagnostics:
- Add describeError (lib/core/errors/retryable-infrastructure.ts): walks the
  .cause chain and always returns the underlying driver cause (code/errno/
  syscall + causeChain), including for unclassified errors like AbortError.
- Log `cause` + a `retryable` flag (and aborted/timedOut in the cell's main
  catch) across the cell + finalization error paths, mirroring the existing
  schedule-execution pattern. Logging-only; no behavior change. This lets the
  next recurrence reveal the real cause and whether the retry applies.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/apps/sim/background/workflow-column-execution.ts b/apps/sim/background/workflow-column-execution.ts
@@ -7,12 +7,14 @@ import { generateId } from '@sim/utils/id'
 import { backoffWithJitter } from '@sim/utils/retry'
 import { task } from '@trigger.dev/sdk'
 import { eq } from 'drizzle-orm'
+import { describeError } from '@/lib/core/errors/retryable-infrastructure'
 import { createTimeoutAbortController } from '@/lib/core/execution-limits'
 import { RateLimiter } from '@/lib/core/rate-limiter/rate-limiter'
 import { preprocessExecution } from '@/lib/execution/preprocessing'
 import { withCascadeLock } from '@/lib/table/cascade-lock'
 import { isExecCancelled } from '@/lib/table/deps'
 import { appendTableEvent } from '@/lib/table/events'
+import { isRetryableCellError, retryTransient } from '@/lib/table/retry-transient'
 import type {
   RowData,
   RowExecutionMetadata,
@@ -67,9 +69,15 @@ export async function executeWorkflowGroupCellJob(
     // marked, so stop re-driving this row.
     if (outcome.result === 'blocked') break
     if (signal?.aborted) break
-    const freshTable = await getTableById(tableId)
+    const freshTable = await retryTransient('cascade getTableById', () => getTableById(tableId), {
+      signal,
+    })
     if (!freshTable) break
-    const freshRow = await getRowById(tableId, rowId, workspaceId)
+    const freshRow = await retryTransient(
+      'cascade getRowById',
+      () => getRowById(tableId, rowId, workspaceId),
+      { signal }
+    )
     if (!freshRow) break
     const next = pickNextEligibleGroupForRow(freshTable, freshRow)
     if (!next) break
@@ -113,7 +121,9 @@ export async function runRowCascadeLoop(
   while (true) {
     if (signal?.aborted) break
 
-    const freshTable = await getTableById(tableId)
+    const freshTable = await retryTransient('cascade getTableById', () => getTableById(tableId), {
+      signal,
+    })
     if (!freshTable) {
       logger.warn(`Table ${tableId} vanished mid-cascade`)
       break
@@ -142,7 +152,11 @@ export async function runRowCascadeLoop(
     // would re-pick the still-pending queued marker and spin.
     if (result === 'blocked') return 'blocked'
 
-    const freshRow = await getRowById(tableId, rowId, workspaceId)
+    const freshRow = await retryTransient(
+      'cascade getRowById',
+      () => getRowById(tableId, rowId, workspaceId),
+      { signal }
+    )
     if (!freshRow) break
     const next = pickNextEligibleGroupForRow(freshTable, freshRow, currentGroupId)
     if (!next) break
@@ -597,8 +611,8 @@ async function runWorkflowAndWriteTerminal(
           })
           .catch((err) => {
             logger.warn(
-              `Per-block partial write failed (table=${tableId} row=${rowId} group=${groupId}):`,
-              err
+              `Per-block partial write failed (table=${tableId} row=${rowId} group=${groupId})`,
+              { cause: describeError(err), retryable: isRetryableCellError(err) }
             )
           })
       }
@@ -720,7 +734,14 @@ async function runWorkflowAndWriteTerminal(
       const message = toError(err).message
       logger.error(
         `Workflow group cell execution failed (table=${tableId} row=${rowId} group=${groupId})`,
-        { error: message, executionId }
+        {
+          error: message,
+          executionId,
+          cause: describeError(err),
+          retryable: isRetryableCellError(err),
+          aborted: abortSignal.aborted,
+          timedOut: timeoutController.isTimedOut(),
+        }
       )
       terminalWritten = true
       await writeChain.catch(() => {})
@@ -735,7 +756,11 @@ async function runWorkflowAndWriteTerminal(
           blockErrors,
         })
       } catch (writeErr) {
-        logger.error('Also failed to write error state', { error: toError(writeErr).message })
+        logger.error('Also failed to write error state', {
+          error: toError(writeErr).message,
+          cause: describeError(writeErr),
+          retryable: isRetryableCellError(writeErr),
+        })
       }
       return 'error'
     }
diff --git a/apps/sim/lib/core/errors/retryable-infrastructure.test.ts b/apps/sim/lib/core/errors/retryable-infrastructure.test.ts
@@ -0,0 +1,64 @@
+/**
+ * @vitest-environment node
+ */
+import { describe, expect, it } from 'vitest'
+import {
+  describeError,
+  isRetryableInfrastructureError,
+} from '@/lib/core/errors/retryable-infrastructure'
+
+describe('describeError', () => {
+  it('reports name and message for a plain error, omitting causeChain', () => {
+    const described = describeError(new Error('boom'))
+    expect(described).toEqual({ name: 'Error', message: 'boom' })
+    expect(described.causeChain).toBeUndefined()
+  })
+
+  it('surfaces the deepest cause for a wrapped driver error', () => {
+    const driver = Object.assign(new Error('read ECONNRESET'), {
+      code: 'ECONNRESET',
+      errno: 'ECONNRESET',
+      syscall: 'read',
+    })
+    const wrapped = new Error('Failed query: select ...', { cause: driver })
+
+    const described = describeError(wrapped)
+    expect(described.name).toBe('Error')
+    expect(described.message).toBe('read ECONNRESET')
+    expect(described.code).toBe('ECONNRESET')
+    expect(described.errno).toBe('ECONNRESET')
+    expect(described.syscall).toBe('read')
+    expect(described.causeChain).toEqual([
+      'Error: Failed query: select ...',
+      'Error: read ECONNRESET',
+    ])
+  })
+
+  it('always returns the cause for unclassified errors (AbortError)', () => {
+    const aborted = Object.assign(new Error('The operation was aborted'), { name: 'AbortError' })
+    const described = describeError(aborted)
+
+    expect(described.name).toBe('AbortError')
+    expect(described.message).toBe('The operation was aborted')
+    // The retryable classifier skips it entirely — describeError still surfaces it.
+    expect(isRetryableInfrastructureError(aborted)).toBe(false)
+  })
+
+  it('falls back to a populated description for non-Error input without throwing', () => {
+    expect(describeError('just a string')).toEqual({ name: 'Error', message: 'just a string' })
+    expect(() => describeError({ weird: true })).not.toThrow()
+  })
+
+  it('stops walking the cause chain at depth 10 and does not loop on a cycle', () => {
+    const a = new Error('a')
+    const b = new Error('b')
+    ;(a as Error & { cause?: unknown }).cause = b
+    ;(b as Error & { cause?: unknown }).cause = a
+
+    let described: ReturnType<typeof describeError> | undefined
+    expect(() => {
+      described = describeError(a)
+    }).not.toThrow()
+    expect(described?.causeChain?.length).toBeLessThanOrEqual(10)
+  })
+})
diff --git a/apps/sim/lib/core/errors/retryable-infrastructure.ts b/apps/sim/lib/core/errors/retryable-infrastructure.ts
@@ -1,3 +1,5 @@
+import { toError } from '@sim/utils/errors'
+
 const RETRYABLE_DB_ERROR_CODES = new Set([
   '08000',
   '08001',
@@ -76,3 +78,47 @@ export function describeRetryableInfrastructureError(
 export function isRetryableInfrastructureError(error: unknown): boolean {
   return Boolean(describeRetryableInfrastructureError(error))
 }
+
+export interface DescribedError {
+  name: string
+  message: string
+  code?: string
+  errno?: string
+  syscall?: string
+  /** `"Name: message"` per link in the `.cause` chain, outermost first. Present only when the chain has more than one link. */
+  causeChain?: string[]
+}
+
+/**
+ * Always-on diagnostic view of an error and its `.cause` chain.
+ *
+ * Unlike {@link describeRetryableInfrastructureError} — which returns
+ * `undefined` for errors outside its retryable allowlist — this returns the
+ * underlying cause for ANY error, including `AbortError` and otherwise
+ * unclassified causes. Reports the fields of the DEEPEST `.cause` link, because
+ * a wrapped driver error (e.g. Drizzle's `"Failed query: ..."` wrapping an
+ * `ECONNRESET`) carries the real reason there, not on the outer wrapper.
+ *
+ * `@sim/logger` does not serialize the non-enumerable `Error.prototype.cause`,
+ * so callers must pass the result as an explicit structured log field rather
+ * than relying on the logger to expand a raw error.
+ */
+export function describeError(error: unknown): DescribedError {
+  const chain = getErrorChain(error)
+  if (chain.length === 0) {
+    const normalized = toError(error)
+    return { name: normalized.name, message: normalized.message }
+  }
+  const deepest = chain[chain.length - 1]
+  const code = typeof deepest.code === 'string' ? deepest.code : undefined
+  const errno = typeof deepest.errno === 'string' ? deepest.errno : undefined
+  const syscall = typeof deepest.syscall === 'string' ? deepest.syscall : undefined
+  return {
+    name: deepest.name,
+    message: deepest.message,
+    ...(code ? { code } : {}),
+    ...(errno ? { errno } : {}),
+    ...(syscall ? { syscall } : {}),
+    ...(chain.length > 1 ? { causeChain: chain.map((e) => `${e.name}: ${e.message}`) } : {}),
+  }
+}
diff --git a/apps/sim/lib/logs/execution/logging-session.ts b/apps/sim/lib/logs/execution/logging-session.ts
@@ -3,6 +3,10 @@ import { workflowExecutionLogs } from '@sim/db/schema'
 import { createLogger } from '@sim/logger'
 import { toError } from '@sim/utils/errors'
 import { and, eq, sql } from 'drizzle-orm'
+import {
+  describeError,
+  isRetryableInfrastructureError,
+} from '@/lib/core/errors/retryable-infrastructure'
 import { executionLogger } from '@/lib/logs/execution/logger'
 import {
   calculateCostSummary,
@@ -177,6 +181,8 @@ export class LoggingSession {
     } catch (error) {
       logger.error(`Failed to persist last started block for execution ${this.executionId}:`, {
         error: toError(error).message,
+        cause: describeError(error),
+        retryable: isRetryableInfrastructureError(error),
       })
     }
   }
@@ -193,6 +199,8 @@ export class LoggingSession {
     } catch (error) {
       logger.error(`Failed to persist last completed block for execution ${this.executionId}:`, {
         error: toError(error).message,
+        cause: describeError(error),
+        retryable: isRetryableInfrastructureError(error),
       })
     }
   }
@@ -411,6 +419,8 @@ export class LoggingSession {
         executionId: this.executionId,
         error: toError(error).message,
         stack: error instanceof Error ? error.stack : undefined,
+        cause: describeError(error),
+        retryable: isRetryableInfrastructureError(error),
       })
       throw error
     }
@@ -1057,7 +1067,11 @@ export class LoggingSession {
       this.completionAttemptFailed = true
       logger.error(
         `[${this.requestId || 'unknown'}] Cost-only fallback also failed for execution ${this.executionId}:`,
-        { error: toError(fallbackError).message }
+        {
+          error: toError(fallbackError).message,
+          cause: describeError(fallbackError),
+          retryable: isRetryableInfrastructureError(fallbackError),
+        }
       )
     }
   }
diff --git a/apps/sim/lib/table/cascade-lock.ts b/apps/sim/lib/table/cascade-lock.ts
@@ -1,6 +1,7 @@
 import { createLogger } from '@sim/logger'
 import { toError } from '@sim/utils/errors'
 import { acquireLock, extendLock, releaseLock } from '@/lib/core/config/redis'
+import { retryTransient } from '@/lib/table/retry-transient'
 
 const logger = createLogger('TableCascadeLock')
 
@@ -40,7 +41,11 @@ export async function withCascadeLock<T>(
   fn: () => Promise<T>
 ): Promise<{ status: 'acquired'; result: T } | { status: 'contended' }> {
   const key = cascadeLockKey(tableId, rowId)
-  const acquired = await acquireLock(key, ownerId, LOCK_TTL_SECONDS)
+  // A timed-out/dropped Redis command here throws before the cell is picked up;
+  // retry so a transient Redis blip doesn't fail the run outright.
+  const acquired = await retryTransient('cascade acquireLock', () =>
+    acquireLock(key, ownerId, LOCK_TTL_SECONDS)
+  )
   if (!acquired) return { status: 'contended' }
 
   const heartbeat = setInterval(() => {
diff --git a/apps/sim/lib/table/cell-write.ts b/apps/sim/lib/table/cell-write.ts
@@ -12,6 +12,7 @@
 import { createLogger } from '@sim/logger'
 import { isExecCancelled } from '@/lib/table/deps'
 import { appendTableEvent } from '@/lib/table/events'
+import { retryTransient } from '@/lib/table/retry-transient'
 import type { RowData, RowExecutionMetadata, RowExecutions, WorkflowGroup } from '@/lib/table/types'
 
 const logger = createLogger('WorkflowCellWrite')
@@ -46,12 +47,14 @@ export async function writeWorkflowGroupState(
   const requestId = ctx.requestId ?? `wfgrp-${executionId}`
   const { getTableById, getRowById, updateRow } = await import('@/lib/table/service')
 
-  const table = await getTableById(tableId)
+  const table = await retryTransient('cell-write getTableById', () => getTableById(tableId))
   if (!table) {
     logger.warn(`Table ${tableId} vanished before group state write`)
     return 'wrote'
   }
-  const row = await getRowById(tableId, rowId, workspaceId)
+  const row = await retryTransient('cell-write getRowById', () =>
+    getRowById(tableId, rowId, workspaceId)
+  )
   if (!row) {
     logger.warn(`Row ${rowId} vanished before group state write`)
     return 'wrote'
@@ -99,17 +102,22 @@ export async function writeWorkflowGroupState(
   // task writes (running/completed/error) get the SQL guard so an in-flight
   // partial can't clobber a stop click or a newer run that already committed.
   const cancellationGuard = bypassStaleWorker ? undefined : { groupId, executionId }
-  const result = await updateRow(
-    {
-      tableId,
-      rowId,
-      data: payload.dataPatch ?? {},
-      workspaceId,
-      executionsPatch: { [groupId]: payload.executionState },
-      cancellationGuard,
-    },
-    table,
-    requestId
+  // The executionId/cancellation guard makes this write idempotent — a retry
+  // after a dropped connection re-applies the same terminal state, so retrying
+  // is safe and is what stops a transient blip from stranding the cell.
+  const result = await retryTransient('cell-write updateRow', () =>
+    updateRow(
+      {
+        tableId,
+        rowId,
+        data: payload.dataPatch ?? {},
+        workspaceId,
+        executionsPatch: { [groupId]: payload.executionState },
+        cancellationGuard,
+      },
+      table,
+      requestId
+    )
   )
   if (result === null) {
     logger.info(
diff --git a/apps/sim/lib/table/retry-transient.test.ts b/apps/sim/lib/table/retry-transient.test.ts
diff --git a/apps/sim/lib/table/retry-transient.ts b/apps/sim/lib/table/retry-transient.ts
diff --git a/apps/sim/lib/workflows/executor/pause-persistence.ts b/apps/sim/lib/workflows/executor/pause-persistence.ts