Skip to content

Commit 928bd91

Browse files
fix(background): recategorize user/recovery failures as errors, not trigger faults (#4860)
* fix(webhook): don't fault trigger run on user/workflow execution errors Webhook-triggered executions re-threw every error, so trigger.dev marked the run failed and fired #eng-errors alerts. The vast majority of these are user-caused workflow failures (missing required fields, invalid field references, bad URLs, provider 4xx, expired models, low credit) that are already recorded in the execution logs. Distinguish fault vs error in executeWebhookJobInternal: when the failure was finalized by core (the workflow ran and its failure is logged), complete the run with { success: false } instead of throwing. Errors that were not finalized came from the webhook pipeline itself and still re-throw to fault the run. Await waitForPostExecution first so the finalized flag is reliable. The error is still recorded on the run's OTel span via recordException (no ERROR status, so the run isn't faulted) and remains in the execution logs, so these stay investigable in Tempo/Loki without false alerts. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * fix(schedule): don't fault trigger run on error-recovery failures The schedule task already treats workflow-execution failures as recorded errors rather than trigger faults, but the outermost catch's own recovery code (the infra-retry and releaseClaim calls) was unguarded. A secondary DB blip while releasing the claim re-threw and escaped run(), faulting the trigger.dev run and firing an alert — a double-fault during cleanup. Wrap the recovery path in a try/catch: log and record the exception on the span without re-throwing. The claim expires on its TTL and the next tick re-claims the schedule, so swallowing the cleanup failure is safe. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * test(webhook): assert waitForPostExecution runs on the non-finalized path Guards the race fix on the infra-error path so a future refactor can't silently drop the await. Addresses Greptile review feedback. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent ba2e4cc commit 928bd91

3 files changed

Lines changed: 199 additions & 12 deletions

File tree

apps/sim/background/schedule-execution.ts

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import { trace } from '@opentelemetry/api'
12
import {
23
db,
34
jobExecutionLogs,
@@ -943,16 +944,28 @@ export async function executeScheduleJob(payload: ScheduleExecutionPayload) {
943944
)
944945
}
945946
} catch (error: unknown) {
946-
if (isRetryableInfrastructureError(error)) {
947-
await retryScheduleAfterInfraFailure({ payload, requestId, claimedAt, error })
948-
return
949-
}
947+
try {
948+
if (isRetryableInfrastructureError(error)) {
949+
await retryScheduleAfterInfraFailure({ payload, requestId, claimedAt, error })
950+
return
951+
}
950952

951-
logger.error(`[${requestId}] Error processing schedule ${payload.scheduleId}`, error)
952-
await releaseClaim(
953-
now,
954-
`Failed to release schedule ${payload.scheduleId} after unhandled error`
955-
)
953+
logger.error(`[${requestId}] Error processing schedule ${payload.scheduleId}`, error)
954+
await releaseClaim(
955+
now,
956+
`Failed to release schedule ${payload.scheduleId} after unhandled error`
957+
)
958+
} catch (recoveryError: unknown) {
959+
// A secondary failure during error recovery (e.g. a transient DB blip while
960+
// releasing the claim or scheduling an infra retry) must not fault the run. The
961+
// claim expires on its TTL and the next tick re-claims the schedule. Record the
962+
// exception on the span so it stays visible in traces without faulting the run.
963+
logger.error(
964+
`[${requestId}] Failed to recover schedule ${payload.scheduleId} after error`,
965+
recoveryError
966+
)
967+
trace.getActiveSpan()?.recordException(toError(recoveryError))
968+
}
956969
}
957970
})
958971
}

apps/sim/background/webhook-execution.test.ts

Lines changed: 156 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,110 @@
22
* @vitest-environment node
33
*/
44

5+
import {
6+
dbChainMock,
7+
dbChainMockFns,
8+
executionPreprocessingMock,
9+
executionPreprocessingMockFns,
10+
loggingSessionMock,
11+
loggingSessionMockFns,
12+
} from '@sim/testing'
513
import { beforeEach, describe, expect, it, vi } from 'vitest'
614

7-
const { mockResolveWebhookRecordProviderConfig } = vi.hoisted(() => ({
15+
const {
16+
mockResolveWebhookRecordProviderConfig,
17+
mockExecuteWorkflowCore,
18+
mockWasExecutionFinalizedByCore,
19+
mockRecordException,
20+
mockGetActiveSpan,
21+
} = vi.hoisted(() => ({
822
mockResolveWebhookRecordProviderConfig: vi.fn(),
23+
mockExecuteWorkflowCore: vi.fn(),
24+
mockWasExecutionFinalizedByCore: vi.fn(),
25+
mockRecordException: vi.fn(),
26+
mockGetActiveSpan: vi.fn(),
927
}))
1028

29+
vi.mock('@opentelemetry/api', () => ({
30+
trace: { getActiveSpan: mockGetActiveSpan },
31+
}))
32+
33+
vi.mock('@sim/db', () => dbChainMock)
34+
vi.mock('@/lib/execution/preprocessing', () => executionPreprocessingMock)
35+
vi.mock('@/lib/logs/execution/logging-session', () => loggingSessionMock)
36+
1137
vi.mock('@/lib/webhooks/env-resolver', () => ({
1238
resolveWebhookRecordProviderConfig: mockResolveWebhookRecordProviderConfig,
1339
}))
1440

15-
import { resolveWebhookExecutionProviderConfig } from './webhook-execution'
41+
vi.mock('@/lib/workflows/executor/execution-core', () => ({
42+
executeWorkflowCore: mockExecuteWorkflowCore,
43+
wasExecutionFinalizedByCore: mockWasExecutionFinalizedByCore,
44+
}))
45+
46+
vi.mock('@/lib/core/idempotency', () => ({
47+
IdempotencyService: { createWebhookIdempotencyKey: vi.fn(() => 'idempotency-key') },
48+
webhookIdempotency: {
49+
executeWithIdempotency: vi.fn(
50+
(_provider: string, _key: string, operation: () => Promise<unknown>) => operation()
51+
),
52+
},
53+
}))
54+
55+
vi.mock('@/lib/workflows/persistence/utils', () => ({
56+
loadDeployedWorkflowState: vi.fn(async () => ({
57+
blocks: {},
58+
edges: [],
59+
loops: {},
60+
parallels: {},
61+
deploymentVersionId: 'deployment-1',
62+
})),
63+
}))
64+
65+
vi.mock('@/lib/webhooks/providers', () => ({
66+
getProviderHandler: vi.fn(() => ({})),
67+
}))
68+
69+
vi.mock('@/lib/logs/execution/trace-spans/trace-spans', () => ({
70+
buildTraceSpans: vi.fn(() => ({ traceSpans: [] })),
71+
}))
72+
73+
vi.mock('@/lib/core/execution-limits', () => ({
74+
createTimeoutAbortController: vi.fn(() => ({
75+
signal: new AbortController().signal,
76+
cleanup: vi.fn(),
77+
isTimedOut: () => false,
78+
timeoutMs: 120_000,
79+
})),
80+
getTimeoutErrorMessage: vi.fn(() => 'timed out'),
81+
}))
82+
83+
vi.mock('@/lib/workflows/executor/pause-persistence', () => ({
84+
handlePostExecutionPauseState: vi.fn(),
85+
}))
86+
87+
vi.mock('@/lib/webhooks/attachment-processor', () => ({
88+
WebhookAttachmentProcessor: class {},
89+
}))
90+
91+
vi.mock('@/app/api/auth/oauth/utils', () => ({
92+
resolveOAuthAccountId: vi.fn(),
93+
}))
94+
95+
vi.mock('@/executor/execution/snapshot', () => ({
96+
ExecutionSnapshot: class {},
97+
}))
98+
99+
vi.mock('@/tools/safe-assign', () => ({ safeAssign: vi.fn() }))
100+
101+
vi.mock('@/blocks', () => ({ getBlock: vi.fn(() => null) }))
102+
103+
vi.mock('@/triggers', () => ({
104+
getTrigger: vi.fn(),
105+
isTriggerValid: vi.fn(() => false),
106+
}))
107+
108+
import { executeWebhookJob, resolveWebhookExecutionProviderConfig } from './webhook-execution'
16109

17110
describe('resolveWebhookExecutionProviderConfig', () => {
18111
beforeEach(() => {
@@ -66,3 +159,64 @@ describe('resolveWebhookExecutionProviderConfig', () => {
66159
)
67160
})
68161
})
162+
163+
describe('executeWebhookJob fault vs error handling', () => {
164+
const payload = {
165+
webhookId: 'webhook-1',
166+
workflowId: 'workflow-1',
167+
userId: 'user-1',
168+
executionId: 'execution-1',
169+
requestId: 'request-1',
170+
provider: 'gmail',
171+
body: { message: 'hello' },
172+
headers: {},
173+
path: '/webhook',
174+
workspaceId: 'workspace-1',
175+
}
176+
177+
beforeEach(() => {
178+
vi.clearAllMocks()
179+
executionPreprocessingMockFns.mockPreprocessExecution.mockResolvedValue({
180+
success: true,
181+
workflowRecord: { workspaceId: 'workspace-1', userId: 'user-1', variables: {} },
182+
executionTimeout: { async: 120_000 },
183+
})
184+
mockResolveWebhookRecordProviderConfig.mockImplementation(async (record) => record)
185+
dbChainMockFns.limit.mockResolvedValue([{ id: 'webhook-1' }])
186+
mockGetActiveSpan.mockReturnValue({ recordException: mockRecordException })
187+
})
188+
189+
it('completes the run (does not throw) when the failure was finalized by core', async () => {
190+
mockExecuteWorkflowCore.mockRejectedValue(
191+
new Error('Gmail 2 is missing required fields: Label')
192+
)
193+
mockWasExecutionFinalizedByCore.mockReturnValue(true)
194+
195+
const result = await executeWebhookJob(payload)
196+
197+
expect(result).toMatchObject({
198+
success: false,
199+
workflowId: 'workflow-1',
200+
executionId: 'execution-1',
201+
provider: 'gmail',
202+
})
203+
expect(loggingSessionMockFns.mockWaitForPostExecution).toHaveBeenCalled()
204+
// User/workflow errors are already recorded by core — the catch must not re-log them.
205+
expect(loggingSessionMockFns.mockSafeCompleteWithError).not.toHaveBeenCalled()
206+
// The error is still recorded on the run span so it stays visible in traces.
207+
expect(mockRecordException).toHaveBeenCalledWith(
208+
expect.objectContaining({ message: 'Gmail 2 is missing required fields: Label' })
209+
)
210+
})
211+
212+
it('faults the run (re-throws) when the failure was not finalized by core', async () => {
213+
mockExecuteWorkflowCore.mockRejectedValue(new Error('Workflow state not found'))
214+
mockWasExecutionFinalizedByCore.mockReturnValue(false)
215+
216+
await expect(executeWebhookJob(payload)).rejects.toThrow('Workflow state not found')
217+
// waitForPostExecution must run on every path so the finalized-by-core signal is always reliable.
218+
expect(loggingSessionMockFns.mockWaitForPostExecution).toHaveBeenCalled()
219+
// Pipeline/infra errors are recorded here before re-throwing to fault the trigger.dev run.
220+
expect(loggingSessionMockFns.mockSafeCompleteWithError).toHaveBeenCalled()
221+
})
222+
})

apps/sim/background/webhook-execution.ts

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import { trace } from '@opentelemetry/api'
12
import { db } from '@sim/db'
23
import { account, webhook } from '@sim/db/schema'
34
import { createLogger, runWithRequestContext } from '@sim/logger'
@@ -616,8 +617,27 @@ async function executeWebhookJobInternal(
616617
provider: payload.provider,
617618
})
618619

620+
// The finalized flag is set inside a fire-and-forget post-execution promise; await it so the
621+
// signal is reliable and the failure is fully persisted before we decide fault vs error.
622+
await loggingSession.waitForPostExecution()
623+
624+
// A failure inside workflow execution (block error, provider 4xx, missing required field, etc.)
625+
// is finalized by core and already recorded in the execution logs. That is a user/workflow error,
626+
// not a trigger.dev job fault — complete the run normally so we don't fire a false alert. Errors
627+
// that were not finalized came from the webhook pipeline itself, so we re-throw to fault below.
619628
if (wasExecutionFinalizedByCore(error, executionId)) {
620-
throw error
629+
// Record the exception on the run span so it stays visible in traces without
630+
// marking the span as ERROR — that status is what faults the trigger.dev run.
631+
trace.getActiveSpan()?.recordException(toError(error))
632+
633+
return {
634+
success: false,
635+
workflowId: payload.workflowId,
636+
executionId,
637+
output: hasExecutionResult(error) ? error.executionResult.output : {},
638+
executedAt: new Date().toISOString(),
639+
provider: payload.provider,
640+
}
621641
}
622642

623643
try {

0 commit comments

Comments
 (0)