fix(run-engine): emit runFailed from createFailedTaskRun

d-cs · claude · d-cs · commit 97018b1e65d0 · 2026-05-22T15:53:14.000+01:00
The mollifier drainer's terminal-failure path (Phase 4G) and the
batch-trigger's "queue size limit exceeded" path both call
createFailedTaskRun to write a SYSTEM_FAILURE PG row for runs that
never actually executed. Neither path emitted runFailed afterwards,
so the runEngineHandlers' `runFailed` listener never fired — which
means PerformTaskRunAlertsService never enqueued an alert delivery
job, and customers' configured TASK_RUN alert channels missed the
failure entirely. The row was visible in the dashboard list but
silent for alerting purposes.

Emit runFailed from createFailedTaskRun with `attemptNumber: 0` as
the marker that the run never executed (distinguishes synthesised
terminal failures from runs that exhausted their retries).
PerformTaskRunAlertsService doesn't filter on attemptNumber or
status, so the existing pipeline picks the event up without further
changes. DeliverAlertService dispatches via the channel type
(email/webhook/etc) the same way it does for any other terminal
failure.

Test: a containerTest subscribes to runFailed before calling
createFailedTaskRun, asserts exactly one event fires with the
expected payload shape. The existing batchTrigger tests still pass
(they didn't assert the negative).

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/.server-changes/createFailedTaskRun-emits-runFailed.md b/.server-changes/createFailedTaskRun-emits-runFailed.md
@@ -0,0 +1,6 @@
+---
+area: webapp
+type: fix
+---
+
+`engine.createFailedTaskRun` now emits the `runFailed` event so the alert pipeline picks up the SYSTEM_FAILURE row and the event-store handler writes the completion event into the trace. Affects the mollifier drainer's terminal-failure path (introduced in Phase 4G) and the batch-trigger's "queue size limit exceeded" path. Previously these terminal failures landed in PG silently — visible in the dashboard list but never reaching customers' configured TASK_RUN alert channels. The event payload carries `attemptNumber: 0` as the marker that the run never executed (synthesised terminal failure, not exhausted retries).
diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts
@@ -1132,6 +1132,44 @@ export class RunEngine {
           });
         }
 
+        // Emit `runFailed` so the alert pipeline picks up the
+        // SYSTEM_FAILURE row and the event-store handler writes the
+        // completion event into the trace. Without this the mollifier
+        // drainer's terminal failures (and batch-trigger's
+        // exceed-limit failures) land in PG silently — visible in the
+        // dashboard list but never reaching customers' configured
+        // ERROR alert channels.
+        this.eventBus.emit("runFailed", {
+          time: taskRun.completedAt ?? new Date(),
+          run: {
+            id: taskRun.id,
+            status: taskRun.status,
+            spanId: taskRun.spanId,
+            error,
+            taskEventStore: taskRun.taskEventStore,
+            createdAt: taskRun.createdAt,
+            completedAt: taskRun.completedAt,
+            updatedAt: taskRun.updatedAt,
+            // This row never attempted execution — it's a synthesised
+            // terminal failure. The alert payload's `attemptNumber=0`
+            // is the signal downstream consumers can use to
+            // distinguish a never-ran failure from a run that
+            // exhausted its retries.
+            attemptNumber: 0,
+            usageDurationMs: 0,
+            costInCents: 0,
+          },
+          organization: {
+            id: environment.organization.id,
+          },
+          project: {
+            id: environment.project.id,
+          },
+          environment: {
+            id: environment.id,
+          },
+        });
+
         return taskRun;
       },
       {
diff --git a/internal-packages/run-engine/src/engine/tests/createFailedTaskRun.test.ts b/internal-packages/run-engine/src/engine/tests/createFailedTaskRun.test.ts
@@ -0,0 +1,111 @@
+import { containerTest } from "@internal/testcontainers";
+import { trace } from "@internal/tracing";
+import { generateFriendlyId } from "@trigger.dev/core/v3/isomorphic";
+import { expect } from "vitest";
+import { RunEngine } from "../index.js";
+import { EventBusEventArgs } from "../eventBus.js";
+import { setupAuthenticatedEnvironment } from "./setup.js";
+
+vi.setConfig({ testTimeout: 60_000 });
+
+describe("RunEngine.createFailedTaskRun", () => {
+  containerTest("emits runFailed so the alert pipeline wakes up", async ({ prisma, redisOptions }) => {
+    // The mollifier drainer (and batch-trigger over-limit path) call
+    // createFailedTaskRun to write a terminal SYSTEM_FAILURE PG row
+    // for runs that never actually executed. Without an explicit
+    // runFailed emit, the row lands silently — the
+    // runEngineHandlers' `runFailed` listener (which enqueues
+    // PerformTaskRunAlertsService) never fires, so customers'
+    // configured TASK_RUN alert channels miss the failure entirely.
+    //
+    // Regression intent: if the emit is removed or moved out of
+    // createFailedTaskRun's success path, this test fails. The
+    // shape assertions pin the fields the alert delivery service
+    // reads from the event payload (run.id, run.status, error,
+    // attemptNumber=0 as the never-ran-marker).
+    const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+    const engine = new RunEngine({
+      prisma,
+      worker: {
+        redis: redisOptions,
+        workers: 1,
+        tasksPerWorker: 10,
+        pollIntervalMs: 100,
+      },
+      queue: {
+        redis: redisOptions,
+        masterQueueConsumersDisabled: true,
+        processWorkerQueueDebounceMs: 50,
+      },
+      runLock: {
+        redis: redisOptions,
+      },
+      machines: {
+        defaultMachine: "small-1x",
+        machines: {
+          "small-1x": {
+            name: "small-1x" as const,
+            cpu: 0.5,
+            memory: 0.5,
+            centsPerMs: 0.0001,
+          },
+        },
+        baseCostInCents: 0.0005,
+      },
+      tracer: trace.getTracer("test", "0.0.0"),
+    });
+
+    try {
+      const failedEvents: EventBusEventArgs<"runFailed">[0][] = [];
+      engine.eventBus.on("runFailed", (event) => {
+        failedEvents.push(event);
+      });
+
+      const friendlyId = generateFriendlyId("run");
+      const taskIdentifier = "drainer-terminal-test";
+
+      const failed = await engine.createFailedTaskRun({
+        friendlyId,
+        environment: {
+          id: authenticatedEnvironment.id,
+          type: authenticatedEnvironment.type,
+          project: { id: authenticatedEnvironment.project.id },
+          organization: { id: authenticatedEnvironment.organization.id },
+        },
+        taskIdentifier,
+        payload: "{}",
+        payloadType: "application/json",
+        error: {
+          type: "STRING_ERROR",
+          raw: "Mollifier drainer terminal failure: synthetic engine.trigger panic",
+        },
+        traceId: "0123456789abcdef0123456789abcdef",
+        spanId: "fedcba9876543210",
+      });
+
+      expect(failed.status).toBe("SYSTEM_FAILURE");
+
+      expect(failedEvents).toHaveLength(1);
+      const event = failedEvents[0];
+      expect(event.run.id).toBe(failed.id);
+      expect(event.run.status).toBe("SYSTEM_FAILURE");
+      expect(event.run.spanId).toBe("fedcba9876543210");
+      // attemptNumber=0 is the marker that the run never executed —
+      // it's a synthesised terminal failure, not an exhausted-retries
+      // failure. Downstream consumers can use this to distinguish.
+      expect(event.run.attemptNumber).toBe(0);
+      expect(event.run.usageDurationMs).toBe(0);
+      expect(event.run.costInCents).toBe(0);
+      expect(event.run.error).toEqual({
+        type: "STRING_ERROR",
+        raw: "Mollifier drainer terminal failure: synthetic engine.trigger panic",
+      });
+      expect(event.organization.id).toBe(authenticatedEnvironment.organization.id);
+      expect(event.project.id).toBe(authenticatedEnvironment.project.id);
+      expect(event.environment.id).toBe(authenticatedEnvironment.id);
+    } finally {
+      await engine.quit();
+    }
+  });
+});