feat(webapp): write SYSTEM_FAILURE PG row when drainer hits a non-retryable error

d-cs · claude · d-cs · commit d4b55c1893df · 2026-05-22T10:07:54.000+01:00
Previously, a non-retryable engine.trigger failure during drain left
the buffer entry as `status: "FAILED"` in Redis with no PG row. The
customer saw the run in their SDK / dashboard listing for ~10 min
(buffer TTL) then it vanished entirely — no audit trail of the
failure. Billing was unaffected (no attempts ever ran) but
observability was zero.

Reuse the engine's existing `createFailedTaskRun` helper (the same one
batch-trigger calls when an item fails to start) — writes a terminal
SYSTEM_FAILURE TaskRun row with the engine.trigger error stored on
`error`, no attempts, P2002-idempotent on the unique constraint.

Drainer handler classifies the failure:
- Retryable PG error → rethrow so MollifierDrainer.drainOne requeues
- Non-retryable → createFailedTaskRun, swallow original error so the
  buffer entry is ack'd (PG now has the audit row)
- createFailedTaskRun also fails (PG truly unreachable) → rethrow
  original so drainer falls through to its existing buffer.fail
  terminal-marker path
- Snapshot too malformed to construct the environment block → rethrow
  (defensive — drainer falls through to buffer.fail)

Tests cover each path.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts b/apps/webapp/app/v3/mollifier/mollifierDrainerHandler.server.ts
@@ -86,7 +86,77 @@ export function createDrainerHandler(deps: {
         span.setAttribute("mollifier.run_friendly_id", input.runId);
         span.setAttribute("taskRunId", input.runId);
 
-        await deps.engine.trigger(input.payload as any, deps.prisma);
+        try {
+          await deps.engine.trigger(input.payload as any, deps.prisma);
+        } catch (err) {
+          // The retryable-PG class re-throws so the drainer's outer
+          // worker loop can `buffer.requeue` (handled in
+          // `MollifierDrainer.drainOne`). For non-retryable failures we
+          // write a terminal SYSTEM_FAILURE row to PG via the engine's
+          // existing `createFailedTaskRun` (used by batch-trigger for
+          // the same purpose) so the customer sees the run in their
+          // dashboard / SDK instead of silently losing it when the
+          // buffer entry TTLs out. If THAT insert also fails (PG truly
+          // unreachable), rethrow so the drainer's outer catch falls
+          // through to its existing `buffer.fail` terminal-marker path.
+          if (isRetryablePgError(err)) {
+            throw err;
+          }
+          const reason = err instanceof Error ? err.message : String(err);
+          span.setAttribute("mollifier.terminal_failure_reason", reason);
+          const snapshot = input.payload as Record<string, unknown>;
+          const env = snapshot.environment as
+            | {
+                id: string;
+                type: any;
+                project: { id: string };
+                organization: { id: string };
+              }
+            | undefined;
+          if (!env) {
+            // Snapshot too malformed to even construct a TaskRun row.
+            // Drainer's outer catch will buffer.fail this entry.
+            throw err;
+          }
+          try {
+            await deps.engine.createFailedTaskRun({
+              friendlyId: input.runId,
+              environment: env,
+              taskIdentifier: String(snapshot.taskIdentifier ?? ""),
+              payload: typeof snapshot.payload === "string" ? snapshot.payload : undefined,
+              payloadType:
+                typeof snapshot.payloadType === "string" ? snapshot.payloadType : undefined,
+              error: {
+                type: "STRING_ERROR",
+                raw: `Mollifier drainer terminal failure: ${reason}`,
+              },
+              parentTaskRunId:
+                typeof snapshot.parentTaskRunId === "string"
+                  ? snapshot.parentTaskRunId
+                  : undefined,
+              rootTaskRunId:
+                typeof snapshot.rootTaskRunId === "string"
+                  ? snapshot.rootTaskRunId
+                  : undefined,
+              depth: typeof snapshot.depth === "number" ? snapshot.depth : 0,
+              resumeParentOnCompletion: snapshot.resumeParentOnCompletion === true,
+              traceId: typeof snapshot.traceId === "string" ? snapshot.traceId : undefined,
+              spanId: typeof snapshot.spanId === "string" ? snapshot.spanId : undefined,
+              taskEventStore:
+                typeof snapshot.taskEventStore === "string"
+                  ? snapshot.taskEventStore
+                  : undefined,
+              queue: typeof snapshot.queue === "string" ? snapshot.queue : undefined,
+              lockedQueueId:
+                typeof snapshot.lockedQueueId === "string" ? snapshot.lockedQueueId : undefined,
+            });
+          } catch (writeErr) {
+            // Class A — PG itself is failing. Rethrow the original
+            // error so the drainer falls back to buffer.fail. Include
+            // the write error in the log line at the drainer layer.
+            throw err;
+          }
+        }
       });
     });
   };
diff --git a/apps/webapp/test/mollifierDrainerHandler.test.ts b/apps/webapp/test/mollifierDrainerHandler.test.ts
@@ -90,12 +90,14 @@ describe("createDrainerHandler", () => {
     expect(observedTraceId).toBe(snapshotTraceId);
   });
 
-  it("propagates engine.trigger errors so MollifierDrainer can classify them", async () => {
+  it("rethrows retryable PG errors so MollifierDrainer requeues the entry", async () => {
+    const err = new Error("Can't reach database server");
     const trigger = vi.fn(async () => {
-      throw new Error("boom");
+      throw err;
     });
+    const createFailedTaskRun = vi.fn();
     const handler = createDrainerHandler({
-      engine: { trigger } as any,
+      engine: { trigger, createFailedTaskRun } as any,
       prisma: {} as any,
     });
 
@@ -108,6 +110,97 @@ describe("createDrainerHandler", () => {
         attempts: 0,
         createdAt: new Date(),
       } as any),
-    ).rejects.toThrow("boom");
+    ).rejects.toThrow("Can't reach database server");
+    // Retryable: we do NOT write a SYSTEM_FAILURE row, the entry should
+    // be requeued for another shot.
+    expect(createFailedTaskRun).not.toHaveBeenCalled();
+  });
+
+  const envFixture = {
+    id: "env_a",
+    type: "DEVELOPMENT",
+    project: { id: "proj_1" },
+    organization: { id: "org_1" },
+  };
+
+  it("writes a SYSTEM_FAILURE PG row when engine.trigger fails non-retryably", async () => {
+    const trigger = vi.fn(async () => {
+      throw new Error("validation failed: payload too large");
+    });
+    const createFailedTaskRun = vi.fn(async () => ({
+      id: "internal",
+      friendlyId: "run_x",
+    }));
+    const handler = createDrainerHandler({
+      engine: { trigger, createFailedTaskRun } as any,
+      prisma: {} as any,
+    });
+
+    await expect(
+      handler({
+        runId: "run_x",
+        envId: "env_a",
+        orgId: "org_1",
+        payload: { taskIdentifier: "t", environment: envFixture },
+        attempts: 0,
+        createdAt: new Date(),
+      } as any),
+    ).resolves.toBeUndefined();
+
+    expect(trigger).toHaveBeenCalledOnce();
+    expect(createFailedTaskRun).toHaveBeenCalledOnce();
+    const arg = createFailedTaskRun.mock.calls[0][0] as { error: { raw: string } };
+    expect(arg.error.raw).toContain("validation failed");
+  });
+
+  it("rethrows the original error when createFailedTaskRun also fails (PG genuinely unreachable)", async () => {
+    const triggerErr = new Error("engine rejected the snapshot");
+    const trigger = vi.fn(async () => {
+      throw triggerErr;
+    });
+    const createFailedTaskRun = vi.fn(async () => {
+      throw new Error("connection refused");
+    });
+    const handler = createDrainerHandler({
+      engine: { trigger, createFailedTaskRun } as any,
+      prisma: {} as any,
+    });
+
+    await expect(
+      handler({
+        runId: "run_x",
+        envId: "env_a",
+        orgId: "org_1",
+        payload: { taskIdentifier: "t", environment: envFixture },
+        attempts: 0,
+        createdAt: new Date(),
+      } as any),
+    ).rejects.toThrow("engine rejected the snapshot");
+    // Drainer's outer drainOne loop now decides retry vs buffer.fail.
+    expect(createFailedTaskRun).toHaveBeenCalledOnce();
+  });
+
+  it("rethrows the original error when the snapshot lacks an environment block", async () => {
+    const triggerErr = new Error("engine rejected the snapshot");
+    const trigger = vi.fn(async () => {
+      throw triggerErr;
+    });
+    const createFailedTaskRun = vi.fn();
+    const handler = createDrainerHandler({
+      engine: { trigger, createFailedTaskRun } as any,
+      prisma: {} as any,
+    });
+
+    await expect(
+      handler({
+        runId: "run_x",
+        envId: "env_a",
+        orgId: "org_1",
+        payload: { taskIdentifier: "t" /* no environment */ },
+        attempts: 0,
+        createdAt: new Date(),
+      } as any),
+    ).rejects.toThrow("engine rejected the snapshot");
+    expect(createFailedTaskRun).not.toHaveBeenCalled();
   });
 });