diff --git a/.changeset/retry-sigsegv.md b/.changeset/retry-sigsegv.md new file mode 100644 index 00000000000..5a53c351efe --- /dev/null +++ b/.changeset/retry-sigsegv.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/core": patch +--- + +Retry `TASK_PROCESS_SIGSEGV` task crashes under the user's retry policy instead of failing the run on the first segfault. SIGSEGV in Node tasks is frequently non-deterministic (native addon races, JIT/GC interaction, near-OOM in native code, host issues), so retrying on a fresh process often succeeds. The retry is gated by the task's existing `retry` config + `maxAttempts` — same path `TASK_PROCESS_SIGTERM` and uncaught exceptions already use — so tasks without a retry policy still fail fast. diff --git a/packages/core/src/v3/errors.ts b/packages/core/src/v3/errors.ts index a538ca9357b..80124e1d386 100644 --- a/packages/core/src/v3/errors.ts +++ b/packages/core/src/v3/errors.ts @@ -199,6 +199,9 @@ export function truncateStack(stack: string | undefined): string { ].join("\n"); } +/** + * Truncates error messages that exceed MAX_MESSAGE_LENGTH to prevent OOM. + */ export function truncateMessage(message: string | undefined): string { if (!message) return ""; return message.length > MAX_MESSAGE_LENGTH @@ -206,6 +209,10 @@ export function truncateMessage(message: string | undefined): string { : message; } +/** + * Parses an unknown error into a TaskRunError structure. + * Handles InternalError, built-in Error, strings, and custom error objects. + */ export function parseError(error: unknown): TaskRunError { if (isInternalError(error)) { return { @@ -301,6 +308,10 @@ export function createJsonErrorObject(error: TaskRunError): SerializedError { } // Removes null characters and truncates oversized fields to prevent OOM +/** + * Sanitizes TaskRunError by removing null bytes and truncating long fields. + * Used to clean errors before storage or transmission. + */ export function sanitizeError(error: TaskRunError): TaskRunError { switch (error.type) { case "BUILT_IN_ERROR": { @@ -351,6 +362,11 @@ export function sanitizeError(error: TaskRunError): TaskRunError { } } +/** + * Determines whether an error should trigger a retry attempt. + * Returns true for errors that are retriable under the user's retry policy. + * Non-retriable errors (like OOM, SIGKILL_TIMEOUT) will fail the run immediately. + */ export function shouldRetryError(error: TaskRunError): boolean { switch (error.type) { case "INTERNAL_ERROR": { @@ -361,7 +377,6 @@ export function shouldRetryError(error: TaskRunError): boolean { case "CONFIGURED_INCORRECTLY": case "TASK_ALREADY_RUNNING": case "TASK_PROCESS_SIGKILL_TIMEOUT": - case "TASK_PROCESS_SIGSEGV": case "TASK_PROCESS_OOM_KILLED": case "TASK_PROCESS_MAYBE_OOM_KILLED": case "TASK_RUN_CANCELLED": @@ -398,6 +413,7 @@ export function shouldRetryError(error: TaskRunError): boolean { case "TASK_RUN_UNCAUGHT_EXCEPTION": case "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE": case "TASK_PROCESS_SIGTERM": + case "TASK_PROCESS_SIGSEGV": return true; default: @@ -419,6 +435,10 @@ export function shouldRetryError(error: TaskRunError): boolean { } } +/** + * Checks if retry settings should be looked up for this error type. + * Some errors (like SIGSEGV, SIGTERM, uncaught exceptions) respect user retry config. + */ export function shouldLookupRetrySettings(error: TaskRunError): boolean { switch (error.type) { case "INTERNAL_ERROR": { @@ -448,6 +468,10 @@ export function shouldLookupRetrySettings(error: TaskRunError): boolean { } } +/** + * Corrects error stack traces by normalizing file paths and removing noise. + * Used to make stack traces more readable in logs and error UI. + */ export function correctErrorStackTrace( stackTrace: string, projectDir?: string, @@ -490,6 +514,10 @@ function correctStackTraceLine(line: string, projectDir?: string, isDev?: boolea return line.trim(); } +/** + * Groups Zod validation issues by task index for better error reporting. + * Used when parsing task metadata fails. + */ export function groupTaskMetadataIssuesByTask(tasks: any, issues: z.ZodIssue[]) { return issues.reduce( (acc, issue) => { @@ -773,6 +801,10 @@ const findSignalInMessage = (message?: string, truncateLength = 100) => { } }; +/** + * Enhances TaskRunError with additional context like signals, OOM detection. + * Used to enrich errors before displaying or logging. + */ export function taskRunErrorEnhancer(error: TaskRunError): EnhanceError { switch (error.type) { case "BUILT_IN_ERROR": { diff --git a/packages/core/test/errors.test.ts b/packages/core/test/errors.test.ts index dee6509d3a2..9a94366d845 100644 --- a/packages/core/test/errors.test.ts +++ b/packages/core/test/errors.test.ts @@ -1,5 +1,13 @@ import { describe, it, expect } from "vitest"; -import { truncateStack, truncateMessage, parseError, sanitizeError } from "../src/v3/errors.js"; +import { + truncateStack, + truncateMessage, + parseError, + sanitizeError, + shouldRetryError, + shouldLookupRetrySettings, +} from "../src/v3/errors.js"; +import type { TaskRunError } from "../src/v3/schemas/common.js"; // Helper: build a fake stack with N frames function buildStack(messageLines: string[], frameCount: number): string { @@ -238,3 +246,29 @@ describe("truncateStack message line bounding", () => { expect(result).toContain("...[truncated]"); }); }); + +describe("shouldRetryError + shouldLookupRetrySettings", () => { + const internal = (code: string): TaskRunError => + ({ type: "INTERNAL_ERROR", code } as TaskRunError); + + it("retries SIGSEGV (changed from non-retriable) and looks up retry settings", () => { + const err = internal("TASK_PROCESS_SIGSEGV"); + expect(shouldRetryError(err)).toBe(true); + expect(shouldLookupRetrySettings(err)).toBe(true); + }); + + it("retries SIGTERM via the same path", () => { + const err = internal("TASK_PROCESS_SIGTERM"); + expect(shouldRetryError(err)).toBe(true); + expect(shouldLookupRetrySettings(err)).toBe(true); + }); + + it("still does not retry SIGKILL timeout", () => { + expect(shouldRetryError(internal("TASK_PROCESS_SIGKILL_TIMEOUT"))).toBe(false); + }); + + it("still does not retry OOM kills (handled by the separate machine-bump path)", () => { + expect(shouldRetryError(internal("TASK_PROCESS_OOM_KILLED"))).toBe(false); + expect(shouldRetryError(internal("TASK_PROCESS_MAYBE_OOM_KILLED"))).toBe(false); + }); +});