diff --git a/.changeset/errors-api-schemas.md b/.changeset/errors-api-schemas.md new file mode 100644 index 00000000000..023ca825885 --- /dev/null +++ b/.changeset/errors-api-schemas.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/core": patch +--- + +Add request and response schemas for the new Errors API (error groups). These back the env-scoped HTTP endpoints for listing error groups, retrieving a single group, and changing its state (resolve, ignore, unresolve), plus a `filter[error]` option on the runs list to fetch the runs behind a group. Exported from `@trigger.dev/core/v3` so the SDK can reuse them. diff --git a/.claude/skills/errors-api-e2e/SKILL.md b/.claude/skills/errors-api-e2e/SKILL.md new file mode 100644 index 00000000000..e74c50bf5ea --- /dev/null +++ b/.claude/skills/errors-api-e2e/SKILL.md @@ -0,0 +1,199 @@ +--- +name: errors-api-e2e +description: End-to-end smoke test for the public Errors HTTP API (error groups). Seeds failed runs into ClickHouse so the error materialized views populate, then drives the real endpoints against the running webapp — list (with filters + pagination), retrieve, resolve/ignore/unresolve, the `filter[error]` runs filter, user attribution via the `trigger.dev mint-token` -> JWT exchange, and the 401/403/404 negatives. Use for "smoke test the errors API", "test the errors API e2e", "prove the errors endpoints work", or to re-verify after changes. +allowed-tools: Read, Bash +--- + +# Errors API — end-to-end smoke test + +Proves the public Errors API against the **running** webapp with real HTTP. No +mocks. The error data plane is ClickHouse (`errors_v1` + `error_occurrences_v1`, +both materialized-view-fed from `task_runs_v2`) plus Postgres `ErrorGroupState` +for lifecycle status; this skill seeds straight into `task_runs_v2` and lets the +MVs do the rest. + +Code under test: +- `apps/webapp/app/routes/api.v1.errors.ts` — `GET /api/v1/errors` (list). +- `apps/webapp/app/routes/api.v1.errors.$errorId.ts` — `GET /api/v1/errors/:errorId` (detail). +- `apps/webapp/app/routes/api.v1.errors.$errorId.{resolve,ignore,unresolve}.ts` — state actions. +- `apps/webapp/app/presenters/v3/ApiErrorListPresenter.server.ts` / `ApiErrorGroupPresenter.server.ts`. +- `apps/webapp/app/presenters/v3/ApiRunListPresenter.server.ts` — the `filter[error]` addition on `GET /api/v1/runs`. +- `apps/webapp/app/v3/services/errorGroupActions.server.ts` — resolve/ignore/unresolve (nullable `userId`). +- Attribution: `api.v1.projects.$projectRef.$env.jwt.ts` stamps `act:{sub}` for PAT **and** UAT exchanges; `@trigger.dev/rbac` surfaces `act.sub` through bearer auth; the action handlers read `authentication.actor?.sub`. + +`errorId` is `error_` (round-trips via `ErrorId` in `@trigger.dev/core/v3/isomorphic`). + +## Prerequisites + +- Webapp running on http://localhost:3030 (`pnpm run dev --filter webapp`). Confirm `curl -s http://localhost:3030/healthcheck`. +- DB seeded (`pnpm run db:seed`), and a local ClickHouse reachable at `CLICKHOUSE_URL` (the `pnpm run docker` stack). +- The CLI built + logged in to localhost:3030 (`pnpm run build --filter trigger.dev`; profile `default` points at localhost:3030). Needed only for the attribution leg. + +> Important wiring facts the seed relies on (verified): +> - The MVs read the error type/message from `error.data.*`, so the seeded +> `error` JSON column **must** be wrapped: `{"data": {"type": ..., "message": ..., "stack": ...}}`. +> - The MVs only fire for failed statuses: `SYSTEM_FAILURE | CRASHED | INTERRUPTED | COMPLETED_WITH_ERRORS | TIMED_OUT`, and require a non-empty `error_fingerprint`. +> - `GET /api/v1/runs` lists run **ids** from ClickHouse but **hydrates from Postgres** `TaskRun`. So the error-list/detail/action legs work from a ClickHouse-only seed, but the `filter[error]` leg needs a **paired** Postgres `TaskRun` row whose `id` equals the ClickHouse `run_id`. + +Run everything from the repo root in one shell. Invoke the built CLI via a +function (a `CLI="node …"` variable won't word-split under zsh): +```bash +cli() { node packages/cli-v3/dist/esm/index.js "$@"; } +PROFILE=default +``` + +## Setup — resolve a dev environment + connection strings + +```bash +cd apps/webapp +CHURL=$(grep -E "^CLICKHOUSE_URL=" .env | head -1 | cut -d= -f2- | tr -d '"') +DBURL=$(grep -E "^DATABASE_URL=" .env | head -1 | cut -d= -f2- | tr -d '"' | tr -d "'" | sed 's/?.*//') + +# Pick the seeded hello-world dev env (proj_rrkpdguyagvsoktglnod). Adjust the +# WHERE if you want a different project. +read ENV ORG PROJ REF < <(psql "$DBURL" -t -A -F' ' -c " + SELECT re.id, re.\"organizationId\", re.\"projectId\", p.\"externalRef\" + FROM \"RuntimeEnvironment\" re + JOIN \"Project\" p ON p.id = re.\"projectId\" + WHERE re.slug='dev' AND p.\"externalRef\"='proj_rrkpdguyagvsoktglnod' LIMIT 1;") +APIKEY=$(psql "$DBURL" -t -A -c "SELECT \"apiKey\" FROM \"RuntimeEnvironment\" WHERE id='$ENV';") +cd .. +H="Authorization: Bearer $APIKEY" +B="http://localhost:3030" +``` + +## Steps + +### 1. Seed two error groups (ClickHouse, MV-fed) + +```bash +RUN=$(node -e 'console.log(Date.now().toString(36))') +TASK="errors-api-e2e-$RUN"; FP_A="fpA${RUN}"; FP_B="fpB${RUN}" +ERRID_A="error_$FP_A"; ERRID_B="error_$FP_B" +NOW_CH=$(node -e 'console.log(new Date().toISOString().replace("T"," ").replace("Z","").slice(0,23))') +NOW_MS=$(node -e 'console.log(Date.now())') +Q=$(python3 -c "import urllib.parse;print(urllib.parse.quote('INSERT INTO trigger_dev.task_runs_v2 FORMAT JSONEachRow'))") + +mkrow() { # status fingerprint errorType message runId + echo "{\"environment_id\":\"$ENV\",\"organization_id\":\"$ORG\",\"project_id\":\"$PROJ\",\"run_id\":\"$5\",\"friendly_id\":\"run_$5\",\"status\":\"$1\",\"environment_type\":\"DEVELOPMENT\",\"engine\":\"V2\",\"task_identifier\":\"$TASK\",\"created_at\":\"$NOW_CH\",\"updated_at\":\"$NOW_CH\",\"error\":{\"data\":{\"type\":\"$3\",\"message\":\"$4\",\"stack\":\"at x (a.ts:1:1)\"}},\"error_fingerprint\":\"$2\",\"task_version\":\"20240101.1\",\"_version\":\"$NOW_MS\",\"_is_deleted\":0}" +} +ROWS="$(mkrow COMPLETED_WITH_ERRORS $FP_A AlphaBoom 'alpha boom happened' r_a1_$RUN) +$(mkrow COMPLETED_WITH_ERRORS $FP_A AlphaBoom 'alpha boom happened' r_a2_$RUN) +$(mkrow CRASHED $FP_B BetaCrash 'beta crash happened' r_b1_$RUN)" +printf '%s' "$ROWS" | curl -s "$CHURL/?query=$Q" --data-binary @- + +# Poll until both fingerprints appear in errors_v1 (the MV is near-instant locally). +for i in $(seq 1 10); do + N=$(curl -s "$CHURL" --data-binary "SELECT count() FROM (SELECT 1 FROM trigger_dev.errors_v1 WHERE environment_id='$ENV' AND error_fingerprint IN ('$FP_A','$FP_B') GROUP BY error_fingerprint)") + [ "$N" = "2" ] && break; sleep 1 +done +echo "seeded fingerprints in errors_v1: $N (want 2)" +``` +PASS: `N = 2`. Alpha has 2 occurrences, beta 1. + +### 2. List + filters + pagination + +```bash +curl -s "$B/api/v1/errors?filter%5BtaskIdentifier%5D=$TASK&filter%5Bperiod%5D=1d" -H "$H" \ + | python3 -c "import sys,json;d=json.load(sys.stdin);print('count',len(d['data']),[(e['id'],e['status'],e['count']) for e in d['data']])" +``` +PASS: 2 groups, both `status=unresolved`, alpha `count=2`, beta `count=1`, ids `error_`. + +Assert each filter narrows correctly (each should return the noted shape): +```bash +curl -s "$B/api/v1/errors?filter%5BtaskIdentifier%5D=$TASK&filter%5Bstatus%5D=unresolved&filter%5Bperiod%5D=1d" -H "$H" | python3 -c "import sys,json;print('unresolved:',len(json.load(sys.stdin)['data']))" # 2 +curl -s "$B/api/v1/errors?filter%5BtaskIdentifier%5D=$TASK&filter%5Bsearch%5D=AlphaBoom&filter%5Bperiod%5D=1d" -H "$H" | python3 -c "import sys,json;print('search:',[e['errorType'] for e in json.load(sys.stdin)['data']])" # ['AlphaBoom'] +curl -s "$B/api/v1/errors?filter%5BtaskIdentifier%5D=$TASK&filter%5Bperiod%5D=1d&page%5Bsize%5D=1" -H "$H" | python3 -c "import sys,json;d=json.load(sys.stdin);print('page size 1:',len(d['data']),'next?',bool(d['pagination'].get('next')))" # 1 / True +``` +PASS: `unresolved: 2`, `search: ['AlphaBoom']`, `page size 1: 1 / next? True`. + +### 3. Retrieve detail + +```bash +curl -s "$B/api/v1/errors/$ERRID_A" -H "$H" \ + | python3 -c "import sys,json;d=json.load(sys.stdin);print(d['id'],d['errorType'],d['status'],d['count'],d['affectedVersions'],d['resolvedBy'])" +``` +PASS: `error_ AlphaBoom unresolved 2 ['20240101.1'] None`. + +### 4. Resolve / ignore / unresolve (env API key — `resolvedBy` null) + +```bash +st(){ python3 -c "import sys,json;d=json.load(sys.stdin);print('status',d['status'],'| resolvedInVersion',d['resolvedInVersion'],'| resolvedBy',d['resolvedBy'],'| ignoredUntil',bool(d['ignoredUntil']),'| reason',d['ignoredReason'])"; } + +curl -s -X POST "$B/api/v1/errors/$ERRID_A/resolve" -H "$H" -H 'Content-Type: application/json' -d '{"resolvedInVersion":"20240101.1"}' >/dev/null +curl -s "$B/api/v1/errors/$ERRID_A" -H "$H" | st # status resolved | resolvedInVersion 20240101.1 | resolvedBy None + +curl -s -X POST "$B/api/v1/errors/$ERRID_B/ignore" -H "$H" -H 'Content-Type: application/json' -d '{"duration":3600000,"reason":"known flake"}' >/dev/null +curl -s "$B/api/v1/errors/$ERRID_B" -H "$H" | st # status ignored | ignoredUntil True | reason known flake + +curl -s -X POST "$B/api/v1/errors/$ERRID_A/unresolve" -H "$H" >/dev/null +curl -s "$B/api/v1/errors/$ERRID_A" -H "$H" | st # status unresolved +``` +PASS: each transition reflected; `filter[status]=ignored` returns only beta: +```bash +curl -s "$B/api/v1/errors?filter%5BtaskIdentifier%5D=$TASK&filter%5Bstatus%5D=ignored&filter%5Bperiod%5D=1d" -H "$H" | python3 -c "import sys,json;print([e['id'] for e in json.load(sys.stdin)['data']])" # [error_] +``` + +### 5. `filter[error]` on the runs list (paired PG + CH seed) + +The runs list hydrates from Postgres, so seed a matching `TaskRun` row + a CH row +that share `run_id`/`id` and carry a fingerprint: +```bash +RID="re2e${RUN}"; FRID="run_${RID}"; FP_R="fpR${RUN}" +psql "$DBURL" -v ON_ERROR_STOP=1 -c " + INSERT INTO \"TaskRun\" (id, \"friendlyId\", \"taskIdentifier\", payload, \"traceId\", \"spanId\", \"runtimeEnvironmentId\", \"projectId\", queue, status, \"createdAt\", \"updatedAt\") + VALUES ('$RID','$FRID','$TASK','{}','trace_$RID','span_$RID','$ENV','$PROJ','task/$TASK','COMPLETED_WITH_ERRORS', now(), now()) + ON CONFLICT (id) DO NOTHING;" >/dev/null +ROW="{\"environment_id\":\"$ENV\",\"organization_id\":\"$ORG\",\"project_id\":\"$PROJ\",\"run_id\":\"$RID\",\"friendly_id\":\"$FRID\",\"status\":\"COMPLETED_WITH_ERRORS\",\"environment_type\":\"DEVELOPMENT\",\"engine\":\"V2\",\"task_identifier\":\"$TASK\",\"created_at\":\"$NOW_CH\",\"updated_at\":\"$NOW_CH\",\"error\":{\"data\":{\"type\":\"RunsFilterErr\",\"message\":\"for runs filter\",\"stack\":\"at x\"}},\"error_fingerprint\":\"$FP_R\",\"task_version\":\"20240101.1\",\"_version\":\"$NOW_MS\",\"_is_deleted\":0}" +printf '%s' "$ROW" | curl -s "$CHURL/?query=$Q" --data-binary @- +sleep 1 +curl -s "$B/api/v1/runs?filter%5Berror%5D=error_$FP_R" -H "$H" | python3 -c "import sys,json;d=json.load(sys.stdin);print('runs:',[r['id'] for r in d['data']])" +``` +PASS: one run, `run_` (status maps to `FAILED`). Proves `filter[error]` -> fingerprint -> CH -> PG hydration. + +### 6. Attribution — `mint-token` -> JWT exchange records the acting user + +```bash +TOKEN=$(cli mint-token --profile $PROFILE --client errors-api-e2e 2>/dev/null) # UAT +ENVJWT=$(curl -sS -X POST "$B/api/v1/projects/$REF/dev/jwt" -H "Authorization: Bearer $TOKEN" \ + -H 'Content-Type: application/json' -d '{"claims":{"scopes":["read:errors","write:errors"]}}' \ + | python3 -c "import sys,json;print(json.load(sys.stdin)['token'])") +# Decoded env JWT carries act.sub = the user id. +node -e 'const p=JSON.parse(Buffer.from(process.argv[1].split(".")[1],"base64url").toString());console.log("act:",JSON.stringify(p.act))' "$ENVJWT" + +curl -s -X POST "$B/api/v1/errors/$ERRID_A/resolve" -H "Authorization: Bearer $ENVJWT" \ + -H 'Content-Type: application/json' -d '{"resolvedInVersion":"20240101.2"}' >/dev/null +curl -s "$B/api/v1/errors/$ERRID_A" -H "$H" | python3 -c "import sys,json;d=json.load(sys.stdin);print('resolvedBy:',d['resolvedBy'])" +``` +PASS: `act.sub` is the user id (matches `cli whoami`), and `detail.resolvedBy` equals that user id (not null). A plain env key leaves it null (step 4). A **PAT** exchanged the same way also stamps `act` — repeat with the stored PAT to confirm `ignoredByUserId` attribution. + +### 7. Negatives + +```bash +curl -s -o /dev/null -w 'unknown id: %{http_code} (404)\n' "$B/api/v1/errors/error_doesnotexist0000" -H "$H" +curl -s -o /dev/null -w 'no auth list: %{http_code} (401)\n' "$B/api/v1/errors" +curl -s -o /dev/null -w 'no auth resolve: %{http_code} (401)\n' -X POST "$B/api/v1/errors/$ERRID_B/resolve" -H 'Content-Type: application/json' -d '{}' + +# read-only JWT must be denied on write, allowed on read +READJWT=$(curl -sS -X POST "$B/api/v1/projects/$REF/dev/jwt" -H "Authorization: Bearer $TOKEN" \ + -H 'Content-Type: application/json' -d '{"claims":{"scopes":["read:errors"]}}' | python3 -c "import sys,json;print(json.load(sys.stdin)['token'])") +curl -s -o /dev/null -w 'read JWT write: %{http_code} (403)\n' -X POST "$B/api/v1/errors/$ERRID_B/resolve" -H "Authorization: Bearer $READJWT" -H 'Content-Type: application/json' -d '{}' +curl -s -o /dev/null -w 'read JWT read: %{http_code} (200)\n' "$B/api/v1/errors?filter%5BtaskIdentifier%5D=$TASK" -H "Authorization: Bearer $READJWT" +``` +PASS: `404`, `401`, `401`, `403`, `200` respectively. + +## Result + +Report PASS only if: step 1 lands 2 groups in `errors_v1`; step 2's filters and +pagination narrow correctly; step 3 returns the detail; step 4's resolve/ignore/ +unresolve flip status (and `filter[status]` follows); step 5's `filter[error]` +returns the paired run; step 6 records `resolvedBy` = the acting user via the +JWT exchange (null with a plain env key); and step 7 returns 404/401/401/403/200. +A red leg is a bug or a missing prereq — report the exact status + body and file +a Linear issue, don't tune around it. + +## Notes / gotchas + +- Run files use a unique `$RUN` suffix per invocation, so reruns don't collide and seeded rows stay isolated by their unique task identifier. They are local-dev test rows (90-day ClickHouse TTL); no cleanup required. +- After **adding** the route files, the classic Remix dev compiler may not register them until a dev-server restart (a stale manifest returns Remix's HTML 404 on the new paths). If `POST …/resolve` returns a 404 HTML page rather than 401/200, restart `pnpm run dev --filter webapp`. +- The rbac `act` extraction lives in `@trigger.dev/rbac` (a built dep). After editing it, `pnpm run build --filter @trigger.dev/rbac` and restart the webapp so the attribution leg (step 6) reflects the change. diff --git a/apps/webapp/app/presenters/v3/ApiErrorGroupPresenter.server.ts b/apps/webapp/app/presenters/v3/ApiErrorGroupPresenter.server.ts new file mode 100644 index 00000000000..2ecf7cccc86 --- /dev/null +++ b/apps/webapp/app/presenters/v3/ApiErrorGroupPresenter.server.ts @@ -0,0 +1,227 @@ +import { type ErrorGroupDetail } from "@trigger.dev/core/v3"; +import { ErrorId } from "@trigger.dev/core/v3/isomorphic"; +import { type ErrorGroupStatus } from "@trigger.dev/database"; +import { type ApiAuthenticationResultSuccess } from "~/services/apiAuth.server"; +import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server"; +import { sortVersionsDescending } from "~/utils/semver"; +import { BasePresenter } from "./basePresenter.server"; + +/** + * Resolves the friendly `error_` id from the URL to the full error + * group detail for the authenticated environment, or `undefined` if it doesn't + * exist. Shared by the detail loader and the resolve/ignore/unresolve actions. + */ +export function findErrorGroupResource( + authentication: ApiAuthenticationResultSuccess, + errorId: string +): Promise { + const fingerprint = ErrorId.toId(errorId); + return new ApiErrorGroupPresenter().call( + authentication.environment.organizationId, + authentication.environment.project.id, + authentication.environment.id, + fingerprint + ); +} + +const DB_STATUS_TO_API: Record = { + UNRESOLVED: "unresolved", + RESOLVED: "resolved", + IGNORED: "ignored", +}; + +function parseClickHouseDateTime(value: string): Date { + const asNum = Number(value); + if (!isNaN(asNum) && asNum > 1e12) { + return new Date(asNum); + } + return new Date(value.replace(" ", "T") + "Z"); +} + +export class ApiErrorGroupPresenter extends BasePresenter { + /** + * Resolves a single error group to its API detail shape, or `undefined` if no + * such fingerprint exists in the environment (the route turns that into 404). + * Reuses the same ClickHouse query builders + `ErrorGroupState` reads the + * dashboard presenter uses. + */ + public async call( + organizationId: string, + projectId: string, + environmentId: string, + fingerprint: string + ): Promise { + return this.trace("call", async () => { + const clickhouse = await clickhouseFactory.getClickhouseForOrganization( + organizationId, + "logs" + ); + + const summary = await this.getSummary( + clickhouse, + organizationId, + projectId, + environmentId, + fingerprint + ); + + if (!summary) { + return undefined; + } + + const [affectedVersions, state] = await Promise.all([ + this.getAffectedVersions(clickhouse, organizationId, projectId, environmentId, fingerprint), + this.getState(environmentId, summary.taskIdentifier, fingerprint), + ]); + + return { + id: ErrorId.toFriendlyId(fingerprint), + fingerprint, + taskIdentifier: summary.taskIdentifier, + errorType: summary.errorType, + errorMessage: summary.errorMessage, + count: summary.count, + firstSeen: summary.firstSeen, + lastSeen: summary.lastSeen, + affectedVersions, + status: state ? DB_STATUS_TO_API[state.status] : "unresolved", + resolvedAt: state?.resolvedAt ?? null, + resolvedInVersion: state?.resolvedInVersion ?? null, + resolvedBy: state?.resolvedBy ?? null, + ignoredAt: state?.ignoredAt ?? null, + ignoredUntil: state?.ignoredUntil ?? null, + ignoredReason: state?.ignoredReason ?? null, + ignoredByUserId: state?.ignoredByUserId ?? null, + ignoredUntilOccurrenceRate: state?.ignoredUntilOccurrenceRate ?? null, + ignoredUntilTotalOccurrences: state?.ignoredUntilTotalOccurrences ?? null, + }; + }); + } + + private async getSummary( + clickhouse: Awaited>, + organizationId: string, + projectId: string, + environmentId: string, + fingerprint: string + ): Promise< + | { + taskIdentifier: string; + errorType: string; + errorMessage: string; + count: number; + firstSeen: Date; + lastSeen: Date; + } + | undefined + > { + const queryBuilder = clickhouse.errors.listQueryBuilder(); + + queryBuilder.where("organization_id = {organizationId: String}", { organizationId }); + queryBuilder.where("project_id = {projectId: String}", { projectId }); + queryBuilder.where("environment_id = {environmentId: String}", { environmentId }); + queryBuilder.where("error_fingerprint = {fingerprint: String}", { fingerprint }); + queryBuilder.groupBy("error_fingerprint, task_identifier"); + queryBuilder.limit(1); + + const [queryError, records] = await queryBuilder.execute(); + + if (queryError) { + throw queryError; + } + + if (!records || records.length === 0) { + return undefined; + } + + const record = records[0]; + return { + taskIdentifier: record.task_identifier, + errorType: record.error_type, + errorMessage: record.error_message, + count: record.occurrence_count, + firstSeen: parseClickHouseDateTime(record.first_seen), + lastSeen: parseClickHouseDateTime(record.last_seen), + }; + } + + private async getAffectedVersions( + clickhouse: Awaited>, + organizationId: string, + projectId: string, + environmentId: string, + fingerprint: string + ): Promise { + const queryBuilder = clickhouse.errors.affectedVersionsQueryBuilder(); + + queryBuilder.where("organization_id = {organizationId: String}", { organizationId }); + queryBuilder.where("project_id = {projectId: String}", { projectId }); + queryBuilder.where("environment_id = {environmentId: String}", { environmentId }); + queryBuilder.where("error_fingerprint = {fingerprint: String}", { fingerprint }); + queryBuilder.where("task_version != ''"); + queryBuilder.limit(100); + + const [queryError, records] = await queryBuilder.execute(); + + if (queryError || !records) { + return []; + } + + const versions = records.map((r) => r.task_version).filter((v) => v.length > 0); + return sortVersionsDescending(versions).slice(0, 5); + } + + private async getState( + environmentId: string, + taskIdentifier: string, + fingerprint: string + ): Promise<{ + status: ErrorGroupStatus; + resolvedAt: Date | null; + resolvedInVersion: string | null; + resolvedBy: string | null; + ignoredAt: Date | null; + ignoredUntil: Date | null; + ignoredReason: string | null; + ignoredByUserId: string | null; + ignoredUntilOccurrenceRate: number | null; + ignoredUntilTotalOccurrences: number | null; + } | null> { + const row = await this._replica.errorGroupState.findFirst({ + where: { + environmentId, + taskIdentifier, + errorFingerprint: fingerprint, + }, + select: { + status: true, + resolvedAt: true, + resolvedInVersion: true, + resolvedBy: true, + ignoredAt: true, + ignoredUntil: true, + ignoredReason: true, + ignoredByUserId: true, + ignoredUntilOccurrenceRate: true, + ignoredUntilTotalOccurrences: true, + }, + }); + + if (!row) { + return null; + } + + return { + status: row.status, + resolvedAt: row.resolvedAt, + resolvedInVersion: row.resolvedInVersion, + resolvedBy: row.resolvedBy, + ignoredAt: row.ignoredAt, + ignoredUntil: row.ignoredUntil, + ignoredReason: row.ignoredReason, + ignoredByUserId: row.ignoredByUserId, + ignoredUntilOccurrenceRate: row.ignoredUntilOccurrenceRate, + ignoredUntilTotalOccurrences: row.ignoredUntilTotalOccurrences, + }; + } +} diff --git a/apps/webapp/app/presenters/v3/ApiErrorListPresenter.server.ts b/apps/webapp/app/presenters/v3/ApiErrorListPresenter.server.ts new file mode 100644 index 00000000000..db86753a6fb --- /dev/null +++ b/apps/webapp/app/presenters/v3/ApiErrorListPresenter.server.ts @@ -0,0 +1,164 @@ +import { type ErrorGroupListItem } from "@trigger.dev/core/v3"; +import { ErrorId } from "@trigger.dev/core/v3/isomorphic"; +import { type ErrorGroupStatus, type Project, type RuntimeEnvironment } from "@trigger.dev/database"; +import { z } from "zod"; +import { clickhouseFactory } from "~/services/clickhouse/clickhouseFactoryInstance.server"; +import { getCurrentPlan } from "~/services/platform.v3.server"; +import { CoercedDate } from "~/utils/zod"; +import { + ErrorsListPresenter, + type ErrorsListOptions, +} from "./ErrorsListPresenter.server"; +import { BasePresenter } from "./basePresenter.server"; + +// API status (lowercase) <-> DB ErrorGroupState.status (uppercase). +const API_STATUS_TO_DB: Record = { + unresolved: "UNRESOLVED", + resolved: "RESOLVED", + ignored: "IGNORED", +}; + +const DB_STATUS_TO_API: Record = { + UNRESOLVED: "unresolved", + RESOLVED: "resolved", + IGNORED: "ignored", +}; + +export const ApiErrorListSearchParams = z.object({ + "page[size]": z.coerce.number().int().positive().min(1).max(100).optional(), + "page[after]": z.string().optional(), + "page[before]": z.string().optional(), + "filter[taskIdentifier]": z + .string() + .optional() + .transform((value) => (value ? value.split(",") : undefined)), + "filter[version]": z + .string() + .optional() + .transform((value) => (value ? value.split(",") : undefined)), + "filter[status]": z + .string() + .optional() + .transform((value, ctx) => { + if (!value) { + return undefined; + } + + const statuses = value.split(","); + // hasOwnProperty, not `in`: `in` walks the prototype chain, so + // `filter[status]=toString` would pass and map to a function. + const invalid = statuses.filter( + (status) => !Object.prototype.hasOwnProperty.call(API_STATUS_TO_DB, status) + ); + + if (invalid.length > 0) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `Invalid status values: ${invalid.join( + ", " + )}. Allowed: unresolved, resolved, ignored.`, + }); + return z.NEVER; + } + + return Array.from(new Set(statuses.map((status) => API_STATUS_TO_DB[status]))); + }), + "filter[search]": z.string().max(1000).optional(), + "filter[period]": z.string().optional(), + "filter[from]": CoercedDate, + "filter[to]": CoercedDate, +}); + +type ApiErrorListSearchParams = z.infer; + +export class ApiErrorListPresenter extends BasePresenter { + public async call( + project: Pick, + environment: Pick, + searchParams: ApiErrorListSearchParams + ): Promise<{ + data: ErrorGroupListItem[]; + pagination: { next?: string; previous?: string }; + }> { + return this.trace("call", async () => { + const options: ErrorsListOptions = { + projectId: project.id, + defaultPeriod: "1d", + }; + + if (searchParams["page[size]"]) { + options.pageSize = searchParams["page[size]"]; + } + + if (searchParams["page[after]"]) { + options.cursor = searchParams["page[after]"]; + options.direction = "forward"; + } + + if (searchParams["page[before]"]) { + options.cursor = searchParams["page[before]"]; + options.direction = "backward"; + } + + if (searchParams["filter[taskIdentifier]"]) { + options.tasks = searchParams["filter[taskIdentifier]"]; + } + + if (searchParams["filter[version]"]) { + options.versions = searchParams["filter[version]"]; + } + + if (searchParams["filter[status]"]) { + options.statuses = searchParams["filter[status]"]; + } + + if (searchParams["filter[search]"]) { + options.search = searchParams["filter[search]"]; + } + + if (searchParams["filter[period]"]) { + options.period = searchParams["filter[period]"]; + } + + if (searchParams["filter[from]"]) { + options.from = searchParams["filter[from]"].getTime(); + } + + if (searchParams["filter[to]"]) { + options.to = searchParams["filter[to]"].getTime(); + } + + const organizationId = environment.organizationId; + + const plan = await getCurrentPlan(organizationId); + options.retentionLimitDays = plan?.v3Subscription?.plan?.limits.logRetentionDays.number ?? 30; + + // The errors data lives in the "logs" ClickHouse client, matching the + // dashboard list loader. + const clickhouse = await clickhouseFactory.getClickhouseForOrganization( + organizationId, + "logs" + ); + + const presenter = new ErrorsListPresenter(this._replica, clickhouse); + const result = await presenter.call(organizationId, environment.id, options); + + return { + data: result.errorGroups.map((group) => ({ + id: ErrorId.toFriendlyId(group.fingerprint), + fingerprint: group.fingerprint, + taskIdentifier: group.taskIdentifier, + errorType: group.errorType, + errorMessage: group.errorMessage, + status: DB_STATUS_TO_API[group.status], + count: group.count, + firstSeen: group.firstSeen, + lastSeen: group.lastSeen, + resolvedAt: group.resolvedAt, + ignoredUntil: group.ignoredUntil, + })), + pagination: result.pagination, + }; + }); + } +} diff --git a/apps/webapp/app/presenters/v3/ApiRunListPresenter.server.ts b/apps/webapp/app/presenters/v3/ApiRunListPresenter.server.ts index 0e7077b3dfc..a48a9a29e6d 100644 --- a/apps/webapp/app/presenters/v3/ApiRunListPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/ApiRunListPresenter.server.ts @@ -83,6 +83,8 @@ export const ApiRunListSearchParams = z.object({ }), "filter[bulkAction]": z.string().optional(), "filter[schedule]": z.string().optional(), + // An `error_` id — lists the runs behind an error group. + "filter[error]": z.string().optional(), "filter[isTest]": z .string() .optional() @@ -237,6 +239,10 @@ export class ApiRunListPresenter extends BasePresenter { options.scheduleId = searchParams["filter[schedule]"]; } + if (searchParams["filter[error]"]) { + options.errorId = searchParams["filter[error]"]; + } + if (searchParams["filter[createdAt][from]"]) { options.from = searchParams["filter[createdAt][from]"].getTime(); } diff --git a/apps/webapp/app/routes/api.v1.errors.$errorId.ignore.ts b/apps/webapp/app/routes/api.v1.errors.$errorId.ignore.ts new file mode 100644 index 00000000000..4b0ad2ab671 --- /dev/null +++ b/apps/webapp/app/routes/api.v1.errors.$errorId.ignore.ts @@ -0,0 +1,58 @@ +import { json } from "@remix-run/server-runtime"; +import { IgnoreErrorRequestBody } from "@trigger.dev/core/v3"; +import { z } from "zod"; +import { findErrorGroupResource } from "~/presenters/v3/ApiErrorGroupPresenter.server"; +import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server"; +import { ErrorGroupActions } from "~/v3/services/errorGroupActions.server"; + +const ParamsSchema = z.object({ + errorId: z.string(), +}); + +export const { action, loader } = createActionApiRoute( + { + params: ParamsSchema, + body: IgnoreErrorRequestBody, + method: "POST", + allowJWT: true, + corsStrategy: "all", + findResource: async (params, authentication) => + findErrorGroupResource(authentication, params.errorId), + authorization: { + action: "write", + resource: (params) => ({ type: "errors", id: params.errorId }), + }, + }, + async ({ authentication, body, resource, params }) => { + if (!resource) { + return json({ error: "Not found" }, { status: 404 }); + } + + const environment = authentication.environment; + + await new ErrorGroupActions().ignoreError( + { + organizationId: environment.organizationId, + projectId: environment.project.id, + environmentId: environment.id, + taskIdentifier: resource.taskIdentifier, + errorFingerprint: resource.fingerprint, + }, + { + userId: authentication.actor?.sub ?? null, + duration: body.duration, + occurrenceRateThreshold: body.occurrenceRate, + totalOccurrencesThreshold: body.totalOccurrences, + // The "re-surface after N more occurrences" threshold is relative to + // the count at ignore time. The resolved resource's `count` is the + // group's current global occurrence count (same source the dashboard + // uses), so reuse it instead of issuing a second query. + occurrenceCountAtIgnoreTime: body.totalOccurrences ? resource.count : undefined, + reason: body.reason, + } + ); + + const updated = await findErrorGroupResource(authentication, params.errorId); + return json(updated); + } +); diff --git a/apps/webapp/app/routes/api.v1.errors.$errorId.resolve.ts b/apps/webapp/app/routes/api.v1.errors.$errorId.resolve.ts new file mode 100644 index 00000000000..e0818c89f49 --- /dev/null +++ b/apps/webapp/app/routes/api.v1.errors.$errorId.resolve.ts @@ -0,0 +1,50 @@ +import { json } from "@remix-run/server-runtime"; +import { ResolveErrorRequestBody } from "@trigger.dev/core/v3"; +import { z } from "zod"; +import { findErrorGroupResource } from "~/presenters/v3/ApiErrorGroupPresenter.server"; +import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server"; +import { ErrorGroupActions } from "~/v3/services/errorGroupActions.server"; + +const ParamsSchema = z.object({ + errorId: z.string(), +}); + +export const { action, loader } = createActionApiRoute( + { + params: ParamsSchema, + body: ResolveErrorRequestBody, + method: "POST", + allowJWT: true, + corsStrategy: "all", + findResource: async (params, authentication) => + findErrorGroupResource(authentication, params.errorId), + authorization: { + action: "write", + resource: (params) => ({ type: "errors", id: params.errorId }), + }, + }, + async ({ authentication, body, resource, params }) => { + if (!resource) { + return json({ error: "Not found" }, { status: 404 }); + } + + const environment = authentication.environment; + + await new ErrorGroupActions().resolveError( + { + organizationId: environment.organizationId, + projectId: environment.project.id, + environmentId: environment.id, + taskIdentifier: resource.taskIdentifier, + errorFingerprint: resource.fingerprint, + }, + { + userId: authentication.actor?.sub ?? null, + resolvedInVersion: body.resolvedInVersion, + } + ); + + const updated = await findErrorGroupResource(authentication, params.errorId); + return json(updated); + } +); diff --git a/apps/webapp/app/routes/api.v1.errors.$errorId.ts b/apps/webapp/app/routes/api.v1.errors.$errorId.ts new file mode 100644 index 00000000000..a1dd99e5551 --- /dev/null +++ b/apps/webapp/app/routes/api.v1.errors.$errorId.ts @@ -0,0 +1,25 @@ +import { json } from "@remix-run/server-runtime"; +import { z } from "zod"; +import { findErrorGroupResource } from "~/presenters/v3/ApiErrorGroupPresenter.server"; +import { createLoaderApiRoute } from "~/services/routeBuilders/apiBuilder.server"; + +const ParamsSchema = z.object({ + errorId: z.string(), +}); + +export const loader = createLoaderApiRoute( + { + params: ParamsSchema, + allowJWT: true, + corsStrategy: "all", + findResource: async (params, authentication) => + findErrorGroupResource(authentication, params.errorId), + authorization: { + action: "read", + resource: (_resource, params) => ({ type: "errors", id: params.errorId }), + }, + }, + async ({ resource }) => { + return json(resource); + } +); diff --git a/apps/webapp/app/routes/api.v1.errors.$errorId.unresolve.ts b/apps/webapp/app/routes/api.v1.errors.$errorId.unresolve.ts new file mode 100644 index 00000000000..9362b7c4c4b --- /dev/null +++ b/apps/webapp/app/routes/api.v1.errors.$errorId.unresolve.ts @@ -0,0 +1,42 @@ +import { json } from "@remix-run/server-runtime"; +import { z } from "zod"; +import { findErrorGroupResource } from "~/presenters/v3/ApiErrorGroupPresenter.server"; +import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server"; +import { ErrorGroupActions } from "~/v3/services/errorGroupActions.server"; + +const ParamsSchema = z.object({ + errorId: z.string(), +}); + +export const { action, loader } = createActionApiRoute( + { + params: ParamsSchema, + method: "POST", + allowJWT: true, + corsStrategy: "all", + findResource: async (params, authentication) => + findErrorGroupResource(authentication, params.errorId), + authorization: { + action: "write", + resource: (params) => ({ type: "errors", id: params.errorId }), + }, + }, + async ({ authentication, resource, params }) => { + if (!resource) { + return json({ error: "Not found" }, { status: 404 }); + } + + const environment = authentication.environment; + + await new ErrorGroupActions().unresolveError({ + organizationId: environment.organizationId, + projectId: environment.project.id, + environmentId: environment.id, + taskIdentifier: resource.taskIdentifier, + errorFingerprint: resource.fingerprint, + }); + + const updated = await findErrorGroupResource(authentication, params.errorId); + return json(updated); + } +); diff --git a/apps/webapp/app/routes/api.v1.errors.ts b/apps/webapp/app/routes/api.v1.errors.ts new file mode 100644 index 00000000000..2826b217604 --- /dev/null +++ b/apps/webapp/app/routes/api.v1.errors.ts @@ -0,0 +1,29 @@ +import { json } from "@remix-run/server-runtime"; +import { + ApiErrorListPresenter, + ApiErrorListSearchParams, +} from "~/presenters/v3/ApiErrorListPresenter.server"; +import { createLoaderApiRoute } from "~/services/routeBuilders/apiBuilder.server"; + +export const loader = createLoaderApiRoute( + { + searchParams: ApiErrorListSearchParams, + allowJWT: true, + corsStrategy: "all", + authorization: { + action: "read", + resource: () => ({ type: "errors" }), + }, + findResource: async () => 1, // Collection route — nothing to resolve. + }, + async ({ searchParams, authentication }) => { + const presenter = new ApiErrorListPresenter(); + const result = await presenter.call( + authentication.environment.project, + authentication.environment, + searchParams + ); + + return json(result); + } +); diff --git a/apps/webapp/app/routes/api.v1.projects.$projectRef.$env.jwt.ts b/apps/webapp/app/routes/api.v1.projects.$projectRef.$env.jwt.ts index 1214c80f24a..45dca11fbae 100644 --- a/apps/webapp/app/routes/api.v1.projects.$projectRef.$env.jwt.ts +++ b/apps/webapp/app/routes/api.v1.projects.$projectRef.$env.jwt.ts @@ -117,11 +117,22 @@ export async function action({ request, params }: ActionFunctionArgs) { : uatCap : requestedScopes; + // Attribution: stamp the acting user on the minted env JWT. A UAT carries + // its user as `userActorId`; a PAT exchange resolves the user from the + // authentication result. Either way downstream handlers read `act.sub` + // (e.g. the errors API records who resolved/ignored an error). An org + // access token has no user, so `act` is omitted. + const actorUserId = + userActorId ?? + (authenticationResult.type === "personalAccessToken" + ? authenticationResult.result.userId + : undefined); + const claims = { sub: runtimeEnv.id, pub: true, ...(scopes ? { scopes } : {}), - ...(userActorId ? { act: { sub: userActorId } } : {}), + ...(actorUserId ? { act: { sub: actorUserId } } : {}), }; const jwt = await internal_generateJWT({ diff --git a/apps/webapp/app/services/apiAuth.server.ts b/apps/webapp/app/services/apiAuth.server.ts index 915311c07c1..c19c5d4c516 100644 --- a/apps/webapp/app/services/apiAuth.server.ts +++ b/apps/webapp/app/services/apiAuth.server.ts @@ -56,6 +56,13 @@ export type ApiAuthenticationResultSuccess = { realtime?: { skipColumns?: string[]; }; + // Present when the request used a public JWT minted from a PAT/UAT exchange + // that stamped an `act` delegation claim. `actor.sub` is the acting user id, + // used for attribution (e.g. who resolved an error). Absent for plain env + // API keys (no user) and JWTs minted without delegation. + actor?: { + sub: string; + }; }; export type ApiAuthenticationResultFailure = { diff --git a/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts b/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts index bee8fdc5a71..593c916f7c9 100644 --- a/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts +++ b/apps/webapp/app/services/routeBuilders/apiBuilder.server.ts @@ -78,6 +78,9 @@ async function authenticateRequestForApiBuilder( environment: result.environment, realtime: result.jwt?.realtime, oneTimeUse: result.jwt?.oneTimeUse, + // Surface the delegation actor (PAT/UAT-exchanged JWT) so handlers can + // attribute writes to the acting user. + actor: result.jwt?.act, }; return { ok: true, authentication, ability: result.ability }; diff --git a/apps/webapp/app/v3/services/errorGroupActions.server.ts b/apps/webapp/app/v3/services/errorGroupActions.server.ts index c026efe2aba..bc6a23b1c11 100644 --- a/apps/webapp/app/v3/services/errorGroupActions.server.ts +++ b/apps/webapp/app/v3/services/errorGroupActions.server.ts @@ -14,7 +14,10 @@ export class ErrorGroupActions { async resolveError( identifier: ErrorGroupIdentifier, params: { - userId: string; + // Nullable: a resolve via an env API key has no acting user, so + // `resolvedBy` stays null. The dashboard always passes a userId; the + // API passes the `act.sub` user from a PAT/UAT-exchanged JWT, else null. + userId?: string | null; resolvedInVersion?: string; } ) { @@ -34,7 +37,7 @@ export class ErrorGroupActions { status: "RESOLVED", resolvedAt: now, resolvedInVersion: params.resolvedInVersion ?? null, - resolvedBy: params.userId, + resolvedBy: params.userId ?? null, ignoredUntil: null, ignoredUntilOccurrenceRate: null, ignoredUntilTotalOccurrences: null, @@ -52,7 +55,7 @@ export class ErrorGroupActions { status: "RESOLVED", resolvedAt: now, resolvedInVersion: params.resolvedInVersion ?? null, - resolvedBy: params.userId, + resolvedBy: params.userId ?? null, }, }); } @@ -60,7 +63,7 @@ export class ErrorGroupActions { async ignoreError( identifier: ErrorGroupIdentifier, params: { - userId: string; + userId?: string | null; duration?: number; occurrenceRateThreshold?: number; totalOccurrencesThreshold?: number; @@ -87,7 +90,7 @@ export class ErrorGroupActions { ignoredUntilTotalOccurrences: params.totalOccurrencesThreshold ?? null, ignoredAtOccurrenceCount: params.occurrenceCountAtIgnoreTime ?? null, ignoredReason: params.reason ?? null, - ignoredByUserId: params.userId, + ignoredByUserId: params.userId ?? null, resolvedAt: null, resolvedInVersion: null, resolvedBy: null, diff --git a/docs/docs.json b/docs/docs.json index 77269340944..533880015a7 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -379,6 +379,16 @@ "management/runs/retrieve-result" ] }, + { + "group": "Errors API", + "pages": [ + "management/errors/list", + "management/errors/retrieve", + "management/errors/resolve", + "management/errors/ignore", + "management/errors/unresolve" + ] + }, { "group": "Queues API", "pages": [ diff --git a/docs/management/errors/ignore.mdx b/docs/management/errors/ignore.mdx new file mode 100644 index 00000000000..43392e62e0e --- /dev/null +++ b/docs/management/errors/ignore.mdx @@ -0,0 +1,4 @@ +--- +title: "Ignore an error" +openapi: "v3-openapi POST /api/v1/errors/{errorId}/ignore" +--- diff --git a/docs/management/errors/list.mdx b/docs/management/errors/list.mdx new file mode 100644 index 00000000000..49a99af05ee --- /dev/null +++ b/docs/management/errors/list.mdx @@ -0,0 +1,4 @@ +--- +title: "List errors" +openapi: "v3-openapi GET /api/v1/errors" +--- diff --git a/docs/management/errors/resolve.mdx b/docs/management/errors/resolve.mdx new file mode 100644 index 00000000000..15b72cad7ef --- /dev/null +++ b/docs/management/errors/resolve.mdx @@ -0,0 +1,4 @@ +--- +title: "Resolve an error" +openapi: "v3-openapi POST /api/v1/errors/{errorId}/resolve" +--- diff --git a/docs/management/errors/retrieve.mdx b/docs/management/errors/retrieve.mdx new file mode 100644 index 00000000000..b3f41ae111d --- /dev/null +++ b/docs/management/errors/retrieve.mdx @@ -0,0 +1,4 @@ +--- +title: "Retrieve an error" +openapi: "v3-openapi GET /api/v1/errors/{errorId}" +--- diff --git a/docs/management/errors/unresolve.mdx b/docs/management/errors/unresolve.mdx new file mode 100644 index 00000000000..1232d864580 --- /dev/null +++ b/docs/management/errors/unresolve.mdx @@ -0,0 +1,4 @@ +--- +title: "Unresolve an error" +openapi: "v3-openapi POST /api/v1/errors/{errorId}/unresolve" +--- diff --git a/docs/v3-openapi.yaml b/docs/v3-openapi.yaml index 56100db3fe6..63700c90dc0 100644 --- a/docs/v3-openapi.yaml +++ b/docs/v3-openapi.yaml @@ -3515,6 +3515,190 @@ paths: await sessions.close(chatId, { reason: "user signed out" }); + "/api/v1/errors": + get: + operationId: list_errors_v1 + summary: List errors + description: | + List error groups in a specific environment. Runs that fail are grouped by a fingerprint derived from the error type, message, and stack trace. Filter by task identifier, version, status, search text, and time range. + parameters: + - $ref: "#/components/parameters/errorsCursorPagination" + - $ref: "#/components/parameters/errorsFilter" + responses: + "200": + description: Successful request + content: + application/json: + schema: + $ref: "#/components/schemas/ListErrorsResult" + "400": + description: Invalid query parameters + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorWithDetailsResponse" + "401": + description: Unauthorized request + tags: + - errors + security: + - secretKey: [] + x-codeSamples: + - lang: curl + label: List errors + source: |- + curl -X GET "https://api.trigger.dev/api/v1/errors?filter[status]=unresolved" \ + -H "Authorization: Bearer $TRIGGER_SECRET_KEY" + "/api/v1/errors/{errorId}": + parameters: + - $ref: "#/components/parameters/errorId" + get: + operationId: retrieve_error_v1 + summary: Retrieve an error + description: Retrieve detailed information about a single error group, including its lifecycle state and the worker versions it has affected. + responses: + "200": + description: Successful request + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorObject" + "401": + description: Unauthorized request + "404": + description: Error not found + tags: + - errors + security: + - secretKey: [] + x-codeSamples: + - lang: curl + label: Retrieve an error + source: |- + curl -X GET "https://api.trigger.dev/api/v1/errors/error_8f3b2a1c9d4e5f60" \ + -H "Authorization: Bearer $TRIGGER_SECRET_KEY" + "/api/v1/errors/{errorId}/resolve": + parameters: + - $ref: "#/components/parameters/errorId" + post: + operationId: resolve_error_v1 + summary: Resolve an error + description: | + Mark an error group as resolved. Optionally record the worker version that resolved it. Send a JSON body (use `{}` when you have no fields to set). + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + resolvedInVersion: + type: string + description: The worker version the error was resolved in. + example: 20240101.1 + responses: + "200": + description: Successful request + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorObject" + "401": + description: Unauthorized request + "404": + description: Error not found + tags: + - errors + security: + - secretKey: [] + x-codeSamples: + - lang: curl + label: Resolve an error + source: |- + curl -X POST "https://api.trigger.dev/api/v1/errors/error_8f3b2a1c9d4e5f60/resolve" \ + -H "Authorization: Bearer $TRIGGER_SECRET_KEY" \ + -H "Content-Type: application/json" \ + -d '{"resolvedInVersion": "20240101.1"}' + "/api/v1/errors/{errorId}/ignore": + parameters: + - $ref: "#/components/parameters/errorId" + post: + operationId: ignore_error_v1 + summary: Ignore an error + description: | + Mark an error group as ignored. Provide a `duration` to ignore it for a fixed window, and/or thresholds that re-surface the error when exceeded. Send a JSON body (use `{}` to ignore indefinitely). + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + duration: + type: integer + description: How long to ignore the error for, in milliseconds. + example: 86400000 + occurrenceRate: + type: number + description: Re-surface the error if its occurrence rate exceeds this many occurrences per minute. + totalOccurrences: + type: integer + description: Re-surface the error once it accrues this many new occurrences after being ignored. + reason: + type: string + description: An optional human-readable reason for ignoring the error. + responses: + "200": + description: Successful request + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorObject" + "401": + description: Unauthorized request + "404": + description: Error not found + tags: + - errors + security: + - secretKey: [] + x-codeSamples: + - lang: curl + label: Ignore an error + source: |- + curl -X POST "https://api.trigger.dev/api/v1/errors/error_8f3b2a1c9d4e5f60/ignore" \ + -H "Authorization: Bearer $TRIGGER_SECRET_KEY" \ + -H "Content-Type: application/json" \ + -d '{"duration": 86400000, "reason": "Known flaky dependency"}' + "/api/v1/errors/{errorId}/unresolve": + parameters: + - $ref: "#/components/parameters/errorId" + post: + operationId: unresolve_error_v1 + summary: Unresolve an error + description: Move a resolved or ignored error group back to unresolved. + responses: + "200": + description: Successful request + content: + application/json: + schema: + $ref: "#/components/schemas/ErrorObject" + "401": + description: Unauthorized request + "404": + description: Error not found + tags: + - errors + security: + - secretKey: [] + x-codeSamples: + - lang: curl + label: Unresolve an error + source: |- + curl -X POST "https://api.trigger.dev/api/v1/errors/error_8f3b2a1c9d4e5f60/unresolve" \ + -H "Authorization: Bearer $TRIGGER_SECRET_KEY" + components: parameters: taskIdentifier: @@ -3601,6 +3785,51 @@ components: before: type: string description: The ID of the session to start the page before. Sets the pagination direction to backward. + errorId: + in: path + name: errorId + required: true + schema: + type: string + description: The ID of an error group, starts with `error_`. + example: error_8f3b2a1c9d4e5f60 + errorsFilter: + in: query + name: filter + style: deepObject + explode: true + description: | + Filter the error groups. Filter by task identifier, version, status, search text, and time range. + + For array fields, provide multiple values as a comma-separated list. For example, to get unresolved and ignored errors, use `filter[status]=unresolved,ignored`. + + For the time range, use `filter[period]` (e.g. `filter[period]=7d`) or `filter[from]` / `filter[to]` with ISO timestamps. + schema: + $ref: "#/components/schemas/ErrorsFilter" + errorsCursorPagination: + in: query + name: page + style: deepObject + explode: true + description: | + Paginate the results. Specify the number of errors per page, and the ID of the error to start the page after or before. + + For object fields like `page`, use the "form" encoding style. For example, to get the next page, use `page[after]=error_1234`. + schema: + type: object + properties: + size: + type: integer + maximum: 100 + minimum: 1 + default: 25 + description: Number of errors per page. Maximum is 100. + after: + type: string + description: The ID of the error to start the page after. Sets the pagination direction to forward. + before: + type: string + description: The ID of the error to start the page before. Sets the pagination direction to backward. runId: in: path name: runId @@ -4075,6 +4304,199 @@ components: items: type: string description: The tags that are attached to the run + error: + type: string + description: An error group ID (starts with `error_`). Lists the runs behind that error group. + example: error_8f3b2a1c9d4e5f60 + ErrorsFilter: + type: object + properties: + taskIdentifier: + type: array + items: + type: string + description: The identifier of the task the error belongs to + version: + type: array + items: + type: string + description: The worker version the error occurred in + status: + type: array + items: + type: string + enum: + - unresolved + - resolved + - ignored + description: The lifecycle status of the error group + search: + type: string + description: Free-text search across the error type and message + from: + type: string + format: date-time + description: The start of the time range to count occurrences within + to: + type: string + format: date-time + description: The end of the time range to count occurrences within + period: + type: string + description: A relative time range to count occurrences within + example: 7d + ErrorListItem: + type: object + required: + - id + - fingerprint + - taskIdentifier + - errorType + - errorMessage + - status + - count + - firstSeen + - lastSeen + properties: + id: + type: string + description: The unique ID of the error group, prefixed with `error_` + example: error_8f3b2a1c9d4e5f60 + fingerprint: + type: string + description: The raw fingerprint the error is grouped by + taskIdentifier: + type: string + description: The identifier of the task the error belongs to + errorType: + type: string + description: The error type or name (e.g. `TypeError`) + errorMessage: + type: string + description: The normalized error message + status: + type: string + enum: + - unresolved + - resolved + - ignored + count: + type: integer + description: The number of occurrences within the requested time range + firstSeen: + type: string + format: date-time + description: When the error group was first seen (global) + lastSeen: + type: string + format: date-time + description: When the error group was last seen (global) + resolvedAt: + type: string + format: date-time + nullable: true + ignoredUntil: + type: string + format: date-time + nullable: true + ListErrorsResult: + type: object + properties: + data: + type: array + items: + "$ref": "#/components/schemas/ErrorListItem" + pagination: + type: object + properties: + next: + type: string + description: The error ID to start the next page after. Pass it as `page[after]`. + example: error_8f3b2a1c9d4e5f60 + previous: + type: string + description: The error ID to start the previous page before. Pass it as `page[before]`. + ErrorObject: + type: object + required: + - id + - fingerprint + - taskIdentifier + - errorType + - errorMessage + - status + - count + - firstSeen + - lastSeen + - affectedVersions + properties: + id: + type: string + description: The unique ID of the error group, prefixed with `error_` + example: error_8f3b2a1c9d4e5f60 + fingerprint: + type: string + description: The raw fingerprint the error is grouped by + taskIdentifier: + type: string + description: The identifier of the task the error belongs to + errorType: + type: string + description: The error type or name (e.g. `TypeError`) + errorMessage: + type: string + description: The normalized error message + count: + type: integer + description: The total number of occurrences of this error group + firstSeen: + type: string + format: date-time + lastSeen: + type: string + format: date-time + affectedVersions: + type: array + items: + type: string + description: The most recent worker versions the error has occurred in (up to five) + status: + type: string + enum: + - unresolved + - resolved + - ignored + resolvedAt: + type: string + format: date-time + nullable: true + resolvedInVersion: + type: string + nullable: true + resolvedBy: + type: string + nullable: true + description: The ID of the user who resolved the error, when attributable + ignoredAt: + type: string + format: date-time + nullable: true + ignoredUntil: + type: string + format: date-time + nullable: true + ignoredReason: + type: string + nullable: true + ignoredByUserId: + type: string + nullable: true + ignoredUntilOccurrenceRate: + type: number + nullable: true + ignoredUntilTotalOccurrences: + type: integer + nullable: true ListRunsResult: type: object properties: diff --git a/internal-packages/rbac/src/fallback.ts b/internal-packages/rbac/src/fallback.ts index a8b0e719d71..2a135a475dc 100644 --- a/internal-packages/rbac/src/fallback.ts +++ b/internal-packages/rbac/src/fallback.ts @@ -127,6 +127,10 @@ class RoleBaseAccessFallbackController implements RoleBaseAccessController { : []; const realtime = result.payload.realtime as { skipColumns?: string[] } | undefined; const oneTimeUse = result.payload.otu === true; + // A JWT minted from a PAT/UAT exchange stamps `act: { sub: userId }` for + // attribution. Surface it so write handlers can record the acting user. + const act = result.payload.act as { sub?: unknown } | undefined; + const actSub = typeof act?.sub === "string" ? act.sub : undefined; return { ok: true, @@ -138,7 +142,7 @@ class RoleBaseAccessFallbackController implements RoleBaseAccessController { projectId: env.projectId, }, ability: buildJwtAbility(scopes), - jwt: { realtime, oneTimeUse }, + jwt: { realtime, oneTimeUse, ...(actSub ? { act: { sub: actSub } } : {}) }, }; } diff --git a/packages/core/src/v3/schemas/errors.ts b/packages/core/src/v3/schemas/errors.ts new file mode 100644 index 00000000000..192da5b6de7 --- /dev/null +++ b/packages/core/src/v3/schemas/errors.ts @@ -0,0 +1,87 @@ +import { z } from "zod"; + +/** + * The lifecycle state of an error group. Mirrors the dashboard's + * `ErrorGroupState.status` (`UNRESOLVED | RESOLVED | IGNORED`) but is exposed + * lowercase over the API, matching the `filter[status]` query value. + */ +export const ErrorGroupStatus = z.enum(["unresolved", "resolved", "ignored"]); + +export type ErrorGroupStatus = z.infer; + +/** + * A single error group as returned by the list endpoint. `count` is the number + * of occurrences within the requested time range; `firstSeen`/`lastSeen` are + * the group's global first/last occurrence. + */ +export const ErrorGroupListItem = z.object({ + id: z.string(), + fingerprint: z.string(), + taskIdentifier: z.string(), + errorType: z.string(), + errorMessage: z.string(), + status: ErrorGroupStatus, + count: z.number(), + firstSeen: z.coerce.date(), + lastSeen: z.coerce.date(), + resolvedAt: z.coerce.date().nullable(), + ignoredUntil: z.coerce.date().nullable(), +}); + +export type ErrorGroupListItem = z.infer; + +export const ListErrorsResponse = z.object({ + data: z.array(ErrorGroupListItem), + pagination: z.object({ + next: z.string().optional(), + previous: z.string().optional(), + }), +}); + +export type ListErrorsResponse = z.infer; + +/** + * The full detail for a single error group: summary fields, the affected task + * versions (most recent five), and the complete lifecycle state. + */ +export const ErrorGroupDetail = z.object({ + id: z.string(), + fingerprint: z.string(), + taskIdentifier: z.string(), + errorType: z.string(), + errorMessage: z.string(), + count: z.number(), + firstSeen: z.coerce.date(), + lastSeen: z.coerce.date(), + affectedVersions: z.array(z.string()), + status: ErrorGroupStatus, + resolvedAt: z.coerce.date().nullable(), + resolvedInVersion: z.string().nullable(), + resolvedBy: z.string().nullable(), + ignoredAt: z.coerce.date().nullable(), + ignoredUntil: z.coerce.date().nullable(), + ignoredReason: z.string().nullable(), + ignoredByUserId: z.string().nullable(), + ignoredUntilOccurrenceRate: z.number().nullable(), + ignoredUntilTotalOccurrences: z.number().nullable(), +}); + +export type ErrorGroupDetail = z.infer; + +export const ResolveErrorRequestBody = z.object({ + resolvedInVersion: z.string().optional(), +}); + +export type ResolveErrorRequestBody = z.infer; + +export const IgnoreErrorRequestBody = z.object({ + /** How long to ignore the error for, in milliseconds. */ + duration: z.number().int().positive().optional(), + /** Re-surface the error if its occurrence rate exceeds this many per minute. */ + occurrenceRate: z.number().positive().optional(), + /** Re-surface the error once it accrues this many new occurrences. */ + totalOccurrences: z.number().int().positive().optional(), + reason: z.string().max(1000).optional(), +}); + +export type IgnoreErrorRequestBody = z.infer; diff --git a/packages/core/src/v3/schemas/index.ts b/packages/core/src/v3/schemas/index.ts index 11857d6197f..8587ed04e04 100644 --- a/packages/core/src/v3/schemas/index.ts +++ b/packages/core/src/v3/schemas/index.ts @@ -16,3 +16,4 @@ export * from "./checkpoints.js"; export * from "./warmStart.js"; export * from "./queues.js"; export * from "./query.js"; +export * from "./errors.js"; diff --git a/packages/plugins/src/rbac.ts b/packages/plugins/src/rbac.ts index 002da1bb63d..ca9a1a0494f 100644 --- a/packages/plugins/src/rbac.ts +++ b/packages/plugins/src/rbac.ts @@ -225,7 +225,10 @@ export type BearerAuthResult = environment: RbacEnv; subject: RbacSubject; ability: RbacAbility; - jwt?: { realtime?: { skipColumns?: string[] }; oneTimeUse?: boolean }; + // `act` carries the acting user (`act.sub`) when the public JWT was + // minted from a PAT/UAT exchange that stamped a delegation claim. Hosts + // surface it for attribution (e.g. who resolved an error). + jwt?: { realtime?: { skipColumns?: string[] }; oneTimeUse?: boolean; act?: { sub: string } }; }; export type SessionAuthResult =