From 62898a93ba94dea9270d7269103ad80f8bcd1a09 Mon Sep 17 00:00:00 2001 From: espadonne Date: Mon, 18 May 2026 00:39:12 -0400 Subject: [PATCH] billing: cap actions minutes by job timeout --- docs/internal/billing.md | 4 ++- docs/internal/runbooks/actions.md | 5 ++++ docs/internal/runbooks/stripe-billing.md | 9 ++++++- .../actions/queries/workflow_insights.sql | 25 +++++++++++++---- .../actions/sqlc/workflow_insights.sql.go | 25 +++++++++++++---- internal/actions/trigger/enqueue_test.go | 3 +++ internal/billing/billing_test.go | 27 ++++++++++++++++++- internal/billing/queries/billing.sql | 5 +++- internal/billing/sqlc/billing.sql.go | 5 +++- 9 files changed, 93 insertions(+), 15 deletions(-) diff --git a/docs/internal/billing.md b/docs/internal/billing.md index 454ca7e8..ff0037d6 100644 --- a/docs/internal/billing.md +++ b/docs/internal/billing.md @@ -351,7 +351,9 @@ PAYMENTS SP08 starts hosted-cost metering: durable. - Actions minutes are counted from completed or canceled workflow job runtime, rounded up to the next whole minute, within the current - monthly usage period. + monthly usage period. Metered runtime is capped at the job's declared + `timeout_minutes` so stale terminal rows cannot bill past the maximum + execution time the runner was allowed to consume. - `org_usage_counters` stores the current projection, `org_usage_snapshots` records audit snapshots, and `org_quota_overrides` lets site admins temporarily override a quota diff --git a/docs/internal/runbooks/actions.md b/docs/internal/runbooks/actions.md index 62d2d005..9ce6f35a 100644 --- a/docs/internal/runbooks/actions.md +++ b/docs/internal/runbooks/actions.md @@ -259,6 +259,11 @@ active container, and reports terminal `cancelled`. with matching labels and capacity. Unsupported hosted labels such as `windows-latest` and `macos-latest` intentionally remain queued until an operator registers matching runners. +- **Actions usage looks too high:** compare raw job wall-clock runtime with + timeout-capped runtime. Billing and usage metrics cap completed/cancelled job + runtime at `workflow_jobs.timeout_minutes`; a larger raw + `completed_at - started_at` gap usually means a stale running row was + cancelled later than the container actually stopped. - **Step logs buffer:** verify the Caddy route above and confirm the SSE route is still mounted outside compression and short timeouts. - **`actions/checkout@v4` fails:** confirm the job is still running, the repo diff --git a/docs/internal/runbooks/stripe-billing.md b/docs/internal/runbooks/stripe-billing.md index 6e79a32b..2f15b1cb 100644 --- a/docs/internal/runbooks/stripe-billing.md +++ b/docs/internal/runbooks/stripe-billing.md @@ -382,7 +382,14 @@ Use this when `BillingQuotaOverage` fires. 1. Open the org billing settings page as a site admin. The Usage panel shows storage and Actions minutes against the effective limits. -2. If counters look stale, run `org:usage_recalc` for that org. +2. If counters look stale, run `org:usage_recalc` for that org: + + ```sh + shithubd admin run-job org:usage_recalc '{"org_id":1,"source":"support-recalc"}' + ``` + + Replace `org_id` with the affected organization id and use a source label + that names the incident or support case. 3. If the overage is legitimate, ask the org owner to upgrade or reduce usage. For support incidents, add a temporary quota override in the site-admin debug panel; it is attributed to the operator. diff --git a/internal/actions/queries/workflow_insights.sql b/internal/actions/queries/workflow_insights.sql index c0e41ff8..2efb39f0 100644 --- a/internal/actions/queries/workflow_insights.sql +++ b/internal/actions/queries/workflow_insights.sql @@ -4,7 +4,10 @@ SELECT COUNT(DISTINCT run.id)::bigint AS run_count, COUNT(job.id)::bigint AS job_count, - COALESCE(SUM(EXTRACT(EPOCH FROM (job.completed_at - job.started_at))) FILTER ( + COALESCE(SUM(LEAST( + GREATEST(EXTRACT(EPOCH FROM (job.completed_at - job.started_at)), 0), + job.timeout_minutes::double precision * 60.0 + )) FILTER ( WHERE job.started_at IS NOT NULL AND job.completed_at IS NOT NULL ), 0)::bigint AS completed_job_seconds FROM workflow_runs run @@ -18,7 +21,10 @@ SELECT COALESCE(NULLIF(run.workflow_name, ''), run.workflow_file)::text AS workflow_name, COUNT(DISTINCT run.id)::bigint AS run_count, COUNT(job.id)::bigint AS job_count, - COALESCE(SUM(EXTRACT(EPOCH FROM (job.completed_at - job.started_at))) FILTER ( + COALESCE(SUM(LEAST( + GREATEST(EXTRACT(EPOCH FROM (job.completed_at - job.started_at)), 0), + job.timeout_minutes::double precision * 60.0 + )) FILTER ( WHERE job.started_at IS NOT NULL AND job.completed_at IS NOT NULL ), 0)::bigint AS completed_job_seconds FROM workflow_runs run @@ -31,7 +37,10 @@ LIMIT $3; -- name: GetActionsPerformanceSummaryForRepo :one SELECT - COALESCE(AVG(EXTRACT(EPOCH FROM (job.completed_at - job.started_at))) FILTER ( + COALESCE(AVG(LEAST( + GREATEST(EXTRACT(EPOCH FROM (job.completed_at - job.started_at)), 0), + job.timeout_minutes::double precision * 60.0 + )) FILTER ( WHERE job.started_at IS NOT NULL AND job.completed_at IS NOT NULL ), 0)::double precision AS avg_job_seconds, COALESCE(AVG(EXTRACT(EPOCH FROM (job.started_at - job.created_at))) FILTER ( @@ -45,7 +54,10 @@ SELECT AND job.conclusion IS NOT NULL AND job.conclusion <> 'success' )::bigint AS failed_job_count, - COALESCE(SUM(EXTRACT(EPOCH FROM (job.completed_at - job.started_at))) FILTER ( + COALESCE(SUM(LEAST( + GREATEST(EXTRACT(EPOCH FROM (job.completed_at - job.started_at)), 0), + job.timeout_minutes::double precision * 60.0 + )) FILTER ( WHERE job.started_at IS NOT NULL AND job.completed_at IS NOT NULL AND job.conclusion IS NOT NULL @@ -62,7 +74,10 @@ SELECT COALESCE(NULLIF(run.workflow_name, ''), run.workflow_file)::text AS workflow_name, COUNT(DISTINCT run.id)::bigint AS run_count, COUNT(job.id)::bigint AS job_count, - COALESCE(AVG(EXTRACT(EPOCH FROM (job.completed_at - job.started_at))) FILTER ( + COALESCE(AVG(LEAST( + GREATEST(EXTRACT(EPOCH FROM (job.completed_at - job.started_at)), 0), + job.timeout_minutes::double precision * 60.0 + )) FILTER ( WHERE job.started_at IS NOT NULL AND job.completed_at IS NOT NULL ), 0)::double precision AS avg_job_seconds, COALESCE(AVG(EXTRACT(EPOCH FROM (job.started_at - job.created_at))) FILTER ( diff --git a/internal/actions/sqlc/workflow_insights.sql.go b/internal/actions/sqlc/workflow_insights.sql.go index 911803a5..db096d3e 100644 --- a/internal/actions/sqlc/workflow_insights.sql.go +++ b/internal/actions/sqlc/workflow_insights.sql.go @@ -13,7 +13,10 @@ import ( const getActionsPerformanceSummaryForRepo = `-- name: GetActionsPerformanceSummaryForRepo :one SELECT - COALESCE(AVG(EXTRACT(EPOCH FROM (job.completed_at - job.started_at))) FILTER ( + COALESCE(AVG(LEAST( + GREATEST(EXTRACT(EPOCH FROM (job.completed_at - job.started_at)), 0), + job.timeout_minutes::double precision * 60.0 + )) FILTER ( WHERE job.started_at IS NOT NULL AND job.completed_at IS NOT NULL ), 0)::double precision AS avg_job_seconds, COALESCE(AVG(EXTRACT(EPOCH FROM (job.started_at - job.created_at))) FILTER ( @@ -27,7 +30,10 @@ SELECT AND job.conclusion IS NOT NULL AND job.conclusion <> 'success' )::bigint AS failed_job_count, - COALESCE(SUM(EXTRACT(EPOCH FROM (job.completed_at - job.started_at))) FILTER ( + COALESCE(SUM(LEAST( + GREATEST(EXTRACT(EPOCH FROM (job.completed_at - job.started_at)), 0), + job.timeout_minutes::double precision * 60.0 + )) FILTER ( WHERE job.started_at IS NOT NULL AND job.completed_at IS NOT NULL AND job.conclusion IS NOT NULL @@ -70,7 +76,10 @@ const getActionsUsageSummaryForRepo = `-- name: GetActionsUsageSummaryForRepo :o SELECT COUNT(DISTINCT run.id)::bigint AS run_count, COUNT(job.id)::bigint AS job_count, - COALESCE(SUM(EXTRACT(EPOCH FROM (job.completed_at - job.started_at))) FILTER ( + COALESCE(SUM(LEAST( + GREATEST(EXTRACT(EPOCH FROM (job.completed_at - job.started_at)), 0), + job.timeout_minutes::double precision * 60.0 + )) FILTER ( WHERE job.started_at IS NOT NULL AND job.completed_at IS NOT NULL ), 0)::bigint AS completed_job_seconds FROM workflow_runs run @@ -104,7 +113,10 @@ SELECT COALESCE(NULLIF(run.workflow_name, ''), run.workflow_file)::text AS workflow_name, COUNT(DISTINCT run.id)::bigint AS run_count, COUNT(job.id)::bigint AS job_count, - COALESCE(AVG(EXTRACT(EPOCH FROM (job.completed_at - job.started_at))) FILTER ( + COALESCE(AVG(LEAST( + GREATEST(EXTRACT(EPOCH FROM (job.completed_at - job.started_at)), 0), + job.timeout_minutes::double precision * 60.0 + )) FILTER ( WHERE job.started_at IS NOT NULL AND job.completed_at IS NOT NULL ), 0)::double precision AS avg_job_seconds, COALESCE(AVG(EXTRACT(EPOCH FROM (job.started_at - job.created_at))) FILTER ( @@ -179,7 +191,10 @@ SELECT COALESCE(NULLIF(run.workflow_name, ''), run.workflow_file)::text AS workflow_name, COUNT(DISTINCT run.id)::bigint AS run_count, COUNT(job.id)::bigint AS job_count, - COALESCE(SUM(EXTRACT(EPOCH FROM (job.completed_at - job.started_at))) FILTER ( + COALESCE(SUM(LEAST( + GREATEST(EXTRACT(EPOCH FROM (job.completed_at - job.started_at)), 0), + job.timeout_minutes::double precision * 60.0 + )) FILTER ( WHERE job.started_at IS NOT NULL AND job.completed_at IS NOT NULL ), 0)::bigint AS completed_job_seconds FROM workflow_runs run diff --git a/internal/actions/trigger/enqueue_test.go b/internal/actions/trigger/enqueue_test.go index 38908bda..d3c6c7b4 100644 --- a/internal/actions/trigger/enqueue_test.go +++ b/internal/actions/trigger/enqueue_test.go @@ -388,6 +388,9 @@ func seedCompletedActionsMinutes(t *testing.T, f enqFx, completedAt time.Time, m t.Fatalf("claimed seed job id=%d, want %d", claimed.ID, jobs[0].ID) } startedAt := completedAt.Add(-time.Duration(minutes) * time.Minute) + if _, err := f.pool.Exec(ctx, `UPDATE workflow_jobs SET timeout_minutes = $2 WHERE id = $1`, jobs[0].ID, minutes); err != nil { + t.Fatalf("set seed timeout_minutes: %v", err) + } if _, err := q.UpdateWorkflowJobStatus(ctx, f.pool, actionsdb.UpdateWorkflowJobStatusParams{ ID: jobs[0].ID, Status: actionsdb.WorkflowJobStatusCompleted, diff --git a/internal/billing/billing_test.go b/internal/billing/billing_test.go index 17847b63..c24696ae 100644 --- a/internal/billing/billing_test.go +++ b/internal/billing/billing_test.go @@ -544,6 +544,31 @@ func TestOrgUsageCountersAndQuotaOverrides(t *testing.T) { `, org.ID, start.Add(12*time.Hour)); err != nil { t.Fatalf("insert workflow usage: %v", err) } + if _, err := pool.Exec(ctx, ` + WITH repo AS ( + SELECT id FROM repos WHERE owner_org_id = $1 AND name = 'metered-repo' + ), runner AS ( + SELECT id FROM workflow_runners WHERE name = 'metered-runner' + ), run AS ( + INSERT INTO workflow_runs ( + repo_id, run_index, workflow_file, head_sha, event, + status, conclusion, started_at, completed_at + ) + SELECT repo.id, 2, '.shithub/workflows/stale.yml', 'abcdef2', 'push', + 'cancelled', 'cancelled', $2::timestamptz, $2::timestamptz + interval '3 days' + FROM repo + RETURNING id + ) + INSERT INTO workflow_jobs ( + run_id, job_index, job_key, runner_id, status, conclusion, + timeout_minutes, started_at, completed_at + ) + SELECT run.id, 0, 'stale', runner.id, 'cancelled', 'cancelled', + 360, $2::timestamptz, $2::timestamptz + interval '3 days' + FROM run, runner + `, org.ID, start.Add(13*time.Hour)); err != nil { + t.Fatalf("insert stale workflow usage: %v", err) + } recalc, err := billing.RecalculateOrgUsageCounters(ctx, deps, org.ID, start, end) if err != nil { t.Fatalf("RecalculateOrgUsageCounters: %v", err) @@ -552,7 +577,7 @@ func TestOrgUsageCountersAndQuotaOverrides(t *testing.T) { recalc.ActionsLogBytes != 1234 || recalc.ActionsArtifactBytes != 3456 || recalc.ObjectStorageBytes != 4690 || - recalc.ActionsMinutesUsed != 6 { + recalc.ActionsMinutesUsed != 366 { t.Fatalf("unexpected recalculated usage: %+v", recalc) } } diff --git a/internal/billing/queries/billing.sql b/internal/billing/queries/billing.sql index d579e13a..0e0b45a5 100644 --- a/internal/billing/queries/billing.sql +++ b/internal/billing/queries/billing.sql @@ -931,7 +931,10 @@ actions_minutes AS ( AND j.completed_at IS NOT NULL AND j.completed_at >= sqlc.arg(actions_period_start)::timestamptz AND j.completed_at < sqlc.arg(actions_period_end)::timestamptz - THEN CEIL(EXTRACT(EPOCH FROM (j.completed_at - j.started_at)) / 60.0)::bigint + THEN LEAST( + CEIL(GREATEST(EXTRACT(EPOCH FROM (j.completed_at - j.started_at)), 0) / 60.0)::bigint, + j.timeout_minutes::bigint + ) ELSE 0 END ), 0)::bigint AS actions_minutes_used diff --git a/internal/billing/sqlc/billing.sql.go b/internal/billing/sqlc/billing.sql.go index aec31a4c..71d71d48 100644 --- a/internal/billing/sqlc/billing.sql.go +++ b/internal/billing/sqlc/billing.sql.go @@ -2146,7 +2146,10 @@ actions_minutes AS ( AND j.completed_at IS NOT NULL AND j.completed_at >= $2::timestamptz AND j.completed_at < $3::timestamptz - THEN CEIL(EXTRACT(EPOCH FROM (j.completed_at - j.started_at)) / 60.0)::bigint + THEN LEAST( + CEIL(GREATEST(EXTRACT(EPOCH FROM (j.completed_at - j.started_at)), 0) / 60.0)::bigint, + j.timeout_minutes::bigint + ) ELSE 0 END ), 0)::bigint AS actions_minutes_used