From d3326c39060162c953811971d6bd5a57c12f1da7 Mon Sep 17 00:00:00 2001 From: Remylus Losius Date: Fri, 19 Jun 2026 16:11:55 -0400 Subject: [PATCH 1/7] fix(auth): return 401 for anonymous callers on protected endpoints An anonymous request (no credentials, or a session cookie that expired in the browser and is no longer sent) to a protected endpoint now returns 401 auth.required instead of 403. The SPA redirects to login on a 401, so an expired session surfaces as a clean re-login prompt rather than a dead-end 'failed to load'. An authenticated caller whose role lacks the permission still gets 403 authz.permission_denied; the audit event is unchanged for both. --- CHANGELOG.md | 7 +++++++ api/error_codes.yaml | 11 +++++++++++ internal/auth/middleware.go | 26 +++++++++++++++++++++----- 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a8750821a..beca0a2ba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,13 @@ Versioning: [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ### Changed +- Auth: an anonymous request to a protected endpoint (no credentials, or a + session cookie that expired in the browser and is no longer sent) now returns + **401 `auth.required`** instead of 403. The SPA redirects to login on a 401, + so an expired session surfaces as a clean re-login prompt rather than a + dead-end "failed to load." An authenticated caller whose role lacks the + permission still gets 403 `authz.permission_denied`. + - CI release safety: the release workflow now fails closed on a `v*` tag push when no GPG signing key is configured, rather than publishing unsigned packages. Manual `workflow_dispatch` trial builds stay permissive (warn + diff --git a/api/error_codes.yaml b/api/error_codes.yaml index 8e6d79059..33f164b2b 100644 --- a/api/error_codes.yaml +++ b/api/error_codes.yaml @@ -69,6 +69,17 @@ errors: # ========================================================================= # auth - authentication # ========================================================================= + - code: auth.required + http_status: 401 + fault: client + retryable: false + description: > + The request reached a protected endpoint without a usable session (no + credentials presented, or the session cookie expired in the browser and + was not sent). The caller must sign in. Distinct from + authz.permission_denied (403), which is an authenticated caller whose role + lacks the permission. The SPA redirects to login on this 401. + - code: auth.invalid_credentials http_status: 401 fault: client diff --git a/internal/auth/middleware.go b/internal/auth/middleware.go index 1936aec59..0ac8d693d 100644 --- a/internal/auth/middleware.go +++ b/internal/auth/middleware.go @@ -60,15 +60,28 @@ func RequirePermission(p Permission) func(http.Handler) http.Handler { } } -// denyPermission writes the canonical 403 envelope and emits the +// denyPermission writes the RBAC denial envelope and emits the // authz.permission_denied audit event with detail.required_permission // set to the permission id. Per system-rbac AC-09 + AC-11 + C-04. +// +// The HTTP status distinguishes the two denial classes so the SPA can react +// correctly: an ANONYMOUS caller (no or expired credentials — the request +// arrived without a usable session) gets 401 auth.required so the client +// redirects to login; an AUTHENTICATED caller whose role lacks the permission +// gets 403 authz.permission_denied. The audit record is identical either way — +// a denial is a denial. func denyPermission(w http.ResponseWriter, r *http.Request, p Permission, id Identity) { + status, code, fault, msg := http.StatusForbidden, "authz.permission_denied", "policy", + "this operation requires a permission your role does not grant" + if id.IsAnonymous { + status, code, fault, msg = http.StatusUnauthorized, "auth.required", "client", + "authentication required; sign in to continue" + } errBody := map[string]any{ - "code": "authz.permission_denied", - "fault": "policy", + "code": code, + "fault": fault, "retryable": false, - "human_message": "this operation requires a permission your role does not grant", + "human_message": msg, "detail": map[string]any{ "required_permission": string(p), }, @@ -78,8 +91,11 @@ func denyPermission(w http.ResponseWriter, r *http.Request, p Permission, id Ide } envelope := map[string]any{"error": errBody} body, _ := json.Marshal(envelope) + if status == http.StatusUnauthorized { + w.Header().Set("WWW-Authenticate", "Bearer") + } w.Header().Set("Content-Type", "application/json") - w.WriteHeader(http.StatusForbidden) + w.WriteHeader(status) _, _ = w.Write(body) actorID := id.ID From 1b2b56a75490ee150c13319fd0c923ac8d5b7f6b Mon Sep 17 00:00:00 2001 From: Remylus Losius Date: Fri, 19 Jun 2026 16:11:55 -0400 Subject: [PATCH 2/7] test+spec: update anonymous-denial contract to 401 across specs/tests The 12 specs/tests that strictly asserted anonymous -> 403 now assert 401 auth.required (alerts, audit-events-query, fleet-observability, host-system-info, os-intelligence, system-rbac AC-09/AC-15, system/fleet connectivity, discovery/ intelligence config). Authenticated-but-unauthorized -> 403 language preserved. Specs that already said '401/403' are unchanged. --- internal/server/api_alerts_test.go | 4 +-- internal/server/api_audit_query_test.go | 4 +-- internal/server/api_fleet_test.go | 8 +++--- internal/server/api_host_system_info_test.go | 4 +-- internal/server/api_intelligence_test.go | 4 +-- internal/server/api_rbac_test.go | 28 +++++++++++++++---- .../server/api_system_connectivity_test.go | 12 ++++---- .../api_system_discovery_config_test.go | 4 +-- .../api_system_intelligence_config_test.go | 4 +-- specs/api/alerts.spec.yaml | 2 +- specs/api/audit-events-query.spec.yaml | 4 +-- .../fleet-connectivity-breakdown.spec.yaml | 4 +-- specs/api/fleet-observability.spec.yaml | 2 +- specs/api/host-system-info.spec.yaml | 2 +- specs/api/os-intelligence.spec.yaml | 2 +- specs/api/system-connectivity.spec.yaml | 6 ++-- specs/api/system-discovery-config.spec.yaml | 2 +- .../api/system-intelligence-config.spec.yaml | 2 +- specs/system/rbac.spec.yaml | 2 +- 19 files changed, 58 insertions(+), 42 deletions(-) diff --git a/internal/server/api_alerts_test.go b/internal/server/api_alerts_test.go index bc12b6af1..d63e5026f 100644 --- a/internal/server/api_alerts_test.go +++ b/internal/server/api_alerts_test.go @@ -89,8 +89,8 @@ func TestAPI_Alerts_List_Anonymous_Forbidden(t *testing.T) { t.Fatalf("GET: %v", err) } defer resp.Body.Close() - if resp.StatusCode != http.StatusForbidden { - t.Errorf("status=%d, want 403", resp.StatusCode) + if resp.StatusCode != http.StatusUnauthorized { + t.Errorf("status=%d, want 401 auth.required (anonymous)", resp.StatusCode) } }) } diff --git a/internal/server/api_audit_query_test.go b/internal/server/api_audit_query_test.go index eb38fded2..5ad03672d 100644 --- a/internal/server/api_audit_query_test.go +++ b/internal/server/api_audit_query_test.go @@ -354,8 +354,8 @@ func TestAPI_AuditEvents_RequiresAuditRead(t *testing.T) { // Anonymous → 403, and no event body leaks. anon := doReq(t, asRole(t, "GET", url+"/api/v1/audit/events", "", nil)) defer anon.Body.Close() - if anon.StatusCode != http.StatusForbidden { - t.Fatalf("anonymous GET /audit/events = %d, want 403", anon.StatusCode) + if anon.StatusCode != http.StatusUnauthorized { + t.Fatalf("anonymous GET /audit/events = %d, want 401", anon.StatusCode) } body, _ := io.ReadAll(anon.Body) if strings.Contains(string(body), "\"items\"") { diff --git a/internal/server/api_fleet_test.go b/internal/server/api_fleet_test.go index 418e78989..55b211c93 100644 --- a/internal/server/api_fleet_test.go +++ b/internal/server/api_fleet_test.go @@ -470,13 +470,13 @@ func TestAPI_Fleet_Anonymous_Returns403(t *testing.T) { req, _ := http.NewRequest("GET", url+"/api/v1/fleet/score", nil) resp := doReq(t, req) defer resp.Body.Close() - if resp.StatusCode != http.StatusForbidden { + if resp.StatusCode != http.StatusUnauthorized { b, _ := io.ReadAll(resp.Body) - t.Fatalf("status = %d, want 403; body=%s", resp.StatusCode, b) + t.Fatalf("status = %d, want 401 (anonymous); body=%s", resp.StatusCode, b) } b, _ := io.ReadAll(resp.Body) - if !strings.Contains(string(b), "authz.permission_denied") { - t.Errorf("body lacks authz.permission_denied: %s", b) + if !strings.Contains(string(b), "auth.required") { + t.Errorf("body lacks auth.required: %s", b) } }) } diff --git a/internal/server/api_host_system_info_test.go b/internal/server/api_host_system_info_test.go index 8959f8e56..aa784469f 100644 --- a/internal/server/api_host_system_info_test.go +++ b/internal/server/api_host_system_info_test.go @@ -170,8 +170,8 @@ func TestAPI_HostSystemInfo_GET_Anonymous_Forbidden(t *testing.T) { t.Fatalf("GET: %v", err) } defer resp.Body.Close() - if resp.StatusCode != http.StatusForbidden { - t.Fatalf("expected 403 anon, got %d", resp.StatusCode) + if resp.StatusCode != http.StatusUnauthorized { + t.Fatalf("expected 401 anon, got %d", resp.StatusCode) } }) } diff --git a/internal/server/api_intelligence_test.go b/internal/server/api_intelligence_test.go index a967f4ea1..f54f211af 100644 --- a/internal/server/api_intelligence_test.go +++ b/internal/server/api_intelligence_test.go @@ -115,8 +115,8 @@ func TestAPI_Intelligence_Events_Anonymous_Forbidden(t *testing.T) { t.Fatalf("GET: %v", err) } defer resp.Body.Close() - if resp.StatusCode != http.StatusForbidden { - t.Errorf("expected 403 anon, got %d", resp.StatusCode) + if resp.StatusCode != http.StatusUnauthorized { + t.Errorf("expected 401 anon, got %d", resp.StatusCode) } }) } diff --git a/internal/server/api_rbac_test.go b/internal/server/api_rbac_test.go index edbdb1311..b6fa36d90 100644 --- a/internal/server/api_rbac_test.go +++ b/internal/server/api_rbac_test.go @@ -53,13 +53,29 @@ func TestAPI_RBAC_DeniesWithoutPermission(t *testing.T) { // No session cookie → anonymous. resp := doReq(t, req) defer resp.Body.Close() - if resp.StatusCode != http.StatusForbidden { + if resp.StatusCode != http.StatusUnauthorized { b, _ := io.ReadAll(resp.Body) - t.Fatalf("status = %d, want 403; body=%s", resp.StatusCode, b) + t.Fatalf("anonymous status = %d, want 401; body=%s", resp.StatusCode, b) } b, _ := io.ReadAll(resp.Body) - if !strings.Contains(string(b), "authz.permission_denied") { - t.Errorf("body lacks authz.permission_denied: %s", b) + if !strings.Contains(string(b), "auth.required") { + t.Errorf("anonymous body lacks auth.required: %s", b) + } + + // Authenticated caller whose role lacks the permission still gets 403 + // authz.permission_denied (a viewer holds host:read but not host:write). + vreq := asRole(t, "POST", url+"/api/v1/diagnostics:require-host-write", auth.RoleViewer, + map[string]any{"message": "rbac-deny-authed"}) + vreq.Header.Set("Idempotency-Key", "rbac-deny-authed-key") + vresp := doReq(t, vreq) + defer vresp.Body.Close() + if vresp.StatusCode != http.StatusForbidden { + vb, _ := io.ReadAll(vresp.Body) + t.Fatalf("authenticated viewer status = %d, want 403; body=%s", vresp.StatusCode, vb) + } + vb, _ := io.ReadAll(vresp.Body) + if !strings.Contains(string(vb), "authz.permission_denied") { + t.Errorf("authenticated-denial body lacks authz.permission_denied: %s", vb) } }) } @@ -297,8 +313,8 @@ func TestAPI_RBAC_AdminRolesDeniesAnonymous(t *testing.T) { url, _ := freshAPIServer(t) resp := doGet(t, url+"/api/v1/roles") defer resp.Body.Close() - if resp.StatusCode != http.StatusForbidden { - t.Errorf("status = %d, want 403 (no role)", resp.StatusCode) + if resp.StatusCode != http.StatusUnauthorized { + t.Errorf("status = %d, want 401 (no role / anonymous)", resp.StatusCode) } }) } diff --git a/internal/server/api_system_connectivity_test.go b/internal/server/api_system_connectivity_test.go index 05a62f596..1a845e7e7 100644 --- a/internal/server/api_system_connectivity_test.go +++ b/internal/server/api_system_connectivity_test.go @@ -192,8 +192,8 @@ func TestAPI_SystemConnectivity_Config_PUT_Anonymous_Forbidden(t *testing.T) { t.Fatalf("PUT: %v", err) } defer resp.Body.Close() - if resp.StatusCode != http.StatusForbidden { - t.Fatalf("expected 403 anon, got %d", resp.StatusCode) + if resp.StatusCode != http.StatusUnauthorized { + t.Fatalf("expected 401 anon, got %d", resp.StatusCode) } }) } @@ -239,8 +239,8 @@ func TestAPI_SystemConnectivity_Status_Anonymous_Forbidden(t *testing.T) { t.Fatalf("GET: %v", err) } defer resp.Body.Close() - if resp.StatusCode != http.StatusForbidden { - t.Fatalf("expected 403 anon, got %d", resp.StatusCode) + if resp.StatusCode != http.StatusUnauthorized { + t.Fatalf("expected 401 anon, got %d", resp.StatusCode) } }) } @@ -387,8 +387,8 @@ func TestAPI_FleetConnectivity_Breakdown_Anonymous_Forbidden(t *testing.T) { t.Fatalf("GET: %v", err) } defer resp.Body.Close() - if resp.StatusCode != http.StatusForbidden { - t.Fatalf("expected 403 anon, got %d", resp.StatusCode) + if resp.StatusCode != http.StatusUnauthorized { + t.Fatalf("expected 401 anon, got %d", resp.StatusCode) } }) } diff --git a/internal/server/api_system_discovery_config_test.go b/internal/server/api_system_discovery_config_test.go index fcb794f37..09dec4965 100644 --- a/internal/server/api_system_discovery_config_test.go +++ b/internal/server/api_system_discovery_config_test.go @@ -89,8 +89,8 @@ func TestAPI_SystemDiscoveryConfig_GET_AsAnonymous_Forbidden(t *testing.T) { t.Fatalf("GET: %v", err) } defer resp.Body.Close() - if resp.StatusCode != http.StatusForbidden { - t.Fatalf("expected 403 anon, got %d", resp.StatusCode) + if resp.StatusCode != http.StatusUnauthorized { + t.Fatalf("expected 401 anon, got %d", resp.StatusCode) } }) } diff --git a/internal/server/api_system_intelligence_config_test.go b/internal/server/api_system_intelligence_config_test.go index 7d6d8b1e7..0b142e1a2 100644 --- a/internal/server/api_system_intelligence_config_test.go +++ b/internal/server/api_system_intelligence_config_test.go @@ -89,8 +89,8 @@ func TestAPI_SystemIntelligenceConfig_GET_AsAnonymous_Forbidden(t *testing.T) { t.Fatalf("GET: %v", err) } defer resp.Body.Close() - if resp.StatusCode != http.StatusForbidden { - t.Fatalf("expected 403 anon, got %d", resp.StatusCode) + if resp.StatusCode != http.StatusUnauthorized { + t.Fatalf("expected 401 anon, got %d", resp.StatusCode) } }) } diff --git a/specs/api/alerts.spec.yaml b/specs/api/alerts.spec.yaml index fabf94948..858ffc4cb 100644 --- a/specs/api/alerts.spec.yaml +++ b/specs/api/alerts.spec.yaml @@ -79,7 +79,7 @@ spec: priority: critical references_constraints: [C-01] - id: AC-02 - description: 'GET /api/v1/alerts without an authenticated session returns 403' + description: 'GET /api/v1/alerts as an anonymous caller (no/expired session) returns 401 auth.required' priority: critical references_constraints: [C-01] - id: AC-03 diff --git a/specs/api/audit-events-query.spec.yaml b/specs/api/audit-events-query.spec.yaml index e86f6b00e..6276f3fd4 100644 --- a/specs/api/audit-events-query.spec.yaml +++ b/specs/api/audit-events-query.spec.yaml @@ -48,7 +48,7 @@ spec: type: technical enforcement: error - id: C-06 - description: GET /audit/events MUST require the audit:read permission. The audit trail is security-sensitive (actor ids, IPs, resource ids, action detail); an anonymous or unauthorized caller MUST receive 403 and MUST NOT receive any events. Enforcement is via auth.EnforcePermission(AuditRead) as the first statement of the handler, matching every sibling data handler. + description: GET /audit/events MUST require the audit:read permission. The audit trail is security-sensitive (actor ids, IPs, resource ids, action detail); an anonymous caller MUST receive 401 auth.required and an authenticated-but-unauthorized caller 403 and MUST NOT receive any events. Enforcement is via auth.EnforcePermission(AuditRead) as the first statement of the handler, matching every sibling data handler. type: security enforcement: error @@ -87,6 +87,6 @@ spec: description: Stored detail with redacted fields shows "" placeholders; redactions lists scrubbed names. priority: critical - id: AC-11 - description: GET /audit/events with no session/credential (anonymous) returns 403 and no event body — the deny path runs before any query. A caller holding audit:read returns 200. (A role lacking audit:read is denied by the same auth.EnforcePermission mechanism, covered generically by system-rbac.) + description: GET /audit/events with no session/credential (anonymous) returns 401 auth.required and no event body — the deny path runs before any query. A caller holding audit:read returns 200. (A role lacking audit:read is denied by the same auth.EnforcePermission mechanism, covered generically by system-rbac.) priority: critical references_constraints: [C-06] diff --git a/specs/api/fleet-connectivity-breakdown.spec.yaml b/specs/api/fleet-connectivity-breakdown.spec.yaml index dc94669bd..1e651e385 100644 --- a/specs/api/fleet-connectivity-breakdown.spec.yaml +++ b/specs/api/fleet-connectivity-breakdown.spec.yaml @@ -55,7 +55,7 @@ spec: type: technical enforcement: error - id: C-03 - description: The endpoint requires system:read; an anonymous caller is rejected. + description: The endpoint requires system:read; an anonymous caller is rejected with 401 auth.required. type: security enforcement: error @@ -73,6 +73,6 @@ spec: priority: high references_constraints: [C-01] - id: AC-07 - description: GET /api/v1/fleet/connectivity/breakdown by an anonymous caller is rejected (not authenticated). + description: GET /api/v1/fleet/connectivity/breakdown by an anonymous caller is rejected with 401 auth.required. priority: high references_constraints: [C-03] diff --git a/specs/api/fleet-observability.spec.yaml b/specs/api/fleet-observability.spec.yaml index a776ec13e..89a9a2215 100644 --- a/specs/api/fleet-observability.spec.yaml +++ b/specs/api/fleet-observability.spec.yaml @@ -53,7 +53,7 @@ spec: type: technical enforcement: error - id: C-02 - description: Every endpoint MUST require the system:read permission via auth.EnforcePermission. Both anonymous callers and authenticated callers without system:read return 403 authz.permission_denied (matches the established RBAC pattern — see system-rbac AC-09) + description: Every endpoint MUST require the system:read permission via auth.EnforcePermission. Anonymous callers return 401 auth.required; authenticated callers without system:read return 403 authz.permission_denied (matches the established RBAC pattern — see system-rbac AC-09) type: security enforcement: error - id: C-03 diff --git a/specs/api/host-system-info.spec.yaml b/specs/api/host-system-info.spec.yaml index 44a725e97..c3fa7d036 100644 --- a/specs/api/host-system-info.spec.yaml +++ b/specs/api/host-system-info.spec.yaml @@ -84,7 +84,7 @@ spec: priority: critical references_constraints: [C-03] - id: AC-04 - description: 'GET /api/v1/hosts/{id}/system-info as anonymous (no session) returns 403 authz.permission_denied. host:read failure surfaces the same envelope used by GET /hosts/{id}' + description: 'GET /api/v1/hosts/{id}/system-info as an anonymous caller (no/expired session) returns 401 auth.required. host:read failure surfaces the same envelope used by GET /hosts/{id}' priority: critical references_constraints: [C-01] - id: AC-05 diff --git a/specs/api/os-intelligence.spec.yaml b/specs/api/os-intelligence.spec.yaml index c7ede66c8..1c4cd80db 100644 --- a/specs/api/os-intelligence.spec.yaml +++ b/specs/api/os-intelligence.spec.yaml @@ -81,7 +81,7 @@ spec: priority: critical references_constraints: [C-01] - id: AC-02 - description: 'GET /api/v1/intelligence/events without an authenticated session returns 403' + description: 'GET /api/v1/intelligence/events as an anonymous caller (no/expired session) returns 401 auth.required' priority: critical references_constraints: [C-01] - id: AC-03 diff --git a/specs/api/system-connectivity.spec.yaml b/specs/api/system-connectivity.spec.yaml index 9b2907695..9fe310a7d 100644 --- a/specs/api/system-connectivity.spec.yaml +++ b/specs/api/system-connectivity.spec.yaml @@ -64,7 +64,7 @@ spec: type: technical enforcement: error - id: C-04 - description: Reads require system:read and the mutating PUT requires system:write; a caller lacking the permission gets 403, and an anonymous caller is rejected. + description: Reads require system:read and the mutating PUT requires system:write; a caller lacking the permission gets 403, and an anonymous caller is rejected with 401 auth.required. type: security enforcement: error @@ -90,7 +90,7 @@ spec: priority: high references_constraints: [C-04] - id: AC-07 - description: PUT /api/v1/system/connectivity/config by an anonymous caller is rejected (not authenticated). + description: PUT /api/v1/system/connectivity/config by an anonymous caller is rejected with 401 auth.required. priority: high references_constraints: [C-04] - id: AC-08 @@ -98,6 +98,6 @@ spec: priority: high references_constraints: [C-03] - id: AC-09 - description: GET /api/v1/system/connectivity/status by an anonymous caller is rejected (not authenticated). + description: GET /api/v1/system/connectivity/status by an anonymous caller is rejected with 401 auth.required. priority: high references_constraints: [C-04] diff --git a/specs/api/system-discovery-config.spec.yaml b/specs/api/system-discovery-config.spec.yaml index 52837283e..26232c878 100644 --- a/specs/api/system-discovery-config.spec.yaml +++ b/specs/api/system-discovery-config.spec.yaml @@ -102,7 +102,7 @@ spec: priority: critical references_constraints: [C-01] - id: AC-02 - description: 'GET /api/v1/system/discovery/config with a caller missing system:read returns 403 authz.permission_denied' + description: 'GET /api/v1/system/discovery/config as an anonymous caller returns 401 auth.required (an authenticated caller missing system:read returns 403 authz.permission_denied)' priority: critical references_constraints: [C-02] - id: AC-03 diff --git a/specs/api/system-intelligence-config.spec.yaml b/specs/api/system-intelligence-config.spec.yaml index a30d0c7c8..a642b037d 100644 --- a/specs/api/system-intelligence-config.spec.yaml +++ b/specs/api/system-intelligence-config.spec.yaml @@ -85,7 +85,7 @@ spec: priority: critical references_constraints: [C-01] - id: AC-02 - description: 'GET /api/v1/system/intelligence/config with a caller missing system:read returns 403 authz.permission_denied' + description: 'GET /api/v1/system/intelligence/config as an anonymous caller returns 401 auth.required (an authenticated caller missing system:read returns 403 authz.permission_denied)' priority: critical references_constraints: [C-02] - id: AC-03 diff --git a/specs/system/rbac.spec.yaml b/specs/system/rbac.spec.yaml index 27bc10edf..510e01fbc 100644 --- a/specs/system/rbac.spec.yaml +++ b/specs/system/rbac.spec.yaml @@ -97,7 +97,7 @@ spec: description: RequirePermission middleware allows the request when the identity grants the permission and the license gate passes. priority: critical - id: AC-09 - description: RequirePermission middleware denies with 403 authz.permission_denied when the identity lacks the permission. + description: "RequirePermission / EnforcePermission distinguishes the two denial classes by HTTP status. An ANONYMOUS caller (no credentials, or a session cookie that expired in the browser and is no longer sent) is denied 401 auth.required so the SPA redirects to login. An AUTHENTICATED caller whose role lacks the permission is denied 403 authz.permission_denied. The authz.permission.denied audit event is emitted in both cases." priority: critical references_constraints: [C-03, C-04] - id: AC-10 From 9b898393c190d0ee4fb7815d5d8c0f4cb21ecb26 Mon Sep 17 00:00:00 2001 From: Remylus Losius Date: Fri, 19 Jun 2026 16:35:12 -0400 Subject: [PATCH 3/7] =?UTF-8?q?feat(remediation):=20conditional=20approval?= =?UTF-8?q?=20(A-keep)=20=E2=80=94=20free-core=20auto-approves?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements the A-keep ADR: free-core single-rule remediation no longer requires a separate human approval, so a single operator can request and Fix a finding directly (removing the self-review deadlock). The approve/reject flow with separation of duties is retained for the licensed bulk/auto track. - Request(...requiresApproval bool): false (free core) inserts an 'approved' row directly (reviewed_at set, reviewed_by NULL, auto-approved review_note) and emits remediation.requested + remediation.approved; true (licensed bulk/auto) inserts 'pending_approval' and goes through Approve/Reject. - The single-rule request handler passes false. - Tests: AC-01 covers auto-approve + the approval-required path; the HTTP AC-05/AC-06 approve and pending-execute paths seed a pending_approval request (the free-core POST auto-approves). Frontend unchanged (the hook already renders approved -> Fix and keeps the pending_approval/approve UI for the licensed track). Note: the ADR + governance docs land in #604; their status flips to 'implemented' once both merge. --- CHANGELOG.md | 8 +++ internal/remediation/service.go | 48 +++++++++++--- internal/remediation/service_test.go | 47 +++++++++----- internal/server/api_remediation_test.go | 73 +++++++++++++--------- internal/server/remediation_handlers.go | 5 +- internal/worker/remediation_worker_test.go | 2 +- specs/api/remediation.spec.yaml | 6 +- 7 files changed, 132 insertions(+), 57 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a8750821a..0f5e1ab3e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,14 @@ Versioning: [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ### Changed +- Remediation: free-core single-rule remediation is now **auto-approved** on + request, so an operator can apply a fix without a separate approver. This + removes the self-review deadlock for single-operator workspaces (you could + request a fix but never approve your own request). The request lifecycle and + the approve/reject flow with separation of duties are retained for the + licensed bulk/automated remediation track (which requests with approval + required). See `docs/engineering/remediation_governance_adr.md` ("A-keep"). + - CI release safety: the release workflow now fails closed on a `v*` tag push when no GPG signing key is configured, rather than publishing unsigned packages. Manual `workflow_dispatch` trial builds stay permissive (warn + diff --git a/internal/remediation/service.go b/internal/remediation/service.go index 65550fa08..fdb3669cb 100644 --- a/internal/remediation/service.go +++ b/internal/remediation/service.go @@ -75,8 +75,21 @@ func scanListRequest(row pgx.Row) (Request, error) { // recording a best-effort projected per-framework lift. Returns // ErrDuplicateOpen when an open request already exists for the same host+rule. // NEVER contacts the host. Emits remediation.requested. +// Request opens a remediation request for one rule on one host. +// +// Approval is conditional on the remediation track (ADR +// docs/engineering/remediation_governance_adr.md, "A-keep"): +// +// - requiresApproval=false (FREE-CORE single-rule manual remediation): the +// request is AUTO-APPROVED on creation — inserted directly in 'approved' +// with reviewed_at set and an explanatory review_note, so the requester can +// Fix it without a separate approver. This makes single-operator workspaces +// workable (no self-review deadlock). +// - requiresApproval=true (LICENSED bulk/auto track): the request is inserted +// 'pending_approval' and must go through Approve/Reject with separation of +// duties (the requester cannot review their own request) before it can run. func (s *Service) Request(ctx context.Context, hostID uuid.UUID, ruleID string, - scanRunID *uuid.UUID, requestedBy uuid.UUID) (Request, error) { + scanRunID *uuid.UUID, requestedBy uuid.UUID, requiresApproval bool) (Request, error) { ruleID = strings.TrimSpace(ruleID) if ruleID == "" { return Request{}, ErrInvalidInput @@ -87,13 +100,27 @@ func (s *Service) Request(ctx context.Context, hostID uuid.UUID, ruleID string, proj, _ := s.ProjectLift(ctx, hostID, ruleID) id := uuid.Must(uuid.NewV7()) - row := s.pool.QueryRow(ctx, ` - INSERT INTO remediation_requests - (id, host_id, rule_id, scan_run_id, status, requested_by, - projected_cis, projected_stig, projected_nist) - VALUES ($1, $2, $3, $4, 'pending_approval', $5, $6, $7, $8) - RETURNING `+selectCols, - id, hostID, ruleID, scanRunID, requestedBy, proj.CIS, proj.STIG, proj.NIST) + var row pgx.Row + if requiresApproval { + row = s.pool.QueryRow(ctx, ` + INSERT INTO remediation_requests + (id, host_id, rule_id, scan_run_id, status, requested_by, + projected_cis, projected_stig, projected_nist) + VALUES ($1, $2, $3, $4, 'pending_approval', $5, $6, $7, $8) + RETURNING `+selectCols, + id, hostID, ruleID, scanRunID, requestedBy, proj.CIS, proj.STIG, proj.NIST) + } else { + row = s.pool.QueryRow(ctx, ` + INSERT INTO remediation_requests + (id, host_id, rule_id, scan_run_id, status, requested_by, + reviewed_at, review_note, + projected_cis, projected_stig, projected_nist) + VALUES ($1, $2, $3, $4, 'approved', $5, now(), + 'auto-approved: free-core single-rule remediation requires no separate approval', + $6, $7, $8) + RETURNING `+selectCols, + id, hostID, ruleID, scanRunID, requestedBy, proj.CIS, proj.STIG, proj.NIST) + } rq, err := scanRequest(row) if err != nil { if isUniqueViolation(err) { @@ -103,6 +130,11 @@ func (s *Service) Request(ctx context.Context, hostID uuid.UUID, ruleID string, } s.emitEvent(ctx, audit.RemediationRequested, rq, requestedBy, "requested") + if !requiresApproval { + // Record the auto-approval honestly in the audit trail; there is no + // human reviewer (reviewed_by stays NULL). + s.emitEvent(ctx, audit.RemediationApproved, rq, requestedBy, "auto_approved") + } return rq, nil } diff --git a/internal/remediation/service_test.go b/internal/remediation/service_test.go index 5c04ca1cf..273b668a7 100644 --- a/internal/remediation/service_test.go +++ b/internal/remediation/service_test.go @@ -102,33 +102,50 @@ func TestRequest_InsertDuplicateInvalidReopen(t *testing.T) { var calls []emitCall svc := NewService(pool, fakeEmitter(&calls)) - rq, err := svc.Request(ctx, hostID, "sshd-permit-root-no", nil, user) + // Free-core single-rule remediation (requiresApproval=false) is + // AUTO-APPROVED on creation. + rq, err := svc.Request(ctx, hostID, "sshd-permit-root-no", nil, user, false) if err != nil { t.Fatalf("Request: %v", err) } - if rq.Status != StatusPendingApproval || rq.RuleID != "sshd-permit-root-no" { - t.Errorf("requested remediation = %+v", rq) + if rq.Status != StatusApproved || rq.RuleID != "sshd-permit-root-no" { + t.Errorf("requested remediation = %+v, want status=approved", rq) } - if len(calls) != 1 || calls[0].Code != audit.RemediationRequested { - t.Errorf("audit calls = %+v, want one requested", calls) + if rq.ReviewedAt == nil || rq.ReviewedBy != nil { + t.Errorf("auto-approved: want reviewed_at set + reviewed_by nil, got reviewed_at=%v reviewed_by=%v", rq.ReviewedAt, rq.ReviewedBy) + } + if !strings.Contains(rq.ReviewNote, "auto-approved") { + t.Errorf("auto-approved review_note = %q, want an auto-approved note", rq.ReviewNote) + } + // Auto-approve emits remediation.requested then remediation.approved. + if len(calls) != 2 || calls[0].Code != audit.RemediationRequested || calls[1].Code != audit.RemediationApproved { + t.Errorf("audit calls = %+v, want requested + approved", calls) } - // Duplicate open: rejected. - if _, err := svc.Request(ctx, hostID, "sshd-permit-root-no", nil, user); !errors.Is(err, ErrDuplicateOpen) { + // Duplicate open (the auto-approved request is still open): rejected. + if _, err := svc.Request(ctx, hostID, "sshd-permit-root-no", nil, user, false); !errors.Is(err, ErrDuplicateOpen) { t.Errorf("duplicate Request err = %v, want ErrDuplicateOpen", err) } // Invalid input (empty rule). - if _, err := svc.Request(ctx, hostID, " ", nil, user); !errors.Is(err, ErrInvalidInput) { + if _, err := svc.Request(ctx, hostID, " ", nil, user, false); !errors.Is(err, ErrInvalidInput) { t.Errorf("empty rule err = %v, want ErrInvalidInput", err) } - // Reopen after the prior is rejected: a fresh request succeeds. + // Approval-required (requiresApproval=true) inserts pending_approval and + // reopens after a terminal state: reject -> a fresh request succeeds. reviewer := seedUser(t, pool, "reviewer") - if _, err := svc.Reject(ctx, rq.ID, reviewer, "not now"); err != nil { + pend, err := svc.Request(ctx, hostID, "needs-approval", nil, user, true) + if err != nil { + t.Fatalf("approval-required Request: %v", err) + } + if pend.Status != StatusPendingApproval { + t.Errorf("approval-required status = %v, want pending_approval", pend.Status) + } + if _, err := svc.Reject(ctx, pend.ID, reviewer, "not now"); err != nil { t.Fatalf("Reject: %v", err) } - if _, err := svc.Request(ctx, hostID, "sshd-permit-root-no", nil, user); err != nil { + if _, err := svc.Request(ctx, hostID, "needs-approval", nil, user, true); err != nil { t.Errorf("reopen after reject failed: %v", err) } }) @@ -146,7 +163,7 @@ func TestLifecycle_Transitions(t *testing.T) { svc := NewService(pool, fakeEmitter(&calls)) // approve path - a, _ := svc.Request(ctx, hostID, "rule-a", nil, requester) + a, _ := svc.Request(ctx, hostID, "rule-a", nil, requester, true) got, err := svc.Approve(ctx, a.ID, reviewer, "ok") if err != nil || got.Status != StatusApproved || got.ReviewedBy == nil || *got.ReviewedBy != reviewer { t.Fatalf("Approve = %+v, err %v", got, err) @@ -157,7 +174,7 @@ func TestLifecycle_Transitions(t *testing.T) { } // reject path - b, _ := svc.Request(ctx, hostID, "rule-b", nil, requester) + b, _ := svc.Request(ctx, hostID, "rule-b", nil, requester, true) if got, err := svc.Reject(ctx, b.ID, reviewer, "no"); err != nil || got.Status != StatusRejected { t.Fatalf("Reject = %+v, err %v", got, err) } @@ -198,7 +215,7 @@ func TestSeparationOfDuties(t *testing.T) { hostID := seedHost(t, pool, user) svc := NewService(pool, fakeEmitter(&[]emitCall{})) - rq, _ := svc.Request(ctx, hostID, "rule-s", nil, user) + rq, _ := svc.Request(ctx, hostID, "rule-s", nil, user, true) // self-approve blocked if _, err := svc.Approve(ctx, rq.ID, user, "me"); !errors.Is(err, ErrSelfReview) { t.Errorf("self-approve err = %v, want ErrSelfReview", err) @@ -260,7 +277,7 @@ func TestProjectLift_AndOverlayNeverMutatesRuleState(t *testing.T) { } // Request persists the projection snapshot. - rq, err := svc.Request(ctx, hostID, "rule-active", nil, requester) + rq, err := svc.Request(ctx, hostID, "rule-active", nil, requester, false) if err != nil { t.Fatalf("Request: %v", err) } diff --git a/internal/server/api_remediation_test.go b/internal/server/api_remediation_test.go index 234f61960..8a91b5e36 100644 --- a/internal/server/api_remediation_test.go +++ b/internal/server/api_remediation_test.go @@ -62,8 +62,10 @@ func TestAPI_Remediation_LifecycleAndRBAC(t *testing.T) { if err := json.NewDecoder(or.Body).Decode(&created); err != nil { t.Fatalf("decode created: %v", err) } - if created.Status != "pending_approval" || created.RuleID != "sshd-permit-root-no" { - t.Errorf("created = %+v", created) + // Free core: a single-rule request is auto-approved on creation (no + // separate approver). See remediation_governance_adr.md. + if created.Status != "approved" || created.RuleID != "sshd-permit-root-no" { + t.Errorf("created = %+v, want status=approved (auto-approved free core)", created) } // --- duplicate open -> 409 --- @@ -81,16 +83,22 @@ func TestAPI_Remediation_LifecycleAndRBAC(t *testing.T) { t.Errorf("anonymous list status = %d, want 401/403", ar.StatusCode) } - // --- approve: ops_lead is 403 (no remediation:approve) --- - oa := doReq(t, asRole(t, "POST", base+"/"+created.ID+":approve", auth.RoleOpsLead, map[string]any{})) + // --- approve endpoint RBAC + separation of duties. The free-core POST + // above auto-approves, so it never yields a pending row; seed an + // approval-required (pending_approval) request directly to exercise the + // approve endpoint (this is the licensed bulk/auto track's shape). --- + pendID := seedPendingRemediation(t, pool, hostID, roleUserIDs[auth.RoleOpsLead], "needs-approval") + pendPath := base + "/" + pendID.String() + + // ops_lead (the requester; no remediation:approve) -> 403 + oa := doReq(t, asRole(t, "POST", pendPath+":approve", auth.RoleOpsLead, map[string]any{})) oa.Body.Close() if oa.StatusCode != http.StatusForbidden { t.Fatalf("ops_lead approve status = %d, want 403", oa.StatusCode) } - // --- approve: security_admin (different user from the ops_lead - // requester) succeeds --- - sa := doReq(t, asRole(t, "POST", base+"/"+created.ID+":approve", + // security_admin (different user from the requester) approves -> 200 + sa := doReq(t, asRole(t, "POST", pendPath+":approve", auth.RoleSecurityAdmin, map[string]any{"note": "reviewed"})) defer sa.Body.Close() if sa.StatusCode != http.StatusOK { @@ -102,21 +110,17 @@ func TestAPI_Remediation_LifecycleAndRBAC(t *testing.T) { t.Errorf("approved status = %q, want approved", approved.Status) } - // --- re-approve -> 409 wrong state --- - ra := doReq(t, asRole(t, "POST", base+"/"+created.ID+":approve", auth.RoleSecurityAdmin, map[string]any{})) + // re-approve -> 409 wrong state + ra := doReq(t, asRole(t, "POST", pendPath+":approve", auth.RoleSecurityAdmin, map[string]any{})) ra.Body.Close() if ra.StatusCode != http.StatusConflict { t.Errorf("re-approve status = %d, want 409", ra.StatusCode) } - // --- separation of duties at the HTTP layer: a security_admin - // requests, then tries to approve their own -> 409 self_review --- - selfReq := doReq(t, asRole(t, "POST", base, auth.RoleSecurityAdmin, - map[string]any{"host_id": hostID.String(), "rule_id": "self-rule"})) - defer selfReq.Body.Close() - var selfRR apiRem - _ = json.NewDecoder(selfReq.Body).Decode(&selfRR) - selfAp := doReq(t, asRole(t, "POST", base+"/"+selfRR.ID+":approve", auth.RoleSecurityAdmin, map[string]any{})) + // separation of duties at the HTTP layer: the security_admin requester + // cannot approve their own pending request -> 409 self_review. + selfID := seedPendingRemediation(t, pool, hostID, roleUserIDs[auth.RoleSecurityAdmin], "self-rule") + selfAp := doReq(t, asRole(t, "POST", base+"/"+selfID.String()+":approve", auth.RoleSecurityAdmin, map[string]any{})) selfAp.Body.Close() if selfAp.StatusCode != http.StatusConflict { t.Errorf("self-approve status = %d, want 409 (separation of duties)", selfAp.StatusCode) @@ -181,23 +185,18 @@ func TestAPI_Remediation_ExecuteFreeCore(t *testing.T) { t.Fatalf("viewer execute status = %d, want 403", o.StatusCode) } - // security_admin has the perm, but the request is still pending - // (not approved) -> 409 wrong_state. NOT 402. - pre := doReq(t, asRole(t, "POST", execURL, auth.RoleSecurityAdmin, map[string]any{})) + // Executing a NON-approved (pending) request -> 409 wrong_state, NOT 402 + // (free core, no license gate). The free-core request above auto-approves, + // so seed a pending row to exercise this path. + pendID := seedPendingRemediation(t, pool, hostID, roleUserIDs[auth.RoleOpsLead], "rule-pending") + pre := doReq(t, asRole(t, "POST", base+"/"+pendID.String()+":execute", auth.RoleSecurityAdmin, map[string]any{})) pre.Body.Close() if pre.StatusCode != http.StatusConflict { t.Fatalf("execute-before-approve status = %d, want 409", pre.StatusCode) } - // Approve it (security_admin != ops_lead requester). - ap := doReq(t, asRole(t, "POST", base+"/"+created.ID+":approve", - auth.RoleSecurityAdmin, map[string]any{"note": "ok"})) - ap.Body.Close() - if ap.StatusCode != http.StatusOK { - t.Fatalf("approve status = %d, want 200", ap.StatusCode) - } - - // Now execute -> 202 Accepted, a remediation job enqueued. + // The free-core request is already approved (auto). Execute it directly + // -> 202 Accepted, a remediation job enqueued. ex := doReq(t, asRole(t, "POST", execURL, auth.RoleSecurityAdmin, map[string]any{})) var acc struct { RequestID string `json:"request_id"` @@ -226,6 +225,22 @@ func TestAPI_Remediation_ExecuteFreeCore(t *testing.T) { }) } +// seedPendingRemediation inserts a pending_approval remediation request +// directly (the approval-required / licensed track's shape). The free-core HTTP +// POST auto-approves, so this is how the approve-endpoint and pending-execute +// paths are exercised at the HTTP layer. +func seedPendingRemediation(t *testing.T, pool *pgxpool.Pool, hostID, requestedBy uuid.UUID, ruleID string) uuid.UUID { + t.Helper() + id := uuid.Must(uuid.NewV7()) + if _, err := pool.Exec(context.Background(), + `INSERT INTO remediation_requests (id, host_id, rule_id, status, requested_by) + VALUES ($1, $2, $3, 'pending_approval', $4)`, + id, hostID, ruleID, requestedBy); err != nil { + t.Fatalf("seed pending remediation: %v", err) + } + return id +} + // countRemediationJobs counts pending remediation jobs on the queue. func countRemediationJobs(t *testing.T, pool *pgxpool.Pool) int { t.Helper() diff --git a/internal/server/remediation_handlers.go b/internal/server/remediation_handlers.go index fa83b0e9c..27dd2fce5 100644 --- a/internal/server/remediation_handlers.go +++ b/internal/server/remediation_handlers.go @@ -207,7 +207,10 @@ func (h *handlers) RequestRemediation(w http.ResponseWriter, r *http.Request) { s := uuid.UUID(*req.ScanRunId) scanRunID = &s } - rq, err := h.remediationSvc.Request(ctx, hostID, req.RuleId, scanRunID, requestedBy) + // Free-core: single-rule manual remediation is auto-approved (no separate + // approver). Bulk/auto remediation (OpenWatch+) will request with approval + // required via its own gated handler. + rq, err := h.remediationSvc.Request(ctx, hostID, req.RuleId, scanRunID, requestedBy, false) if mapRemediationErr(w, err) { return } diff --git a/internal/worker/remediation_worker_test.go b/internal/worker/remediation_worker_test.go index 621fbbaee..5d7d87bf4 100644 --- a/internal/worker/remediation_worker_test.go +++ b/internal/worker/remediation_worker_test.go @@ -36,7 +36,7 @@ func seedApprovedRequest(t *testing.T, pool *pgxpool.Pool, svc *remediation.Serv requester := seedUniqueUser(t, pool) reviewer := seedUniqueUser(t, pool) ctx := context.Background() - rq, err := svc.Request(ctx, hostID, ruleID, nil, requester) + rq, err := svc.Request(ctx, hostID, ruleID, nil, requester, true) if err != nil { t.Fatalf("seed request: %v", err) } diff --git a/specs/api/remediation.spec.yaml b/specs/api/remediation.spec.yaml index ae042ff21..29b5e52e5 100644 --- a/specs/api/remediation.spec.yaml +++ b/specs/api/remediation.spec.yaml @@ -121,7 +121,7 @@ spec: acceptance_criteria: - id: AC-01 - description: 'Request inserts a pending_approval row (host_id, rule_id, requested_by, optional scan_run_id and projected-lift snapshot) and emits remediation.requested; a second Request for the same host+rule while one is open (pending_approval/approved/dry_run_complete/executing) returns ErrDuplicateOpen; an empty rule_id returns ErrInvalidInput; a fresh Request after the prior one reached a terminal state succeeds' + description: 'Approval is conditional on the remediation track. A free-core Request (requiresApproval=false) is AUTO-APPROVED: it inserts an approved row (host_id, rule_id, requested_by, optional scan_run_id and projected-lift snapshot, reviewed_at set, reviewed_by NULL, an auto-approved review_note) and emits remediation.requested then remediation.approved. A requiresApproval=true Request (the licensed bulk/auto track) inserts pending_approval instead. A second Request for the same host+rule while one is open (pending_approval/approved/dry_run_complete/executing) returns ErrDuplicateOpen; an empty rule_id returns ErrInvalidInput; a fresh Request after the prior one reached a terminal state succeeds' priority: critical references_constraints: [C-02] - id: AC-02 @@ -137,11 +137,11 @@ spec: priority: critical references_constraints: [C-01, C-07] - id: AC-05 - description: 'Endpoint RBAC + lifecycle: POST /api/v1/remediation/requests requires remediation:request and 404s an unknown host_id; GET /requests, GET /requests/{id}, GET /requests/{id}/steps require remediation:read; POST :approve and :reject require remediation:approve; anonymous callers are rejected on all. A full request->approve happy path round-trips through the HTTP layer' + description: 'Endpoint RBAC + lifecycle: POST /api/v1/remediation/requests requires remediation:request, 404s an unknown host_id, and (free core) returns the request AUTO-APPROVED; GET /requests, GET /requests/{id}, GET /requests/{id}/steps require remediation:read; POST :approve and :reject require remediation:approve and enforce separation of duties (the requester cannot approve their own request, 409); anonymous callers are rejected on all. The approve endpoint is exercised on an approval-required (pending_approval) request since the free-core request auto-approves' priority: critical references_constraints: [C-05, C-03] - id: AC-06 - description: 'Execute/rollback are FREE core: a caller WITH remediation:execute executing an approved request gets 202 and a remediation job is enqueued; a caller LACKING remediation:execute is 403; executing a non-approved request is 409; rolling back (remediation:rollback) a non-executed request is 409. No remediation act endpoint returns 402. :dry-run returns 501. request/approve/reject and the GET reads return their normal status codes.' + description: 'Execute/rollback are FREE core: a caller WITH remediation:execute executing an approved request gets 202 and a remediation job is enqueued. A free-core request is auto-approved, so it executes directly with no separate approval step. A caller LACKING remediation:execute is 403; executing a non-approved (pending_approval) request is 409; rolling back (remediation:rollback) a non-executed request is 409. No remediation act endpoint returns 402. :dry-run returns 501. request/approve/reject and the GET reads return their normal status codes.' priority: critical references_constraints: [C-06] - id: AC-07 From 7c75fcad4474ebbc9993e0ba906a37fa87ace8dc Mon Sep 17 00:00:00 2001 From: Remylus Losius Date: Fri, 19 Jun 2026 17:46:11 -0400 Subject: [PATCH 4/7] fix(remediation): serialize concurrent fixes on a host instead of failing Clicking Fix on several findings on the same host enqueued multiple jobs that ran concurrently; the second collided on the per-host SSH guard (ErrHostBusy) and the remediation worker marked it failed. Now the worker treats a busy host as transient: it backs off and requeues (queue.EnqueueAfter) until the host is free, so the fixes apply one at a time. - queue: add a delayed-visibility column (migration 0039 available_at) + EnqueueAfter(delay); Dequeue skips not-yet-available rows so the requeue does not busy-loop the drain (job-queue AC-13). - remediation: HostHasExecuting + RevertToApproved primitives (api-remediation AC-08); worker processExecute/processRollback pre-check the host and revert+ requeue on an ErrHostBusy race instead of failing the request. --- .../0039_job_queue_available_at.sql | 24 +++++++ internal/queue/dequeue.go | 4 +- internal/queue/enqueue.go | 30 +++++++-- internal/queue/queue_test.go | 46 ++++++++++++++ internal/remediation/execution.go | 26 ++++++++ internal/remediation/service_test.go | 49 +++++++++++++++ internal/worker/remediation_worker.go | 63 ++++++++++++++++++- specs/api/remediation.spec.yaml | 3 + specs/system/job-queue.spec.yaml | 3 + 9 files changed, 238 insertions(+), 10 deletions(-) create mode 100644 internal/db/migrations/0039_job_queue_available_at.sql diff --git a/internal/db/migrations/0039_job_queue_available_at.sql b/internal/db/migrations/0039_job_queue_available_at.sql new file mode 100644 index 000000000..c1033d832 --- /dev/null +++ b/internal/db/migrations/0039_job_queue_available_at.sql @@ -0,0 +1,24 @@ +-- Delayed-visibility for the job queue. A pending job becomes dequeuable only +-- at or after available_at, which defaults to now() — so every existing enqueue +-- path (scans, diagnostics, etc.) is immediately visible and unchanged. The +-- remediation worker sets a future available_at to back off and requeue a job +-- whose target host is already being remediated, so concurrent "Fix" clicks on +-- one host serialize (queue) instead of colliding on the per-host SSH guard and +-- failing. +-- +-- Spec: specs/system/job-queue.spec.yaml. + +-- +goose Up +ALTER TABLE job_queue ADD COLUMN available_at TIMESTAMPTZ NOT NULL DEFAULT now(); + +-- The dequeue hot path now filters and orders on availability. Replace the +-- status-only partial index so the claim query stays index-driven. +DROP INDEX IF EXISTS idx_job_queue_pending; +CREATE INDEX idx_job_queue_pending ON job_queue (available_at, created_at) + WHERE status = 'pending'; + +-- +goose Down +DROP INDEX IF EXISTS idx_job_queue_pending; +CREATE INDEX idx_job_queue_pending ON job_queue (created_at) + WHERE status = 'pending'; +ALTER TABLE job_queue DROP COLUMN IF EXISTS available_at; diff --git a/internal/queue/dequeue.go b/internal/queue/dequeue.go index 077d336cc..7d0533e24 100644 --- a/internal/queue/dequeue.go +++ b/internal/queue/dequeue.go @@ -32,8 +32,8 @@ func Dequeue(ctx context.Context, pool *pgxpool.Pool) (*Job, context.Context, er attempts = attempts + 1 WHERE id = ( SELECT id FROM job_queue - WHERE status = 'pending' - ORDER BY created_at + WHERE status = 'pending' AND available_at <= now() + ORDER BY available_at, created_at FOR UPDATE SKIP LOCKED LIMIT 1 ) diff --git a/internal/queue/enqueue.go b/internal/queue/enqueue.go index a7bb5906e..207d54fdf 100644 --- a/internal/queue/enqueue.go +++ b/internal/queue/enqueue.go @@ -4,15 +4,16 @@ import ( "context" "encoding/json" "fmt" + "time" "github.com/Hanalyx/openwatch/internal/correlation" "github.com/google/uuid" "github.com/jackc/pgx/v5/pgxpool" ) -// Enqueue persists a new job_queue row. The caller's ctx MUST carry a -// correlation_id; this is a programming-error guard per spec C-01. The -// row's correlation_id pins the job to the originating intent across +// Enqueue persists a new job_queue row, immediately dequeuable. The caller's +// ctx MUST carry a correlation_id; this is a programming-error guard per spec +// C-01. The row's correlation_id pins the job to the originating intent across // the async boundary. // // Returns the inserted job's ID. The job is in "pending" status with @@ -20,6 +21,17 @@ import ( // // Spec system-job-queue AC-01. func Enqueue(ctx context.Context, pool *pgxpool.Pool, jobType string, payload any) (uuid.UUID, error) { + return EnqueueAfter(ctx, pool, jobType, payload, 0) +} + +// EnqueueAfter is Enqueue with a delay: the job is not dequeuable until +// `delay` from now (available_at = now() + delay). A non-positive delay makes +// it immediately available, identical to Enqueue. Used to back off and requeue +// a job that can't run yet (e.g. the target host is busy) without a tight +// re-dequeue loop, since Dequeue skips not-yet-available rows. +// +// Spec system-job-queue AC-13 (delayed visibility). +func EnqueueAfter(ctx context.Context, pool *pgxpool.Pool, jobType string, payload any, delay time.Duration) (uuid.UUID, error) { corrID, ok := correlation.From(ctx) if !ok { return uuid.Nil, ErrMissingCorrelation @@ -44,10 +56,16 @@ func Enqueue(ctx context.Context, pool *pgxpool.Pool, jobType string, payload an return uuid.Nil, fmt.Errorf("queue: uuid: %w", err) } + // available_at is computed server-side (now() + interval) so it shares the + // database clock that Dequeue compares against. + secs := delay.Seconds() + if secs < 0 { + secs = 0 + } const stmt = ` - INSERT INTO job_queue (id, job_type, payload, correlation_id, status, attempts) - VALUES ($1, $2, $3::jsonb, $4, 'pending', 0)` - if _, err := pool.Exec(ctx, stmt, id, jobType, payloadJSON, corrID); err != nil { + INSERT INTO job_queue (id, job_type, payload, correlation_id, status, attempts, available_at) + VALUES ($1, $2, $3::jsonb, $4, 'pending', 0, now() + make_interval(secs => $5))` + if _, err := pool.Exec(ctx, stmt, id, jobType, payloadJSON, corrID, secs); err != nil { return uuid.Nil, fmt.Errorf("queue: insert: %w", err) } return id, nil diff --git a/internal/queue/queue_test.go b/internal/queue/queue_test.go index bff1a5d58..09b28cae1 100644 --- a/internal/queue/queue_test.go +++ b/internal/queue/queue_test.go @@ -29,6 +29,52 @@ func freshPool(t *testing.T) *pgxpool.Pool { return pool } +// @ac AC-13 +// AC-13: EnqueueAfter delays a job's visibility. Dequeue skips a job whose +// available_at is in the future and claims it only once available_at <= now(), +// so a requeue-with-backoff does not busy-loop the drain. +func TestEnqueueAfter_DelaysVisibility(t *testing.T) { + t.Run("system-job-queue/AC-13", func(t *testing.T) { + pool := freshPool(t) + ctx := correlation.Set(context.Background(), "req-ac13-001") + + future, err := EnqueueAfter(ctx, pool, "diagnostics.test_job", map[string]any{"k": "future"}, time.Hour) + if err != nil { + t.Fatalf("EnqueueAfter(future): %v", err) + } + now, err := EnqueueAfter(ctx, pool, "diagnostics.test_job", map[string]any{"k": "now"}, 0) + if err != nil { + t.Fatalf("EnqueueAfter(now): %v", err) + } + + // Dequeue claims the immediately-available job, never the future one. + job, _, err := Dequeue(ctx, pool) + if err != nil { + t.Fatalf("Dequeue: %v", err) + } + if job.ID != now { + t.Fatalf("Dequeue claimed %s, want the immediately-available job %s", job.ID, now) + } + // The future job stays hidden. + if _, _, err := Dequeue(ctx, pool); !errors.Is(err, ErrNoJob) { + t.Fatalf("second Dequeue err = %v, want ErrNoJob (future job hidden)", err) + } + + // Once its available_at passes, the future job becomes dequeuable. + if _, err := pool.Exec(ctx, + `UPDATE job_queue SET available_at = now() - interval '1 second' WHERE id = $1`, future); err != nil { + t.Fatalf("backdate: %v", err) + } + job2, _, err := Dequeue(ctx, pool) + if err != nil { + t.Fatalf("Dequeue after backdate: %v", err) + } + if job2.ID != future { + t.Errorf("Dequeue claimed %s, want the now-available job %s", job2.ID, future) + } + }) +} + // @ac AC-01 // AC-01: Enqueue persists a job_queue row with the expected fields populated. func TestEnqueue_PersistsRow(t *testing.T) { diff --git a/internal/remediation/execution.go b/internal/remediation/execution.go index 440fcf5e4..08ce4f721 100644 --- a/internal/remediation/execution.go +++ b/internal/remediation/execution.go @@ -32,6 +32,32 @@ func (s *Service) MarkRolledBack(ctx context.Context, id uuid.UUID) (Request, er return s.transition(ctx, id, StatusExecuted, StatusRolledBack) } +// RevertToApproved transitions an 'executing' request back to 'approved'. The +// remediation worker calls this when it has marked a request executing but the +// host turned out to be busy (lost a race for the per-host guard): the request +// returns to approved so the requeued job can run it once the host frees up. +func (s *Service) RevertToApproved(ctx context.Context, id uuid.UUID) (Request, error) { + return s.transition(ctx, id, StatusExecuting, StatusApproved) +} + +// HostHasExecuting reports whether the host already has a remediation request +// in the 'executing' state. The worker uses this to serialize per-host +// remediation: only one rule is applied on a host at a time (they share one +// SSH session via the executor's per-host guard), so a second concurrent +// request backs off and requeues instead of colliding. +func (s *Service) HostHasExecuting(ctx context.Context, hostID uuid.UUID) (bool, error) { + var exists bool + err := s.pool.QueryRow(ctx, ` + SELECT EXISTS ( + SELECT 1 FROM remediation_requests + WHERE host_id = $1 AND status = 'executing' + )`, hostID).Scan(&exists) + if err != nil { + return false, fmt.Errorf("remediation: host-executing check: %w", err) + } + return exists, nil +} + // transition performs a guarded fromState -> toState update under FOR UPDATE. // Unlike review() it does not touch reviewed_by/reviewed_at — execution // transitions are system-driven, not a human review. diff --git a/internal/remediation/service_test.go b/internal/remediation/service_test.go index 5c04ca1cf..1e01b55b1 100644 --- a/internal/remediation/service_test.go +++ b/internal/remediation/service_test.go @@ -283,3 +283,52 @@ func TestProjectLift_AndOverlayNeverMutatesRuleState(t *testing.T) { } }) } + +// @ac AC-08 +// AC-08: per-host serialization primitives. HostHasExecuting reports whether a +// request is 'executing' on a host; RevertToApproved returns an 'executing' +// request to 'approved'. The worker uses these (with a backoff requeue via +// queue.EnqueueAfter) so a second concurrent execute on a busy host requeues +// instead of failing. +func TestSerializePrimitives(t *testing.T) { + t.Run("api-remediation/AC-08", func(t *testing.T) { + pool := freshPool(t) + ctx := context.Background() + user := seedUser(t, pool, "ser") + hostID := seedHost(t, pool, user) + svc := NewService(pool, fakeEmitter(&[]emitCall{})) + + // Idle host: nothing executing. + if busy, err := svc.HostHasExecuting(ctx, hostID); err != nil || busy { + t.Fatalf("HostHasExecuting (idle) = %v, %v; want false", busy, err) + } + + // Seed an executing request directly (the worker sets this via + // MarkExecuting; we seed it to test the primitives in isolation). + execID := uuid.Must(uuid.NewV7()) + if _, err := pool.Exec(ctx, + `INSERT INTO remediation_requests (id, host_id, rule_id, status, requested_by) + VALUES ($1, $2, 'rule-x', 'executing', $3)`, execID, hostID, user); err != nil { + t.Fatalf("seed executing: %v", err) + } + + // The host is now busy. + if busy, err := svc.HostHasExecuting(ctx, hostID); err != nil || !busy { + t.Errorf("HostHasExecuting (executing) = %v, %v; want true", busy, err) + } + + // RevertToApproved returns it to approved and clears busy. + reverted, err := svc.RevertToApproved(ctx, execID) + if err != nil || reverted.Status != StatusApproved { + t.Errorf("RevertToApproved = %+v, %v; want approved", reverted, err) + } + if busy, _ := svc.HostHasExecuting(ctx, hostID); busy { + t.Errorf("HostHasExecuting after revert = true; want false") + } + + // RevertToApproved on a non-executing request -> ErrWrongState. + if _, err := svc.RevertToApproved(ctx, execID); !errors.Is(err, ErrWrongState) { + t.Errorf("RevertToApproved (approved) err = %v, want ErrWrongState", err) + } + }) +} diff --git a/internal/worker/remediation_worker.go b/internal/worker/remediation_worker.go index 60a4e6241..59e0a23e7 100644 --- a/internal/worker/remediation_worker.go +++ b/internal/worker/remediation_worker.go @@ -136,8 +136,25 @@ func (w *RemediationWorker) ProcessJob(ctx context.Context, j *queue.Job) { } } -// processExecute drives approved -> executing -> executed|failed. +// remediationBusyBackoff is how long a remediation job waits before being +// retried when its target host is busy with another remediation. Only one rule +// is applied on a host at a time (they share a single SSH session via the +// executor's per-host guard), so concurrent "Fix" clicks serialize instead of +// failing. Kept short — the running remediation, not the wait, is the bottleneck. +const remediationBusyBackoff = 3 * time.Second + +// processExecute drives approved -> executing -> executed|failed. When the host +// is already being remediated, it backs off and requeues (serialize per host) +// instead of failing the request. func (w *RemediationWorker) processExecute(ctx context.Context, j *queue.Job, p RemediationPayload) { + // Serialize per host: if another remediation is already executing on this + // host, requeue with a backoff rather than colliding on the per-host SSH + // guard (which would fail this request). The request stays 'approved'. + if busy, err := w.svc.HostHasExecuting(ctx, p.HostID); err == nil && busy { + w.requeueBusy(ctx, j, p) + return + } + // Guard + transition approved -> executing (row-locked). A duplicate // enqueue or a request not in 'approved' fails here without touching the // host. @@ -158,7 +175,19 @@ func (w *RemediationWorker) processExecute(ctx context.Context, j *queue.Job, p // executor owns the per-host concurrency guard. result, remErr := w.executor.Remediate(ctx, p.HostID, p.RuleID) if remErr != nil { - // Host-side failure: record an empty journal + transition to failed. + // A lost race for the per-host guard is TRANSIENT, not a host-side + // failure: revert executing -> approved and requeue so it retries once + // the host frees, rather than marking the request failed. + if errors.Is(remErr, kensa.ErrHostBusy) { + if _, rerr := w.svc.RevertToApproved(ctx, p.RequestID); rerr != nil { + slog.WarnContext(ctx, "remediation revert-to-approved failed", + slog.String("request_id", p.RequestID.String()), + slog.String("error", rerr.Error())) + } + w.requeueBusy(ctx, j, p) + return + } + // Real host-side failure: record an empty journal + transition to failed. w.finishExecute(ctx, j, rq, p, nil, false) slog.WarnContext(ctx, "remediation execute failed on host", slog.String("request_id", p.RequestID.String()), @@ -172,6 +201,23 @@ func (w *RemediationWorker) processExecute(ctx context.Context, j *queue.Job, p w.finishExecute(ctx, j, rq, p, txns, committed) } +// requeueBusy completes the current job and re-enqueues the same signed action +// after remediationBusyBackoff, so a worker retries it once the host frees up. +// Dequeue skips the not-yet-available row, so this does not busy-loop the +// drain. A failure to re-enqueue falls back to failing the job (visible) rather +// than silently dropping the action. +func (w *RemediationWorker) requeueBusy(ctx context.Context, j *queue.Job, p RemediationPayload) { + body := MarshalRemediationJob(w.queueKey, p) + if _, err := queue.EnqueueAfter(ctx, w.pool, RemediationJobType, body, remediationBusyBackoff); err != nil { + slog.WarnContext(ctx, "remediation requeue (host busy) failed", + slog.String("request_id", p.RequestID.String()), + slog.String("error", err.Error())) + _ = queue.Fail(ctx, w.pool, j.ID, "requeue (host busy) failed: "+err.Error()) + return + } + _ = queue.Complete(ctx, w.pool, j.ID) +} + // finishExecute writes the journal, transitions to executed|failed, flips the // rule to pass on a committed execute, publishes + audits, completes the job. func (w *RemediationWorker) finishExecute(ctx context.Context, j *queue.Job, @@ -235,6 +281,13 @@ func (w *RemediationWorker) processRollback(ctx context.Context, j *queue.Job, p return } + // Serialize per host: a rollback shares the per-host SSH guard with execute, + // so if another remediation is executing on this host, back off and requeue. + if busy, herr := w.svc.HostHasExecuting(ctx, p.HostID); herr == nil && busy { + w.requeueBusy(ctx, j, p) + return + } + // Resolve the rollback handle: the payload's txn id, or the first // committed transaction recorded for the request. txnID := p.TxnID @@ -248,6 +301,12 @@ func (w *RemediationWorker) processRollback(ctx context.Context, j *queue.Job, p } res, rbErr := w.executor.Rollback(ctx, p.HostID, txnID) + // A lost race for the per-host guard is transient: requeue and retry rather + // than recording a failed rollback (the request stays 'executed'). + if errors.Is(rbErr, kensa.ErrHostBusy) { + w.requeueBusy(ctx, j, p) + return + } status := "failed" if rbErr == nil && res != nil { status = res.Status diff --git a/specs/api/remediation.spec.yaml b/specs/api/remediation.spec.yaml index ae042ff21..541ba7d0c 100644 --- a/specs/api/remediation.spec.yaml +++ b/specs/api/remediation.spec.yaml @@ -148,3 +148,6 @@ spec: description: 'Worker execution path: a queued remediation job with a valid HMAC drives an approved request approved -> executing -> executed when the executor returns a committed transaction, writes a remediation_transactions journal row (kensa_txn_id, phase_result=committed), and flips the rule to pass in host_rule_state (the compliance score moves). A committed run that the executor reports as errored, or a host-side failure, transitions the request to failed and writes no flip. A job whose payload HMAC does not verify is dead-lettered (queue.Fail) with scheduler.job.hmac_rejected and no executor invocation.' priority: critical references_constraints: [C-08] + - id: AC-08 + description: 'Per-host remediation serialization: HostHasExecuting(host) reports whether a request is in the executing state on the host; RevertToApproved(id) returns an executing request to approved (and is ErrWrongState otherwise). The worker uses these plus a backoff requeue (queue.EnqueueAfter) so a second concurrent execute/rollback on a busy host requeues and retries once the host frees, instead of colliding on the per-host SSH guard and being marked failed.' + priority: high diff --git a/specs/system/job-queue.spec.yaml b/specs/system/job-queue.spec.yaml index 1debe87f7..95ad930ec 100644 --- a/specs/system/job-queue.spec.yaml +++ b/specs/system/job-queue.spec.yaml @@ -128,3 +128,6 @@ spec: the default-1 worker stays strictly serial. priority: critical references_constraints: [C-07] + - id: AC-13 + description: 'EnqueueAfter(delay) sets a future available_at; Dequeue does not claim a job before its available_at and claims it once available_at <= now() (ORDER BY available_at, created_at). Enqueue (delay 0) is immediately available, so existing producers are unchanged. This backs a requeue-with-backoff without busy-looping the drain.' + priority: high From 13dab53d3f9657d33745c884f10d57fb53186603 Mon Sep 17 00:00:00 2001 From: Remylus Losius Date: Fri, 19 Jun 2026 17:46:11 -0400 Subject: [PATCH 5/7] feat(frontend): live remediation status via remediation.completed SSE The Remediation tab required a manual refresh to see a fix finish. The worker already publishes remediation.completed on the event bus; useLiveEvents now subscribes to it and invalidates ['host', id, 'remediations'] + ['host', id], so the tab and the compliance score update automatically when a queued fix or rollback reaches its terminal state. frontend-live-events AC-09 + AC-01 (topic set grows to 6). --- CHANGELOG.md | 11 +++++++++++ frontend/src/hooks/useLiveEvents.ts | 15 +++++++++++++++ frontend/tests/hooks/useLiveEvents.test.tsx | 20 +++++++++++++++++--- specs/frontend/live-events.spec.yaml | 9 +++++++-- 4 files changed, 50 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a8750821a..9c806687b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,11 @@ Versioning: [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ### Changed +- Remediation now updates live. The Remediation tab and the compliance score + refresh automatically when a queued fix or rollback finishes, over the SSE + event stream (new `remediation.completed` topic), instead of requiring a + manual page refresh. + - CI release safety: the release workflow now fails closed on a `v*` tag push when no GPG signing key is configured, rather than publishing unsigned packages. Manual `workflow_dispatch` trial builds stay permissive (warn + @@ -24,6 +29,12 @@ Versioning: [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ### Fixed +- Applying several fixes on the same host at once no longer fails the extra + ones. Concurrent remediations on a host now serialize: a fix whose host is + busy backs off and requeues (with a short delay, via a new delayed-visibility + column on the job queue) until the host is free, instead of colliding on the + per-host SSH guard and being marked failed. + - Documentation version drift: operator guides referenced `0.2.0-rc.5` while `packaging/version.env` was `0.2.0-rc.10`; all guides now match. - SPA static-delivery tests are self-contained (in-memory fixture) instead of diff --git a/frontend/src/hooks/useLiveEvents.ts b/frontend/src/hooks/useLiveEvents.ts index f2b009ff6..b2ef292b8 100644 --- a/frontend/src/hooks/useLiveEvents.ts +++ b/frontend/src/hooks/useLiveEvents.ts @@ -31,6 +31,7 @@ export const ALL_TOPICS = [ 'host.discovered', 'intelligence.event', 'scan.completed', + 'remediation.completed', ] as const; type Topic = (typeof ALL_TOPICS)[number]; @@ -141,6 +142,20 @@ export function useLiveEvents(options: UseLiveEventsOptions = {}) { queryClient.invalidateQueries({ queryKey: ['host', hostId] }); } }, + // remediation.completed -> the Remediation tab + Compliance score update + // without a manual refresh. The worker publishes this when a queued + // execute/rollback reaches its terminal state (executed | failed | + // rolled_back); a committed execute also flips the rule to pass, so the + // host detail (compliance) is invalidated too. + 'remediation.completed': (e) => { + const env = parseEnvelope(e); + if (!env) return; + const hostId = (env.payload?.HostID ?? env.payload?.host_id) as string | undefined; + if (hostId) { + queryClient.invalidateQueries({ queryKey: ['host', hostId, 'remediations'] }); + queryClient.invalidateQueries({ queryKey: ['host', hostId] }); + } + }, }; for (const k of topics) { diff --git a/frontend/tests/hooks/useLiveEvents.test.tsx b/frontend/tests/hooks/useLiveEvents.test.tsx index 6025dacfe..68e0d4b18 100644 --- a/frontend/tests/hooks/useLiveEvents.test.tsx +++ b/frontend/tests/hooks/useLiveEvents.test.tsx @@ -83,8 +83,8 @@ beforeEach(() => { }); // @ac AC-01 -// AC-01: ALL_TOPICS exported as the closed set of 5 topics (v1.1.0 -// adds scan.completed). +// AC-01: ALL_TOPICS exported as the closed set (v1.1.0 adds scan.completed; +// v1.2.0 adds remediation.completed). test('frontend-live-events/AC-01 — ALL_TOPICS is the closed v1.0 set', () => { const want = [ 'host.changed', @@ -92,9 +92,10 @@ test('frontend-live-events/AC-01 — ALL_TOPICS is the closed v1.0 set', () => { 'host.discovered', 'intelligence.event', 'scan.completed', + 'remediation.completed', ]; expect([...ALL_TOPICS]).toEqual(want); - expect(ALL_TOPICS.length).toBe(5); + expect(ALL_TOPICS.length).toBe(6); }); // Helper to mount the hook and return the stub + spies. @@ -124,6 +125,19 @@ test('frontend-live-events/AC-02 — host.changed invalidates [hosts] + [host, i expect(calls).toContainEqual(['host', 'h-aaa']); }); +// @ac AC-09 +// AC-09: remediation.completed invalidates the host's remediations list (the +// Remediation tab updates without a manual refresh) and the host detail (a +// committed fix flips a rule to pass, moving the compliance score). The worker +// publishes HostID (Go field name). +test('frontend-live-events/AC-09 — remediation.completed invalidates [host, id, remediations] + [host, id]', () => { + const { es, spy } = mountHook(); + es.fire('remediation.completed', { HostID: 'h-rem' }); + const calls = spy.mock.calls.map((c) => c[0]?.queryKey); + expect(calls).toContainEqual(['host', 'h-rem', 'remediations']); + expect(calls).toContainEqual(['host', 'h-rem']); +}); + // @ac AC-03 test('frontend-live-events/AC-03 — monitoring.band.changed invalidates [hosts] + [host, id]', () => { const { es, spy } = mountHook(); diff --git a/specs/frontend/live-events.spec.yaml b/specs/frontend/live-events.spec.yaml index 2cf21545e..a66cd258f 100644 --- a/specs/frontend/live-events.spec.yaml +++ b/specs/frontend/live-events.spec.yaml @@ -28,6 +28,7 @@ spec: invalidation; list page is NOT invalidated (intel events don't affect the list view) + - remediation.completed — remediations list + detail invalidation - scan.completed — list + detail invalidation (v1.1.0: compliance_summary on both views changes when a scan's @@ -65,7 +66,7 @@ spec: constraints: - id: C-01 - description: 'ALL_TOPICS MUST be a closed const-tuple containing exactly the five kinds: host.changed, monitoring.band.changed, host.discovered, intelligence.event, scan.completed (v1.1.0). Adding a sixth requires bumping this spec''s version and updating the test' + description: 'ALL_TOPICS MUST be a closed const-tuple containing exactly the six kinds: host.changed, monitoring.band.changed, host.discovered, intelligence.event, scan.completed, remediation.completed (v1.2.0). Adding another requires bumping this spec''s version and updating the test' type: technical enforcement: error - id: C-02 @@ -95,7 +96,7 @@ spec: acceptance_criteria: - id: AC-01 - description: 'ALL_TOPICS as exported from useLiveEvents.ts equals exactly the closed set ["host.changed", "monitoring.band.changed", "host.discovered", "intelligence.event", "scan.completed"]. Verified by importing the const and asserting the array contents + length' + description: 'ALL_TOPICS as exported from useLiveEvents.ts equals exactly the closed set ["host.changed", "monitoring.band.changed", "host.discovered", "intelligence.event", "scan.completed", "remediation.completed"]. Verified by importing the const and asserting the array contents + length' priority: critical references_constraints: [C-01] - id: AC-02 @@ -126,3 +127,7 @@ spec: description: 'Firing scan.completed with host_id=H invalidates ["hosts"] AND ["host", H] — the hero compliance card refreshes without a reload after a scan finishes' priority: critical references_constraints: [C-07] + - id: AC-09 + description: 'Firing remediation.completed with HostID=H invalidates ["host", H, "remediations"] AND ["host", H] — the Remediation tab and the compliance score refresh without a reload when a queued fix or rollback finishes (a committed fix flips a rule to pass)' + priority: high + references_constraints: [C-07] From 8922dfbe8e811a116606dc0bd54c91a50f7f673f Mon Sep 17 00:00:00 2001 From: Remylus Losius Date: Fri, 19 Jun 2026 18:11:43 -0400 Subject: [PATCH 6/7] chore(release): bump Kensa to v0.5.2 and prepare 0.2.0-rc.11 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kensa v0.5.2 is a PATCH release with a frozen api/ surface, so OpenWatch's library integration is unchanged. Its notable fix corrects a config_value matching bug ('" "' delimiter now matches any whitespace incl. TAB), which removes a class of false FAILs on TAB-delimited rules (RHEL login.defs) — affected hosts may see their compliance score improve. The jsonl skipped-vs- error fix (kensa#104) is confirmed no-impact for the library path (issue #603). - go.mod kensa v0.5.1 -> v0.5.2; KensaModuleVersion + kensa-executor spec pin updated to match (version-pin tests pass; corpus stays at 539 rules, the variable-catalog AC still sees exactly 3 placeholders). - version.env -> 0.2.0-rc.11; README + operator guides + CHANGELOG cut a 0.2.0-rc.11 section. --- CHANGELOG.md | 17 +++++++++++++++++ README.md | 2 +- docs/guides/API_GUIDE.md | 6 +++--- docs/guides/MONITORING_SETUP.md | 4 ++-- docs/guides/PRODUCTION_DEPLOYMENT.md | 2 +- docs/guides/QUICKSTART.md | 2 +- docs/guides/UPGRADE_PROCEDURE.md | 2 +- go.mod | 2 +- go.sum | 4 ++-- internal/kensa/types.go | 2 +- packaging/version.env | 2 +- specs/system/kensa-executor.spec.yaml | 4 ++-- 12 files changed, 33 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a8750821a..0e7c3d78b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,8 +10,25 @@ Versioning: [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +--- + +## [0.2.0-rc.11] Eyrie — 2026-06-19 + +The bundled Kensa scan engine moves to v0.5.2, which corrects a class of false +compliance FAILs on TAB-delimited rules; the GA-readiness pass also hardened CI +and the release workflow. + ### Changed +- Updated the bundled Kensa scan engine and rule corpus to v0.5.2. v0.5.2 fixes + a `config_value` matching bug so a `" "` delimiter matches any whitespace + (including TAB), correcting a class of false FAILs on TAB-delimited rules such + as RHEL `login.defs` — affected hosts may see their compliance score improve. + It also adds rule-engine correctness gates (check-method parameter contracts, + value-domain validation, a comparator + delimiter engine, and a schema/engine + parity gate). The corpus stays at 539 rules and the engine's frozen API + surface is unchanged, so OpenWatch's library integration is unaffected + (kensa v0.5.2). - CI release safety: the release workflow now fails closed on a `v*` tag push when no GPG signing key is configured, rather than publishing unsigned packages. Manual `workflow_dispatch` trial builds stay permissive (warn + diff --git a/README.md b/README.md index 39a1e1be4..23b17cdb3 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ OpenWatch is the compliance operating system for teams managing Linux infrastruc > Python/FastAPI implementation was archived out of the repo on 2026-06-05). The > Go tree lives at the **repo root**: Go 1.26 backend (`cmd/`, `internal/`), > React 19 + TanStack frontend (`frontend/`), PostgreSQL-only. The current -> version is `0.2.0-rc.10`, a pre-release — not a GA build. +> version is `0.2.0-rc.11`, a pre-release — not a GA build. ![OpenWatch Compliance Dashboard](docs/images/dashboard-preview.png) diff --git a/docs/guides/API_GUIDE.md b/docs/guides/API_GUIDE.md index bec8c307f..1ac3d87ca 100644 --- a/docs/guides/API_GUIDE.md +++ b/docs/guides/API_GUIDE.md @@ -11,7 +11,7 @@ contract source of truth is `api/openapi.yaml` in the repository; the running binary serves the same document, and `GET /api/v1/version` reports the build it came from. -This guide reflects OpenWatch `0.2.0-rc.10`, a pre-release. The API surface is +This guide reflects OpenWatch `0.2.0-rc.11`, a pre-release. The API surface is still growing — endpoints that the legacy Python API exposed (scan execution, remediation, exceptions, posture history, audit exports, the rule-reference browser) are not yet part of `api/v1`. See [What is not yet in the @@ -276,7 +276,7 @@ curl -s --cacert /etc/openwatch/tls/ca.crt https://localhost:8443/api/v1/health ``` ```json -{"status": "healthy", "db_connected": true, "version": "0.2.0-rc.10"} +{"status": "healthy", "db_connected": true, "version": "0.2.0-rc.11"} ``` `status` is `healthy` or `degraded`; the endpoint returns `503` when the service @@ -354,7 +354,7 @@ configuration steps, see ## What is not yet in the API The compliance scanning workflow runs through Kensa and the background worker, -not yet through public REST endpoints. As of `0.2.0-rc.10`, `api/v1` does not +not yet through public REST endpoints. As of `0.2.0-rc.11`, `api/v1` does not include: - Scan execution or scan-result endpoints (`/api/v1/scans/…`). diff --git a/docs/guides/MONITORING_SETUP.md b/docs/guides/MONITORING_SETUP.md index e3c75f814..41d468282 100644 --- a/docs/guides/MONITORING_SETUP.md +++ b/docs/guides/MONITORING_SETUP.md @@ -54,7 +54,7 @@ curl -k https://localhost:8443/api/v1/health A healthy response returns `200 OK`: ```json -{"status": "healthy", "db_connected": true, "version": "0.2.0-rc.10"} +{"status": "healthy", "db_connected": true, "version": "0.2.0-rc.11"} ``` When the database ping fails, the endpoint returns `503 Service Unavailable` @@ -76,7 +76,7 @@ curl -k https://localhost:8443/api/v1/version ```json { - "openwatch": "0.2.0-rc.10", + "openwatch": "0.2.0-rc.11", "kensa": "", "go": "", "commit": "", diff --git a/docs/guides/PRODUCTION_DEPLOYMENT.md b/docs/guides/PRODUCTION_DEPLOYMENT.md index f972a24a7..24a17b2a1 100644 --- a/docs/guides/PRODUCTION_DEPLOYMENT.md +++ b/docs/guides/PRODUCTION_DEPLOYMENT.md @@ -12,7 +12,7 @@ touches lightly: process layout, TLS, the background worker, backups, upgrades, and incident runbooks. > Verify the version you deploy. The current line is a pre-release -> (`0.2.0-rc.10` per `packaging/version.env`), not a GA build. Treat it +> (`0.2.0-rc.11` per `packaging/version.env`), not a GA build. Treat it > accordingly until a GA tag ships. --- diff --git a/docs/guides/QUICKSTART.md b/docs/guides/QUICKSTART.md index 71d598eaa..78f4b7213 100644 --- a/docs/guides/QUICKSTART.md +++ b/docs/guides/QUICKSTART.md @@ -49,7 +49,7 @@ A healthy response looks like this: { "status": "healthy", "db_connected": true, - "version": "0.2.0-rc.10" + "version": "0.2.0-rc.11" } ``` diff --git a/docs/guides/UPGRADE_PROCEDURE.md b/docs/guides/UPGRADE_PROCEDURE.md index 5d1a40ad0..018038a06 100644 --- a/docs/guides/UPGRADE_PROCEDURE.md +++ b/docs/guides/UPGRADE_PROCEDURE.md @@ -13,7 +13,7 @@ database backup and restore commands referenced below, see [`BACKUP_RECOVERY.md`](BACKUP_RECOVERY.md). For migration mechanics, see [`DATABASE_MIGRATIONS.md`](DATABASE_MIGRATIONS.md). -> Version note: the current release line is a pre-release (`0.2.0-rc.10`). Treat +> Version note: the current release line is a pre-release (`0.2.0-rc.11`). Treat > upgrades between pre-release builds as potentially breaking and always back up > first. diff --git a/go.mod b/go.mod index faed2fc67..385a5d3db 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.26.4 require ( github.com/BurntSushi/toml v1.6.0 - github.com/Hanalyx/kensa v0.5.1 + github.com/Hanalyx/kensa v0.5.2 github.com/getkin/kin-openapi v0.139.0 github.com/gliderlabs/ssh v0.3.8 github.com/go-chi/chi/v5 v5.3.0 diff --git a/go.sum b/go.sum index 1cea4c716..dd9543c7a 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,7 @@ github.com/BurntSushi/toml v1.6.0 h1:dRaEfpa2VI55EwlIW72hMRHdWouJeRF7TPYhI+AUQjk= github.com/BurntSushi/toml v1.6.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= -github.com/Hanalyx/kensa v0.5.1 h1:ggIqW2fMXHUopAwn86EKq1n4qUsgKeVW62yQQC8rGy8= -github.com/Hanalyx/kensa v0.5.1/go.mod h1:oEJt9i8spIWwy6i6uF1YgShrLS67kFXKIWr+J1eYBOY= +github.com/Hanalyx/kensa v0.5.2 h1:9bp5KION7N1FlmJA4f0AKFS4uVXijXZWDiP8ucViriQ= +github.com/Hanalyx/kensa v0.5.2/go.mod h1:oEJt9i8spIWwy6i6uF1YgShrLS67kFXKIWr+J1eYBOY= github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk= github.com/andybalholm/brotli v1.2.1 h1:R+f5xP285VArJDRgowrfb9DqL18yVK0gKAW/F+eTWro= github.com/andybalholm/brotli v1.2.1/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY= diff --git a/internal/kensa/types.go b/internal/kensa/types.go index 319cd8ee1..ae64dabdd 100644 --- a/internal/kensa/types.go +++ b/internal/kensa/types.go @@ -10,7 +10,7 @@ import ( // KensaModuleVersion is the version pin recorded in the spec's context // block. AC-10 source-inspects to verify this matches the corresponding // entry in app/go.mod. -const KensaModuleVersion = "v0.5.1" +const KensaModuleVersion = "v0.5.2" // Sentinel errors returned by Executor.Run. Tests use errors.Is for // classification; the audit emission path maps each to a typed diff --git a/packaging/version.env b/packaging/version.env index 5148256d2..39ea84993 100644 --- a/packaging/version.env +++ b/packaging/version.env @@ -2,5 +2,5 @@ # # The Go binary's ldflags read this file via the Makefile; build scripts # in packaging/{rpm,deb}/ source it for spec macros. -VERSION="0.2.0-rc.10" +VERSION="0.2.0-rc.11" CODENAME="Eyrie" diff --git a/specs/system/kensa-executor.spec.yaml b/specs/system/kensa-executor.spec.yaml index 8df5090a1..d887a180a 100644 --- a/specs/system/kensa-executor.spec.yaml +++ b/specs/system/kensa-executor.spec.yaml @@ -10,7 +10,7 @@ spec: feature: Kensa scan execution bridge description: > The executor invokes Kensa (Go module github.com/Hanalyx/kensa - pinned to v0.5.1) to run a scan against a single host using the + pinned to v0.5.2) to run a scan against a single host using the FULL rule corpus applicable to the host's detected OS capabilities. The Kensa API (`Kensa.Scan(ctx, host, rules, opts...)` per kensa-go/api/kensa.go:228) takes a `[]*api.Rule` @@ -131,7 +131,7 @@ spec: type: technical enforcement: error - id: C-13 - description: The production scanFunc MUST compose the scan-only Kensa via api.New with pkg/kensa.NewScanner (kensa v0.5.1 — stateless, concurrency-safe shared) and this package's TransportFactory; no engine, store, or signer is constructed for the scan path. The worker subcommand binds it via WithScanFunc(NewProductionScanFunc(...)). unwiredScanFunc may remain ONLY as the test fallback NewExecutor defaults to before binding, annotated as such + description: The production scanFunc MUST compose the scan-only Kensa via api.New with pkg/kensa.NewScanner (kensa v0.5.2 — stateless, concurrency-safe shared) and this package's TransportFactory; no engine, store, or signer is constructed for the scan path. The worker subcommand binds it via WithScanFunc(NewProductionScanFunc(...)). unwiredScanFunc may remain ONLY as the test fallback NewExecutor defaults to before binding, annotated as such type: technical enforcement: error - id: C-14 From 3b18347f766148eb687715bf5a1eb9219c6181de Mon Sep 17 00:00:00 2001 From: Remylus Losius Date: Fri, 19 Jun 2026 23:52:55 -0400 Subject: [PATCH 7/7] docs(changelog): reconcile rc.11 section (bundle #604-#608) --- CHANGELOG.md | 54 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index beca0a2ba..3d384d69e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,15 +10,40 @@ Versioning: [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] -### Changed +--- -- Auth: an anonymous request to a protected endpoint (no credentials, or a - session cookie that expired in the browser and is no longer sent) now returns - **401 `auth.required`** instead of 403. The SPA redirects to login on a 401, - so an expired session surfaces as a clean re-login prompt rather than a - dead-end "failed to load." An authenticated caller whose role lacks the - permission still gets 403 `authz.permission_denied`. +## [0.2.0-rc.11] Eyrie — 2026-06-19 + +The bundled Kensa scan engine moves to v0.5.2, which corrects a class of false +compliance FAILs on TAB-delimited rules. Single-operator remediation no longer +deadlocks (free-core fixes auto-approve), the Remediation tab updates live and +serializes concurrent fixes, an expired session now redirects cleanly to login, +and the GA-readiness pass hardened CI and the release workflow. + +### Added + +- Remediation: free-core single-rule remediation is now **auto-approved** on + request, so an operator can apply a fix without a separate approver. This + removes the self-review deadlock for single-operator workspaces (you could + request a fix but never approve your own request). The request lifecycle and + the approve/reject flow with separation of duties are retained for the + licensed bulk/automated remediation track (which requests with approval + required). See `docs/engineering/remediation_governance_adr.md` ("A-keep"). +### Changed + +- Updated the bundled Kensa scan engine and rule corpus to v0.5.2. v0.5.2 fixes + a `config_value` matching bug so a `" "` delimiter matches any whitespace + (including TAB), correcting a class of false FAILs on TAB-delimited rules such + as RHEL `login.defs` — affected hosts may see their compliance score improve. + It also adds rule-engine correctness gates (check-method parameter contracts, + value-domain validation, a comparator + delimiter engine, and a schema/engine + parity gate). The corpus stays at 539 rules and the engine's frozen API + surface is unchanged, so OpenWatch's library integration is unaffected + (kensa v0.5.2). +- Documented remediation/exception governance: a remediation-approval ADR and a + role matrix covering who can request versus approve a fix or exception, plus a + self-review rule, and an RBAC spec that drift-locks the role/permission map. - CI release safety: the release workflow now fails closed on a `v*` tag push when no GPG signing key is configured, rather than publishing unsigned packages. Manual `workflow_dispatch` trial builds stay permissive (warn + @@ -31,6 +56,21 @@ Versioning: [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ### Fixed +- Auth: an anonymous request to a protected endpoint (no credentials, or a + session cookie that expired in the browser and is no longer sent) now returns + **401 `auth.required`** instead of 403. The SPA redirects to login on a 401, + so an expired session surfaces as a clean re-login prompt rather than a + dead-end "failed to load." An authenticated caller whose role lacks the + permission still gets 403 `authz.permission_denied`. +- Remediation now updates live. The Remediation tab and the compliance score + refresh automatically when a queued fix or rollback finishes, over the SSE + event stream (new `remediation.completed` topic), instead of requiring a + manual page refresh. +- Applying several fixes on the same host at once no longer fails the extra + ones. Concurrent remediations on a host now serialize: a fix whose host is + busy backs off and requeues (with a short delay, via a new delayed-visibility + column on the job queue) until the host is free, instead of colliding on the + per-host SSH guard and being marked failed. - Documentation version drift: operator guides referenced `0.2.0-rc.5` while `packaging/version.env` was `0.2.0-rc.10`; all guides now match. - SPA static-delivery tests are self-contained (in-memory fixture) instead of