From 4c889b502c3f3ee51586e9b766b12ea6099e2d64 Mon Sep 17 00:00:00 2001 From: Harshal Patil <12152047+harche@users.noreply.github.com> Date: Fri, 19 Jun 2026 10:53:02 -0400 Subject: [PATCH 1/2] Drop estimatedImpact; align verification schema with CRD (#162) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit estimatedImpact on ProposalResult was +required in the AnalysisResult CRD but only optional in the LLM output schema, so analyses where the agent omitted it were rejected by CRD validation at status-patch time — persisting 0 options, marking the proposal Failed, and orphaning the analysis sandbox pod. - Remove estimatedImpact from the API type, the LLM output schema, the generated CRD, and the mock agent. - Tighten the verification branch of the LLM schema to match the CRD (steps[] require name+type; verification requires description), fixing the same latent drift class. - Add a guard test asserting every CRD-required field is also required in the corresponding LLM output schema. --- api/v1alpha1/proposal_analysis_types.go | 8 - .../agentic.openshift.io_analysisresults.yaml | 10 -- controller/proposal/schema_crd_drift_test.go | 143 ++++++++++++++++++ controller/proposal/schemas.go | 7 +- test/agent/main.go | 3 +- 5 files changed, 148 insertions(+), 23 deletions(-) create mode 100644 controller/proposal/schema_crd_drift_test.go diff --git a/api/v1alpha1/proposal_analysis_types.go b/api/v1alpha1/proposal_analysis_types.go index 61e63bc5..0814eb44 100644 --- a/api/v1alpha1/proposal_analysis_types.go +++ b/api/v1alpha1/proposal_analysis_types.go @@ -137,14 +137,6 @@ type ProposalResult struct { // Must be one of: Reversible, Irreversible, Partial. // +optional Reversible Reversibility `json:"reversible,omitempty"` - // estimatedImpact is a Markdown-formatted description of the expected - // impact of the remediation on the system - // (e.g., "Brief pod restart, ~30s downtime"). - // Maximum 1024 characters. - // +required - // +kubebuilder:validation:MinLength=1 - // +kubebuilder:validation:MaxLength=1024 - EstimatedImpact string `json:"estimatedImpact,omitempty"` // rollbackPlan describes how to undo the remediation if execution fails // or causes unexpected issues. Only the execution step mutates cluster // state, so rollback lives here alongside the actions it would undo. diff --git a/config/crd/bases/agentic.openshift.io_analysisresults.yaml b/config/crd/bases/agentic.openshift.io_analysisresults.yaml index 62922c52..85917640 100644 --- a/config/crd/bases/agentic.openshift.io_analysisresults.yaml +++ b/config/crd/bases/agentic.openshift.io_analysisresults.yaml @@ -243,15 +243,6 @@ spec: maxLength: 8192 minLength: 1 type: string - estimatedImpact: - description: |- - estimatedImpact is a Markdown-formatted description of the expected - impact of the remediation on the system - (e.g., "Brief pod restart, ~30s downtime"). - Maximum 1024 characters. - maxLength: 1024 - minLength: 1 - type: string reversible: description: |- reversible indicates whether the remediation can be rolled back @@ -298,7 +289,6 @@ spec: required: - actions - description - - estimatedImpact - risk type: object rbac: diff --git a/controller/proposal/schema_crd_drift_test.go b/controller/proposal/schema_crd_drift_test.go new file mode 100644 index 00000000..8ed1a0cc --- /dev/null +++ b/controller/proposal/schema_crd_drift_test.go @@ -0,0 +1,143 @@ +package proposal + +import ( + "encoding/json" + "os" + "path/filepath" + "testing" + + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + "sigs.k8s.io/yaml" +) + +// TestAnalysisSchemaCoversCRDRequiredFields guards against the schema/CRD +// contract drift described in lightspeed-agentic-operator#162: the JSON +// schema sent to the LLM for structured output must mark as required every +// field the AnalysisResult CRD requires. When the two disagree, a valid LLM +// response can be rejected by CRD validation at status-patch time, which +// fails the analysis and orphans the sandbox pod. +// +// The generated CRD (which encodes the +required markers) is the source of +// truth. For every object node the two schemas share, the CRD's required set +// must be a subset of the LLM schema's required set. This would have caught +// both the original estimatedImpact incident and the latent verification gap. +func TestAnalysisSchemaCoversCRDRequiredFields(t *testing.T) { + crdPath := filepath.Join("..", "..", "config", "crd", "bases", "agentic.openshift.io_analysisresults.yaml") + raw, err := os.ReadFile(crdPath) + if err != nil { + t.Fatalf("read CRD: %v", err) + } + + var crd apiextensionsv1.CustomResourceDefinition + if err := yaml.Unmarshal(raw, &crd); err != nil { + t.Fatalf("unmarshal CRD: %v", err) + } + if len(crd.Spec.Versions) == 0 { + t.Fatal("CRD declares no versions") + } + + // The LLM analysis schema describes a single option; its per-option shape + // lives at properties.options.items. + var llm map[string]any + if err := json.Unmarshal(AnalysisOutputSchema, &llm); err != nil { + t.Fatalf("unmarshal AnalysisOutputSchema: %v", err) + } + llmOption, ok := digObject(llm, "properties", "options", "items") + if !ok { + t.Fatal("AnalysisOutputSchema is missing properties.options.items") + } + + for _, v := range crd.Spec.Versions { + if v.Schema == nil || v.Schema.OpenAPIV3Schema == nil { + continue + } + // The matching per-option schema in the CRD is at status.options.items. + status, ok := v.Schema.OpenAPIV3Schema.Properties["status"] + if !ok { + continue + } + options, ok := status.Properties["options"] + if !ok || options.Items == nil || options.Items.Schema == nil { + t.Fatalf("CRD version %s is missing status.options.items schema", v.Name) + } + assertRequiredCoverage(t, "options[]", options.Items.Schema, llmOption) + } +} + +// assertRequiredCoverage walks a CRD schema node and the corresponding LLM +// schema node in parallel, asserting that every field required by the CRD is +// also required by the LLM schema (for fields the LLM schema actually models). +func assertRequiredCoverage(t *testing.T, path string, crd *apiextensionsv1.JSONSchemaProps, llm map[string]any) { + t.Helper() + if crd == nil || llm == nil { + return + } + + // Arrays: descend into the element schema on both sides. + if crd.Type == "array" { + if crd.Items == nil || crd.Items.Schema == nil { + return + } + llmItems, ok := asJSONObject(llm["items"]) + if !ok { + return + } + assertRequiredCoverage(t, path+"[]", crd.Items.Schema, llmItems) + return + } + + llmProps, _ := asJSONObject(llm["properties"]) + llmRequired := requiredSet(llm["required"]) + for _, req := range crd.Required { + // Only enforce coverage for fields the LLM schema actually models; + // the LLM may legitimately omit a field the CRD allows. + if _, modeled := llmProps[req]; !modeled { + continue + } + if !llmRequired[req] { + t.Errorf("schema/CRD drift at %s: CRD requires %q but the LLM output schema does not mark it required", path, req) + } + } + + // Recurse into properties present in both schemas. + for name := range crd.Properties { + child := crd.Properties[name] + llmChild, ok := asJSONObject(llmProps[name]) + if !ok { + continue + } + assertRequiredCoverage(t, path+"."+name, &child, llmChild) + } +} + +// digObject walks nested object properties of a decoded JSON schema. +func digObject(m map[string]any, keys ...string) (map[string]any, bool) { + cur := m + for _, k := range keys { + next, ok := asJSONObject(cur[k]) + if !ok { + return nil, false + } + cur = next + } + return cur, true +} + +func asJSONObject(v any) (map[string]any, bool) { + m, ok := v.(map[string]any) + return m, ok +} + +func requiredSet(v any) map[string]bool { + out := map[string]bool{} + arr, ok := v.([]any) + if !ok { + return out + } + for _, e := range arr { + if s, ok := e.(string); ok { + out[s] = true + } + } + return out +} diff --git a/controller/proposal/schemas.go b/controller/proposal/schemas.go index b8d490de..715bd428 100644 --- a/controller/proposal/schemas.go +++ b/controller/proposal/schemas.go @@ -50,7 +50,6 @@ var AnalysisOutputSchema = json.RawMessage(`{ }, "risk": { "type": "string", "enum": ["Low", "Medium", "High", "Critical"], "description": "Risk assessment. Low: safe to apply. Medium: review recommended. High: careful review required. Critical: manual approval strongly recommended" }, "reversible": { "type": "string", "enum": ["Reversible", "Irreversible", "Partial"], "description": "Whether this remediation can be rolled back. Reversible: fully undoable. Irreversible: cannot undo. Partial: some actions undoable" }, - "estimatedImpact": { "type": "string", "description": "Expected impact on the system (e.g., 'Brief pod restart, ~30s downtime')" }, "rollbackPlan": { "type": "object", "description": "How to undo the remediation if it fails or causes issues. Required when reversible is Reversible or Partial.", @@ -77,10 +76,12 @@ var AnalysisOutputSchema = json.RawMessage(`{ "command": { "type": "string", "description": "Command or API call to run (e.g., 'oc get pod -n production -l app=web -o jsonpath={.status.phase}')" }, "expected": { "type": "string", "description": "Expected output or condition (e.g., 'Running', 'ready=true')" }, "type": { "type": "string", "description": "Check category (e.g., 'command', 'metric', 'condition')" } - } + }, + "required": ["name", "type"] } } - } + }, + "required": ["description"] }, "rbac": { "type": "object", diff --git a/test/agent/main.go b/test/agent/main.go index 44a20694..ebf5bb78 100644 --- a/test/agent/main.go +++ b/test/agent/main.go @@ -255,8 +255,7 @@ func cannedResponse(phase, targetNS string) []byte { { "type": "patch", "description": "mock proposed action" } ], "risk": "Low", - "reversible": "Reversible", - "estimatedImpact": "Brief pod restart, ~30s downtime" + "reversible": "Reversible" }, "verification": { "description": "mock verification plan", From a192d1f33f94e88d1313760e3221633e702973a7 Mon Sep 17 00:00:00 2001 From: Harshal Patil <12152047+harche@users.noreply.github.com> Date: Fri, 19 Jun 2026 11:22:58 -0400 Subject: [PATCH 2/2] test: flag CRD-required fields absent from the LLM schema MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review feedback: the drift guard silently skipped CRD-required fields that were not modeled at all in the LLM output schema, so removing such a field could pass undetected. A required field that the agent is never asked for is rejected at status-patch time, which is exactly the drift this guard exists to catch — so error in that case too, not only when a modeled field lacks the required marker. --- controller/proposal/schema_crd_drift_test.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/controller/proposal/schema_crd_drift_test.go b/controller/proposal/schema_crd_drift_test.go index 8ed1a0cc..c64d7922 100644 --- a/controller/proposal/schema_crd_drift_test.go +++ b/controller/proposal/schema_crd_drift_test.go @@ -89,9 +89,12 @@ func assertRequiredCoverage(t *testing.T, path string, crd *apiextensionsv1.JSON llmProps, _ := asJSONObject(llm["properties"]) llmRequired := requiredSet(llm["required"]) for _, req := range crd.Required { - // Only enforce coverage for fields the LLM schema actually models; - // the LLM may legitimately omit a field the CRD allows. + // A field the CRD requires must be both modeled and required in the LLM + // output schema. If it is absent entirely the agent is never asked for + // it, so the status patch is rejected at runtime — the exact drift this + // guard exists to catch. if _, modeled := llmProps[req]; !modeled { + t.Errorf("schema/CRD drift at %s: CRD requires %q but the LLM output schema does not model the field", path, req) continue } if !llmRequired[req] {