From d5232578ed375d75eed29d6b892898322947da12 Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Thu, 16 Apr 2026 18:56:55 -0700 Subject: [PATCH] fix(cli): --no-cluster must not deploy the snapshot-capture agent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `--no-cluster` is documented as "dry-run, skip cluster operations", but validate.go only propagates it into `validator.WithNoCluster(...)` which affects *phase execution*. The earlier no-snapshot branch at `if snapshotFilePath != ""` still calls `deployAgentForValidation()` — which creates a K8s client, deploys a Job, and captures a snapshot from the live API. That directly contradicts the flag's contract. Two concrete consequences: 1. `go test ./pkg/cli/...` on any dev machine with a working ~/.kube/config mutates the live cluster as a side effect. Observed on aicr-cuj2: `TestValidateCmd_RecipeKindHandling` deployed the `aicr-validation` namespace and the `aicr-validate` Job before the test returned. 2. Three subtests in that test asserted `errContain: "kubernetes client"` — they only passed in CI because CI has no kubeconfig, so `GetKubeClient()` failed and the "failed to create kubernetes client" wrap propagated out. On dev machines the agent deploy succeeded, phases reported skipped, validate returned nil → `expected error but got nil`. Guard the agent-deploy path on `--no-cluster` and return a clear `ErrCodeInvalidRequest` when the user asks for no-cluster without a snapshot (nothing to validate against). Placed after `recipe.LoadFromFile` so the subtests continue to exercise the recipe loader, kind check, and RecipeMetadata auto-hydration before hitting the new error. Update the three subtests' `errContain` to the new message. The existing `errAbsent` assertions still prove those same kind-check / auto-hydration paths ran successfully. Known follow-up (out of scope for this PR): `cm://` URIs for `--recipe` and `--snapshot` still imply cluster reads under `--no-cluster`, so a literal "no cluster access anywhere" mode needs further work (likely a separate check in `recipe.LoadFromFile` / `serializer.FromFileWith…` paths). --- pkg/cli/validate.go | 10 ++++++++++ pkg/cli/validate_test.go | 6 +++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/pkg/cli/validate.go b/pkg/cli/validate.go index 4c8103a5..af6c23dd 100644 --- a/pkg/cli/validate.go +++ b/pkg/cli/validate.go @@ -516,6 +516,16 @@ Run validation without failing on check errors (informational mode): var snap *snapshotter.Snapshot + // --no-cluster means "do not touch the cluster". The agent-deploy + // branch below contradicts that (it creates a Job and captures a + // snapshot from the live API), so a snapshot file is the only valid + // data source in that mode. Placed after recipe.LoadFromFile so + // recipe kind-check and auto-hydration still run for CLI coverage. + if snapshotFilePath == "" && cmd.Bool("no-cluster") { + return errors.New(errors.ErrCodeInvalidRequest, + "--no-cluster requires --snapshot (cannot deploy the snapshot-capture agent without cluster access)") + } + if snapshotFilePath != "" { slog.Info("loading snapshot", "uri", snapshotFilePath) snap, err = serializer.FromFileWithKubeconfig[snapshotter.Snapshot](snapshotFilePath, kubeconfig) diff --git a/pkg/cli/validate_test.go b/pkg/cli/validate_test.go index de14ecf4..d570ea50 100644 --- a/pkg/cli/validate_test.go +++ b/pkg/cli/validate_test.go @@ -273,7 +273,7 @@ func TestValidateCmd_RecipeKindHandling(t *testing.T) { name: "RecipeMetadata with criteria auto-hydrates", yamlContent: "kind: RecipeMetadata\napiVersion: aicr.nvidia.com/v1alpha1\nmetadata:\n name: test\nspec:\n criteria:\n service: eks\n accelerator: h100\n intent: training\n", wantErr: true, - errContain: "kubernetes client", + errContain: "--no-cluster requires --snapshot", errAbsent: "has no criteria", }, { @@ -292,14 +292,14 @@ func TestValidateCmd_RecipeKindHandling(t *testing.T) { name: "RecipeResult kind passes kind check", yamlContent: "kind: RecipeResult\napiVersion: aicr.nvidia.com/v1alpha1\n", wantErr: true, - errContain: "kubernetes client", + errContain: "--no-cluster requires --snapshot", errAbsent: "is required", }, { name: "empty kind passes kind check", yamlContent: "apiVersion: aicr.nvidia.com/v1alpha1\n", wantErr: true, - errContain: "kubernetes client", + errContain: "--no-cluster requires --snapshot", errAbsent: "is required", }, }