From d6fd810e5e477ac907e87886cc75ec2094561cdc Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Wed, 15 Apr 2026 21:24:15 -0700 Subject: [PATCH] refactor(ci): unify GPU Chainsaw layout and validation flow Move shared assertions from cluster/ to common/, keep leaf-specific checks with their suites, align both H100 GPU workflows on the same validation flow, and temporarily remove the Dynamo smoke path. Decode all YAML documents in referencedAssertFiles (was only reading the first). Increase kind-load timeouts in aicr-build action. --- .github/actions/aicr-build/action.yml | 16 +- .../actions/gpu-snapshot-validate/action.yml | 24 ++ .../workflows/gpu-h100-inference-test.yaml | 205 ++-------------- .github/workflows/gpu-h100-training-test.yaml | 49 ++-- tests/chainsaw/ai-conformance/README.md | 24 +- .../ai-conformance/cluster/chainsaw-test.yaml | 10 +- .../assert-cert-manager.yaml | 0 .../assert-dra-driver.yaml | 0 .../assert-kai-scheduler.yaml | 0 .../assert-monitoring.yaml | 0 .../{cluster => common}/assert-skyhook.yaml | 0 .../kind-inference-dynamo/assert-crds.yaml | 65 +++++ .../kind-inference-dynamo/assert-dynamo.yaml | 32 +++ .../assert-kgateway.yaml | 32 +++ .../kind-inference-dynamo/chainsaw-test.yaml | 16 +- .../assert-kubeflow-trainer.yaml | 0 .../kind-training-kubeflow/chainsaw-test.yaml | 12 +- tests/chainsaw/ai-conformance/main.go | 99 +++++++- tests/chainsaw/ai-conformance/main_test.go | 228 ++++++++++++++++++ 19 files changed, 572 insertions(+), 240 deletions(-) rename tests/chainsaw/ai-conformance/{cluster => common}/assert-cert-manager.yaml (100%) rename tests/chainsaw/ai-conformance/{cluster => common}/assert-dra-driver.yaml (100%) rename tests/chainsaw/ai-conformance/{cluster => common}/assert-kai-scheduler.yaml (100%) rename tests/chainsaw/ai-conformance/{cluster => common}/assert-monitoring.yaml (100%) rename tests/chainsaw/ai-conformance/{cluster => common}/assert-skyhook.yaml (100%) create mode 100644 tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-crds.yaml create mode 100644 tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-dynamo.yaml create mode 100644 tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-kgateway.yaml rename tests/chainsaw/ai-conformance/{cluster => kind-training-kubeflow}/assert-kubeflow-trainer.yaml (100%) create mode 100644 tests/chainsaw/ai-conformance/main_test.go diff --git a/.github/actions/aicr-build/action.yml b/.github/actions/aicr-build/action.yml index c4bbee90a..575bdd436 100644 --- a/.github/actions/aicr-build/action.yml +++ b/.github/actions/aicr-build/action.yml @@ -53,9 +53,14 @@ runs: # Load onto all nodes. The snapshot agent requests nvidia.com/gpu but # does not set a node selector, so it can land on any GPU-capable node # including the control-plane (e.g., T4 smoke test). - timeout 600 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || { + # + # Timeout is intentionally generous (900s per attempt). H100 self-hosted + # runners transfer images over a shared Docker-in-Docker bridge; large + # CUDA base images (~250MB compressed) combined with I/O contention from + # parallel GPU operator pods regularly exceed the previous 600s limit. + timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || { echo "::warning::kind load attempt 1 failed for ko.local:smoke-test, retrying..." - timeout 600 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" + timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" } - name: Build validator images and load into kind @@ -94,9 +99,12 @@ runs: USER nonroot ENTRYPOINT ["/${phase}"] DOCKERFILE - timeout 300 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}" || { + # Validator images are small (~30MB distroless), but share the same + # Docker-in-Docker bridge as the smoke-test load above. 600s per + # attempt accommodates I/O queuing behind concurrent image pulls. + timeout 600 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}" || { echo "::warning::kind load attempt 1 failed for ko.local/aicr-validators/${phase}:latest, retrying..." - timeout 300 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}" + timeout 600 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}" } done diff --git a/.github/actions/gpu-snapshot-validate/action.yml b/.github/actions/gpu-snapshot-validate/action.yml index 8a587bbd4..e1ee3c14b 100644 --- a/.github/actions/gpu-snapshot-validate/action.yml +++ b/.github/actions/gpu-snapshot-validate/action.yml @@ -59,3 +59,27 @@ runs: exit 1 fi echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}" + + - name: Debug snapshot Job + if: failure() + shell: bash + run: | + echo "=== Snapshot Job ===" + kubectl --context="kind-${{ inputs.cluster_name }}" -n default get job aicr -o yaml || true + echo "=== Snapshot Pods ===" + kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ + get pods -l app.kubernetes.io/name=aicr -o wide || true + echo "=== Snapshot Job describe ===" + kubectl --context="kind-${{ inputs.cluster_name }}" -n default describe job aicr || true + echo "=== Snapshot Pod describe ===" + kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ + describe pods -l app.kubernetes.io/name=aicr || true + echo "=== Snapshot current logs ===" + kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ + logs -l app.kubernetes.io/name=aicr --all-containers --tail=200 || true + echo "=== Snapshot previous logs ===" + kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ + logs -l app.kubernetes.io/name=aicr --all-containers --previous --tail=200 || true + echo "=== Snapshot ConfigMap ===" + kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ + get configmap aicr-snapshot -o yaml || true diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml index 8e0e6e15b..bb2fc7842 100644 --- a/.github/workflows/gpu-h100-inference-test.yaml +++ b/.github/workflows/gpu-h100-inference-test.yaml @@ -57,19 +57,11 @@ jobs: - 'pkg/evidence/**' - '.github/actions/gpu-test-cleanup/**' - '.github/actions/load-versions/**' - - 'tests/manifests/dynamo-vllm-smoke-test.yaml' - 'tests/chainsaw/chainsaw-config.yaml' - 'tests/chainsaw/ai-conformance/main.go' + - 'tests/chainsaw/ai-conformance/common/**' - 'tests/chainsaw/ai-conformance/kind-common/**' - 'tests/chainsaw/ai-conformance/kind-inference-dynamo/**' - - 'tests/chainsaw/ai-conformance/cluster/assert-crds.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-dynamo.yaml' - 'recipes/mixins/platform-inference.yaml' - 'recipes/components/kgateway/**' - 'recipes/components/kgateway-crds/**' @@ -166,27 +158,19 @@ jobs: --test-dir tests/chainsaw/ai-conformance/kind-inference-dynamo \ --config tests/chainsaw/chainsaw-config.yaml + # --- CNCF AI Conformance validation --- + # Runs after the stack health checks so gateway and metrics validators + # see a settled inference stack. + - name: Verify expected resources exist run: | go run ./tests/chainsaw/ai-conformance/ \ --dir tests/chainsaw/ai-conformance/kind-inference-dynamo \ - --file tests/chainsaw/ai-conformance/cluster/assert-crds.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-dynamo.yaml \ + --dir tests/chainsaw/ai-conformance/common \ + --dir tests/chainsaw/ai-conformance/kind-common \ --kubeconfig="${HOME}/.kube/config" \ --debug - # --- CNCF AI Conformance validation --- - # Runs before Dynamo so conformance validators retain access to free GPUs. - # dra-support allocates a GPU via ResourceClaim, and gang-scheduling - # requires 2 free GPUs. Running the Dynamo smoke test first would consume - # GPU capacity and create avoidable test flakiness. - - name: Validate CNCF AI Conformance id: validate-conformance run: | @@ -202,146 +186,32 @@ jobs: --output=validation-result.yaml \ --evidence-dir=conformance-evidence - # --- Dynamo vLLM inference smoke test --- - # Runs after conformance: Dynamo's DRA ResourceClaim consumes GPU - # capacity, so it is safer to keep the shared conformance gate first. - - - name: Deploy Dynamo vLLM smoke test - run: | - # Create kai-scheduler queue for Dynamo (grove-operator sets kai.scheduler/queue=dynamo). - # The kai-scheduler chart creates default-parent-queue + default-queue on install, - # but Dynamo needs its own queue as a child of the parent. - kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - <<'EOF' - apiVersion: scheduling.run.ai/v2 - kind: Queue - metadata: - name: dynamo - spec: - parentQueue: default-parent-queue - resources: - gpu: - quota: 0 - limit: -1 - overQuotaWeight: 1 - cpu: - quota: 0 - limit: -1 - overQuotaWeight: 1 - memory: - quota: 0 - limit: -1 - overQuotaWeight: 1 - EOF - - # Create DRA ResourceClaim for GPU allocation. - # Required on DRA-only clusters where device-plugin GPU requests cannot be scheduled. - # The kai.scheduler/queue label is required for KAI scheduler to manage the claim. - kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - <<'EOF' - apiVersion: resource.k8s.io/v1 - kind: ResourceClaim - metadata: - name: vllm-smoke-gpu-claim - namespace: dynamo-system - labels: - kai.scheduler/queue: dynamo - spec: - devices: - requests: - - name: gpu - exactly: - deviceClassName: gpu.nvidia.com - allocationMode: ExactCount - count: 1 - EOF - - kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \ - -f tests/manifests/dynamo-vllm-smoke-test.yaml -n dynamo-system - - echo "Waiting for DynamoGraphDeployment to be reconciled..." - for i in $(seq 1 120); do - PHASE=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ - get dynamographdeployment vllm-smoke-test \ - -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null) - if [[ "${PHASE}" == "True" ]]; then - echo "DynamoGraphDeployment is ready." - break - fi - echo "Waiting for DGD readiness... (${i}/120)" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods 2>/dev/null || true - sleep 10 - done - - if [[ "${PHASE}" != "True" ]]; then - echo "::error::DynamoGraphDeployment did not become ready within 20 minutes" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ - get dynamographdeployment vllm-smoke-test -o yaml 2>/dev/null || true - exit 1 - fi - - echo "Dynamo pods:" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods - - - name: Validate Dynamo inference - run: | - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ - port-forward svc/vllm-smoke-test-frontend 8000:8000 & - PF_PID=$! - sleep 3 + # Dynamo smoke is intentionally disabled for now. The vLLM runtime image + # adds significant latency and flakiness in Kind CI, and training has no + # matching smoke path yet. Reintroduce it later alongside a symmetric + # training smoke test if needed. + # --- Validation artifacts --- - cleanup() { kill "${PF_PID}" 2>/dev/null || true; } - trap cleanup EXIT - - echo "=== Waiting for /v1/models (model registration may take time after worker ready) ===" - for i in $(seq 1 30); do - MODELS=$(curl -sf http://localhost:8000/v1/models 2>/dev/null || echo '{"data":[]}') - if echo "${MODELS}" | jq -e '.data | length > 0' >/dev/null 2>&1; then - echo "Models available after ${i} attempt(s)." - break - fi - echo "Waiting for model registration... (${i}/30)" - sleep 10 - done - echo "${MODELS}" | jq . - if ! echo "${MODELS}" | jq -e '.data | length > 0' >/dev/null 2>&1; then - echo "::error::No models reported by frontend after 5 minutes" - exit 1 - fi - - echo "=== Sending chat completion ===" - RESPONSE=$(curl -sf http://localhost:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{"model":"Qwen/Qwen3-0.6B","messages":[{"role":"user","content":"What is 2+2?"}],"max_tokens":30,"stream":false}') - echo "${RESPONSE}" | jq . - - CONTENT=$(echo "${RESPONSE}" | jq -r '.choices[0].message.content') - if [[ -z "${CONTENT}" || "${CONTENT}" == "null" ]]; then - echo "::error::Empty response from vLLM" - exit 1 - fi - echo "Dynamo vLLM inference smoke test passed." - - # --- Evidence collection --- - - - name: Collect AI conformance evidence + # Collect a post-run resource snapshot regardless of whether conformance + # validation ran, so triage always has a cluster-state artifact. + - name: Collect validation artifacts if: >- - !cancelled() + always() + && !cancelled() && steps.bundle-install.outcome == 'success' - && (steps.validate-conformance.outcome == 'success' || steps.validate-conformance.outcome == 'failure') + continue-on-error: true + shell: bash run: | + set -o pipefail + mkdir -p conformance-evidence go run ./tests/chainsaw/ai-conformance/ \ --dir tests/chainsaw/ai-conformance/kind-inference-dynamo \ - --file tests/chainsaw/ai-conformance/cluster/assert-crds.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-dynamo.yaml \ + --dir tests/chainsaw/ai-conformance/common \ + --dir tests/chainsaw/ai-conformance/kind-common \ --kubeconfig="${HOME}/.kube/config" \ - --debug + --debug | tee conformance-evidence/resource-existence-post.txt - - name: Upload conformance evidence + - name: Upload validation artifacts if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: @@ -364,20 +234,6 @@ jobs: kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true echo "=== Dynamo pods ===" kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods -o wide 2>/dev/null || true - echo "=== DynamoGraphDeployment status ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get dynamographdeployment -o yaml 2>/dev/null || true - echo "=== Dynamo vLLM frontend logs ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ - logs deployment/vllm-smoke-test-frontend --tail=200 2>/dev/null || true - echo "=== Dynamo vLLM frontend previous logs ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ - logs deployment/vllm-smoke-test-frontend --previous --tail=200 2>/dev/null || true - echo "=== Dynamo vLLM worker logs ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ - logs deployment/vllm-smoke-test-vllmdecodeworker --tail=200 2>/dev/null || true - echo "=== Dynamo vLLM worker previous logs ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ - logs deployment/vllm-smoke-test-vllmdecodeworker --previous --tail=200 2>/dev/null || true echo "=== Dynamo operator logs ===" kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ logs deployment/dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true @@ -419,17 +275,6 @@ jobs: echo "=== Node status ===" kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null || true - - name: Dynamo vLLM cleanup - if: always() - run: | - kubectl --context="kind-${KIND_CLUSTER_NAME}" delete \ - -f tests/manifests/dynamo-vllm-smoke-test.yaml \ - -n dynamo-system --ignore-not-found 2>/dev/null || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" delete resourceclaim \ - vllm-smoke-gpu-claim -n dynamo-system --ignore-not-found 2>/dev/null || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" delete queue \ - dynamo --ignore-not-found 2>/dev/null || true - - name: GPU Test Cleanup if: always() uses: ./.github/actions/gpu-test-cleanup diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml index 25e58ceea..90e518fa5 100644 --- a/.github/workflows/gpu-h100-training-test.yaml +++ b/.github/workflows/gpu-h100-training-test.yaml @@ -59,13 +59,8 @@ jobs: - '.github/actions/load-versions/**' - 'tests/chainsaw/chainsaw-config.yaml' - 'tests/chainsaw/ai-conformance/main.go' + - 'tests/chainsaw/ai-conformance/common/**' - 'tests/chainsaw/ai-conformance/kind-common/**' - - 'tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-kubeflow-trainer.yaml' - 'tests/chainsaw/ai-conformance/kind-training-kubeflow/**' - 'recipes/overlays/kind.yaml' - 'recipes/overlays/h100-kind-training.yaml' @@ -160,23 +155,19 @@ jobs: --test-dir tests/chainsaw/ai-conformance/kind-training-kubeflow \ --config tests/chainsaw/chainsaw-config.yaml + # --- CNCF AI Conformance validation --- + # Runs last to ensure the DCGM → Prometheus → adapter pipeline + # has had time to bootstrap (pod-autoscaling check needs live metric data). + - name: Verify expected resources exist run: | go run ./tests/chainsaw/ai-conformance/ \ --dir tests/chainsaw/ai-conformance/kind-training-kubeflow \ - --file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-kubeflow-trainer.yaml \ + --dir tests/chainsaw/ai-conformance/common \ + --dir tests/chainsaw/ai-conformance/kind-common \ --kubeconfig="${HOME}/.kube/config" \ --debug - # --- CNCF AI Conformance validation --- - # Runs last to ensure the DCGM → Prometheus → adapter pipeline - # has had time to bootstrap (pod-autoscaling check needs live metric data). - - name: Validate CNCF AI Conformance id: validate-conformance run: | @@ -192,26 +183,28 @@ jobs: --output=validation-result.yaml \ --evidence-dir=conformance-evidence - # --- Evidence collection --- + # --- Validation artifacts --- - - name: Collect AI conformance evidence + # Collect a post-run resource snapshot regardless of whether conformance + # validation ran, so triage always has a cluster-state artifact. + - name: Collect validation artifacts if: >- - !cancelled() + always() + && !cancelled() && steps.bundle-install.outcome == 'success' - && (steps.validate-conformance.outcome == 'success' || steps.validate-conformance.outcome == 'failure') + continue-on-error: true + shell: bash run: | + set -o pipefail + mkdir -p conformance-evidence go run ./tests/chainsaw/ai-conformance/ \ --dir tests/chainsaw/ai-conformance/kind-training-kubeflow \ - --file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-kubeflow-trainer.yaml \ + --dir tests/chainsaw/ai-conformance/common \ + --dir tests/chainsaw/ai-conformance/kind-common \ --kubeconfig="${HOME}/.kube/config" \ - --debug + --debug | tee conformance-evidence/resource-existence-post.txt - - name: Upload conformance evidence + - name: Upload validation artifacts if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: diff --git a/tests/chainsaw/ai-conformance/README.md b/tests/chainsaw/ai-conformance/README.md index 3c2bd2bae..a3e2d7e65 100644 --- a/tests/chainsaw/ai-conformance/README.md +++ b/tests/chainsaw/ai-conformance/README.md @@ -4,6 +4,7 @@ This directory contains the Chainsaw suites used to validate AI conformance flow - `offline/`: no-cluster recipe and bundle generation checks - `cluster/`: deployed inference-stack health checks for the external cluster flow +- `common/`: cross-environment shared assertions used by the cluster suite and both Kind GPU suites - `kind-inference-dynamo/`: H100 Kind inference leaf-suite checks used by GPU CI - `kind-training-kubeflow/`: H100 Kind training leaf-suite checks used by GPU CI - `kind-common/`: shared Kind-only assertions consumed by both GPU CI leaf suites @@ -68,16 +69,26 @@ The Kind GPU workflows use these leaf recipes instead: ``` tests/chainsaw/ai-conformance/ ├── README.md +├── common/ # Shared across cluster + Kind GPU suites +│ ├── assert-cert-manager.yaml # cert-manager healthy +│ ├── assert-dra-driver.yaml # DRA driver healthy +│ ├── assert-kai-scheduler.yaml # KAI scheduler healthy +│ ├── assert-monitoring.yaml # Prometheus stack healthy +│ └── assert-skyhook.yaml # Skyhook operator healthy ├── kind-common/ # Shared Kind-only assertions │ ├── assert-gpu-operator.yaml # GPU operator healthy on kind │ ├── assert-network-operator.yaml # Network operator healthy on kind │ └── assert-nvsentinel.yaml # NVSentinel healthy on kind ├── kind-inference-dynamo/ # Kind + H100 + inference + dynamo leaf suite │ ├── chainsaw-test.yaml # Inference leaf health check orchestration +│ ├── assert-crds.yaml # Inference-specific CRDs installed +│ ├── assert-dynamo.yaml # Dynamo platform healthy on kind +│ ├── assert-kgateway.yaml # kgateway healthy on kind │ └── assert-namespaces.yaml # Inference-specific namespaces exist ├── kind-training-kubeflow/ # Kind + H100 + training + kubeflow leaf suite │ ├── chainsaw-test.yaml # Training leaf health check orchestration │ ├── assert-crds.yaml # Training-specific CRDs installed +│ ├── assert-kubeflow-trainer.yaml # Kubeflow trainer healthy on kind │ └── assert-namespaces.yaml # Training-specific namespaces exist ├── offline/ # No cluster needed │ ├── chainsaw-test.yaml # Recipe + bundle generation @@ -86,18 +97,21 @@ tests/chainsaw/ai-conformance/ ├── chainsaw-test.yaml # Cluster health check orchestration ├── assert-namespaces.yaml # 9 namespaces exist ├── assert-crds.yaml # Critical CRDs installed - ├── assert-cert-manager.yaml # cert-manager healthy ├── assert-gpu-operator.yaml # GPU operator + DaemonSets healthy - ├── assert-monitoring.yaml # Prometheus stack healthy ├── assert-kube-system.yaml # AWS EFA healthy ├── assert-kgateway.yaml # kgateway healthy - ├── assert-skyhook.yaml # Skyhook operator healthy ├── assert-nvsentinel.yaml # NVSentinel healthy - ├── assert-dra-driver.yaml # DRA driver healthy - ├── assert-kai-scheduler.yaml # KAI scheduler healthy └── assert-dynamo.yaml # Dynamo platform healthy ``` +Ownership model: + +- `common/`: shared across the external cluster suite and both Kind GPU suites +- `kind-common/`: shared only by the Kind GPU suites +- `kind-inference-dynamo/`: inference-only Kind assertions +- `kind-training-kubeflow/`: training-only Kind assertions +- `cluster/`: external-cluster-only assertions + ## Prerequisites ### Offline tests diff --git a/tests/chainsaw/ai-conformance/cluster/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/cluster/chainsaw-test.yaml index 44613dae3..1cc8018c4 100644 --- a/tests/chainsaw/ai-conformance/cluster/chainsaw-test.yaml +++ b/tests/chainsaw/ai-conformance/cluster/chainsaw-test.yaml @@ -52,7 +52,7 @@ spec: description: Verify cert-manager controller, webhook, and cainjector are available. try: - assert: - file: assert-cert-manager.yaml + file: ../common/assert-cert-manager.yaml # ── GPU Operator ─────────────────────────────────────────────────── - name: assert-gpu-operator @@ -68,7 +68,7 @@ spec: description: Verify kube-prometheus-stack, ephemeral storage metrics, and prometheus-adapter. try: - assert: - file: assert-monitoring.yaml + file: ../common/assert-monitoring.yaml # ── kube-system (AWS EFA) ────────────────────────────────────────── - name: assert-kube-system @@ -89,7 +89,7 @@ spec: description: Verify Skyhook operator controller-manager is available. try: - assert: - file: assert-skyhook.yaml + file: ../common/assert-skyhook.yaml # ── NVSentinel ───────────────────────────────────────────────────── - name: assert-nvsentinel @@ -105,14 +105,14 @@ spec: assert: 600s try: - assert: - file: assert-dra-driver.yaml + file: ../common/assert-dra-driver.yaml # ── KAI Scheduler ────────────────────────────────────────────────── - name: assert-kai-scheduler description: Verify KAI scheduler is available. try: - assert: - file: assert-kai-scheduler.yaml + file: ../common/assert-kai-scheduler.yaml # ── Dynamo Platform ──────────────────────────────────────────────── - name: assert-dynamo diff --git a/tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml b/tests/chainsaw/ai-conformance/common/assert-cert-manager.yaml similarity index 100% rename from tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml rename to tests/chainsaw/ai-conformance/common/assert-cert-manager.yaml diff --git a/tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml b/tests/chainsaw/ai-conformance/common/assert-dra-driver.yaml similarity index 100% rename from tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml rename to tests/chainsaw/ai-conformance/common/assert-dra-driver.yaml diff --git a/tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml b/tests/chainsaw/ai-conformance/common/assert-kai-scheduler.yaml similarity index 100% rename from tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml rename to tests/chainsaw/ai-conformance/common/assert-kai-scheduler.yaml diff --git a/tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml b/tests/chainsaw/ai-conformance/common/assert-monitoring.yaml similarity index 100% rename from tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml rename to tests/chainsaw/ai-conformance/common/assert-monitoring.yaml diff --git a/tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml b/tests/chainsaw/ai-conformance/common/assert-skyhook.yaml similarity index 100% rename from tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml rename to tests/chainsaw/ai-conformance/common/assert-skyhook.yaml diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-crds.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-crds.yaml new file mode 100644 index 000000000..62db5bbe8 --- /dev/null +++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-crds.yaml @@ -0,0 +1,65 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Assert that critical CRDs are installed for the kind inference + Dynamo stack. + +# GPU Operator +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: clusterpolicies.nvidia.com +--- +# cert-manager +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: certificates.cert-manager.io +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: issuers.cert-manager.io +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: clusterissuers.cert-manager.io +--- +# kgateway-crds (Gateway API + Inference Extension) +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: gateways.gateway.networking.k8s.io +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: httproutes.gateway.networking.k8s.io +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: inferencepools.inference.networking.x-k8s.io +--- +# dynamo-crds +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: dynamocomponentdeployments.nvidia.com +--- +# Skyhook +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: skyhooks.skyhook.nvidia.com diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-dynamo.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-dynamo.yaml new file mode 100644 index 000000000..d1f09f606 --- /dev/null +++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-dynamo.yaml @@ -0,0 +1,32 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Assert Dynamo platform components are healthy on the kind inference stack. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dynamo-platform-dynamo-operator-controller-manager + namespace: dynamo-system +status: + (conditions[?type == 'Available']): + - status: "True" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: grove-operator + namespace: dynamo-system +status: + (conditions[?type == 'Available']): + - status: "True" diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-kgateway.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-kgateway.yaml new file mode 100644 index 000000000..b3569f9a7 --- /dev/null +++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-kgateway.yaml @@ -0,0 +1,32 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Assert kgateway controller is available on the kind inference + Dynamo stack. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kgateway + namespace: kgateway-system +status: + (conditions[?type == 'Available']): + - status: "True" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: inference-gateway + namespace: kgateway-system +status: + (conditions[?type == 'Available']): + - status: "True" diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml index 0428d39c1..1b1f701ad 100644 --- a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml +++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml @@ -45,14 +45,14 @@ spec: assert: 120s try: - assert: - file: ../cluster/assert-crds.yaml + file: assert-crds.yaml # ── cert-manager ─────────────────────────────────────────────────── - name: assert-cert-manager description: Verify cert-manager controller, webhook, and cainjector are available. try: - assert: - file: ../cluster/assert-cert-manager.yaml + file: ../common/assert-cert-manager.yaml # ── GPU Operator ─────────────────────────────────────────────────── - name: assert-gpu-operator @@ -68,21 +68,21 @@ spec: description: Verify kube-prometheus-stack, ephemeral storage metrics, and prometheus-adapter. try: - assert: - file: ../cluster/assert-monitoring.yaml + file: ../common/assert-monitoring.yaml # ── kgateway ─────────────────────────────────────────────────────── - name: assert-kgateway description: Verify kgateway controller is available. try: - assert: - file: ../cluster/assert-kgateway.yaml + file: assert-kgateway.yaml # ── Skyhook ──────────────────────────────────────────────────────── - name: assert-skyhook description: Verify Skyhook operator controller-manager is available. try: - assert: - file: ../cluster/assert-skyhook.yaml + file: ../common/assert-skyhook.yaml # ── NVSentinel ───────────────────────────────────────────────────── - name: assert-nvsentinel @@ -98,7 +98,7 @@ spec: assert: 600s try: - assert: - file: ../cluster/assert-dra-driver.yaml + file: ../common/assert-dra-driver.yaml # ── Network Operator ─────────────────────────────────────────────── - name: assert-network-operator @@ -112,11 +112,11 @@ spec: description: Verify KAI scheduler is available. try: - assert: - file: ../cluster/assert-kai-scheduler.yaml + file: ../common/assert-kai-scheduler.yaml # ── Dynamo Platform ──────────────────────────────────────────────── - name: assert-dynamo description: Verify Dynamo operator, etcd, and NATS are healthy. try: - assert: - file: ../cluster/assert-dynamo.yaml + file: assert-dynamo.yaml diff --git a/tests/chainsaw/ai-conformance/cluster/assert-kubeflow-trainer.yaml b/tests/chainsaw/ai-conformance/kind-training-kubeflow/assert-kubeflow-trainer.yaml similarity index 100% rename from tests/chainsaw/ai-conformance/cluster/assert-kubeflow-trainer.yaml rename to tests/chainsaw/ai-conformance/kind-training-kubeflow/assert-kubeflow-trainer.yaml diff --git a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml index 6dd22ece5..382d99104 100644 --- a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml +++ b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml @@ -49,7 +49,7 @@ spec: description: Verify cert-manager controller, webhook, and cainjector are available. try: - assert: - file: ../cluster/assert-cert-manager.yaml + file: ../common/assert-cert-manager.yaml - name: assert-gpu-operator description: Verify GPU operator and managed components are healthy (kind profile — no driver DaemonSet). @@ -63,19 +63,19 @@ spec: description: Verify kube-prometheus-stack, ephemeral storage metrics, and prometheus-adapter. try: - assert: - file: ../cluster/assert-monitoring.yaml + file: ../common/assert-monitoring.yaml - name: assert-skyhook description: Verify Skyhook operator controller-manager is available. try: - assert: - file: ../cluster/assert-skyhook.yaml + file: ../common/assert-skyhook.yaml - name: assert-kubeflow-trainer description: Verify Kubeflow Trainer controller, validating webhook, and TrainJob CRD are available. try: - assert: - file: ../cluster/assert-kubeflow-trainer.yaml + file: assert-kubeflow-trainer.yaml - name: assert-nvsentinel description: Verify NVSentinel controller and platform connector are healthy (kind profile). @@ -89,7 +89,7 @@ spec: assert: 600s try: - assert: - file: ../cluster/assert-dra-driver.yaml + file: ../common/assert-dra-driver.yaml - name: assert-network-operator description: Verify NVIDIA Network Operator is available. @@ -101,4 +101,4 @@ spec: description: Verify KAI scheduler is available. try: - assert: - file: ../cluster/assert-kai-scheduler.yaml + file: ../common/assert-kai-scheduler.yaml diff --git a/tests/chainsaw/ai-conformance/main.go b/tests/chainsaw/ai-conformance/main.go index dee422934..99bb8ed6d 100644 --- a/tests/chainsaw/ai-conformance/main.go +++ b/tests/chainsaw/ai-conformance/main.go @@ -80,6 +80,26 @@ type checkResult struct { Version string // container image, label version, or CRD versions (best-effort) } +type chainsawTestFile struct { + Spec struct { + Steps []chainsawStep `yaml:"steps"` + } `yaml:"spec"` +} + +type chainsawStep struct { + Try []chainsawOperation `yaml:"try"` + Catch []chainsawOperation `yaml:"catch"` + Finally []chainsawOperation `yaml:"finally"` +} + +type chainsawOperation struct { + Assert *chainsawAssert `yaml:"assert"` +} + +type chainsawAssert struct { + File string `yaml:"file"` +} + func main() { cmd := &cli.Command{ Name: "ai-conformance-check", @@ -214,16 +234,33 @@ func parseAssertFiles(dir string) ([]resourceIdentity, error) { } var resources []resourceIdentity + parsedFiles := make(map[string]bool) for _, entry := range entries { name := entry.Name() if !strings.HasPrefix(name, "assert-") || !strings.HasSuffix(name, ".yaml") { continue } - path := filepath.Join(dir, name) - parsed, err := parseYAMLFile(path, name) - if err != nil { - return nil, err + path := filepath.Clean(filepath.Join(dir, name)) + parsedFiles[path] = true + parsed, parseErr := parseYAMLFile(path, name) + if parseErr != nil { + return nil, parseErr + } + resources = append(resources, parsed...) + } + + referencedFiles, err := referencedAssertFiles(dir) + if err != nil { + return nil, err + } + for _, path := range referencedFiles { + if parsedFiles[path] { + continue + } + parsed, parseErr := parseYAMLFile(path, filepath.Base(path)) + if parseErr != nil { + return nil, parseErr } resources = append(resources, parsed...) } @@ -234,6 +271,60 @@ func parseAssertFiles(dir string) ([]resourceIdentity, error) { return resources, nil } +func referencedAssertFiles(dir string) ([]string, error) { + path := filepath.Join(dir, "chainsaw-test.yaml") + f, err := os.Open(path) + if err != nil { + if os.IsNotExist(err) { + return nil, nil + } + return nil, errors.Wrap(errors.ErrCodeInvalidRequest, fmt.Sprintf("failed to open %s", path), err) + } + defer f.Close() + + seen := make(map[string]bool) + var files []string + dec := yaml.NewDecoder(f) + for { + var testFile chainsawTestFile + if err := dec.Decode(&testFile); err != nil { + if stderrors.Is(err, io.EOF) { + break + } + return nil, errors.Wrap(errors.ErrCodeInvalidRequest, fmt.Sprintf("failed to parse %s", path), err) + } + for _, step := range testFile.Spec.Steps { + files = appendReferencedAssertFiles(files, seen, dir, step.Try) + files = appendReferencedAssertFiles(files, seen, dir, step.Catch) + files = appendReferencedAssertFiles(files, seen, dir, step.Finally) + } + } + + return files, nil +} + +func appendReferencedAssertFiles(files []string, seen map[string]bool, dir string, ops []chainsawOperation) []string { + for _, op := range ops { + if op.Assert == nil || op.Assert.File == "" { + continue + } + + name := filepath.Base(op.Assert.File) + if !strings.HasPrefix(name, "assert-") || !strings.HasSuffix(name, ".yaml") { + continue + } + + path := filepath.Clean(filepath.Join(dir, op.Assert.File)) + if seen[path] { + continue + } + seen[path] = true + files = append(files, path) + } + + return files +} + // parseYAMLFile decodes a multi-document YAML file into resource identities. func parseYAMLFile(path, sourceFile string) ([]resourceIdentity, error) { f, err := os.Open(path) diff --git a/tests/chainsaw/ai-conformance/main_test.go b/tests/chainsaw/ai-conformance/main_test.go new file mode 100644 index 000000000..ca7039655 --- /dev/null +++ b/tests/chainsaw/ai-conformance/main_test.go @@ -0,0 +1,228 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "os" + "path/filepath" + "testing" +) + +func TestParseAssertFilesIncludesReferencedAssertions(t *testing.T) { + t.Parallel() + + root := t.TempDir() + clusterDir := filepath.Join(root, "cluster") + sharedDir := filepath.Join(root, "common") + if err := os.MkdirAll(clusterDir, 0o755); err != nil { + t.Fatalf("mkdir cluster: %v", err) + } + if err := os.MkdirAll(sharedDir, 0o755); err != nil { + t.Fatalf("mkdir common: %v", err) + } + + writeTestFile(t, filepath.Join(clusterDir, "assert-local.yaml"), ` +apiVersion: v1 +kind: Namespace +metadata: + name: local +`) + writeTestFile(t, filepath.Join(sharedDir, "assert-shared.yaml"), ` +apiVersion: apps/v1 +kind: Deployment +metadata: + name: shared + namespace: shared-ns +`) + writeTestFile(t, filepath.Join(clusterDir, "chainsaw-test.yaml"), ` +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +spec: + steps: + - try: + - assert: + file: assert-local.yaml + - assert: + file: ../common/assert-shared.yaml +`) + + resources, err := parseAssertFiles(clusterDir) + if err != nil { + t.Fatalf("parse assert files: %v", err) + } + if len(resources) != 2 { + t.Fatalf("resource count = %d, want 2", len(resources)) + } + + got := map[string]string{} + for _, resource := range resources { + got[resource.Metadata.Name] = resource.SourceFile + } + if got["local"] != "assert-local.yaml" { + t.Fatalf("local source = %q, want assert-local.yaml", got["local"]) + } + if got["shared"] != "assert-shared.yaml" { + t.Fatalf("shared source = %q, want assert-shared.yaml", got["shared"]) + } +} + +func TestParseAssertFilesWithoutChainsawTestKeepsDirectoryScanning(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + writeTestFile(t, filepath.Join(dir, "assert-only.yaml"), ` +apiVersion: v1 +kind: Namespace +metadata: + name: only +`) + + resources, err := parseAssertFiles(dir) + if err != nil { + t.Fatalf("parse assert files: %v", err) + } + if len(resources) != 1 { + t.Fatalf("resource count = %d, want 1", len(resources)) + } + if resources[0].Metadata.Name != "only" { + t.Fatalf("resource name = %q, want only", resources[0].Metadata.Name) + } +} + +func TestParseAssertFilesMultiDocumentChainsawTest(t *testing.T) { + t.Parallel() + + root := t.TempDir() + suiteDir := filepath.Join(root, "suite") + sharedDir := filepath.Join(root, "common") + if err := os.MkdirAll(suiteDir, 0o755); err != nil { + t.Fatalf("mkdir suite: %v", err) + } + if err := os.MkdirAll(sharedDir, 0o755); err != nil { + t.Fatalf("mkdir common: %v", err) + } + + writeTestFile(t, filepath.Join(sharedDir, "assert-first.yaml"), ` +apiVersion: v1 +kind: Namespace +metadata: + name: first +`) + writeTestFile(t, filepath.Join(sharedDir, "assert-second.yaml"), ` +apiVersion: v1 +kind: Namespace +metadata: + name: second +`) + // Multi-document chainsaw-test.yaml: two Test documents referencing different files. + writeTestFile(t, filepath.Join(suiteDir, "chainsaw-test.yaml"), ` +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +spec: + steps: + - try: + - assert: + file: ../common/assert-first.yaml +--- +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +spec: + steps: + - try: + - assert: + file: ../common/assert-second.yaml +`) + + resources, err := parseAssertFiles(suiteDir) + if err != nil { + t.Fatalf("parse assert files: %v", err) + } + if len(resources) != 2 { + t.Fatalf("resource count = %d, want 2", len(resources)) + } + + got := map[string]bool{} + for _, r := range resources { + got[r.Metadata.Name] = true + } + if !got["first"] { + t.Fatal("missing resource 'first' from first YAML document") + } + if !got["second"] { + t.Fatal("missing resource 'second' from second YAML document") + } +} + +func TestParseAssertFilesInvalidAssertionYAML(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + writeTestFile(t, filepath.Join(dir, "assert-bad.yaml"), ` +apiVersion: v1 +kind: Namespace +metadata: + name: [invalid yaml +`) + + if _, err := parseAssertFiles(dir); err == nil { + t.Fatal("expected error for invalid assertion YAML, got nil") + } +} + +func TestReferencedAssertFilesInvalidChainsawYAML(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + writeTestFile(t, filepath.Join(dir, "chainsaw-test.yaml"), ` +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +spec: + steps: + - try: + - assert: + file: [invalid yaml +`) + + if _, err := referencedAssertFiles(dir); err == nil { + t.Fatal("expected error for invalid chainsaw-test.yaml, got nil") + } +} + +func TestParseAssertFilesMissingReferencedAssertion(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + writeTestFile(t, filepath.Join(dir, "chainsaw-test.yaml"), ` +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +spec: + steps: + - try: + - assert: + file: assert-missing.yaml +`) + + if _, err := parseAssertFiles(dir); err == nil { + t.Fatal("expected error for missing referenced assertion file, got nil") + } +} + +func writeTestFile(t *testing.T, path, content string) { + t.Helper() + + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatalf("write %s: %v", path, err) + } +}