diff --git a/.github/actions/aicr-build/action.yml b/.github/actions/aicr-build/action.yml index c4bbee90..575bdd43 100644 --- a/.github/actions/aicr-build/action.yml +++ b/.github/actions/aicr-build/action.yml @@ -53,9 +53,14 @@ runs: # Load onto all nodes. The snapshot agent requests nvidia.com/gpu but # does not set a node selector, so it can land on any GPU-capable node # including the control-plane (e.g., T4 smoke test). - timeout 600 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || { + # + # Timeout is intentionally generous (900s per attempt). H100 self-hosted + # runners transfer images over a shared Docker-in-Docker bridge; large + # CUDA base images (~250MB compressed) combined with I/O contention from + # parallel GPU operator pods regularly exceed the previous 600s limit. + timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || { echo "::warning::kind load attempt 1 failed for ko.local:smoke-test, retrying..." - timeout 600 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" + timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" } - name: Build validator images and load into kind @@ -94,9 +99,12 @@ runs: USER nonroot ENTRYPOINT ["/${phase}"] DOCKERFILE - timeout 300 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}" || { + # Validator images are small (~30MB distroless), but share the same + # Docker-in-Docker bridge as the smoke-test load above. 600s per + # attempt accommodates I/O queuing behind concurrent image pulls. + timeout 600 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}" || { echo "::warning::kind load attempt 1 failed for ko.local/aicr-validators/${phase}:latest, retrying..." - timeout 300 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}" + timeout 600 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}" } done diff --git a/.github/actions/gpu-snapshot-validate/action.yml b/.github/actions/gpu-snapshot-validate/action.yml index 8a587bbd..e1ee3c14 100644 --- a/.github/actions/gpu-snapshot-validate/action.yml +++ b/.github/actions/gpu-snapshot-validate/action.yml @@ -59,3 +59,27 @@ runs: exit 1 fi echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}" + + - name: Debug snapshot Job + if: failure() + shell: bash + run: | + echo "=== Snapshot Job ===" + kubectl --context="kind-${{ inputs.cluster_name }}" -n default get job aicr -o yaml || true + echo "=== Snapshot Pods ===" + kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ + get pods -l app.kubernetes.io/name=aicr -o wide || true + echo "=== Snapshot Job describe ===" + kubectl --context="kind-${{ inputs.cluster_name }}" -n default describe job aicr || true + echo "=== Snapshot Pod describe ===" + kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ + describe pods -l app.kubernetes.io/name=aicr || true + echo "=== Snapshot current logs ===" + kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ + logs -l app.kubernetes.io/name=aicr --all-containers --tail=200 || true + echo "=== Snapshot previous logs ===" + kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ + logs -l app.kubernetes.io/name=aicr --all-containers --previous --tail=200 || true + echo "=== Snapshot ConfigMap ===" + kubectl --context="kind-${{ inputs.cluster_name }}" -n default \ + get configmap aicr-snapshot -o yaml || true diff --git a/.github/workflows/gpu-h100-inference-test.yaml b/.github/workflows/gpu-h100-inference-test.yaml index 8e0e6e15..bb2fc784 100644 --- a/.github/workflows/gpu-h100-inference-test.yaml +++ b/.github/workflows/gpu-h100-inference-test.yaml @@ -57,19 +57,11 @@ jobs: - 'pkg/evidence/**' - '.github/actions/gpu-test-cleanup/**' - '.github/actions/load-versions/**' - - 'tests/manifests/dynamo-vllm-smoke-test.yaml' - 'tests/chainsaw/chainsaw-config.yaml' - 'tests/chainsaw/ai-conformance/main.go' + - 'tests/chainsaw/ai-conformance/common/**' - 'tests/chainsaw/ai-conformance/kind-common/**' - 'tests/chainsaw/ai-conformance/kind-inference-dynamo/**' - - 'tests/chainsaw/ai-conformance/cluster/assert-crds.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-dynamo.yaml' - 'recipes/mixins/platform-inference.yaml' - 'recipes/components/kgateway/**' - 'recipes/components/kgateway-crds/**' @@ -166,27 +158,19 @@ jobs: --test-dir tests/chainsaw/ai-conformance/kind-inference-dynamo \ --config tests/chainsaw/chainsaw-config.yaml + # --- CNCF AI Conformance validation --- + # Runs after the stack health checks so gateway and metrics validators + # see a settled inference stack. + - name: Verify expected resources exist run: | go run ./tests/chainsaw/ai-conformance/ \ --dir tests/chainsaw/ai-conformance/kind-inference-dynamo \ - --file tests/chainsaw/ai-conformance/cluster/assert-crds.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-dynamo.yaml \ + --dir tests/chainsaw/ai-conformance/common \ + --dir tests/chainsaw/ai-conformance/kind-common \ --kubeconfig="${HOME}/.kube/config" \ --debug - # --- CNCF AI Conformance validation --- - # Runs before Dynamo so conformance validators retain access to free GPUs. - # dra-support allocates a GPU via ResourceClaim, and gang-scheduling - # requires 2 free GPUs. Running the Dynamo smoke test first would consume - # GPU capacity and create avoidable test flakiness. - - name: Validate CNCF AI Conformance id: validate-conformance run: | @@ -202,146 +186,32 @@ jobs: --output=validation-result.yaml \ --evidence-dir=conformance-evidence - # --- Dynamo vLLM inference smoke test --- - # Runs after conformance: Dynamo's DRA ResourceClaim consumes GPU - # capacity, so it is safer to keep the shared conformance gate first. - - - name: Deploy Dynamo vLLM smoke test - run: | - # Create kai-scheduler queue for Dynamo (grove-operator sets kai.scheduler/queue=dynamo). - # The kai-scheduler chart creates default-parent-queue + default-queue on install, - # but Dynamo needs its own queue as a child of the parent. - kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - <<'EOF' - apiVersion: scheduling.run.ai/v2 - kind: Queue - metadata: - name: dynamo - spec: - parentQueue: default-parent-queue - resources: - gpu: - quota: 0 - limit: -1 - overQuotaWeight: 1 - cpu: - quota: 0 - limit: -1 - overQuotaWeight: 1 - memory: - quota: 0 - limit: -1 - overQuotaWeight: 1 - EOF - - # Create DRA ResourceClaim for GPU allocation. - # Required on DRA-only clusters where device-plugin GPU requests cannot be scheduled. - # The kai.scheduler/queue label is required for KAI scheduler to manage the claim. - kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - <<'EOF' - apiVersion: resource.k8s.io/v1 - kind: ResourceClaim - metadata: - name: vllm-smoke-gpu-claim - namespace: dynamo-system - labels: - kai.scheduler/queue: dynamo - spec: - devices: - requests: - - name: gpu - exactly: - deviceClassName: gpu.nvidia.com - allocationMode: ExactCount - count: 1 - EOF - - kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \ - -f tests/manifests/dynamo-vllm-smoke-test.yaml -n dynamo-system - - echo "Waiting for DynamoGraphDeployment to be reconciled..." - for i in $(seq 1 120); do - PHASE=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ - get dynamographdeployment vllm-smoke-test \ - -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null) - if [[ "${PHASE}" == "True" ]]; then - echo "DynamoGraphDeployment is ready." - break - fi - echo "Waiting for DGD readiness... (${i}/120)" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods 2>/dev/null || true - sleep 10 - done - - if [[ "${PHASE}" != "True" ]]; then - echo "::error::DynamoGraphDeployment did not become ready within 20 minutes" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ - get dynamographdeployment vllm-smoke-test -o yaml 2>/dev/null || true - exit 1 - fi - - echo "Dynamo pods:" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods - - - name: Validate Dynamo inference - run: | - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ - port-forward svc/vllm-smoke-test-frontend 8000:8000 & - PF_PID=$! - sleep 3 + # Dynamo smoke is intentionally disabled for now. The vLLM runtime image + # adds significant latency and flakiness in Kind CI, and training has no + # matching smoke path yet. Reintroduce it later alongside a symmetric + # training smoke test if needed. + # --- Validation artifacts --- - cleanup() { kill "${PF_PID}" 2>/dev/null || true; } - trap cleanup EXIT - - echo "=== Waiting for /v1/models (model registration may take time after worker ready) ===" - for i in $(seq 1 30); do - MODELS=$(curl -sf http://localhost:8000/v1/models 2>/dev/null || echo '{"data":[]}') - if echo "${MODELS}" | jq -e '.data | length > 0' >/dev/null 2>&1; then - echo "Models available after ${i} attempt(s)." - break - fi - echo "Waiting for model registration... (${i}/30)" - sleep 10 - done - echo "${MODELS}" | jq . - if ! echo "${MODELS}" | jq -e '.data | length > 0' >/dev/null 2>&1; then - echo "::error::No models reported by frontend after 5 minutes" - exit 1 - fi - - echo "=== Sending chat completion ===" - RESPONSE=$(curl -sf http://localhost:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{"model":"Qwen/Qwen3-0.6B","messages":[{"role":"user","content":"What is 2+2?"}],"max_tokens":30,"stream":false}') - echo "${RESPONSE}" | jq . - - CONTENT=$(echo "${RESPONSE}" | jq -r '.choices[0].message.content') - if [[ -z "${CONTENT}" || "${CONTENT}" == "null" ]]; then - echo "::error::Empty response from vLLM" - exit 1 - fi - echo "Dynamo vLLM inference smoke test passed." - - # --- Evidence collection --- - - - name: Collect AI conformance evidence + # Collect a post-run resource snapshot regardless of whether conformance + # validation ran, so triage always has a cluster-state artifact. + - name: Collect validation artifacts if: >- - !cancelled() + always() + && !cancelled() && steps.bundle-install.outcome == 'success' - && (steps.validate-conformance.outcome == 'success' || steps.validate-conformance.outcome == 'failure') + continue-on-error: true + shell: bash run: | + set -o pipefail + mkdir -p conformance-evidence go run ./tests/chainsaw/ai-conformance/ \ --dir tests/chainsaw/ai-conformance/kind-inference-dynamo \ - --file tests/chainsaw/ai-conformance/cluster/assert-crds.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-dynamo.yaml \ + --dir tests/chainsaw/ai-conformance/common \ + --dir tests/chainsaw/ai-conformance/kind-common \ --kubeconfig="${HOME}/.kube/config" \ - --debug + --debug | tee conformance-evidence/resource-existence-post.txt - - name: Upload conformance evidence + - name: Upload validation artifacts if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: @@ -364,20 +234,6 @@ jobs: kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true echo "=== Dynamo pods ===" kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods -o wide 2>/dev/null || true - echo "=== DynamoGraphDeployment status ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get dynamographdeployment -o yaml 2>/dev/null || true - echo "=== Dynamo vLLM frontend logs ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ - logs deployment/vllm-smoke-test-frontend --tail=200 2>/dev/null || true - echo "=== Dynamo vLLM frontend previous logs ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ - logs deployment/vllm-smoke-test-frontend --previous --tail=200 2>/dev/null || true - echo "=== Dynamo vLLM worker logs ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ - logs deployment/vllm-smoke-test-vllmdecodeworker --tail=200 2>/dev/null || true - echo "=== Dynamo vLLM worker previous logs ===" - kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ - logs deployment/vllm-smoke-test-vllmdecodeworker --previous --tail=200 2>/dev/null || true echo "=== Dynamo operator logs ===" kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \ logs deployment/dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true @@ -419,17 +275,6 @@ jobs: echo "=== Node status ===" kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null || true - - name: Dynamo vLLM cleanup - if: always() - run: | - kubectl --context="kind-${KIND_CLUSTER_NAME}" delete \ - -f tests/manifests/dynamo-vllm-smoke-test.yaml \ - -n dynamo-system --ignore-not-found 2>/dev/null || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" delete resourceclaim \ - vllm-smoke-gpu-claim -n dynamo-system --ignore-not-found 2>/dev/null || true - kubectl --context="kind-${KIND_CLUSTER_NAME}" delete queue \ - dynamo --ignore-not-found 2>/dev/null || true - - name: GPU Test Cleanup if: always() uses: ./.github/actions/gpu-test-cleanup diff --git a/.github/workflows/gpu-h100-training-test.yaml b/.github/workflows/gpu-h100-training-test.yaml index 25e58cee..90e518fa 100644 --- a/.github/workflows/gpu-h100-training-test.yaml +++ b/.github/workflows/gpu-h100-training-test.yaml @@ -59,13 +59,8 @@ jobs: - '.github/actions/load-versions/**' - 'tests/chainsaw/chainsaw-config.yaml' - 'tests/chainsaw/ai-conformance/main.go' + - 'tests/chainsaw/ai-conformance/common/**' - 'tests/chainsaw/ai-conformance/kind-common/**' - - 'tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml' - - 'tests/chainsaw/ai-conformance/cluster/assert-kubeflow-trainer.yaml' - 'tests/chainsaw/ai-conformance/kind-training-kubeflow/**' - 'recipes/overlays/kind.yaml' - 'recipes/overlays/h100-kind-training.yaml' @@ -160,23 +155,19 @@ jobs: --test-dir tests/chainsaw/ai-conformance/kind-training-kubeflow \ --config tests/chainsaw/chainsaw-config.yaml + # --- CNCF AI Conformance validation --- + # Runs last to ensure the DCGM → Prometheus → adapter pipeline + # has had time to bootstrap (pod-autoscaling check needs live metric data). + - name: Verify expected resources exist run: | go run ./tests/chainsaw/ai-conformance/ \ --dir tests/chainsaw/ai-conformance/kind-training-kubeflow \ - --file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-kubeflow-trainer.yaml \ + --dir tests/chainsaw/ai-conformance/common \ + --dir tests/chainsaw/ai-conformance/kind-common \ --kubeconfig="${HOME}/.kube/config" \ --debug - # --- CNCF AI Conformance validation --- - # Runs last to ensure the DCGM → Prometheus → adapter pipeline - # has had time to bootstrap (pod-autoscaling check needs live metric data). - - name: Validate CNCF AI Conformance id: validate-conformance run: | @@ -192,26 +183,28 @@ jobs: --output=validation-result.yaml \ --evidence-dir=conformance-evidence - # --- Evidence collection --- + # --- Validation artifacts --- - - name: Collect AI conformance evidence + # Collect a post-run resource snapshot regardless of whether conformance + # validation ran, so triage always has a cluster-state artifact. + - name: Collect validation artifacts if: >- - !cancelled() + always() + && !cancelled() && steps.bundle-install.outcome == 'success' - && (steps.validate-conformance.outcome == 'success' || steps.validate-conformance.outcome == 'failure') + continue-on-error: true + shell: bash run: | + set -o pipefail + mkdir -p conformance-evidence go run ./tests/chainsaw/ai-conformance/ \ --dir tests/chainsaw/ai-conformance/kind-training-kubeflow \ - --file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \ - --file tests/chainsaw/ai-conformance/cluster/assert-kubeflow-trainer.yaml \ + --dir tests/chainsaw/ai-conformance/common \ + --dir tests/chainsaw/ai-conformance/kind-common \ --kubeconfig="${HOME}/.kube/config" \ - --debug + --debug | tee conformance-evidence/resource-existence-post.txt - - name: Upload conformance evidence + - name: Upload validation artifacts if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: diff --git a/tests/chainsaw/ai-conformance/README.md b/tests/chainsaw/ai-conformance/README.md index 3c2bd2ba..a3e2d7e6 100644 --- a/tests/chainsaw/ai-conformance/README.md +++ b/tests/chainsaw/ai-conformance/README.md @@ -4,6 +4,7 @@ This directory contains the Chainsaw suites used to validate AI conformance flow - `offline/`: no-cluster recipe and bundle generation checks - `cluster/`: deployed inference-stack health checks for the external cluster flow +- `common/`: cross-environment shared assertions used by the cluster suite and both Kind GPU suites - `kind-inference-dynamo/`: H100 Kind inference leaf-suite checks used by GPU CI - `kind-training-kubeflow/`: H100 Kind training leaf-suite checks used by GPU CI - `kind-common/`: shared Kind-only assertions consumed by both GPU CI leaf suites @@ -68,16 +69,26 @@ The Kind GPU workflows use these leaf recipes instead: ``` tests/chainsaw/ai-conformance/ ├── README.md +├── common/ # Shared across cluster + Kind GPU suites +│ ├── assert-cert-manager.yaml # cert-manager healthy +│ ├── assert-dra-driver.yaml # DRA driver healthy +│ ├── assert-kai-scheduler.yaml # KAI scheduler healthy +│ ├── assert-monitoring.yaml # Prometheus stack healthy +│ └── assert-skyhook.yaml # Skyhook operator healthy ├── kind-common/ # Shared Kind-only assertions │ ├── assert-gpu-operator.yaml # GPU operator healthy on kind │ ├── assert-network-operator.yaml # Network operator healthy on kind │ └── assert-nvsentinel.yaml # NVSentinel healthy on kind ├── kind-inference-dynamo/ # Kind + H100 + inference + dynamo leaf suite │ ├── chainsaw-test.yaml # Inference leaf health check orchestration +│ ├── assert-crds.yaml # Inference-specific CRDs installed +│ ├── assert-dynamo.yaml # Dynamo platform healthy on kind +│ ├── assert-kgateway.yaml # kgateway healthy on kind │ └── assert-namespaces.yaml # Inference-specific namespaces exist ├── kind-training-kubeflow/ # Kind + H100 + training + kubeflow leaf suite │ ├── chainsaw-test.yaml # Training leaf health check orchestration │ ├── assert-crds.yaml # Training-specific CRDs installed +│ ├── assert-kubeflow-trainer.yaml # Kubeflow trainer healthy on kind │ └── assert-namespaces.yaml # Training-specific namespaces exist ├── offline/ # No cluster needed │ ├── chainsaw-test.yaml # Recipe + bundle generation @@ -86,18 +97,21 @@ tests/chainsaw/ai-conformance/ ├── chainsaw-test.yaml # Cluster health check orchestration ├── assert-namespaces.yaml # 9 namespaces exist ├── assert-crds.yaml # Critical CRDs installed - ├── assert-cert-manager.yaml # cert-manager healthy ├── assert-gpu-operator.yaml # GPU operator + DaemonSets healthy - ├── assert-monitoring.yaml # Prometheus stack healthy ├── assert-kube-system.yaml # AWS EFA healthy ├── assert-kgateway.yaml # kgateway healthy - ├── assert-skyhook.yaml # Skyhook operator healthy ├── assert-nvsentinel.yaml # NVSentinel healthy - ├── assert-dra-driver.yaml # DRA driver healthy - ├── assert-kai-scheduler.yaml # KAI scheduler healthy └── assert-dynamo.yaml # Dynamo platform healthy ``` +Ownership model: + +- `common/`: shared across the external cluster suite and both Kind GPU suites +- `kind-common/`: shared only by the Kind GPU suites +- `kind-inference-dynamo/`: inference-only Kind assertions +- `kind-training-kubeflow/`: training-only Kind assertions +- `cluster/`: external-cluster-only assertions + ## Prerequisites ### Offline tests diff --git a/tests/chainsaw/ai-conformance/cluster/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/cluster/chainsaw-test.yaml index 44613dae..1cc8018c 100644 --- a/tests/chainsaw/ai-conformance/cluster/chainsaw-test.yaml +++ b/tests/chainsaw/ai-conformance/cluster/chainsaw-test.yaml @@ -52,7 +52,7 @@ spec: description: Verify cert-manager controller, webhook, and cainjector are available. try: - assert: - file: assert-cert-manager.yaml + file: ../common/assert-cert-manager.yaml # ── GPU Operator ─────────────────────────────────────────────────── - name: assert-gpu-operator @@ -68,7 +68,7 @@ spec: description: Verify kube-prometheus-stack, ephemeral storage metrics, and prometheus-adapter. try: - assert: - file: assert-monitoring.yaml + file: ../common/assert-monitoring.yaml # ── kube-system (AWS EFA) ────────────────────────────────────────── - name: assert-kube-system @@ -89,7 +89,7 @@ spec: description: Verify Skyhook operator controller-manager is available. try: - assert: - file: assert-skyhook.yaml + file: ../common/assert-skyhook.yaml # ── NVSentinel ───────────────────────────────────────────────────── - name: assert-nvsentinel @@ -105,14 +105,14 @@ spec: assert: 600s try: - assert: - file: assert-dra-driver.yaml + file: ../common/assert-dra-driver.yaml # ── KAI Scheduler ────────────────────────────────────────────────── - name: assert-kai-scheduler description: Verify KAI scheduler is available. try: - assert: - file: assert-kai-scheduler.yaml + file: ../common/assert-kai-scheduler.yaml # ── Dynamo Platform ──────────────────────────────────────────────── - name: assert-dynamo diff --git a/tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml b/tests/chainsaw/ai-conformance/common/assert-cert-manager.yaml similarity index 100% rename from tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml rename to tests/chainsaw/ai-conformance/common/assert-cert-manager.yaml diff --git a/tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml b/tests/chainsaw/ai-conformance/common/assert-dra-driver.yaml similarity index 100% rename from tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml rename to tests/chainsaw/ai-conformance/common/assert-dra-driver.yaml diff --git a/tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml b/tests/chainsaw/ai-conformance/common/assert-kai-scheduler.yaml similarity index 100% rename from tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml rename to tests/chainsaw/ai-conformance/common/assert-kai-scheduler.yaml diff --git a/tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml b/tests/chainsaw/ai-conformance/common/assert-monitoring.yaml similarity index 100% rename from tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml rename to tests/chainsaw/ai-conformance/common/assert-monitoring.yaml diff --git a/tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml b/tests/chainsaw/ai-conformance/common/assert-skyhook.yaml similarity index 100% rename from tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml rename to tests/chainsaw/ai-conformance/common/assert-skyhook.yaml diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-crds.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-crds.yaml new file mode 100644 index 00000000..62db5bbe --- /dev/null +++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-crds.yaml @@ -0,0 +1,65 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Assert that critical CRDs are installed for the kind inference + Dynamo stack. + +# GPU Operator +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: clusterpolicies.nvidia.com +--- +# cert-manager +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: certificates.cert-manager.io +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: issuers.cert-manager.io +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: clusterissuers.cert-manager.io +--- +# kgateway-crds (Gateway API + Inference Extension) +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: gateways.gateway.networking.k8s.io +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: httproutes.gateway.networking.k8s.io +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: inferencepools.inference.networking.x-k8s.io +--- +# dynamo-crds +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: dynamocomponentdeployments.nvidia.com +--- +# Skyhook +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: skyhooks.skyhook.nvidia.com diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-dynamo.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-dynamo.yaml new file mode 100644 index 00000000..d1f09f60 --- /dev/null +++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-dynamo.yaml @@ -0,0 +1,32 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Assert Dynamo platform components are healthy on the kind inference stack. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dynamo-platform-dynamo-operator-controller-manager + namespace: dynamo-system +status: + (conditions[?type == 'Available']): + - status: "True" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: grove-operator + namespace: dynamo-system +status: + (conditions[?type == 'Available']): + - status: "True" diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-kgateway.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-kgateway.yaml new file mode 100644 index 00000000..b3569f9a --- /dev/null +++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/assert-kgateway.yaml @@ -0,0 +1,32 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Assert kgateway controller is available on the kind inference + Dynamo stack. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kgateway + namespace: kgateway-system +status: + (conditions[?type == 'Available']): + - status: "True" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: inference-gateway + namespace: kgateway-system +status: + (conditions[?type == 'Available']): + - status: "True" diff --git a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml index 0428d39c..1b1f701a 100644 --- a/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml +++ b/tests/chainsaw/ai-conformance/kind-inference-dynamo/chainsaw-test.yaml @@ -45,14 +45,14 @@ spec: assert: 120s try: - assert: - file: ../cluster/assert-crds.yaml + file: assert-crds.yaml # ── cert-manager ─────────────────────────────────────────────────── - name: assert-cert-manager description: Verify cert-manager controller, webhook, and cainjector are available. try: - assert: - file: ../cluster/assert-cert-manager.yaml + file: ../common/assert-cert-manager.yaml # ── GPU Operator ─────────────────────────────────────────────────── - name: assert-gpu-operator @@ -68,21 +68,21 @@ spec: description: Verify kube-prometheus-stack, ephemeral storage metrics, and prometheus-adapter. try: - assert: - file: ../cluster/assert-monitoring.yaml + file: ../common/assert-monitoring.yaml # ── kgateway ─────────────────────────────────────────────────────── - name: assert-kgateway description: Verify kgateway controller is available. try: - assert: - file: ../cluster/assert-kgateway.yaml + file: assert-kgateway.yaml # ── Skyhook ──────────────────────────────────────────────────────── - name: assert-skyhook description: Verify Skyhook operator controller-manager is available. try: - assert: - file: ../cluster/assert-skyhook.yaml + file: ../common/assert-skyhook.yaml # ── NVSentinel ───────────────────────────────────────────────────── - name: assert-nvsentinel @@ -98,7 +98,7 @@ spec: assert: 600s try: - assert: - file: ../cluster/assert-dra-driver.yaml + file: ../common/assert-dra-driver.yaml # ── Network Operator ─────────────────────────────────────────────── - name: assert-network-operator @@ -112,11 +112,11 @@ spec: description: Verify KAI scheduler is available. try: - assert: - file: ../cluster/assert-kai-scheduler.yaml + file: ../common/assert-kai-scheduler.yaml # ── Dynamo Platform ──────────────────────────────────────────────── - name: assert-dynamo description: Verify Dynamo operator, etcd, and NATS are healthy. try: - assert: - file: ../cluster/assert-dynamo.yaml + file: assert-dynamo.yaml diff --git a/tests/chainsaw/ai-conformance/cluster/assert-kubeflow-trainer.yaml b/tests/chainsaw/ai-conformance/kind-training-kubeflow/assert-kubeflow-trainer.yaml similarity index 100% rename from tests/chainsaw/ai-conformance/cluster/assert-kubeflow-trainer.yaml rename to tests/chainsaw/ai-conformance/kind-training-kubeflow/assert-kubeflow-trainer.yaml diff --git a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml index 6dd22ece..382d9910 100644 --- a/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml +++ b/tests/chainsaw/ai-conformance/kind-training-kubeflow/chainsaw-test.yaml @@ -49,7 +49,7 @@ spec: description: Verify cert-manager controller, webhook, and cainjector are available. try: - assert: - file: ../cluster/assert-cert-manager.yaml + file: ../common/assert-cert-manager.yaml - name: assert-gpu-operator description: Verify GPU operator and managed components are healthy (kind profile — no driver DaemonSet). @@ -63,19 +63,19 @@ spec: description: Verify kube-prometheus-stack, ephemeral storage metrics, and prometheus-adapter. try: - assert: - file: ../cluster/assert-monitoring.yaml + file: ../common/assert-monitoring.yaml - name: assert-skyhook description: Verify Skyhook operator controller-manager is available. try: - assert: - file: ../cluster/assert-skyhook.yaml + file: ../common/assert-skyhook.yaml - name: assert-kubeflow-trainer description: Verify Kubeflow Trainer controller, validating webhook, and TrainJob CRD are available. try: - assert: - file: ../cluster/assert-kubeflow-trainer.yaml + file: assert-kubeflow-trainer.yaml - name: assert-nvsentinel description: Verify NVSentinel controller and platform connector are healthy (kind profile). @@ -89,7 +89,7 @@ spec: assert: 600s try: - assert: - file: ../cluster/assert-dra-driver.yaml + file: ../common/assert-dra-driver.yaml - name: assert-network-operator description: Verify NVIDIA Network Operator is available. @@ -101,4 +101,4 @@ spec: description: Verify KAI scheduler is available. try: - assert: - file: ../cluster/assert-kai-scheduler.yaml + file: ../common/assert-kai-scheduler.yaml diff --git a/tests/chainsaw/ai-conformance/main.go b/tests/chainsaw/ai-conformance/main.go index dee42293..99bb8ed6 100644 --- a/tests/chainsaw/ai-conformance/main.go +++ b/tests/chainsaw/ai-conformance/main.go @@ -80,6 +80,26 @@ type checkResult struct { Version string // container image, label version, or CRD versions (best-effort) } +type chainsawTestFile struct { + Spec struct { + Steps []chainsawStep `yaml:"steps"` + } `yaml:"spec"` +} + +type chainsawStep struct { + Try []chainsawOperation `yaml:"try"` + Catch []chainsawOperation `yaml:"catch"` + Finally []chainsawOperation `yaml:"finally"` +} + +type chainsawOperation struct { + Assert *chainsawAssert `yaml:"assert"` +} + +type chainsawAssert struct { + File string `yaml:"file"` +} + func main() { cmd := &cli.Command{ Name: "ai-conformance-check", @@ -214,16 +234,33 @@ func parseAssertFiles(dir string) ([]resourceIdentity, error) { } var resources []resourceIdentity + parsedFiles := make(map[string]bool) for _, entry := range entries { name := entry.Name() if !strings.HasPrefix(name, "assert-") || !strings.HasSuffix(name, ".yaml") { continue } - path := filepath.Join(dir, name) - parsed, err := parseYAMLFile(path, name) - if err != nil { - return nil, err + path := filepath.Clean(filepath.Join(dir, name)) + parsedFiles[path] = true + parsed, parseErr := parseYAMLFile(path, name) + if parseErr != nil { + return nil, parseErr + } + resources = append(resources, parsed...) + } + + referencedFiles, err := referencedAssertFiles(dir) + if err != nil { + return nil, err + } + for _, path := range referencedFiles { + if parsedFiles[path] { + continue + } + parsed, parseErr := parseYAMLFile(path, filepath.Base(path)) + if parseErr != nil { + return nil, parseErr } resources = append(resources, parsed...) } @@ -234,6 +271,60 @@ func parseAssertFiles(dir string) ([]resourceIdentity, error) { return resources, nil } +func referencedAssertFiles(dir string) ([]string, error) { + path := filepath.Join(dir, "chainsaw-test.yaml") + f, err := os.Open(path) + if err != nil { + if os.IsNotExist(err) { + return nil, nil + } + return nil, errors.Wrap(errors.ErrCodeInvalidRequest, fmt.Sprintf("failed to open %s", path), err) + } + defer f.Close() + + seen := make(map[string]bool) + var files []string + dec := yaml.NewDecoder(f) + for { + var testFile chainsawTestFile + if err := dec.Decode(&testFile); err != nil { + if stderrors.Is(err, io.EOF) { + break + } + return nil, errors.Wrap(errors.ErrCodeInvalidRequest, fmt.Sprintf("failed to parse %s", path), err) + } + for _, step := range testFile.Spec.Steps { + files = appendReferencedAssertFiles(files, seen, dir, step.Try) + files = appendReferencedAssertFiles(files, seen, dir, step.Catch) + files = appendReferencedAssertFiles(files, seen, dir, step.Finally) + } + } + + return files, nil +} + +func appendReferencedAssertFiles(files []string, seen map[string]bool, dir string, ops []chainsawOperation) []string { + for _, op := range ops { + if op.Assert == nil || op.Assert.File == "" { + continue + } + + name := filepath.Base(op.Assert.File) + if !strings.HasPrefix(name, "assert-") || !strings.HasSuffix(name, ".yaml") { + continue + } + + path := filepath.Clean(filepath.Join(dir, op.Assert.File)) + if seen[path] { + continue + } + seen[path] = true + files = append(files, path) + } + + return files +} + // parseYAMLFile decodes a multi-document YAML file into resource identities. func parseYAMLFile(path, sourceFile string) ([]resourceIdentity, error) { f, err := os.Open(path) diff --git a/tests/chainsaw/ai-conformance/main_test.go b/tests/chainsaw/ai-conformance/main_test.go new file mode 100644 index 00000000..ca703965 --- /dev/null +++ b/tests/chainsaw/ai-conformance/main_test.go @@ -0,0 +1,228 @@ +// Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "os" + "path/filepath" + "testing" +) + +func TestParseAssertFilesIncludesReferencedAssertions(t *testing.T) { + t.Parallel() + + root := t.TempDir() + clusterDir := filepath.Join(root, "cluster") + sharedDir := filepath.Join(root, "common") + if err := os.MkdirAll(clusterDir, 0o755); err != nil { + t.Fatalf("mkdir cluster: %v", err) + } + if err := os.MkdirAll(sharedDir, 0o755); err != nil { + t.Fatalf("mkdir common: %v", err) + } + + writeTestFile(t, filepath.Join(clusterDir, "assert-local.yaml"), ` +apiVersion: v1 +kind: Namespace +metadata: + name: local +`) + writeTestFile(t, filepath.Join(sharedDir, "assert-shared.yaml"), ` +apiVersion: apps/v1 +kind: Deployment +metadata: + name: shared + namespace: shared-ns +`) + writeTestFile(t, filepath.Join(clusterDir, "chainsaw-test.yaml"), ` +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +spec: + steps: + - try: + - assert: + file: assert-local.yaml + - assert: + file: ../common/assert-shared.yaml +`) + + resources, err := parseAssertFiles(clusterDir) + if err != nil { + t.Fatalf("parse assert files: %v", err) + } + if len(resources) != 2 { + t.Fatalf("resource count = %d, want 2", len(resources)) + } + + got := map[string]string{} + for _, resource := range resources { + got[resource.Metadata.Name] = resource.SourceFile + } + if got["local"] != "assert-local.yaml" { + t.Fatalf("local source = %q, want assert-local.yaml", got["local"]) + } + if got["shared"] != "assert-shared.yaml" { + t.Fatalf("shared source = %q, want assert-shared.yaml", got["shared"]) + } +} + +func TestParseAssertFilesWithoutChainsawTestKeepsDirectoryScanning(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + writeTestFile(t, filepath.Join(dir, "assert-only.yaml"), ` +apiVersion: v1 +kind: Namespace +metadata: + name: only +`) + + resources, err := parseAssertFiles(dir) + if err != nil { + t.Fatalf("parse assert files: %v", err) + } + if len(resources) != 1 { + t.Fatalf("resource count = %d, want 1", len(resources)) + } + if resources[0].Metadata.Name != "only" { + t.Fatalf("resource name = %q, want only", resources[0].Metadata.Name) + } +} + +func TestParseAssertFilesMultiDocumentChainsawTest(t *testing.T) { + t.Parallel() + + root := t.TempDir() + suiteDir := filepath.Join(root, "suite") + sharedDir := filepath.Join(root, "common") + if err := os.MkdirAll(suiteDir, 0o755); err != nil { + t.Fatalf("mkdir suite: %v", err) + } + if err := os.MkdirAll(sharedDir, 0o755); err != nil { + t.Fatalf("mkdir common: %v", err) + } + + writeTestFile(t, filepath.Join(sharedDir, "assert-first.yaml"), ` +apiVersion: v1 +kind: Namespace +metadata: + name: first +`) + writeTestFile(t, filepath.Join(sharedDir, "assert-second.yaml"), ` +apiVersion: v1 +kind: Namespace +metadata: + name: second +`) + // Multi-document chainsaw-test.yaml: two Test documents referencing different files. + writeTestFile(t, filepath.Join(suiteDir, "chainsaw-test.yaml"), ` +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +spec: + steps: + - try: + - assert: + file: ../common/assert-first.yaml +--- +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +spec: + steps: + - try: + - assert: + file: ../common/assert-second.yaml +`) + + resources, err := parseAssertFiles(suiteDir) + if err != nil { + t.Fatalf("parse assert files: %v", err) + } + if len(resources) != 2 { + t.Fatalf("resource count = %d, want 2", len(resources)) + } + + got := map[string]bool{} + for _, r := range resources { + got[r.Metadata.Name] = true + } + if !got["first"] { + t.Fatal("missing resource 'first' from first YAML document") + } + if !got["second"] { + t.Fatal("missing resource 'second' from second YAML document") + } +} + +func TestParseAssertFilesInvalidAssertionYAML(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + writeTestFile(t, filepath.Join(dir, "assert-bad.yaml"), ` +apiVersion: v1 +kind: Namespace +metadata: + name: [invalid yaml +`) + + if _, err := parseAssertFiles(dir); err == nil { + t.Fatal("expected error for invalid assertion YAML, got nil") + } +} + +func TestReferencedAssertFilesInvalidChainsawYAML(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + writeTestFile(t, filepath.Join(dir, "chainsaw-test.yaml"), ` +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +spec: + steps: + - try: + - assert: + file: [invalid yaml +`) + + if _, err := referencedAssertFiles(dir); err == nil { + t.Fatal("expected error for invalid chainsaw-test.yaml, got nil") + } +} + +func TestParseAssertFilesMissingReferencedAssertion(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + writeTestFile(t, filepath.Join(dir, "chainsaw-test.yaml"), ` +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +spec: + steps: + - try: + - assert: + file: assert-missing.yaml +`) + + if _, err := parseAssertFiles(dir); err == nil { + t.Fatal("expected error for missing referenced assertion file, got nil") + } +} + +func writeTestFile(t *testing.T, path, content string) { + t.Helper() + + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatalf("write %s: %v", path, err) + } +}