NVIDIA · yuanchen8911 · Apr 16, 2026 · Apr 16, 2026
@@ -53,9 +53,14 @@ runs:
         # Load onto all nodes. The snapshot agent requests nvidia.com/gpu but
         # does not set a node selector, so it can land on any GPU-capable node
         # including the control-plane (e.g., T4 smoke test).
-        timeout 600 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || {
+        #
+        # Timeout is intentionally generous (900s per attempt). H100 self-hosted
+        # runners transfer images over a shared Docker-in-Docker bridge; large
+        # CUDA base images (~250MB compressed) combined with I/O contention from
+        # parallel GPU operator pods regularly exceed the previous 600s limit.
+        timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || {
           echo "::warning::kind load attempt 1 failed for ko.local:smoke-test, retrying..."
-          timeout 600 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}"
+          timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}"
         }
 
     - name: Build validator images and load into kind
@@ -94,9 +99,12 @@ runs:
         USER nonroot
         ENTRYPOINT ["/${phase}"]
         DOCKERFILE
-          timeout 300 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}" || {
+          # Validator images are small (~30MB distroless), but share the same
+          # Docker-in-Docker bridge as the smoke-test load above. 600s per
+          # attempt accommodates I/O queuing behind concurrent image pulls.
+          timeout 600 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}" || {
             echo "::warning::kind load attempt 1 failed for ko.local/aicr-validators/${phase}:latest, retrying..."
-            timeout 300 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}"
+            timeout 600 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}"
           }
         done
 

@@ -59,3 +59,27 @@ runs:
           exit 1
         fi
         echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}"
+
+    - name: Debug snapshot Job
+      if: failure()
+      shell: bash
+      run: |
+        echo "=== Snapshot Job ==="
+        kubectl --context="kind-${{ inputs.cluster_name }}" -n default get job aicr -o yaml || true
+        echo "=== Snapshot Pods ==="
+        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
+          get pods -l app.kubernetes.io/name=aicr -o wide || true
+        echo "=== Snapshot Job describe ==="
+        kubectl --context="kind-${{ inputs.cluster_name }}" -n default describe job aicr || true
+        echo "=== Snapshot Pod describe ==="
+        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
+          describe pods -l app.kubernetes.io/name=aicr || true
+        echo "=== Snapshot current logs ==="
+        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
+          logs -l app.kubernetes.io/name=aicr --all-containers --tail=200 || true
+        echo "=== Snapshot previous logs ==="
+        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
+          logs -l app.kubernetes.io/name=aicr --all-containers --previous --tail=200 || true
+        echo "=== Snapshot ConfigMap ==="
+        kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
+          get configmap aicr-snapshot -o yaml || true
@@ -57,19 +57,11 @@ jobs:
               - 'pkg/evidence/**'
               - '.github/actions/gpu-test-cleanup/**'
               - '.github/actions/load-versions/**'
-              - 'tests/manifests/dynamo-vllm-smoke-test.yaml'
               - 'tests/chainsaw/chainsaw-config.yaml'
               - 'tests/chainsaw/ai-conformance/main.go'
+              - 'tests/chainsaw/ai-conformance/common/**'
               - 'tests/chainsaw/ai-conformance/kind-common/**'
               - 'tests/chainsaw/ai-conformance/kind-inference-dynamo/**'
-              - 'tests/chainsaw/ai-conformance/cluster/assert-crds.yaml'
-              - 'tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml'
-              - 'tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml'
-              - 'tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml'
-              - 'tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml'
-              - 'tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml'
-              - 'tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml'
-              - 'tests/chainsaw/ai-conformance/cluster/assert-dynamo.yaml'
               - 'recipes/mixins/platform-inference.yaml'
               - 'recipes/components/kgateway/**'
               - 'recipes/components/kgateway-crds/**'
@@ -166,27 +158,19 @@ jobs:
             --test-dir tests/chainsaw/ai-conformance/kind-inference-dynamo \
             --config tests/chainsaw/chainsaw-config.yaml
 
+      # --- CNCF AI Conformance validation ---
+      # Runs after the stack health checks so gateway and metrics validators
+      # see a settled inference stack.
+
       - name: Verify expected resources exist
         run: |
           go run ./tests/chainsaw/ai-conformance/ \
             --dir tests/chainsaw/ai-conformance/kind-inference-dynamo \
-            --file tests/chainsaw/ai-conformance/cluster/assert-crds.yaml \
-            --file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \
-            --file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \
-            --file tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml \
-            --file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \
-            --file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \
-            --file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \
-            --file tests/chainsaw/ai-conformance/cluster/assert-dynamo.yaml \
+            --dir tests/chainsaw/ai-conformance/common \
+            --dir tests/chainsaw/ai-conformance/kind-common \
             --kubeconfig="${HOME}/.kube/config" \
             --debug
 
-      # --- CNCF AI Conformance validation ---
-      # Runs before Dynamo so conformance validators retain access to free GPUs.
-      # dra-support allocates a GPU via ResourceClaim, and gang-scheduling
-      # requires 2 free GPUs. Running the Dynamo smoke test first would consume
-      # GPU capacity and create avoidable test flakiness.
-
       - name: Validate CNCF AI Conformance
         id: validate-conformance
         run: |
@@ -202,146 +186,32 @@ jobs:
             --output=validation-result.yaml \
             --evidence-dir=conformance-evidence
 
-      # --- Dynamo vLLM inference smoke test ---
-      # Runs after conformance: Dynamo's DRA ResourceClaim consumes GPU
-      # capacity, so it is safer to keep the shared conformance gate first.
-
-      - name: Deploy Dynamo vLLM smoke test
-        run: |
-          # Create kai-scheduler queue for Dynamo (grove-operator sets kai.scheduler/queue=dynamo).
-          # The kai-scheduler chart creates default-parent-queue + default-queue on install,
-          # but Dynamo needs its own queue as a child of the parent.
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - <<'EOF'
-          apiVersion: scheduling.run.ai/v2
-          kind: Queue
-          metadata:
-            name: dynamo
-          spec:
-            parentQueue: default-parent-queue
-            resources:
-              gpu:
-                quota: 0
-                limit: -1
-                overQuotaWeight: 1
-              cpu:
-                quota: 0
-                limit: -1
-                overQuotaWeight: 1
-              memory:
-                quota: 0
-                limit: -1
-                overQuotaWeight: 1
-          EOF
-
-          # Create DRA ResourceClaim for GPU allocation.
-          # Required on DRA-only clusters where device-plugin GPU requests cannot be scheduled.
-          # The kai.scheduler/queue label is required for KAI scheduler to manage the claim.
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - <<'EOF'
-          apiVersion: resource.k8s.io/v1
-          kind: ResourceClaim
-          metadata:
-            name: vllm-smoke-gpu-claim
-            namespace: dynamo-system
-            labels:
-              kai.scheduler/queue: dynamo
-          spec:
-            devices:
-              requests:
-                - name: gpu
-                  exactly:
-                    deviceClassName: gpu.nvidia.com
-                    allocationMode: ExactCount
-                    count: 1
-          EOF
-
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
-            -f tests/manifests/dynamo-vllm-smoke-test.yaml -n dynamo-system
-
-          echo "Waiting for DynamoGraphDeployment to be reconciled..."
-          for i in $(seq 1 120); do
-            PHASE=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
-              get dynamographdeployment vllm-smoke-test \
-              -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null)
-            if [[ "${PHASE}" == "True" ]]; then
-              echo "DynamoGraphDeployment is ready."
-              break
-            fi
-            echo "Waiting for DGD readiness... (${i}/120)"
-            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods 2>/dev/null || true
-            sleep 10
-          done
-
-          if [[ "${PHASE}" != "True" ]]; then
-            echo "::error::DynamoGraphDeployment did not become ready within 20 minutes"
-            kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
-              get dynamographdeployment vllm-smoke-test -o yaml 2>/dev/null || true
-            exit 1
-          fi
-
-          echo "Dynamo pods:"
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods
-
-      - name: Validate Dynamo inference
-        run: |
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
-            port-forward svc/vllm-smoke-test-frontend 8000:8000 &
-          PF_PID=$!
-          sleep 3
+      # Dynamo smoke is intentionally disabled for now. The vLLM runtime image
+      # adds significant latency and flakiness in Kind CI, and training has no
+      # matching smoke path yet. Reintroduce it later alongside a symmetric
+      # training smoke test if needed.
+      # --- Validation artifacts ---
 
-          cleanup() { kill "${PF_PID}" 2>/dev/null || true; }
-          trap cleanup EXIT
-
-          echo "=== Waiting for /v1/models (model registration may take time after worker ready) ==="
-          for i in $(seq 1 30); do
-            MODELS=$(curl -sf http://localhost:8000/v1/models 2>/dev/null || echo '{"data":[]}')
-            if echo "${MODELS}" | jq -e '.data | length > 0' >/dev/null 2>&1; then
-              echo "Models available after ${i} attempt(s)."
-              break
-            fi
-            echo "Waiting for model registration... (${i}/30)"
-            sleep 10
-          done
-          echo "${MODELS}" | jq .
-          if ! echo "${MODELS}" | jq -e '.data | length > 0' >/dev/null 2>&1; then
-            echo "::error::No models reported by frontend after 5 minutes"
-            exit 1
-          fi
-
-          echo "=== Sending chat completion ==="
-          RESPONSE=$(curl -sf http://localhost:8000/v1/chat/completions \
-            -H "Content-Type: application/json" \
-            -d '{"model":"Qwen/Qwen3-0.6B","messages":[{"role":"user","content":"What is 2+2?"}],"max_tokens":30,"stream":false}')
-          echo "${RESPONSE}" | jq .
-
-          CONTENT=$(echo "${RESPONSE}" | jq -r '.choices[0].message.content')
-          if [[ -z "${CONTENT}" || "${CONTENT}" == "null" ]]; then
-            echo "::error::Empty response from vLLM"
-            exit 1
-          fi
-          echo "Dynamo vLLM inference smoke test passed."
-
-      # --- Evidence collection ---
-
-      - name: Collect AI conformance evidence
+      # Collect a post-run resource snapshot regardless of whether conformance
+      # validation ran, so triage always has a cluster-state artifact.
+      - name: Collect validation artifacts
         if: >-
-          !cancelled()
+          always()
+          && !cancelled()
           && steps.bundle-install.outcome == 'success'
-          && (steps.validate-conformance.outcome == 'success' || steps.validate-conformance.outcome == 'failure')
+        continue-on-error: true
+        shell: bash
         run: |
+          set -o pipefail
+          mkdir -p conformance-evidence
           go run ./tests/chainsaw/ai-conformance/ \
             --dir tests/chainsaw/ai-conformance/kind-inference-dynamo \
-            --file tests/chainsaw/ai-conformance/cluster/assert-crds.yaml \
-            --file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \
-            --file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \
-            --file tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml \
-            --file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \
-            --file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \
-            --file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \
-            --file tests/chainsaw/ai-conformance/cluster/assert-dynamo.yaml \
+            --dir tests/chainsaw/ai-conformance/common \
+            --dir tests/chainsaw/ai-conformance/kind-common \
             --kubeconfig="${HOME}/.kube/config" \
-            --debug
+            --debug | tee conformance-evidence/resource-existence-post.txt
 
-      - name: Upload conformance evidence
+      - name: Upload validation artifacts
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
         with:
@@ -364,20 +234,6 @@ jobs:
           kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
           echo "=== Dynamo pods ==="
           kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods -o wide 2>/dev/null || true
-          echo "=== DynamoGraphDeployment status ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get dynamographdeployment -o yaml 2>/dev/null || true
-          echo "=== Dynamo vLLM frontend logs ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
-            logs deployment/vllm-smoke-test-frontend --tail=200 2>/dev/null || true
-          echo "=== Dynamo vLLM frontend previous logs ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
-            logs deployment/vllm-smoke-test-frontend --previous --tail=200 2>/dev/null || true
-          echo "=== Dynamo vLLM worker logs ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
-            logs deployment/vllm-smoke-test-vllmdecodeworker --tail=200 2>/dev/null || true
-          echo "=== Dynamo vLLM worker previous logs ==="
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
-            logs deployment/vllm-smoke-test-vllmdecodeworker --previous --tail=200 2>/dev/null || true
           echo "=== Dynamo operator logs ==="
           kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
             logs deployment/dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true
@@ -419,17 +275,6 @@ jobs:
           echo "=== Node status ==="
           kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null || true
 
-      - name: Dynamo vLLM cleanup
-        if: always()
-        run: |
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" delete \
-            -f tests/manifests/dynamo-vllm-smoke-test.yaml \
-            -n dynamo-system --ignore-not-found 2>/dev/null || true
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" delete resourceclaim \
-            vllm-smoke-gpu-claim -n dynamo-system --ignore-not-found 2>/dev/null || true
-          kubectl --context="kind-${KIND_CLUSTER_NAME}" delete queue \
-            dynamo --ignore-not-found 2>/dev/null || true
-
       - name: GPU Test Cleanup
         if: always()
         uses: ./.github/actions/gpu-test-cleanup