Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions .github/actions/aicr-build/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,14 @@ runs:
# Load onto all nodes. The snapshot agent requests nvidia.com/gpu but
# does not set a node selector, so it can land on any GPU-capable node
# including the control-plane (e.g., T4 smoke test).
timeout 600 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || {
#
# Timeout is intentionally generous (900s per attempt). H100 self-hosted
# runners transfer images over a shared Docker-in-Docker bridge; large
# CUDA base images (~250MB compressed) combined with I/O contention from
# parallel GPU operator pods regularly exceed the previous 600s limit.
timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || {
Comment thread
mchmarny marked this conversation as resolved.
echo "::warning::kind load attempt 1 failed for ko.local:smoke-test, retrying..."
timeout 600 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}"
timeout 900 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}"
Comment thread
yuanchen8911 marked this conversation as resolved.
}

- name: Build validator images and load into kind
Expand Down Expand Up @@ -94,9 +99,12 @@ runs:
USER nonroot
ENTRYPOINT ["/${phase}"]
DOCKERFILE
timeout 300 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}" || {
# Validator images are small (~30MB distroless), but share the same
# Docker-in-Docker bridge as the smoke-test load above. 600s per
# attempt accommodates I/O queuing behind concurrent image pulls.
timeout 600 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}" || {
echo "::warning::kind load attempt 1 failed for ko.local/aicr-validators/${phase}:latest, retrying..."
timeout 300 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}"
timeout 600 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}"
}
done

Expand Down
24 changes: 24 additions & 0 deletions .github/actions/gpu-snapshot-validate/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,27 @@ runs:
exit 1
fi
echo "Snapshot correctly detected ${GPU_COUNT}x ${GPU_MODEL}"

- name: Debug snapshot Job
if: failure()
shell: bash
run: |
echo "=== Snapshot Job ==="
kubectl --context="kind-${{ inputs.cluster_name }}" -n default get job aicr -o yaml || true
echo "=== Snapshot Pods ==="
kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
get pods -l app.kubernetes.io/name=aicr -o wide || true
echo "=== Snapshot Job describe ==="
kubectl --context="kind-${{ inputs.cluster_name }}" -n default describe job aicr || true
echo "=== Snapshot Pod describe ==="
kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
describe pods -l app.kubernetes.io/name=aicr || true
echo "=== Snapshot current logs ==="
kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
logs -l app.kubernetes.io/name=aicr --all-containers --tail=200 || true
echo "=== Snapshot previous logs ==="
kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
logs -l app.kubernetes.io/name=aicr --all-containers --previous --tail=200 || true
echo "=== Snapshot ConfigMap ==="
kubectl --context="kind-${{ inputs.cluster_name }}" -n default \
get configmap aicr-snapshot -o yaml || true
Comment thread
yuanchen8911 marked this conversation as resolved.
205 changes: 25 additions & 180 deletions .github/workflows/gpu-h100-inference-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,19 +57,11 @@ jobs:
- 'pkg/evidence/**'
- '.github/actions/gpu-test-cleanup/**'
- '.github/actions/load-versions/**'
- 'tests/manifests/dynamo-vllm-smoke-test.yaml'
- 'tests/chainsaw/chainsaw-config.yaml'
- 'tests/chainsaw/ai-conformance/main.go'
- 'tests/chainsaw/ai-conformance/common/**'
- 'tests/chainsaw/ai-conformance/kind-common/**'
- 'tests/chainsaw/ai-conformance/kind-inference-dynamo/**'
- 'tests/chainsaw/ai-conformance/cluster/assert-crds.yaml'
- 'tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml'
- 'tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml'
- 'tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml'
- 'tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml'
- 'tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml'
- 'tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml'
- 'tests/chainsaw/ai-conformance/cluster/assert-dynamo.yaml'
- 'recipes/mixins/platform-inference.yaml'
- 'recipes/components/kgateway/**'
- 'recipes/components/kgateway-crds/**'
Expand Down Expand Up @@ -166,27 +158,19 @@ jobs:
--test-dir tests/chainsaw/ai-conformance/kind-inference-dynamo \
--config tests/chainsaw/chainsaw-config.yaml

# --- CNCF AI Conformance validation ---
# Runs after the stack health checks so gateway and metrics validators
# see a settled inference stack.
Comment thread
mchmarny marked this conversation as resolved.

- name: Verify expected resources exist
run: |
go run ./tests/chainsaw/ai-conformance/ \
--dir tests/chainsaw/ai-conformance/kind-inference-dynamo \
--file tests/chainsaw/ai-conformance/cluster/assert-crds.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-dynamo.yaml \
--dir tests/chainsaw/ai-conformance/common \
--dir tests/chainsaw/ai-conformance/kind-common \
--kubeconfig="${HOME}/.kube/config" \
--debug

# --- CNCF AI Conformance validation ---
# Runs before Dynamo so conformance validators retain access to free GPUs.
# dra-support allocates a GPU via ResourceClaim, and gang-scheduling
# requires 2 free GPUs. Running the Dynamo smoke test first would consume
# GPU capacity and create avoidable test flakiness.

- name: Validate CNCF AI Conformance
id: validate-conformance
run: |
Expand All @@ -202,146 +186,32 @@ jobs:
--output=validation-result.yaml \
--evidence-dir=conformance-evidence

# --- Dynamo vLLM inference smoke test ---
# Runs after conformance: Dynamo's DRA ResourceClaim consumes GPU
# capacity, so it is safer to keep the shared conformance gate first.

- name: Deploy Dynamo vLLM smoke test
run: |
# Create kai-scheduler queue for Dynamo (grove-operator sets kai.scheduler/queue=dynamo).
# The kai-scheduler chart creates default-parent-queue + default-queue on install,
# but Dynamo needs its own queue as a child of the parent.
kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - <<'EOF'
apiVersion: scheduling.run.ai/v2
kind: Queue
metadata:
name: dynamo
spec:
parentQueue: default-parent-queue
resources:
gpu:
quota: 0
limit: -1
overQuotaWeight: 1
cpu:
quota: 0
limit: -1
overQuotaWeight: 1
memory:
quota: 0
limit: -1
overQuotaWeight: 1
EOF

# Create DRA ResourceClaim for GPU allocation.
# Required on DRA-only clusters where device-plugin GPU requests cannot be scheduled.
# The kai.scheduler/queue label is required for KAI scheduler to manage the claim.
kubectl --context="kind-${KIND_CLUSTER_NAME}" apply -f - <<'EOF'
apiVersion: resource.k8s.io/v1
kind: ResourceClaim
metadata:
name: vllm-smoke-gpu-claim
namespace: dynamo-system
labels:
kai.scheduler/queue: dynamo
spec:
devices:
requests:
- name: gpu
exactly:
deviceClassName: gpu.nvidia.com
allocationMode: ExactCount
count: 1
EOF

kubectl --context="kind-${KIND_CLUSTER_NAME}" apply \
-f tests/manifests/dynamo-vllm-smoke-test.yaml -n dynamo-system

echo "Waiting for DynamoGraphDeployment to be reconciled..."
for i in $(seq 1 120); do
PHASE=$(kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
get dynamographdeployment vllm-smoke-test \
-o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null)
if [[ "${PHASE}" == "True" ]]; then
echo "DynamoGraphDeployment is ready."
break
fi
echo "Waiting for DGD readiness... (${i}/120)"
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods 2>/dev/null || true
sleep 10
done

if [[ "${PHASE}" != "True" ]]; then
echo "::error::DynamoGraphDeployment did not become ready within 20 minutes"
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
get dynamographdeployment vllm-smoke-test -o yaml 2>/dev/null || true
exit 1
fi

echo "Dynamo pods:"
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods

- name: Validate Dynamo inference
run: |
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
port-forward svc/vllm-smoke-test-frontend 8000:8000 &
PF_PID=$!
sleep 3
# Dynamo smoke is intentionally disabled for now. The vLLM runtime image
# adds significant latency and flakiness in Kind CI, and training has no
# matching smoke path yet. Reintroduce it later alongside a symmetric
# training smoke test if needed.
# --- Validation artifacts ---

cleanup() { kill "${PF_PID}" 2>/dev/null || true; }
trap cleanup EXIT

echo "=== Waiting for /v1/models (model registration may take time after worker ready) ==="
for i in $(seq 1 30); do
MODELS=$(curl -sf http://localhost:8000/v1/models 2>/dev/null || echo '{"data":[]}')
if echo "${MODELS}" | jq -e '.data | length > 0' >/dev/null 2>&1; then
echo "Models available after ${i} attempt(s)."
break
fi
echo "Waiting for model registration... (${i}/30)"
sleep 10
done
echo "${MODELS}" | jq .
if ! echo "${MODELS}" | jq -e '.data | length > 0' >/dev/null 2>&1; then
echo "::error::No models reported by frontend after 5 minutes"
exit 1
fi

echo "=== Sending chat completion ==="
RESPONSE=$(curl -sf http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{"model":"Qwen/Qwen3-0.6B","messages":[{"role":"user","content":"What is 2+2?"}],"max_tokens":30,"stream":false}')
echo "${RESPONSE}" | jq .

CONTENT=$(echo "${RESPONSE}" | jq -r '.choices[0].message.content')
if [[ -z "${CONTENT}" || "${CONTENT}" == "null" ]]; then
echo "::error::Empty response from vLLM"
exit 1
fi
echo "Dynamo vLLM inference smoke test passed."

# --- Evidence collection ---

- name: Collect AI conformance evidence
# Collect a post-run resource snapshot regardless of whether conformance
# validation ran, so triage always has a cluster-state artifact.
- name: Collect validation artifacts
if: >-
!cancelled()
always()
Comment thread
mchmarny marked this conversation as resolved.
&& !cancelled()
&& steps.bundle-install.outcome == 'success'
&& (steps.validate-conformance.outcome == 'success' || steps.validate-conformance.outcome == 'failure')
continue-on-error: true
shell: bash
run: |
set -o pipefail
mkdir -p conformance-evidence
go run ./tests/chainsaw/ai-conformance/ \
--dir tests/chainsaw/ai-conformance/kind-inference-dynamo \
--file tests/chainsaw/ai-conformance/cluster/assert-crds.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-dynamo.yaml \
--dir tests/chainsaw/ai-conformance/common \
--dir tests/chainsaw/ai-conformance/kind-common \
--kubeconfig="${HOME}/.kube/config" \
--debug
--debug | tee conformance-evidence/resource-existence-post.txt
Comment thread
coderabbitai[bot] marked this conversation as resolved.

- name: Upload conformance evidence
- name: Upload validation artifacts
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
Expand All @@ -364,20 +234,6 @@ jobs:
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n gpu-operator get events --sort-by='.lastTimestamp' 2>/dev/null | tail -30 || true
echo "=== Dynamo pods ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get pods -o wide 2>/dev/null || true
echo "=== DynamoGraphDeployment status ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system get dynamographdeployment -o yaml 2>/dev/null || true
echo "=== Dynamo vLLM frontend logs ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
logs deployment/vllm-smoke-test-frontend --tail=200 2>/dev/null || true
echo "=== Dynamo vLLM frontend previous logs ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
logs deployment/vllm-smoke-test-frontend --previous --tail=200 2>/dev/null || true
echo "=== Dynamo vLLM worker logs ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
logs deployment/vllm-smoke-test-vllmdecodeworker --tail=200 2>/dev/null || true
echo "=== Dynamo vLLM worker previous logs ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
logs deployment/vllm-smoke-test-vllmdecodeworker --previous --tail=200 2>/dev/null || true
echo "=== Dynamo operator logs ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" -n dynamo-system \
logs deployment/dynamo-operator-controller-manager --tail=100 -c manager 2>/dev/null || true
Expand Down Expand Up @@ -419,17 +275,6 @@ jobs:
echo "=== Node status ==="
kubectl --context="kind-${KIND_CLUSTER_NAME}" get nodes -o wide 2>/dev/null || true

- name: Dynamo vLLM cleanup
if: always()
run: |
kubectl --context="kind-${KIND_CLUSTER_NAME}" delete \
-f tests/manifests/dynamo-vllm-smoke-test.yaml \
-n dynamo-system --ignore-not-found 2>/dev/null || true
kubectl --context="kind-${KIND_CLUSTER_NAME}" delete resourceclaim \
vllm-smoke-gpu-claim -n dynamo-system --ignore-not-found 2>/dev/null || true
kubectl --context="kind-${KIND_CLUSTER_NAME}" delete queue \
dynamo --ignore-not-found 2>/dev/null || true

- name: GPU Test Cleanup
if: always()
uses: ./.github/actions/gpu-test-cleanup
Expand Down
Loading
Loading