Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/cli/validate.go
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ func runCNCFSubmission(ctx context.Context, evidenceDir string, features []strin
}
}

cncfTimeout := 20 * time.Minute //nolint:mnd // CNCF submission deploys GPU workloads and runs HPA tests
cncfTimeout := defaults.CNCFSubmissionTimeout
ctx, cancel := context.WithTimeout(ctx, cncfTimeout)
defer cancel()

Expand Down
4 changes: 3 additions & 1 deletion pkg/collector/os/sysctl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"path/filepath"
"strings"
"testing"
"time"

"github.com/NVIDIA/aicr/pkg/measurement"
)
Expand All @@ -34,7 +35,8 @@ func TestSysctlCollector_Collect_ContextCancellation(t *testing.T) {

// Start collection and cancel mid-way
go func() {
// Give it a moment to start walking
// Give it a moment to start walking before canceling
time.Sleep(10 * time.Millisecond)
cancel()
}()

Expand Down
53 changes: 53 additions & 0 deletions pkg/defaults/timeouts.go
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,59 @@ const (
FileParserMaxSize = 1 << 20 // 1MB
)

// Validator runtime class check timeout.
const (
// RuntimeClassCheckTimeout is the timeout for verifying RuntimeClass
// existence in the cluster during agent deployment.
RuntimeClassCheckTimeout = 5 * time.Second
)

// CNCF conformance submission timeout.
const (
// CNCFSubmissionTimeout is the timeout for CNCF submission evidence
// collection. CNCF submission deploys GPU workloads and runs HPA tests.
CNCFSubmissionTimeout = 20 * time.Minute
)

// Retry poll intervals for validator wait loops.
const (
// TrainerControllerPollInterval is the retry interval when waiting
// for the Kubeflow Trainer controller-manager to become ready.
TrainerControllerPollInterval = 2 * time.Second

// TrainingRuntimePollInterval is the retry interval when waiting
// for a TrainingRuntime resource to become visible via the API.
TrainingRuntimePollInterval = 500 * time.Millisecond
)

// Termination and truncation limits for validator output.
const (
// TerminationLogMaxSize is the maximum size in bytes of the K8s
// termination log message written to /dev/termination-log.
TerminationLogMaxSize = 4096

// ConfigMapStatusTruncateLen is the maximum length for ConfigMap
// status data before truncation in autoscaler status collection.
ConfigMapStatusTruncateLen = 2000

// AutoscalerMaxEvents is the maximum number of autoscaler events
// to capture when collecting cluster autoscaler evidence.
AutoscalerMaxEvents = 10

// MetricsDisplayLimit is the maximum number of custom metrics
// resources to display in AI service metrics evidence.
MetricsDisplayLimit = 20
)

// Well-known Kubernetes resource names shared across validators.
const (
// GPUOperatorNamespace is the default namespace for the GPU operator.
GPUOperatorNamespace = "gpu-operator"

// KubeSystemNamespace is the standard kube-system namespace.
KubeSystemNamespace = "kube-system"
)

// Attestation file size limits.
const (
// MaxSigstoreBundleSize is the maximum size in bytes for a .sigstore.json file.
Expand Down
3 changes: 2 additions & 1 deletion pkg/k8s/agent/deployer.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"strings"
"time"

"github.com/NVIDIA/aicr/pkg/defaults"
aicrerrors "github.com/NVIDIA/aicr/pkg/errors"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -157,7 +158,7 @@ func (d *Deployer) Cleanup(ctx context.Context, opts CleanupOptions) error {

// validateRuntimeClass checks that the specified RuntimeClass exists in the cluster.
func (d *Deployer) validateRuntimeClass(ctx context.Context) error {
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
ctx, cancel := context.WithTimeout(ctx, defaults.RuntimeClassCheckTimeout)
defer cancel()

_, err := d.clientset.NodeV1().RuntimeClasses().Get(ctx, d.config.RuntimeClassName, metav1.GetOptions{})
Expand Down
4 changes: 3 additions & 1 deletion pkg/logging/cli.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import (
"log/slog"
"os"
"strings"

"github.com/NVIDIA/aicr/pkg/errors"
)

// ANSI color codes
Expand Down Expand Up @@ -87,7 +89,7 @@ func (h *CLIHandler) Handle(_ context.Context, r slog.Record) error {
}

if _, err := fmt.Fprintln(h.writer, msg); err != nil {
return fmt.Errorf("failed to write log output: %w", err)
return errors.Wrap(errors.ErrCodeInternal, "failed to write log output", err)
}
return nil
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/version/version.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ func (v Version) String() string {
func ParseVersion(s string) (Version, error) {
// Check for empty string
if s == "" {
return Version{}, ErrEmptyVersion
return Version{}, pkgerrors.Wrap(pkgerrors.ErrCodeInvalidRequest, "empty version string", ErrEmptyVersion)
}

// Strip 'v' prefix if present
Expand All @@ -107,7 +107,7 @@ func ParseVersion(s string) (Version, error) {
// Split by dots
parts := strings.Split(mainPart, ".")
if len(parts) > 3 {
return Version{}, ErrTooManyComponents
return Version{}, pkgerrors.Wrap(pkgerrors.ErrCodeInvalidRequest, "too many version components", ErrTooManyComponents)
}

// Parse each component
Expand Down
4 changes: 2 additions & 2 deletions validators/conformance/ai_service_metrics_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,8 +185,8 @@ func checkAIServiceMetricsWithURL(ctx *validators.Context, promBaseURL string) e
}
var resources strings.Builder
limit := len(customMetricsResp.Resources)
if limit > 20 {
limit = 20
if limit > defaults.MetricsDisplayLimit {
limit = defaults.MetricsDisplayLimit
}
for i := 0; i < limit; i++ {
r := customMetricsResp.Resources[i]
Expand Down
8 changes: 4 additions & 4 deletions validators/conformance/cluster_autoscaling_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,7 @@ func checkEKSAutoscaling(ctx *validators.Context) error {

// Check for Cluster Autoscaler deployment (optional — EKS may use Karpenter or managed scaling).
// Search common namespaces since Cluster Autoscaler can be deployed anywhere.
caNamespaces := []string{"kube-system", "cluster-autoscaler", "system"}
caNamespaces := []string{defaults.KubeSystemNamespace, "cluster-autoscaler", "system"}
caDeployNames := []string{"cluster-autoscaler", "cluster-autoscaler-aws-cluster-autoscaler"}
var caFound bool
for _, caNS := range caNamespaces {
Expand Down Expand Up @@ -642,8 +642,8 @@ func checkGKEAutoscaling(ctx *validators.Context) error {
if caErr == nil && caStatus != nil {
caStatusFound = true
statusData := caStatus.Data["status"]
if len(statusData) > 2000 {
statusData = statusData[:2000] + "\n... [truncated]"
if len(statusData) > defaults.ConfigMapStatusTruncateLen {
statusData = statusData[:defaults.ConfigMapStatusTruncateLen] + "\n... [truncated]"
}
recordRawTextArtifact(ctx, "Cluster Autoscaler Status",
"kubectl get configmap cluster-autoscaler-status -n kube-system -o jsonpath='{.data.status}'",
Expand All @@ -670,7 +670,7 @@ func checkGKEAutoscaling(ctx *validators.Context) error {
if evErr == nil {
var autoscalerEvents strings.Builder
count := 0
for i := len(events.Items) - 1; i >= 0 && count < 10; i-- {
for i := len(events.Items) - 1; i >= 0 && count < defaults.AutoscalerMaxEvents; i-- {
ev := events.Items[i]
if ev.Reason == "NotTriggerScaleUp" || ev.Reason == "ScaledUpGroup" ||
ev.Reason == "ScaleDown" || ev.Reason == "TriggeredScaleUp" {
Expand Down
7 changes: 4 additions & 3 deletions validators/conformance/gpu_operator_health_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package main
import (
"fmt"

"github.com/NVIDIA/aicr/pkg/defaults"
"github.com/NVIDIA/aicr/pkg/errors"
"github.com/NVIDIA/aicr/validators"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -33,12 +34,12 @@ func CheckGPUOperatorHealth(ctx *validators.Context) error {

// 1. GPU Operator Deployment running
// Check by listing deployments with the gpu-operator label in the namespace.
deploys, err := ctx.Clientset.AppsV1().Deployments("gpu-operator").List(ctx.Ctx, metav1.ListOptions{
deploys, err := ctx.Clientset.AppsV1().Deployments(defaults.GPUOperatorNamespace).List(ctx.Ctx, metav1.ListOptions{
LabelSelector: "app.kubernetes.io/name=gpu-operator",
})
if err != nil || len(deploys.Items) == 0 {
// Fallback: try exact name match.
if verifyErr := verifyDeploymentAvailable(ctx, "gpu-operator", "gpu-operator"); verifyErr != nil {
if verifyErr := verifyDeploymentAvailable(ctx, defaults.GPUOperatorNamespace, "gpu-operator"); verifyErr != nil {
return errors.Wrap(errors.ErrCodeNotFound, "GPU operator deployment not found (checked label app.kubernetes.io/name=gpu-operator and exact name)", verifyErr)
}
}
Expand All @@ -65,7 +66,7 @@ func CheckGPUOperatorHealth(ctx *validators.Context) error {
}

// 3. DCGM exporter DaemonSet running
if err := verifyDaemonSetReady(ctx, "gpu-operator", "nvidia-dcgm-exporter"); err != nil {
if err := verifyDaemonSetReady(ctx, defaults.GPUOperatorNamespace, "nvidia-dcgm-exporter"); err != nil {
return errors.Wrap(errors.ErrCodeNotFound, "DCGM exporter check failed", err)
}

Expand Down
2 changes: 1 addition & 1 deletion validators/conformance/pod_autoscaling_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ func CheckPodAutoscaling(ctx *validators.Context) error {

// 2. GPU custom metrics have data (poll with retries — adapter relist is 30s)
metrics := []string{"gpu_utilization", "gpu_memory_used", "gpu_power_usage"}
namespaces := []string{"gpu-operator", "dynamo-system"}
namespaces := []string{defaults.GPUOperatorNamespace, "dynamo-system"}

var found bool
var foundPath string
Expand Down
4 changes: 2 additions & 2 deletions validators/conformance/secure_access_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ func collectSecureAccessBaselineArtifacts(ctx *validators.Context, dynClient dyn
}

// GPU operator pods.
operatorPods, err := ctx.Clientset.CoreV1().Pods("gpu-operator").List(ctx.Ctx, metav1.ListOptions{})
operatorPods, err := ctx.Clientset.CoreV1().Pods(defaults.GPUOperatorNamespace).List(ctx.Ctx, metav1.ListOptions{})
if err != nil {
recordRawTextArtifact(ctx, "GPU operator pods", "kubectl get pods -n gpu-operator -o wide",
fmt.Sprintf("failed to list gpu-operator pods: %v", err))
Expand All @@ -207,7 +207,7 @@ func collectSecureAccessBaselineArtifacts(ctx *validators.Context, dynClient dyn
}

// GPU operator DaemonSets.
daemonSets, err := ctx.Clientset.AppsV1().DaemonSets("gpu-operator").List(ctx.Ctx, metav1.ListOptions{})
daemonSets, err := ctx.Clientset.AppsV1().DaemonSets(defaults.GPUOperatorNamespace).List(ctx.Ctx, metav1.ListOptions{})
if err != nil {
recordRawTextArtifact(ctx, "GPU operator DaemonSets", "kubectl get ds -n gpu-operator",
fmt.Sprintf("failed to list gpu-operator DaemonSets: %v", err))
Expand Down
4 changes: 2 additions & 2 deletions validators/deployment/gpu_operator_version.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ func findDeploymentConstraint(ctx *validators.Context, name string) (string, boo
}

func getGPUOperatorVersion(ctx context.Context, clientset kubernetes.Interface) (string, error) {
deploymentNames := []string{"gpu-operator", "nvidia-gpu-operator"}
namespaces := []string{"gpu-operator", "nvidia-gpu-operator", "kube-system"}
deploymentNames := []string{gpuOperatorNamespace, "nvidia-gpu-operator"}
namespaces := []string{gpuOperatorNamespace, "nvidia-gpu-operator", "kube-system"}

var lastErr error
for _, ns := range namespaces {
Expand Down
2 changes: 1 addition & 1 deletion validators/performance/nccl_all_reduce_bw_constraint.go
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,7 @@ func waitForTrainingRuntime(ctx context.Context, dynamicClient dynamic.Interface
select {
case <-waitCtx.Done():
return aicrErrors.Wrap(aicrErrors.ErrCodeTimeout, "timed out waiting for TrainingRuntime to be visible", waitCtx.Err())
case <-time.After(500 * time.Millisecond):
case <-time.After(defaults.TrainingRuntimePollInterval):
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion validators/performance/trainer_lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ func waitForTrainerControllerReady(ctx context.Context, dynamicClient dynamic.In
case <-waitCtx.Done():
return aicrErrors.Wrap(aicrErrors.ErrCodeTimeout,
"timed out waiting for Trainer controller-manager to become ready", waitCtx.Err())
case <-time.After(2 * time.Second):
case <-time.After(defaults.TrainerControllerPollInterval):
}
}
}
Expand Down
5 changes: 3 additions & 2 deletions validators/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"os"
"path/filepath"

"github.com/NVIDIA/aicr/pkg/defaults"
aicrerrors "github.com/NVIDIA/aicr/pkg/errors"
)

Expand Down Expand Up @@ -104,8 +105,8 @@ func runCheck(checkFn CheckFunc) int {
func writeTerminationLog(format string, args ...any) {
msg := fmt.Sprintf(format, args...)
slog.Error("FAIL", "message", msg)
if len(msg) > 4096 {
msg = msg[:4096]
if len(msg) > defaults.TerminationLogMaxSize {
msg = msg[:defaults.TerminationLogMaxSize]
}
_ = os.WriteFile(filepath.Clean(terminationLogPath), []byte(msg), 0o600) //nolint:gosec // Fixed path, not user-controlled
}
Loading