From f27428c66fd763211b406f170d4a239bb6757188 Mon Sep 17 00:00:00 2001 From: Mark Chmarny Date: Thu, 16 Apr 2026 04:33:36 -0700 Subject: [PATCH] refactor: replace magic literals and fmt.Errorf with named constants and pkg/errors Addresses findings from a full codebase review: - Replace fmt.Errorf in pkg/logging with errors.Wrap - Wrap bare sentinel error returns in pkg/version with error codes - Extract 11 named constants to pkg/defaults (timeouts, limits, K8s namespaces) - Replace hardcoded timeout values with defaults constants across validators - Replace hardcoded K8s resource names with defaults.GPUOperatorNamespace - Fix context cancellation race in sysctl_test.go --- pkg/cli/validate.go | 2 +- pkg/collector/os/sysctl_test.go | 4 +- pkg/defaults/timeouts.go | 53 +++++++++++++++++++ pkg/k8s/agent/deployer.go | 3 +- pkg/logging/cli.go | 4 +- pkg/version/version.go | 4 +- .../conformance/ai_service_metrics_check.go | 4 +- .../conformance/cluster_autoscaling_check.go | 8 +-- .../conformance/gpu_operator_health_check.go | 7 +-- .../conformance/pod_autoscaling_check.go | 2 +- validators/conformance/secure_access_check.go | 4 +- validators/deployment/gpu_operator_version.go | 4 +- .../nccl_all_reduce_bw_constraint.go | 2 +- validators/performance/trainer_lifecycle.go | 2 +- validators/runner.go | 5 +- 15 files changed, 84 insertions(+), 24 deletions(-) diff --git a/pkg/cli/validate.go b/pkg/cli/validate.go index 3064e0e1e..762e4478d 100644 --- a/pkg/cli/validate.go +++ b/pkg/cli/validate.go @@ -588,7 +588,7 @@ func runCNCFSubmission(ctx context.Context, evidenceDir string, features []strin } } - cncfTimeout := 20 * time.Minute //nolint:mnd // CNCF submission deploys GPU workloads and runs HPA tests + cncfTimeout := defaults.CNCFSubmissionTimeout ctx, cancel := context.WithTimeout(ctx, cncfTimeout) defer cancel() diff --git a/pkg/collector/os/sysctl_test.go b/pkg/collector/os/sysctl_test.go index 0f0e62c75..1bc3c7dfb 100644 --- a/pkg/collector/os/sysctl_test.go +++ b/pkg/collector/os/sysctl_test.go @@ -21,6 +21,7 @@ import ( "path/filepath" "strings" "testing" + "time" "github.com/NVIDIA/aicr/pkg/measurement" ) @@ -34,7 +35,8 @@ func TestSysctlCollector_Collect_ContextCancellation(t *testing.T) { // Start collection and cancel mid-way go func() { - // Give it a moment to start walking + // Give it a moment to start walking before canceling + time.Sleep(10 * time.Millisecond) cancel() }() diff --git a/pkg/defaults/timeouts.go b/pkg/defaults/timeouts.go index 940e64abc..93e8ed743 100644 --- a/pkg/defaults/timeouts.go +++ b/pkg/defaults/timeouts.go @@ -369,6 +369,59 @@ const ( FileParserMaxSize = 1 << 20 // 1MB ) +// Validator runtime class check timeout. +const ( + // RuntimeClassCheckTimeout is the timeout for verifying RuntimeClass + // existence in the cluster during agent deployment. + RuntimeClassCheckTimeout = 5 * time.Second +) + +// CNCF conformance submission timeout. +const ( + // CNCFSubmissionTimeout is the timeout for CNCF submission evidence + // collection. CNCF submission deploys GPU workloads and runs HPA tests. + CNCFSubmissionTimeout = 20 * time.Minute +) + +// Retry poll intervals for validator wait loops. +const ( + // TrainerControllerPollInterval is the retry interval when waiting + // for the Kubeflow Trainer controller-manager to become ready. + TrainerControllerPollInterval = 2 * time.Second + + // TrainingRuntimePollInterval is the retry interval when waiting + // for a TrainingRuntime resource to become visible via the API. + TrainingRuntimePollInterval = 500 * time.Millisecond +) + +// Termination and truncation limits for validator output. +const ( + // TerminationLogMaxSize is the maximum size in bytes of the K8s + // termination log message written to /dev/termination-log. + TerminationLogMaxSize = 4096 + + // ConfigMapStatusTruncateLen is the maximum length for ConfigMap + // status data before truncation in autoscaler status collection. + ConfigMapStatusTruncateLen = 2000 + + // AutoscalerMaxEvents is the maximum number of autoscaler events + // to capture when collecting cluster autoscaler evidence. + AutoscalerMaxEvents = 10 + + // MetricsDisplayLimit is the maximum number of custom metrics + // resources to display in AI service metrics evidence. + MetricsDisplayLimit = 20 +) + +// Well-known Kubernetes resource names shared across validators. +const ( + // GPUOperatorNamespace is the default namespace for the GPU operator. + GPUOperatorNamespace = "gpu-operator" + + // KubeSystemNamespace is the standard kube-system namespace. + KubeSystemNamespace = "kube-system" +) + // Attestation file size limits. const ( // MaxSigstoreBundleSize is the maximum size in bytes for a .sigstore.json file. diff --git a/pkg/k8s/agent/deployer.go b/pkg/k8s/agent/deployer.go index bcfff0946..36c1d83f8 100644 --- a/pkg/k8s/agent/deployer.go +++ b/pkg/k8s/agent/deployer.go @@ -21,6 +21,7 @@ import ( "strings" "time" + "github.com/NVIDIA/aicr/pkg/defaults" aicrerrors "github.com/NVIDIA/aicr/pkg/errors" k8serrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -157,7 +158,7 @@ func (d *Deployer) Cleanup(ctx context.Context, opts CleanupOptions) error { // validateRuntimeClass checks that the specified RuntimeClass exists in the cluster. func (d *Deployer) validateRuntimeClass(ctx context.Context) error { - ctx, cancel := context.WithTimeout(ctx, 5*time.Second) + ctx, cancel := context.WithTimeout(ctx, defaults.RuntimeClassCheckTimeout) defer cancel() _, err := d.clientset.NodeV1().RuntimeClasses().Get(ctx, d.config.RuntimeClassName, metav1.GetOptions{}) diff --git a/pkg/logging/cli.go b/pkg/logging/cli.go index 0747b0342..d5a4c89e1 100644 --- a/pkg/logging/cli.go +++ b/pkg/logging/cli.go @@ -21,6 +21,8 @@ import ( "log/slog" "os" "strings" + + "github.com/NVIDIA/aicr/pkg/errors" ) // ANSI color codes @@ -87,7 +89,7 @@ func (h *CLIHandler) Handle(_ context.Context, r slog.Record) error { } if _, err := fmt.Fprintln(h.writer, msg); err != nil { - return fmt.Errorf("failed to write log output: %w", err) + return errors.Wrap(errors.ErrCodeInternal, "failed to write log output", err) } return nil } diff --git a/pkg/version/version.go b/pkg/version/version.go index 8beb96656..5e2cfb1e2 100644 --- a/pkg/version/version.go +++ b/pkg/version/version.go @@ -81,7 +81,7 @@ func (v Version) String() string { func ParseVersion(s string) (Version, error) { // Check for empty string if s == "" { - return Version{}, ErrEmptyVersion + return Version{}, pkgerrors.Wrap(pkgerrors.ErrCodeInvalidRequest, "empty version string", ErrEmptyVersion) } // Strip 'v' prefix if present @@ -107,7 +107,7 @@ func ParseVersion(s string) (Version, error) { // Split by dots parts := strings.Split(mainPart, ".") if len(parts) > 3 { - return Version{}, ErrTooManyComponents + return Version{}, pkgerrors.Wrap(pkgerrors.ErrCodeInvalidRequest, "too many version components", ErrTooManyComponents) } // Parse each component diff --git a/validators/conformance/ai_service_metrics_check.go b/validators/conformance/ai_service_metrics_check.go index b30dfcedf..004acbdd8 100644 --- a/validators/conformance/ai_service_metrics_check.go +++ b/validators/conformance/ai_service_metrics_check.go @@ -185,8 +185,8 @@ func checkAIServiceMetricsWithURL(ctx *validators.Context, promBaseURL string) e } var resources strings.Builder limit := len(customMetricsResp.Resources) - if limit > 20 { - limit = 20 + if limit > defaults.MetricsDisplayLimit { + limit = defaults.MetricsDisplayLimit } for i := 0; i < limit; i++ { r := customMetricsResp.Resources[i] diff --git a/validators/conformance/cluster_autoscaling_check.go b/validators/conformance/cluster_autoscaling_check.go index 11c4b09fa..ef41f0ed5 100644 --- a/validators/conformance/cluster_autoscaling_check.go +++ b/validators/conformance/cluster_autoscaling_check.go @@ -554,7 +554,7 @@ func checkEKSAutoscaling(ctx *validators.Context) error { // Check for Cluster Autoscaler deployment (optional — EKS may use Karpenter or managed scaling). // Search common namespaces since Cluster Autoscaler can be deployed anywhere. - caNamespaces := []string{"kube-system", "cluster-autoscaler", "system"} + caNamespaces := []string{defaults.KubeSystemNamespace, "cluster-autoscaler", "system"} caDeployNames := []string{"cluster-autoscaler", "cluster-autoscaler-aws-cluster-autoscaler"} var caFound bool for _, caNS := range caNamespaces { @@ -642,8 +642,8 @@ func checkGKEAutoscaling(ctx *validators.Context) error { if caErr == nil && caStatus != nil { caStatusFound = true statusData := caStatus.Data["status"] - if len(statusData) > 2000 { - statusData = statusData[:2000] + "\n... [truncated]" + if len(statusData) > defaults.ConfigMapStatusTruncateLen { + statusData = statusData[:defaults.ConfigMapStatusTruncateLen] + "\n... [truncated]" } recordRawTextArtifact(ctx, "Cluster Autoscaler Status", "kubectl get configmap cluster-autoscaler-status -n kube-system -o jsonpath='{.data.status}'", @@ -670,7 +670,7 @@ func checkGKEAutoscaling(ctx *validators.Context) error { if evErr == nil { var autoscalerEvents strings.Builder count := 0 - for i := len(events.Items) - 1; i >= 0 && count < 10; i-- { + for i := len(events.Items) - 1; i >= 0 && count < defaults.AutoscalerMaxEvents; i-- { ev := events.Items[i] if ev.Reason == "NotTriggerScaleUp" || ev.Reason == "ScaledUpGroup" || ev.Reason == "ScaleDown" || ev.Reason == "TriggeredScaleUp" { diff --git a/validators/conformance/gpu_operator_health_check.go b/validators/conformance/gpu_operator_health_check.go index dc8584dfa..07f702a13 100644 --- a/validators/conformance/gpu_operator_health_check.go +++ b/validators/conformance/gpu_operator_health_check.go @@ -17,6 +17,7 @@ package main import ( "fmt" + "github.com/NVIDIA/aicr/pkg/defaults" "github.com/NVIDIA/aicr/pkg/errors" "github.com/NVIDIA/aicr/validators" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -33,12 +34,12 @@ func CheckGPUOperatorHealth(ctx *validators.Context) error { // 1. GPU Operator Deployment running // Check by listing deployments with the gpu-operator label in the namespace. - deploys, err := ctx.Clientset.AppsV1().Deployments("gpu-operator").List(ctx.Ctx, metav1.ListOptions{ + deploys, err := ctx.Clientset.AppsV1().Deployments(defaults.GPUOperatorNamespace).List(ctx.Ctx, metav1.ListOptions{ LabelSelector: "app.kubernetes.io/name=gpu-operator", }) if err != nil || len(deploys.Items) == 0 { // Fallback: try exact name match. - if verifyErr := verifyDeploymentAvailable(ctx, "gpu-operator", "gpu-operator"); verifyErr != nil { + if verifyErr := verifyDeploymentAvailable(ctx, defaults.GPUOperatorNamespace, "gpu-operator"); verifyErr != nil { return errors.Wrap(errors.ErrCodeNotFound, "GPU operator deployment not found (checked label app.kubernetes.io/name=gpu-operator and exact name)", verifyErr) } } @@ -65,7 +66,7 @@ func CheckGPUOperatorHealth(ctx *validators.Context) error { } // 3. DCGM exporter DaemonSet running - if err := verifyDaemonSetReady(ctx, "gpu-operator", "nvidia-dcgm-exporter"); err != nil { + if err := verifyDaemonSetReady(ctx, defaults.GPUOperatorNamespace, "nvidia-dcgm-exporter"); err != nil { return errors.Wrap(errors.ErrCodeNotFound, "DCGM exporter check failed", err) } diff --git a/validators/conformance/pod_autoscaling_check.go b/validators/conformance/pod_autoscaling_check.go index 1df79bb5a..efe143bb6 100644 --- a/validators/conformance/pod_autoscaling_check.go +++ b/validators/conformance/pod_autoscaling_check.go @@ -101,7 +101,7 @@ func CheckPodAutoscaling(ctx *validators.Context) error { // 2. GPU custom metrics have data (poll with retries — adapter relist is 30s) metrics := []string{"gpu_utilization", "gpu_memory_used", "gpu_power_usage"} - namespaces := []string{"gpu-operator", "dynamo-system"} + namespaces := []string{defaults.GPUOperatorNamespace, "dynamo-system"} var found bool var foundPath string diff --git a/validators/conformance/secure_access_check.go b/validators/conformance/secure_access_check.go index bed3ad454..82ecd8f34 100644 --- a/validators/conformance/secure_access_check.go +++ b/validators/conformance/secure_access_check.go @@ -193,7 +193,7 @@ func collectSecureAccessBaselineArtifacts(ctx *validators.Context, dynClient dyn } // GPU operator pods. - operatorPods, err := ctx.Clientset.CoreV1().Pods("gpu-operator").List(ctx.Ctx, metav1.ListOptions{}) + operatorPods, err := ctx.Clientset.CoreV1().Pods(defaults.GPUOperatorNamespace).List(ctx.Ctx, metav1.ListOptions{}) if err != nil { recordRawTextArtifact(ctx, "GPU operator pods", "kubectl get pods -n gpu-operator -o wide", fmt.Sprintf("failed to list gpu-operator pods: %v", err)) @@ -207,7 +207,7 @@ func collectSecureAccessBaselineArtifacts(ctx *validators.Context, dynClient dyn } // GPU operator DaemonSets. - daemonSets, err := ctx.Clientset.AppsV1().DaemonSets("gpu-operator").List(ctx.Ctx, metav1.ListOptions{}) + daemonSets, err := ctx.Clientset.AppsV1().DaemonSets(defaults.GPUOperatorNamespace).List(ctx.Ctx, metav1.ListOptions{}) if err != nil { recordRawTextArtifact(ctx, "GPU operator DaemonSets", "kubectl get ds -n gpu-operator", fmt.Sprintf("failed to list gpu-operator DaemonSets: %v", err)) diff --git a/validators/deployment/gpu_operator_version.go b/validators/deployment/gpu_operator_version.go index 4a08cf8a3..a46df7f91 100644 --- a/validators/deployment/gpu_operator_version.go +++ b/validators/deployment/gpu_operator_version.go @@ -79,8 +79,8 @@ func findDeploymentConstraint(ctx *validators.Context, name string) (string, boo } func getGPUOperatorVersion(ctx context.Context, clientset kubernetes.Interface) (string, error) { - deploymentNames := []string{"gpu-operator", "nvidia-gpu-operator"} - namespaces := []string{"gpu-operator", "nvidia-gpu-operator", "kube-system"} + deploymentNames := []string{gpuOperatorNamespace, "nvidia-gpu-operator"} + namespaces := []string{gpuOperatorNamespace, "nvidia-gpu-operator", "kube-system"} var lastErr error for _, ns := range namespaces { diff --git a/validators/performance/nccl_all_reduce_bw_constraint.go b/validators/performance/nccl_all_reduce_bw_constraint.go index 686061ebf..a43dd1682 100644 --- a/validators/performance/nccl_all_reduce_bw_constraint.go +++ b/validators/performance/nccl_all_reduce_bw_constraint.go @@ -594,7 +594,7 @@ func waitForTrainingRuntime(ctx context.Context, dynamicClient dynamic.Interface select { case <-waitCtx.Done(): return aicrErrors.Wrap(aicrErrors.ErrCodeTimeout, "timed out waiting for TrainingRuntime to be visible", waitCtx.Err()) - case <-time.After(500 * time.Millisecond): + case <-time.After(defaults.TrainingRuntimePollInterval): } } } diff --git a/validators/performance/trainer_lifecycle.go b/validators/performance/trainer_lifecycle.go index 73395aa2f..5497dc8d8 100644 --- a/validators/performance/trainer_lifecycle.go +++ b/validators/performance/trainer_lifecycle.go @@ -267,7 +267,7 @@ func waitForTrainerControllerReady(ctx context.Context, dynamicClient dynamic.In case <-waitCtx.Done(): return aicrErrors.Wrap(aicrErrors.ErrCodeTimeout, "timed out waiting for Trainer controller-manager to become ready", waitCtx.Err()) - case <-time.After(2 * time.Second): + case <-time.After(defaults.TrainerControllerPollInterval): } } } diff --git a/validators/runner.go b/validators/runner.go index 8743cd73c..7f50c00d9 100644 --- a/validators/runner.go +++ b/validators/runner.go @@ -21,6 +21,7 @@ import ( "os" "path/filepath" + "github.com/NVIDIA/aicr/pkg/defaults" aicrerrors "github.com/NVIDIA/aicr/pkg/errors" ) @@ -104,8 +105,8 @@ func runCheck(checkFn CheckFunc) int { func writeTerminationLog(format string, args ...any) { msg := fmt.Sprintf(format, args...) slog.Error("FAIL", "message", msg) - if len(msg) > 4096 { - msg = msg[:4096] + if len(msg) > defaults.TerminationLogMaxSize { + msg = msg[:defaults.TerminationLogMaxSize] } _ = os.WriteFile(filepath.Clean(terminationLogPath), []byte(msg), 0o600) //nolint:gosec // Fixed path, not user-controlled }