From f27428c66fd763211b406f170d4a239bb6757188 Mon Sep 17 00:00:00 2001
From: Mark Chmarny <mark@chmarny.com>
Date: Thu, 16 Apr 2026 04:33:36 -0700
Subject: [PATCH] refactor: replace magic literals and fmt.Errorf with named
 constants and pkg/errors

Addresses findings from a full codebase review:

- Replace fmt.Errorf in pkg/logging with errors.Wrap
- Wrap bare sentinel error returns in pkg/version with error codes
- Extract 11 named constants to pkg/defaults (timeouts, limits, K8s namespaces)
- Replace hardcoded timeout values with defaults constants across validators
- Replace hardcoded K8s resource names with defaults.GPUOperatorNamespace
- Fix context cancellation race in sysctl_test.go
---
 pkg/cli/validate.go                           |  2 +-
 pkg/collector/os/sysctl_test.go               |  4 +-
 pkg/defaults/timeouts.go                      | 53 +++++++++++++++++++
 pkg/k8s/agent/deployer.go                     |  3 +-
 pkg/logging/cli.go                            |  4 +-
 pkg/version/version.go                        |  4 +-
 .../conformance/ai_service_metrics_check.go   |  4 +-
 .../conformance/cluster_autoscaling_check.go  |  8 +--
 .../conformance/gpu_operator_health_check.go  |  7 +--
 .../conformance/pod_autoscaling_check.go      |  2 +-
 validators/conformance/secure_access_check.go |  4 +-
 validators/deployment/gpu_operator_version.go |  4 +-
 .../nccl_all_reduce_bw_constraint.go          |  2 +-
 validators/performance/trainer_lifecycle.go   |  2 +-
 validators/runner.go                          |  5 +-
 15 files changed, 84 insertions(+), 24 deletions(-)

diff --git a/pkg/cli/validate.go b/pkg/cli/validate.go
index 3064e0e1e..762e4478d 100644
--- a/pkg/cli/validate.go
+++ b/pkg/cli/validate.go
@@ -588,7 +588,7 @@ func runCNCFSubmission(ctx context.Context, evidenceDir string, features []strin
 		}
 	}
 
-	cncfTimeout := 20 * time.Minute //nolint:mnd // CNCF submission deploys GPU workloads and runs HPA tests
+	cncfTimeout := defaults.CNCFSubmissionTimeout
 	ctx, cancel := context.WithTimeout(ctx, cncfTimeout)
 	defer cancel()
 
diff --git a/pkg/collector/os/sysctl_test.go b/pkg/collector/os/sysctl_test.go
index 0f0e62c75..1bc3c7dfb 100644
--- a/pkg/collector/os/sysctl_test.go
+++ b/pkg/collector/os/sysctl_test.go
@@ -21,6 +21,7 @@ import (
 	"path/filepath"
 	"strings"
 	"testing"
+	"time"
 
 	"github.com/NVIDIA/aicr/pkg/measurement"
 )
@@ -34,7 +35,8 @@ func TestSysctlCollector_Collect_ContextCancellation(t *testing.T) {
 
 	// Start collection and cancel mid-way
 	go func() {
-		// Give it a moment to start walking
+		// Give it a moment to start walking before canceling
+		time.Sleep(10 * time.Millisecond)
 		cancel()
 	}()
 
diff --git a/pkg/defaults/timeouts.go b/pkg/defaults/timeouts.go
index 940e64abc..93e8ed743 100644
--- a/pkg/defaults/timeouts.go
+++ b/pkg/defaults/timeouts.go
@@ -369,6 +369,59 @@ const (
 	FileParserMaxSize = 1 << 20 // 1MB
 )
 
+// Validator runtime class check timeout.
+const (
+	// RuntimeClassCheckTimeout is the timeout for verifying RuntimeClass
+	// existence in the cluster during agent deployment.
+	RuntimeClassCheckTimeout = 5 * time.Second
+)
+
+// CNCF conformance submission timeout.
+const (
+	// CNCFSubmissionTimeout is the timeout for CNCF submission evidence
+	// collection. CNCF submission deploys GPU workloads and runs HPA tests.
+	CNCFSubmissionTimeout = 20 * time.Minute
+)
+
+// Retry poll intervals for validator wait loops.
+const (
+	// TrainerControllerPollInterval is the retry interval when waiting
+	// for the Kubeflow Trainer controller-manager to become ready.
+	TrainerControllerPollInterval = 2 * time.Second
+
+	// TrainingRuntimePollInterval is the retry interval when waiting
+	// for a TrainingRuntime resource to become visible via the API.
+	TrainingRuntimePollInterval = 500 * time.Millisecond
+)
+
+// Termination and truncation limits for validator output.
+const (
+	// TerminationLogMaxSize is the maximum size in bytes of the K8s
+	// termination log message written to /dev/termination-log.
+	TerminationLogMaxSize = 4096
+
+	// ConfigMapStatusTruncateLen is the maximum length for ConfigMap
+	// status data before truncation in autoscaler status collection.
+	ConfigMapStatusTruncateLen = 2000
+
+	// AutoscalerMaxEvents is the maximum number of autoscaler events
+	// to capture when collecting cluster autoscaler evidence.
+	AutoscalerMaxEvents = 10
+
+	// MetricsDisplayLimit is the maximum number of custom metrics
+	// resources to display in AI service metrics evidence.
+	MetricsDisplayLimit = 20
+)
+
+// Well-known Kubernetes resource names shared across validators.
+const (
+	// GPUOperatorNamespace is the default namespace for the GPU operator.
+	GPUOperatorNamespace = "gpu-operator"
+
+	// KubeSystemNamespace is the standard kube-system namespace.
+	KubeSystemNamespace = "kube-system"
+)
+
 // Attestation file size limits.
 const (
 	// MaxSigstoreBundleSize is the maximum size in bytes for a .sigstore.json file.
diff --git a/pkg/k8s/agent/deployer.go b/pkg/k8s/agent/deployer.go
index bcfff0946..36c1d83f8 100644
--- a/pkg/k8s/agent/deployer.go
+++ b/pkg/k8s/agent/deployer.go
@@ -21,6 +21,7 @@ import (
 	"strings"
 	"time"
 
+	"github.com/NVIDIA/aicr/pkg/defaults"
 	aicrerrors "github.com/NVIDIA/aicr/pkg/errors"
 	k8serrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -157,7 +158,7 @@ func (d *Deployer) Cleanup(ctx context.Context, opts CleanupOptions) error {
 
 // validateRuntimeClass checks that the specified RuntimeClass exists in the cluster.
 func (d *Deployer) validateRuntimeClass(ctx context.Context) error {
-	ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
+	ctx, cancel := context.WithTimeout(ctx, defaults.RuntimeClassCheckTimeout)
 	defer cancel()
 
 	_, err := d.clientset.NodeV1().RuntimeClasses().Get(ctx, d.config.RuntimeClassName, metav1.GetOptions{})
diff --git a/pkg/logging/cli.go b/pkg/logging/cli.go
index 0747b0342..d5a4c89e1 100644
--- a/pkg/logging/cli.go
+++ b/pkg/logging/cli.go
@@ -21,6 +21,8 @@ import (
 	"log/slog"
 	"os"
 	"strings"
+
+	"github.com/NVIDIA/aicr/pkg/errors"
 )
 
 // ANSI color codes
@@ -87,7 +89,7 @@ func (h *CLIHandler) Handle(_ context.Context, r slog.Record) error {
 	}
 
 	if _, err := fmt.Fprintln(h.writer, msg); err != nil {
-		return fmt.Errorf("failed to write log output: %w", err)
+		return errors.Wrap(errors.ErrCodeInternal, "failed to write log output", err)
 	}
 	return nil
 }
diff --git a/pkg/version/version.go b/pkg/version/version.go
index 8beb96656..5e2cfb1e2 100644
--- a/pkg/version/version.go
+++ b/pkg/version/version.go
@@ -81,7 +81,7 @@ func (v Version) String() string {
 func ParseVersion(s string) (Version, error) {
 	// Check for empty string
 	if s == "" {
-		return Version{}, ErrEmptyVersion
+		return Version{}, pkgerrors.Wrap(pkgerrors.ErrCodeInvalidRequest, "empty version string", ErrEmptyVersion)
 	}
 
 	// Strip 'v' prefix if present
@@ -107,7 +107,7 @@ func ParseVersion(s string) (Version, error) {
 	// Split by dots
 	parts := strings.Split(mainPart, ".")
 	if len(parts) > 3 {
-		return Version{}, ErrTooManyComponents
+		return Version{}, pkgerrors.Wrap(pkgerrors.ErrCodeInvalidRequest, "too many version components", ErrTooManyComponents)
 	}
 
 	// Parse each component
diff --git a/validators/conformance/ai_service_metrics_check.go b/validators/conformance/ai_service_metrics_check.go
index b30dfcedf..004acbdd8 100644
--- a/validators/conformance/ai_service_metrics_check.go
+++ b/validators/conformance/ai_service_metrics_check.go
@@ -185,8 +185,8 @@ func checkAIServiceMetricsWithURL(ctx *validators.Context, promBaseURL string) e
 	}
 	var resources strings.Builder
 	limit := len(customMetricsResp.Resources)
-	if limit > 20 {
-		limit = 20
+	if limit > defaults.MetricsDisplayLimit {
+		limit = defaults.MetricsDisplayLimit
 	}
 	for i := 0; i < limit; i++ {
 		r := customMetricsResp.Resources[i]
diff --git a/validators/conformance/cluster_autoscaling_check.go b/validators/conformance/cluster_autoscaling_check.go
index 11c4b09fa..ef41f0ed5 100644
--- a/validators/conformance/cluster_autoscaling_check.go
+++ b/validators/conformance/cluster_autoscaling_check.go
@@ -554,7 +554,7 @@ func checkEKSAutoscaling(ctx *validators.Context) error {
 
 	// Check for Cluster Autoscaler deployment (optional — EKS may use Karpenter or managed scaling).
 	// Search common namespaces since Cluster Autoscaler can be deployed anywhere.
-	caNamespaces := []string{"kube-system", "cluster-autoscaler", "system"}
+	caNamespaces := []string{defaults.KubeSystemNamespace, "cluster-autoscaler", "system"}
 	caDeployNames := []string{"cluster-autoscaler", "cluster-autoscaler-aws-cluster-autoscaler"}
 	var caFound bool
 	for _, caNS := range caNamespaces {
@@ -642,8 +642,8 @@ func checkGKEAutoscaling(ctx *validators.Context) error {
 	if caErr == nil && caStatus != nil {
 		caStatusFound = true
 		statusData := caStatus.Data["status"]
-		if len(statusData) > 2000 {
-			statusData = statusData[:2000] + "\n... [truncated]"
+		if len(statusData) > defaults.ConfigMapStatusTruncateLen {
+			statusData = statusData[:defaults.ConfigMapStatusTruncateLen] + "\n... [truncated]"
 		}
 		recordRawTextArtifact(ctx, "Cluster Autoscaler Status",
 			"kubectl get configmap cluster-autoscaler-status -n kube-system -o jsonpath='{.data.status}'",
@@ -670,7 +670,7 @@ func checkGKEAutoscaling(ctx *validators.Context) error {
 	if evErr == nil {
 		var autoscalerEvents strings.Builder
 		count := 0
-		for i := len(events.Items) - 1; i >= 0 && count < 10; i-- {
+		for i := len(events.Items) - 1; i >= 0 && count < defaults.AutoscalerMaxEvents; i-- {
 			ev := events.Items[i]
 			if ev.Reason == "NotTriggerScaleUp" || ev.Reason == "ScaledUpGroup" ||
 				ev.Reason == "ScaleDown" || ev.Reason == "TriggeredScaleUp" {
diff --git a/validators/conformance/gpu_operator_health_check.go b/validators/conformance/gpu_operator_health_check.go
index dc8584dfa..07f702a13 100644
--- a/validators/conformance/gpu_operator_health_check.go
+++ b/validators/conformance/gpu_operator_health_check.go
@@ -17,6 +17,7 @@ package main
 import (
 	"fmt"
 
+	"github.com/NVIDIA/aicr/pkg/defaults"
 	"github.com/NVIDIA/aicr/pkg/errors"
 	"github.com/NVIDIA/aicr/validators"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -33,12 +34,12 @@ func CheckGPUOperatorHealth(ctx *validators.Context) error {
 
 	// 1. GPU Operator Deployment running
 	// Check by listing deployments with the gpu-operator label in the namespace.
-	deploys, err := ctx.Clientset.AppsV1().Deployments("gpu-operator").List(ctx.Ctx, metav1.ListOptions{
+	deploys, err := ctx.Clientset.AppsV1().Deployments(defaults.GPUOperatorNamespace).List(ctx.Ctx, metav1.ListOptions{
 		LabelSelector: "app.kubernetes.io/name=gpu-operator",
 	})
 	if err != nil || len(deploys.Items) == 0 {
 		// Fallback: try exact name match.
-		if verifyErr := verifyDeploymentAvailable(ctx, "gpu-operator", "gpu-operator"); verifyErr != nil {
+		if verifyErr := verifyDeploymentAvailable(ctx, defaults.GPUOperatorNamespace, "gpu-operator"); verifyErr != nil {
 			return errors.Wrap(errors.ErrCodeNotFound, "GPU operator deployment not found (checked label app.kubernetes.io/name=gpu-operator and exact name)", verifyErr)
 		}
 	}
@@ -65,7 +66,7 @@ func CheckGPUOperatorHealth(ctx *validators.Context) error {
 	}
 
 	// 3. DCGM exporter DaemonSet running
-	if err := verifyDaemonSetReady(ctx, "gpu-operator", "nvidia-dcgm-exporter"); err != nil {
+	if err := verifyDaemonSetReady(ctx, defaults.GPUOperatorNamespace, "nvidia-dcgm-exporter"); err != nil {
 		return errors.Wrap(errors.ErrCodeNotFound, "DCGM exporter check failed", err)
 	}
 
diff --git a/validators/conformance/pod_autoscaling_check.go b/validators/conformance/pod_autoscaling_check.go
index 1df79bb5a..efe143bb6 100644
--- a/validators/conformance/pod_autoscaling_check.go
+++ b/validators/conformance/pod_autoscaling_check.go
@@ -101,7 +101,7 @@ func CheckPodAutoscaling(ctx *validators.Context) error {
 
 	// 2. GPU custom metrics have data (poll with retries — adapter relist is 30s)
 	metrics := []string{"gpu_utilization", "gpu_memory_used", "gpu_power_usage"}
-	namespaces := []string{"gpu-operator", "dynamo-system"}
+	namespaces := []string{defaults.GPUOperatorNamespace, "dynamo-system"}
 
 	var found bool
 	var foundPath string
diff --git a/validators/conformance/secure_access_check.go b/validators/conformance/secure_access_check.go
index bed3ad454..82ecd8f34 100644
--- a/validators/conformance/secure_access_check.go
+++ b/validators/conformance/secure_access_check.go
@@ -193,7 +193,7 @@ func collectSecureAccessBaselineArtifacts(ctx *validators.Context, dynClient dyn
 	}
 
 	// GPU operator pods.
-	operatorPods, err := ctx.Clientset.CoreV1().Pods("gpu-operator").List(ctx.Ctx, metav1.ListOptions{})
+	operatorPods, err := ctx.Clientset.CoreV1().Pods(defaults.GPUOperatorNamespace).List(ctx.Ctx, metav1.ListOptions{})
 	if err != nil {
 		recordRawTextArtifact(ctx, "GPU operator pods", "kubectl get pods -n gpu-operator -o wide",
 			fmt.Sprintf("failed to list gpu-operator pods: %v", err))
@@ -207,7 +207,7 @@ func collectSecureAccessBaselineArtifacts(ctx *validators.Context, dynClient dyn
 	}
 
 	// GPU operator DaemonSets.
-	daemonSets, err := ctx.Clientset.AppsV1().DaemonSets("gpu-operator").List(ctx.Ctx, metav1.ListOptions{})
+	daemonSets, err := ctx.Clientset.AppsV1().DaemonSets(defaults.GPUOperatorNamespace).List(ctx.Ctx, metav1.ListOptions{})
 	if err != nil {
 		recordRawTextArtifact(ctx, "GPU operator DaemonSets", "kubectl get ds -n gpu-operator",
 			fmt.Sprintf("failed to list gpu-operator DaemonSets: %v", err))
diff --git a/validators/deployment/gpu_operator_version.go b/validators/deployment/gpu_operator_version.go
index 4a08cf8a3..a46df7f91 100644
--- a/validators/deployment/gpu_operator_version.go
+++ b/validators/deployment/gpu_operator_version.go
@@ -79,8 +79,8 @@ func findDeploymentConstraint(ctx *validators.Context, name string) (string, boo
 }
 
 func getGPUOperatorVersion(ctx context.Context, clientset kubernetes.Interface) (string, error) {
-	deploymentNames := []string{"gpu-operator", "nvidia-gpu-operator"}
-	namespaces := []string{"gpu-operator", "nvidia-gpu-operator", "kube-system"}
+	deploymentNames := []string{gpuOperatorNamespace, "nvidia-gpu-operator"}
+	namespaces := []string{gpuOperatorNamespace, "nvidia-gpu-operator", "kube-system"}
 
 	var lastErr error
 	for _, ns := range namespaces {
diff --git a/validators/performance/nccl_all_reduce_bw_constraint.go b/validators/performance/nccl_all_reduce_bw_constraint.go
index 686061ebf..a43dd1682 100644
--- a/validators/performance/nccl_all_reduce_bw_constraint.go
+++ b/validators/performance/nccl_all_reduce_bw_constraint.go
@@ -594,7 +594,7 @@ func waitForTrainingRuntime(ctx context.Context, dynamicClient dynamic.Interface
 		select {
 		case <-waitCtx.Done():
 			return aicrErrors.Wrap(aicrErrors.ErrCodeTimeout, "timed out waiting for TrainingRuntime to be visible", waitCtx.Err())
-		case <-time.After(500 * time.Millisecond):
+		case <-time.After(defaults.TrainingRuntimePollInterval):
 		}
 	}
 }
diff --git a/validators/performance/trainer_lifecycle.go b/validators/performance/trainer_lifecycle.go
index 73395aa2f..5497dc8d8 100644
--- a/validators/performance/trainer_lifecycle.go
+++ b/validators/performance/trainer_lifecycle.go
@@ -267,7 +267,7 @@ func waitForTrainerControllerReady(ctx context.Context, dynamicClient dynamic.In
 		case <-waitCtx.Done():
 			return aicrErrors.Wrap(aicrErrors.ErrCodeTimeout,
 				"timed out waiting for Trainer controller-manager to become ready", waitCtx.Err())
-		case <-time.After(2 * time.Second):
+		case <-time.After(defaults.TrainerControllerPollInterval):
 		}
 	}
 }
diff --git a/validators/runner.go b/validators/runner.go
index 8743cd73c..7f50c00d9 100644
--- a/validators/runner.go
+++ b/validators/runner.go
@@ -21,6 +21,7 @@ import (
 	"os"
 	"path/filepath"
 
+	"github.com/NVIDIA/aicr/pkg/defaults"
 	aicrerrors "github.com/NVIDIA/aicr/pkg/errors"
 )
 
@@ -104,8 +105,8 @@ func runCheck(checkFn CheckFunc) int {
 func writeTerminationLog(format string, args ...any) {
 	msg := fmt.Sprintf(format, args...)
 	slog.Error("FAIL", "message", msg)
-	if len(msg) > 4096 {
-		msg = msg[:4096]
+	if len(msg) > defaults.TerminationLogMaxSize {
+		msg = msg[:defaults.TerminationLogMaxSize]
 	}
 	_ = os.WriteFile(filepath.Clean(terminationLogPath), []byte(msg), 0o600) //nolint:gosec // Fixed path, not user-controlled
 }