From 6ef702822976b62e55626bf4e2095f287c754f31 Mon Sep 17 00:00:00 2001 From: Antti-Ville Suni Date: Wed, 29 Apr 2026 12:44:30 +0000 Subject: [PATCH] enable idle pre-emption by default --- chart/values.yaml | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/chart/values.yaml b/chart/values.yaml index a859bd76d..2527f6a41 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -86,14 +86,15 @@ admission: # GPU Preemption Configuration # Enables utilization-based preemption of idle GPU workloads gpuPreemption: - enabled: false - metricsEndpoint: "" # AMD GPU metrics exporter URL (required when enabled) - pollingInterval: "15s" # How often the scraper polls the metrics endpoint - defaultThreshold: "5" # Default utilization threshold (%) below which a workload is idle - defaultGracePeriod: "10m" # Default idle duration before preemption eligibility - defaultPolicy: "OnPressure" # Default preemption policy (OnPressure or Always) - defaultAggregation: "Max" # Default multi-pod aggregation (Min, Max, or Avg) - defaultTTL: "24h" # Default TTL for terminal GpuWorkload CRs ("0" = retain forever) + enabled: true + # Replace with the actual in-cluster URL of your AMD GPU metrics exporter. + metricsEndpoint: "http://lgtm-stack.otel-lgtm-stack.svc.cluster.local:9090/federate?match[]=gpu_gfx_activity" + pollingInterval: "5s" + defaultThreshold: "5" + defaultGracePeriod: "10m" + defaultPolicy: "OnPressure" + defaultAggregation: "Max" + defaultTTL: "24h" # Optional KaiwoConfig - global operator configuration # If provided, a KaiwoConfig resource will be created