From 889eea8240058a054bb3d582add83dc26b244ce7 Mon Sep 17 00:00:00 2001 From: stxkxs Date: Thu, 21 May 2026 19:29:56 -0700 Subject: [PATCH] feat: add a dedicated tainted node pool for sandbox workers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a `sandbox` Karpenter NodePool to karpenter-resources, the node substrate for the Managed Agents self-hosted sandbox workers. The pool carries an `agents.stxkxs.io/sandbox` taint and label, so sandbox worker pods land on dedicated nodes — agent tool execution is kept off the shared `default` pool. Consolidation is `WhenEmpty` (not `WhenEmptyOrUnderutilized`): a node is reclaimed once it drains its last session, never disrupted while a session is still running. It reuses the `default` EC2NodeClass and is sized for bursty, ephemeral work (`c`/`m` instances, fast 30s empty-node consolidation). The default NodePool and the per-environment overlays are unchanged — the overlays patch the `default` pool by name, so `sandbox` passes through untouched. --- .../karpenter-resources/base/nodepool.yaml | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/addons/operations/karpenter-resources/base/nodepool.yaml b/addons/operations/karpenter-resources/base/nodepool.yaml index fc232e2..8549ba0 100644 --- a/addons/operations/karpenter-resources/base/nodepool.yaml +++ b/addons/operations/karpenter-resources/base/nodepool.yaml @@ -33,3 +33,54 @@ spec: consolidateAfter: 1m budgets: - nodes: "20%" + +--- +# Dedicated, tainted node pool for self-hosted sandbox workers. +# +# Agent tool execution runs on these nodes, kept off the shared `default` +# pool by the `agents.stxkxs.io/sandbox` taint — sandbox worker pods carry +# the matching toleration + nodeSelector. Consolidation is `WhenEmpty` so a +# node draining its last session is reclaimed without disrupting a node +# that still has a session running. +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: sandbox +spec: + template: + metadata: + labels: + agents.stxkxs.io/sandbox: "true" + spec: + taints: + - key: agents.stxkxs.io/sandbox + value: "true" + effect: NoSchedule + nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass + name: default + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: ["spot", "on-demand"] + - key: kubernetes.io/arch + operator: In + values: ["amd64"] + - key: karpenter.k8s.aws/instance-category + operator: In + values: ["c", "m"] + - key: karpenter.k8s.aws/instance-generation + operator: Gt + values: ["4"] + - key: karpenter.k8s.aws/instance-size + operator: In + values: ["medium", "large", "xlarge", "2xlarge"] + limits: + cpu: 100 + memory: 200Gi + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s + budgets: + - nodes: "20%"