diff --git a/.github/workflows/e2e-suite.yaml b/.github/workflows/e2e-suite.yaml new file mode 100644 index 0000000..dce5b8a --- /dev/null +++ b/.github/workflows/e2e-suite.yaml @@ -0,0 +1,109 @@ +# Reusable e2e workflow (workflow_call): shared setup (build image, kind, deploy +# fluence base), then run ONE test suite — a directory under test/e2e/. The +# suite's tests are DISCOVERED (every NN-*.sh, run in sorted order); adding a test +# is just dropping a file in the directory, no workflow edit. If the suite needs +# special preparation it provides a setup.sh in its directory, which is run before +# the tests (the gang suite has none; the quantum suite installs the qpu add-on). +name: e2e-suite +on: + workflow_call: + inputs: + suite: + description: "test suite directory name under test/e2e/ (e.g. gang, quantum)" + required: true + type: string + +env: + IMAGE: vanessa/fluence:test + +jobs: + run: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build fluence image + uses: docker/build-push-action@v6 + with: + context: . + file: ./Dockerfile + push: false + load: true + tags: ${{ env.IMAGE }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Create k8s Kind Cluster + uses: helm/kind-action@v1.10.0 + with: + version: v0.32.0 # required for gang + node_image: kindest/node:v1.36.1 + config: ./deploy/kind-config.yaml + + - name: Free Disk Space (Ubuntu) + run: | + sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \ + /opt/hostedtoolcache/CodeQL + sudo apt-get clean + df -h + + - name: Load docker images + run: | + cluster=$(kind get clusters) + kind load --name "$cluster" docker-image ${{ env.IMAGE }} + + - name: Deploy fluence (base) + run: | + kubectl apply -f deploy/fluence-test.yaml + kubectl rollout status -n kube-system deployment/fluence --timeout=180s + POD="" + for i in $(seq 1 60); do + POD=$(kubectl -n kube-system get pods -l app=fluence \ + -o go-template='{{range .items}}{{if not .metadata.deletionTimestamp}}{{$name := .metadata.name}}{{range .status.conditions}}{{if and (eq .type "Ready") (eq .status "True")}}{{$name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}' 2>/dev/null | head -1 || true) + [ -n "$POD" ] && break + sleep 2 + done + [ -n "$POD" ] || { echo "ERROR: no Ready non-terminating fluence pod"; kubectl -n kube-system get pods -l app=fluence -o wide; exit 1; } + echo "Using pod: $POD" + sleep 5 + kubectl -n kube-system exec "$POD" -- /bin/bash -c "cat /tmp/fluence-graph-*.json" || true + kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{": cpu="}{.status.allocatable.cpu}{" mem="}{.status.allocatable.memory}{"\n"}{end}' + + # Per-suite special setup, if the suite directory provides one. + - name: Suite setup (${{ inputs.suite }}) + run: | + s="test/e2e/${{ inputs.suite }}/setup.sh" + if [ -f "$s" ]; then + echo "running $s" + bash "$s" + else + echo "no setup.sh for suite '${{ inputs.suite }}' — skipping" + fi + + # Discover and run every NN-*.sh in the suite directory, in sorted order. + - name: Run suite (${{ inputs.suite }}) + run: | + dir="test/e2e/${{ inputs.suite }}" + [ -d "$dir" ] || { echo "ERROR: no such suite dir: $dir"; exit 1; } + shopt -s nullglob + tests=("$dir"/[0-9]*.sh) + [ ${#tests[@]} -gt 0 ] || { echo "ERROR: no NN-*.sh tests in $dir"; exit 1; } + IFS=$'\n' tests=($(sort <<<"${tests[*]}")); unset IFS + echo "discovered ${#tests[@]} test(s) in $dir:" + printf ' %s\n' "${tests[@]}" + for t in "${tests[@]}"; do + echo "::group::$t" + bash "$t" + echo "::endgroup::" + done + + - name: Dump diagnostics on failure + if: failure() + run: | + kubectl get pods -A -o wide + kubectl logs -n kube-system deployment/fluence || true + kubectl logs -n kube-system deployment/fluence-webhook || true \ No newline at end of file diff --git a/.github/workflows/e2e-tests.yaml b/.github/workflows/e2e-tests.yaml index a6c1266..4b405f6 100644 --- a/.github/workflows/e2e-tests.yaml +++ b/.github/workflows/e2e-tests.yaml @@ -8,140 +8,15 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true -env: - KIND_VERSION: v0.32.0 - IMAGE: vanessa/fluence:test - jobs: + # Fan out the suites as parallel jobs, each a call into the reusable workflow. + # The shared setup (build, kind, deploy) lives once in e2e-suite.yaml; the + # matrix runs gang and quantum concurrently. e2e: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Build fluence image - uses: docker/build-push-action@v6 - with: - context: . - file: ./Dockerfile - push: false - load: true - tags: ${{ env.IMAGE }} - cache-from: type=gha - cache-to: type=gha,mode=max - - - name: Create k8s Kind Cluster - uses: helm/kind-action@v1.10.0 - with: - version: v0.32.0 # required for gang - node_image: kindest/node:v1.36.1 - config: ./deploy/kind-config.yaml - - - name: Free Disk Space (Ubuntu) - run: | - echo "=== Disk space before cleanup ===" - df -h - - # Remove large software runtimes and tools - sudo rm -rf /usr/share/dotnet - sudo rm -rf /usr/local/lib/android - sudo rm -rf /opt/ghc - sudo rm -rf /opt/hostedtoolcache/CodeQL - - # Clean package caches - sudo apt-get clean - echo "=== Disk space after cleanup ===" - df -h - - - name: Load docker images - run: | - kind get clusters - cluster=$(kind get clusters) - kind load --name $cluster docker-image vanessa/fluence:test - - - name: Deploy fluence (base) - run: | - kubectl apply -f deploy/fluence-test.yaml - kubectl rollout status -n kube-system deployment/fluence --timeout=180s - # rollout status can return while the OLD ReplicaSet's pod is still - # Running (terminating). Selecting by phase=Running alone can grab that - # stale pod, which then 404s on exec/logs. Wait until exactly one - # fluence pod remains, and require it to be Ready and not terminating. - POD="" - for i in $(seq 1 60); do - # names of pods that are Ready AND have no deletionTimestamp (not terminating) - POD=$(kubectl -n kube-system get pods -l app=fluence \ - -o go-template='{{range .items}}{{if not .metadata.deletionTimestamp}}{{$name := .metadata.name}}{{range .status.conditions}}{{if and (eq .type "Ready") (eq .status "True")}}{{$name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}' 2>/dev/null | head -1 || true) - [ -n "$POD" ] && break - sleep 2 - done - [ -n "$POD" ] || { echo "ERROR: no Ready non-terminating fluence pod found"; kubectl -n kube-system get pods -l app=fluence -o wide; exit 1; } - echo "Using pod: $POD" - # Brief sleep to let the container runtime stabilize before exec - sleep 5 - kubectl -n kube-system exec "$POD" -- ls /tmp/ - kubectl -n kube-system logs "$POD" - kubectl -n kube-system exec "$POD" -- /bin/bash -c "cat /tmp/fluence-graph-*.json" - kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{": cpu="}{.status.allocatable.cpu}{" mem="}{.status.allocatable.memory}{"\n"}{end}' - - - name: E2E - classical gang - run: bash test/e2e/01-classical-gang.sh - - - name: Deploy quantum add-on - run: | - # Includes the device plugin and oriented to testing container - kubectl apply -f deploy/fluence-resources-test.yaml - kubectl rollout restart -n kube-system deployment/fluence - kubectl rollout status -n kube-system deployment/fluence --timeout=60s - for i in $(seq 1 60); do - kubectl get nodes -o jsonpath='{range .items[*]}{.status.allocatable}{"\n"}{end}' - kubectl get nodes -o jsonpath='{range .items[*]}{.status.allocatable}{"\n"}{end}' | grep -q 'fluxion.flux-framework.org/qpu' && break - sleep 1 - done - # After a rollout restart BOTH the old and new pods are briefly Running. - # Select only a Ready pod with no deletionTimestamp (i.e. the new one, - # not the terminating old one) so exec/logs don't 404. - POD="" - for i in $(seq 1 60); do - POD=$(kubectl -n kube-system get pods -l app=fluence \ - -o go-template='{{range .items}}{{if not .metadata.deletionTimestamp}}{{$name := .metadata.name}}{{range .status.conditions}}{{if and (eq .type "Ready") (eq .status "True")}}{{$name}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}' 2>/dev/null | head -1 || true) - [ -n "$POD" ] && break - sleep 2 - done - [ -n "$POD" ] || { echo "ERROR: no Ready non-terminating fluence pod found after restart"; kubectl -n kube-system get pods -l app=fluence -o wide; exit 1; } - echo "Using pod: $POD" - # Brief sleep to let the container runtime stabilize before exec - sleep 5 - kubectl -n kube-system exec "$POD" -- /bin/bash -c "cat /tmp/fluence-graph-*.json" - - - name: Wait for webhook - run: | - - # wait for the deployment AND for the caBundle to be populated on the webhook config - kubectl -n kube-system rollout status deployment/fluence-webhook --timeout=120s - for i in $(seq 1 30); do - cab=$(kubectl get mutatingwebhookconfiguration fluence-webhook \ - -o jsonpath='{.webhooks[0].clientConfig.caBundle}' 2>/dev/null) - [ -n "$cab" ] && break - sleep 2 - done - # let TLS serving settle after caBundle patch - sleep 3 - - - name: E2E - quantum placement - run: bash test/e2e/02-quantum-placement.sh - - #- name: E2E - restart recovery (no double-book) - # run: bash test/e2e/03-restart-recovery.sh - - - name: E2E - sidecar ungate - run: bash test/e2e/04-sidecar-ungate.sh - - - name: Dump diagnostics on failure - if: failure() - run: | - kubectl get pods -A -o wide - kubectl logs -n kube-system deployment/fluence + strategy: + fail-fast: false # one suite failing should not cancel the other + matrix: + suite: [gang, quantum] + uses: ./.github/workflows/e2e-suite.yaml + with: + suite: ${{ matrix.suite }} \ No newline at end of file diff --git a/Makefile b/Makefile index 1160cb4..5e2c050 100644 --- a/Makefile +++ b/Makefile @@ -27,8 +27,8 @@ build: ## Build all binaries (scheduler needs flux-sched; helpers are pure Go) .PHONY: python python: - docker build -f python/Dockerfile -t ghcr.io/converged-computing/fluence-sidecar:latest ./python - docker push ghcr.io/converged-computing/fluence-sidecar:latest + docker build -f python/Dockerfile -t vanessa/fluence-sidecar:latest ./python + docker push vanessa/fluence-sidecar:latest # kind load docker-image ghcr.io/converged-computing/fluence-sidecar:latest .PHONY: test @@ -55,13 +55,16 @@ test-image-deploy: test-image kubectl patch podgroup training -n default --type=merge -p '{"metadata":{"finalizers":null}}' || true kubectl delete deployments --all kubectl delete pods --all - kubectl delete -f deploy/fluence-test.yaml + kubectl delete -f deploy/fluence-test.yaml || true kubectl delete pods --all +.PHONY: test-deploy-recreate +test-deploy-recreate: test-image-deploy + kubectl apply -f deploy/fluence-pull-test.yaml .PHONY: deploy deploy: ## Install RBAC + scheduler into kube-system - kubectl apply -f deploy/fluence.yaml + kubectl apply -f deploy/fluence-.yaml .PHONY: help help: diff --git a/README.md b/README.md index 3ee668f..d299757 100644 --- a/README.md +++ b/README.md @@ -194,10 +194,10 @@ ceiling. Types come from the same config as the graph, so they can't drift. ### `sidecars/` — quantum coordination sidecars -Vendor-specific sidecar containers injected by the webhook into leader pods -of quantum workflow groups. Each sidecar discovers the QPU task submitted by -the leader, polls the vendor queue, and ungates worker pods when the task -reaches position==1. +Vendor-specific sidecar containers injected by the webhook into the producer pod +of a shared quantum workflow group. Each sidecar discovers the QPU task submitted +by the producer, polls the vendor queue, and ungates the consumer pods when the +task reaches position==1. ```console sidecars/ @@ -221,7 +221,7 @@ spec: ``` Fluence creates the PodGroup, injects the sidecar, creates per-namespace -RBAC, and gates all non-leader pods. See `sidecars/braket/design.md` for +RBAC, and gates the consumer pods. See `sidecars/braket/design.md` for the full design including the SDK interceptor, queue position polling, and the two-queue problem motivation. @@ -369,88 +369,90 @@ Submission is **not** done by the scheduler — the workload container holds the user's credentials and submits via qrmi-go. Fluence only schedules and hands off the backend. (When we control local quantum devices this will change.) -### 3. Quantum workflow groups (leader + workers) +### 3. Quantum workflow groups (producer + consumers) -A quantum workflow group is one pod that **submits** quantum work (the leader) -plus N pods that **wait** for the result (the workers). All pods share a group -label; Fluence co-schedules them, gives the leader a sidecar that watches the -vendor queue, and gates the workers so they consume no node resources during the -(long, variable) QPU queue wait — releasing them only when the task reaches -`queue_position == 1`. +A quantum workflow group is a gang whose members share **one** quantum task: +one pod **submits** the work (the producer) and N−1 pods **wait** for the result +(the consumers). All pods share a group label and run the *same* image; Fluence +co-schedules them, gives the producer a sidecar that watches the vendor queue, and +gates the consumers so they consume no node resources during the (long, variable) +QPU queue wait — releasing them only when the task reaches `queue_position == 1`. ```yaml -# Every pod in the group carries the same group label + schedulerName: fluence +# Every pod in the group carries the same group label + schedulerName: fluence, +# and opts into shared coordination. metadata: labels: fluence.flux-framework.org/group: my-qaoa-workflow + annotations: + fluence.flux-framework.org/coordination: shared spec: schedulerName: fluence ``` -#### How the leader is chosen — two mechanisms +#### Coordination modes -There are two ways Fluence decides which pod is the leader. They are mutually -exclusive per group; pick the one that matches how your workload is built. +`fluence.flux-framework.org/coordination` selects how the gang is coordinated; it +defaults to `independent`. -**(a) Explicit role (recommended for leader/worker workflows).** Each pod -declares its role with an annotation. This is **authoritative**: admission order -is never consulted, and the same value is injected into the container as -`FLUENCE_ROLE` so your application reads the exact role Fluence used — the two -can never disagree. +- **`shared`** — the gang shares ONE quantum task. Fluence promotes one member to + producer and gates the rest as consumers (see below). Use this for a coordinated + workflow where the classical post-processing should start together as the single + result lands. +- **`independent`** (default) — every member does its own quantum work: its own + real submit, its own queue wait, no gating. N members run N tasks. This is the + honest default; Fluence never invents coordination you did not ask for, and + never dedups tasks meant to be distinct. -```yaml -metadata: - labels: - fluence.flux-framework.org/group: my-qaoa-workflow - annotations: - fluence.flux-framework.org/role: leader # or: worker -``` +#### How the producer is chosen -Use this when the leader and workers are **different** (the leader submits the -quantum task and runs the sidecar; workers process results). The leader gets the -interceptor + sidecar; workers are gated. Because the decision is declared, it is -race-free regardless of which pod the API server admits first. Your container can -branch on `$FLUENCE_ROLE` (e.g. `leader` → submit; `worker` → wait). +In `shared` mode the producer is the member the Job controller stamps with +`batch.kubernetes.io/job-completion-index: "0"` — so an **indexed Job** gives +deterministic, race-free election from a single identical template (every pod has +the same image and group label; only the index differs). This serves two contracts +with no extra configuration: -**(b) Admission order (default when no role annotation is present).** If pods -carry the group label but **no** role annotation, the **first pod admitted** -becomes the leader and every subsequent pod is a worker. This suits a -*homogeneous* pod-template gang (Deployment/Job/StatefulSet) where every replica -is byte-identical — any one of them can lead, so "first admitted" is a fine -tiebreaker. It is **not** suitable for a heterogeneous leader/worker workflow: -since admission order is nondeterministic, a worker pod could be admitted first -and wrongly elected leader. Use mechanism (a) for that case. +- an **explicit-role script** that branches on the completion index (index 0 + submits; others wait and consume the result), and +- an **identical script** where every pod calls submit — the producer's submit is + real, and each consumer's submit is transparently returned the producer's task + (the shared-result dedup), so the code need not branch at all. -> Rule of thumb: identical replicas → admission order is fine. Distinct -> leader/worker pods → use the explicit `role` annotation. +For loose pods with no completion index, the first pod admitted claims the producer +slot; an indexed Job is recommended when you need determinism. #### What Fluence does -Regardless of mechanism, the leader gets the sidecar and a PodGroup is created -(`minCount: 1`); workers get a `quantum.braket/ready` scheduling gate and consume -no node resources during the QPU queue wait. When the sidecar observes -`queue_position == 1`, it patches the task ARN onto each worker's annotations and -removes their gates atomically with setting the `fluence-quantum-classical` -priority class so they reschedule promptly. +In `shared` mode the producer gets the interceptor (real mode) + sidecar and its +own group-of-one PodGroup `-producer` (`minCount: 1`), so it schedules +alone and runs the single real submit; it is never gated. The consumers join the +`` gang (`minCount: N−1`), get a `quantum.braket/ready` scheduling gate, and +consume no node resources during the QPU queue wait. When the sidecar observes +`queue_position == 1`, it stamps the producer's task id onto each consumer +(surfaced as `FLUENCE_QUANTUM_JOB_ID`) and removes their gates atomically with +setting the `fluence-quantum-classical` priority class so they reschedule promptly. +The producer is one of the N members, so the application runs exactly N times — +never N+1, and there is no separate submitter pod. Per-namespace RBAC (`fluence-sidecar` ServiceAccount/Role/RoleBinding) and the -interceptor ConfigMap are created automatically by the webhook on first use — no +interceptor staging are created automatically by the webhook on first use — no manual setup required. ```bash -# Just apply your pods with the group label (+ optional role annotation) and +# Apply your pods with the group label + coordination annotation + # schedulerName: fluence. RBAC is created for you. kubectl apply -f my-quantum-workflow.yaml ``` -#### A note on the homogeneous "all submit" case +#### A note on the independent "all submit" case -A group where *every* pod submits its own quantum task (no leader/worker split) -is possible but rarely what you want: N independent submissions land in the -vendor queue and run at uncoordinated times, so there is no coordination benefit -from grouping them — you would just have N standalone quantum pods. For a single -quantum submission, use a standalone pod (no group label, see §2). For a -coordinated workflow, use the leader/worker form above with an explicit role. +`coordination: independent` (the default) means *every* pod submits its own +quantum task: N independent submissions land in the vendor queue and run at +uncoordinated times. That is correct and sometimes exactly what you want (N +distinct circuits), but it offers no coordination benefit from grouping — it is +equivalent to N standalone quantum pods. For a single quantum submission, use a +standalone pod (no group label, see §2). For a coordinated workflow that shares +one result, use `coordination: shared` above. ### Notes diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go index ea2669a..1a6709d 100644 --- a/cmd/webhook/main.go +++ b/cmd/webhook/main.go @@ -12,9 +12,11 @@ package main import ( "context" "crypto/tls" + "flag" "log" "net/http" "os" + "strings" "time" "github.com/converged-computing/fluence/pkg/cluster" @@ -38,6 +40,29 @@ func main() { cfgName := env("WEBHOOK_CONFIG", "fluence-webhook") addr := env("WEBHOOK_ADDR", ":8443") + // Handler selection. By default ALL registered handlers are enabled. The + // operator may restrict the active set with --handlers (comma-separated) or + // the FLUENCE_HANDLERS env var, e.g. --handlers=fluxion,gang to run without + // quantum. An empty value means all enabled. Unknown names are warned about + // but not fatal (so config survives a handler being renamed/removed). + handlersFlag := flag.String("handlers", env("FLUENCE_HANDLERS", ""), + "comma-separated handlers in dispatch order (default: fluxion,quantum,gang). e.g. fluxion,gang disables quantum") + flag.Parse() + + var requested []string + if *handlersFlag != "" { + for _, n := range strings.Split(*handlersFlag, ",") { + if n = strings.TrimSpace(n); n != "" { + requested = append(requested, n) + } + } + } + active, unknown := webhook.SetActiveHandlers(requested) + for _, n := range unknown { + log.Printf("[fluence-webhook] WARNING: unknown handler %q — ignoring", n) + } + log.Printf("[fluence-webhook] active handlers (in dispatch order): %v", active) + dnsNames := []string{ svc + "." + ns + ".svc", svc + "." + ns + ".svc.cluster.local", @@ -87,7 +112,6 @@ func main() { mutator := &webhook.Mutator{ AttributeKeys: attrKeys, Clientset: client, - SidecarImage: env("FLUENCE_SIDECAR_IMAGE", ""), } log.Printf("[fluence-webhook] env contract injected into fluxion pods: %v", mutator.EnvVarNames()) diff --git a/deploy/fluence-pull-test.yaml b/deploy/fluence-pull-test.yaml new file mode 100644 index 0000000..8e42158 --- /dev/null +++ b/deploy/fluence-pull-test.yaml @@ -0,0 +1,287 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: fluence + namespace: kube-system +--- +# Bind the built-in scheduler roles so fluence (a full kube-scheduler build) has +# every list/watch the scheduling framework needs (nodes, pods, PV/PVC, CSI, +# storageclasses, resourceclaims/slices, volumeattachments, events, etc.). +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: fluence-as-kube-scheduler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:kube-scheduler +subjects: + - kind: ServiceAccount + name: fluence + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: fluence-as-volume-scheduler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:volume-scheduler +subjects: + - kind: ServiceAccount + name: fluence + namespace: kube-system +--- +# Delegated authentication: read the auth configmap in kube-system. This is the +# fix for the "extension-apiserver-authentication ... forbidden" errors. +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: fluence-extension-apiserver-authentication-reader + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: extension-apiserver-authentication-reader +subjects: + - kind: ServiceAccount + name: fluence + namespace: kube-system +--- +# Extras the built-in scheduler role does not grant: the alpha PodGroup/Workload +# API (gang), and leader-election leases under our scheduler name. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: fluence-extra +rules: + - apiGroups: ["scheduling.k8s.io"] + resources: ["podgroups", "workloads", "podgroups/status", "workloads/status"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["create", "get", "update", "list", "watch"] + # PreBind stamps the allocated backend onto the pod as an annotation; the + # built-in system:kube-scheduler role only allows patching pods/status, not + # the pod object, so grant it here. + - apiGroups: [""] + resources: ["pods"] + # create/delete: the webhook creates the one-off quantum submitter pod + # (ensureSubmitterPod) and the scheduler reaps it during gang cleanup. + verbs: ["get", "list", "watch", "create", "patch", "update", "delete"] + # The webhook self-manages its TLS by patching its own config's caBundle. + - apiGroups: ["admissionregistration.k8s.io"] + resources: ["mutatingwebhookconfigurations"] + verbs: ["get", "list", "watch", "patch"] + # The webhook creates per-namespace sidecar RBAC on demand when a leader + # pod is admitted, so users do not need to apply RBAC manually. + - apiGroups: [""] + resources: ["serviceaccounts"] + verbs: ["get", "create"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["get", "create"] + - apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "rolebindings"] + verbs: ["get", "create"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: fluence-extra +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: fluence-extra +subjects: + - kind: ServiceAccount + name: fluence + namespace: kube-system +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: fluence-scheduler-config + namespace: kube-system +data: + scheduler-config.yaml: | + apiVersion: kubescheduler.config.k8s.io/v1 + kind: KubeSchedulerConfiguration + leaderElection: + leaderElect: false + profiles: + - schedulerName: fluence + plugins: + # multiPoint wires Fluence into every extension point its Go type + # implements: PreFilter, Filter, and PreBind (which stamps the backend + # annotation). Listing points individually risks omitting one — that is + # exactly what left PreBind unwired and the backend annotation unset. + multiPoint: + enabled: [{name: Fluence}] +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: fluence + namespace: kube-system + labels: {app: fluence} +spec: + replicas: 1 + selector: + matchLabels: {app: fluence} + template: + metadata: + labels: {app: fluence} + spec: + serviceAccountName: fluence + containers: + - name: fluence + image: vanessa/fluence:test + imagePullPolicy: Always + command: + - /bin/fluence + - --config=/etc/fluence/scheduler-config.yaml + # fluence is its own scheduler binary, so it needs the gang gates set + # here (the cluster-level kube-scheduler gates don't apply to it). + # Without these its PodGroup/GangScheduling plugin is inactive, pods + # schedule with no gang semantics, and PodGroup status stays Pending. + - --feature-gates=GenericWorkload=true,GangScheduling=true + - --v=4 + env: + # Path to the resources config (e.g. quantum backends). Unset/empty + # file -> classical-only graph. Supplied by the quantum add-on. + - name: FLUENCE_RESOURCES + value: /etc/fluence/resources.yaml + volumeMounts: + - name: config + mountPath: /etc/fluence + volumes: + - name: config + projected: + sources: + - configMap: {name: fluence-scheduler-config} + - configMap: {name: fluence-resources, optional: true} +--- +# Mutating webhook: injects scheduler-chosen values into pods at creation time +# (currently a downward-API QRMI_BACKEND env for quantum pods). It self-manages +# TLS — generates a CA + serving cert at startup and patches the caBundle below — +# so no cert-manager and no committed keys. failurePolicy Ignore keeps a webhook +# outage from blocking pod creation cluster-wide. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: fluence-webhook + namespace: kube-system + labels: {app: fluence-webhook} +spec: + replicas: 1 + selector: + matchLabels: {app: fluence-webhook} + template: + metadata: + labels: {app: fluence-webhook} + spec: + serviceAccountName: fluence + containers: + - name: webhook + image: vanessa/fluence:test + imagePullPolicy: Always + command: ["/bin/fluence-webhook"] + # The webhook derives the FLUXION_* env contract (FLUXION_VENDOR, + # FLUXION_QRMI_TYPE, ...) from the resource graph's attribute keys, so + # it needs the same graph the scheduler and device plugin read. Without + # this it injects only FLUXION_BACKEND, and the sidecar can't route to + # a provider (which keys on qrmi_type). + env: + - name: FLUENCE_RESOURCES + value: /etc/fluence/resources.yaml + ports: + - containerPort: 8443 + readinessProbe: + httpGet: {path: /healthz, port: 8443, scheme: HTTPS} + initialDelaySeconds: 2 + volumeMounts: + - name: config + mountPath: /etc/fluence + volumes: + - name: config + projected: + sources: + - configMap: {name: fluence-resources, optional: true} +--- +apiVersion: v1 +kind: Service +metadata: + name: fluence-webhook + namespace: kube-system +spec: + selector: {app: fluence-webhook} + ports: + - port: 443 + targetPort: 8443 +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: fluence-webhook +webhooks: + - name: pods.fluence.flux-framework.org + admissionReviewVersions: ["v1"] + sideEffects: None + failurePolicy: Ignore # never block pod creation if the webhook is down + # caBundle is filled in at runtime by the webhook patching this object. + clientConfig: + service: + name: fluence-webhook + namespace: kube-system + path: /mutate + port: 443 + rules: + - apiGroups: [""] + apiVersions: ["v1"] + operations: ["CREATE"] + resources: ["pods"] + scope: Namespaced + # Don't intercept system pods (and avoid bootstrap coupling). + namespaceSelector: + matchExpressions: + - key: kubernetes.io/metadata.name + operator: NotIn + values: ["kube-system"] +# fluence-sidecar.yaml +# +# RBAC and supporting resources for the Fluence quantum sidecar. +# +# The sidecar runs inside a leader pod and needs: +# - patch/annotate on pods in its own namespace (to ungate workers and +# propagate the task ARN annotation) +# +# The sidecar ServiceAccount is namespace-scoped — it only has permissions +# in the namespace where the workflow runs. The webhook sets +# spec.serviceAccountName on the leader pod to fluence-sidecar. +# +# The fluence Python package is staged into user containers by an init +# container (Model C): the webhook injects an init container from the +# sidecar image that copies the package + sitecustomize into a shared +# volume on the user container's PYTHONPATH. No ConfigMap, no user install. +# +# Apply with: +# kubectl apply -f deploy/fluence-sidecar.yaml + + +--- +# PriorityClass for classical pods paired with quantum work. +# Applied to worker pods by the webhook when they are gated. +# When ungated, high priority triggers preemption of lower-priority work +# so workers get nodes immediately as the QPU result arrives. +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: fluence-quantum-classical + labels: + app: fluence +value: 1000000 +globalDefault: false +preemptionPolicy: PreemptLowerPriority +description: "High priority for classical pods paired with quantum work. Set by Fluence webhook." diff --git a/deploy/fluence-test.yaml b/deploy/fluence-test.yaml index 6d1dace..ab61a91 100644 --- a/deploy/fluence-test.yaml +++ b/deploy/fluence-test.yaml @@ -67,7 +67,9 @@ rules: # the pod object, so grant it here. - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "watch", "patch", "update"] + # create/delete: the webhook creates the one-off quantum submitter pod + # (ensureSubmitterPod) and the scheduler reaps it during gang cleanup. + verbs: ["get", "list", "watch", "create", "patch", "update", "delete"] # The webhook self-manages its TLS by patching its own config's caBundle. - apiGroups: ["admissionregistration.k8s.io"] resources: ["mutatingwebhookconfigurations"] @@ -146,6 +148,13 @@ spec: # Without these its PodGroup/GangScheduling plugin is inactive, pods # schedule with no gang semantics, and PodGroup status stays Pending. - --feature-gates=GenericWorkload=true,GangScheduling=true + # Re-attempt unschedulable pods more often than the 5m default. In the + # contention experiment a gang that loses the initial race for nodes is + # marked Unschedulable; this is how soon it is re-tried after capacity + # frees (the event-driven QueueingHint is best-effort; this is the + # backstop that bounds worst-case requeue latency). 30s keeps contended + # gangs draining promptly without thrashing the queue. + - --pod-max-in-unschedulable-pods-duration=30s - --v=4 env: # Path to the resources config (e.g. quantum backends). Unset/empty diff --git a/deploy/fluence.yaml b/deploy/fluence.yaml index b856268..7d71386 100644 --- a/deploy/fluence.yaml +++ b/deploy/fluence.yaml @@ -67,7 +67,9 @@ rules: # the pod object, so grant it here. - apiGroups: [""] resources: ["pods"] - verbs: ["get", "list", "watch", "patch", "update"] + # create/delete: the webhook creates the one-off quantum submitter pod + # (ensureSubmitterPod) and the scheduler reaps it during gang cleanup. + verbs: ["get", "list", "watch", "create", "patch", "update", "delete"] # The webhook self-manages its TLS by patching its own config's caBundle. - apiGroups: ["admissionregistration.k8s.io"] resources: ["mutatingwebhookconfigurations"] diff --git a/deploy/kind-config.yaml b/deploy/kind-config.yaml index c94e070..ec310bc 100644 --- a/deploy/kind-config.yaml +++ b/deploy/kind-config.yaml @@ -32,4 +32,4 @@ nodes: - name: feature-gates value: "GenericWorkload=true" - role: worker - - role: worker + - role: worker \ No newline at end of file diff --git a/docs/coordination-handler-design.md b/docs/coordination-handler-design.md new file mode 100644 index 0000000..cdcfd38 --- /dev/null +++ b/docs/coordination-handler-design.md @@ -0,0 +1,387 @@ +# Coordination handlers: producer/consumer gang split (no separate submitter) + +> **Status: implemented.** This design is live in `pkg/webhook/handlers/quantum.go` +> (the coordination router + `mutateProducer`/`mutateConsumer`/`coordinationMode`/ +> `isProducer`), `pkg/webhook/handlers/gang.go` (classical gangs defer quantum +> pods to the quantum handler), and `pkg/fluence/fluence.go` (reconcile reaps the +> `-producer` PodGroup, never the producer pod — it is a real member). +> Unit tests are in `pkg/webhook/handlers/quantum_test.go`; structural e2e in +> `test/e2e/quantum/02–04`. Coordination is **role-aware**: the webhook stamps +> `FLUENCE_COORDINATION_ROLE` (producer/consumer) and hands consumers the +> producer's task id (`FLUENCE_QUANTUM_JOB_ID`); the workload branches on the role +> (producer submits, consumer fetches the shared result by id). No submit +> interception, no faux flag — that earlier mechanism has been removed. + +## Why this replaces the submitter-pod model + +The `add-sidecar-interface` branch coordinates a quantum gang by creating a +*separate* one-off submitter pod (`-submitter`) that runs the user's +application image to do the real submit, then ungates a gang of N faux-submitting +members. That works, but it runs the user's application **N+1 times** for an +N-gang: once in the submitter (a full run whose post-processing nobody consumes) +plus once in each of the N members. The redundant run is not an implementation +wart — it is a symptom of modeling quantum work as a producer/consumer split +while pretending one image plays both roles, selected at runtime by a faux flag. + +This design keeps the split (it is correct) but removes the separate pod: the +**producer is one of the N members**, promoted at admission, so the application +runs exactly **N times** — the needed number — with exactly **one real submit**. + +The core thesis is unchanged: Fluence is a generic gang scheduler (native gangs +since k8s 1.36), and per-resource nuance lives in handlers. This is entirely a +change to the `quantum` handler plus a one-line deferral in the `gang` handler. + +## The fundamental constraint + +A quantum task's content (the circuit) comes from user code, so **the pod that +defines a task must run to submit it**. Therefore, per pod, *submit* and *gate* +are mutually exclusive — a pod either runs (and can submit) or is gated (and +cannot). Gating only ever buys resource savings for pods that **do not submit**: +pods that consume a result someone else produced. + +That partitions a quantum gang into two kinds, decided per pod: + +- **producer** — runs its code, submits its own task, holds a node through the + queue wait. Not gateable, ever. +- **consumer** — never submits; reads the producer's result. Fully gateable until + that result is ready. + +## Coordination modes (user-facing contract) + +Identical pod templates (a Job/Deployment) are genuinely ambiguous between "one +shared task, fan the result out to N pods" and "N independent tasks." Fluence +cannot infer this; the user declares it with one annotation on the pod template: + +```yaml +metadata: + annotations: + fluence.flux-framework.org/coordination: shared # or: independent +``` + +| mode | meaning | who submits | gating | app runs | real submits | +|------|---------|-------------|--------|----------|--------------| +| `independent` (default) | N pods each do their own quantum work | every pod | none possible (all are producers) | N | N | +| `shared` | one task; N pods consume the result | producer only | consumers gated until task ready | N | 1 | + +`coordination` is an open enum so future designs (e.g. `scatter` — index-paired +task↔pod, §6.2 of the quantum doc) slot in as new modes without changing the +mechanism. Default is `independent`: never invent coordination the user did not +ask for, and never dedup tasks that were meant to be distinct. + +### What each mode does to resources, honestly + +- **shared**: the producer (1 node) holds its node through the queue wait; + consumers (N−1) consume **zero** node resources while gated, then start at + position==1. Idle cost during the wait ≈ 1 node, vs N for a traditional gang. +- **independent**: every pod is a producer, so every pod holds its node through + its own queue wait — N nodes idle. There is nothing to coordinate (no shared + result), so this is not a Fluence deficiency; it is the physics of "N + independent tasks," and it is the user's explicit choice. The only way to + reclaim even the producer's node in either mode is a resumable `.result()` + (replay), and is deliberately **out of scope + for v1** (one idle node is cheap; replay imposes a replay-safe-code contract). + +## Producer election + +Exactly one member must be the producer. Election is deterministic for the +recommended workload and best-effort otherwise: + +- **Indexed Job (recommended):** the pod carries + `batch.kubernetes.io/job-completion-index`. **Index `0` is the producer**; + every other index is a consumer. Deterministic, race-free, no recorded state — + the controller already stamped the index, and identical templates yield + differentiated behavior purely from it. This is why an indexed Job is the right + shape and is what the experiments use. +- **Non-indexed gang (Deployment / raw grouped pods):** first arrival claims the + producer slot by creating the producer PodGroup (create-if-absent); later pods + find it present and become consumers. Best-effort (racy under simultaneous + admission); documented, with indexed Job recommended for determinism. + +## The two-group split + +| | producer (index 0) | consumers (indices 1..N−1) | +|---|---|---| +| PodGroup | `-producer`, `minCount=1` | ``, `minCount=N−1` | +| schedules | immediately, alone | atomically as a gang, **after ungate** | +| gate | none | `quantum.braket/ready` + preempting priority | +| interceptor | staged (tags the real submit) | **not staged** (a consumer never submits) | +| sidecar | yes — polls the task, ungates `` at position==1 | no | +| app run | full; submits the one real task | full; reads role=consumer and fetches the shared result by id (no submit) | + +`minCount=1` on the producer group is what removes the deadlock that forced a +separate submitter: a single-member group schedules alone, so the producer runs +during the wait while the `minCount=N−1` consumer group sits gated. The two +groups have independent minCounts; neither blocks the other. The consumer group +keeps a real gang `minCount` (N−1), so **gang scheduling is preserved and +demonstrable** (experiment requirement 1). + +Coordination is role-aware rather than interception-based: the consumer is told +`FLUENCE_COORDINATION_ROLE=consumer` and handed the producer's task id +(`FLUENCE_QUANTUM_JOB_ID`, stamped by the sidecar at ungate), and the workload +fetches the shared result by that id instead of submitting. One real task, N +consumers, each app run once, in full — and no SDK submit-interception. + +## Gate / ungate flow (shared mode) + +``` +1. Producer (index 0) admitted -> own group-of-one, ungated, sidecar attached + (FLUENCE_GANG_GROUP=), interceptor in REAL mode. + Consumers (1..N-1) admitted -> group (minCount N-1), GATED, + role=consumer, depends-on producer=-producer. + +2. Scheduler places the producer immediately (minCount=1). It runs the user app, + .run() submits the ONE real task (tagged fluence-pod-uid). + +3. Producer sidecar discovers the task by tag, polls queue position. + +4. At position==1 (or RUNNING): for each gated pod in : + annotate fluence.flux-framework.org/quantum-job-id= + remove the quantum.braket/ready gate (priority already set at admission) + +5. Consumer group (now ungated, minCount N-1) gang-schedules atomically and + starts as the quantum result arrives. Each consumer reads role=consumer and + fetches the producer's task by FLUENCE_QUANTUM_JOB_ID (.result() returns the + shared result); app post-processes. No consumer submits. +``` + +`independent` mode skips all of this: each pod is its own group-of-one, ungated, +real submit, optional observe-only sidecar — i.e. today's standalone path applied +per pod. + +--- + +## Patch + +All changes are in `pkg/webhook/handlers/`. The webhook core, the `fluxion` +handler, `dependency.go`, `sidecar.go`, and the Python interceptor/sidecar are +**unchanged**. + +### `gang.go` — defer on quantum pods (removes the ordering dependency) + +The gang handler currently calls `EnsurePodGroup` unconditionally and relies on +idempotency to coexist with the quantum handler. With the two-group split the +quantum handler owns *both* quantum PodGroups (and the producer's group differs +from its admission-time label), so the gang handler must not also gang quantum +pods. Make it skip them: + +```go +func (h *gangHandler) Applies(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) bool { + // Classical gangs only. A pod that requests the quantum resource is gang- + // scheduled by the quantum handler (which owns the producer/consumer split); + // handling it here too would create a second, conflicting PodGroup. + if spec.PodRequestsResource(pod, QuantumResource) { + return false + } + return webhook.GroupName(pod) != "" +} +``` + +### `quantum.go` — replace `Mutate` and the submitter machinery + +**Add** these constants (near the existing const block): + +```go +const ( + // CoordinationAnnotation selects how a quantum gang is coordinated. Open enum + // so new designs (e.g. "scatter") add a mode without changing the mechanism. + CoordinationAnnotation = "fluence.flux-framework.org/coordination" + // CoordinationShared: one real task; the producer (index 0) submits, the + // other members are gated consumers that dedup to the producer's task. + CoordinationShared = "shared" + // CoordinationIndependent (default): every member does its own quantum work; + // no coordination, no gating, each holds its node through its own queue wait. + CoordinationIndependent = "independent" + + // ProducerGroupSuffix names the producer's own group-of-one: -producer + // (minCount 1) so it schedules alone and never deadlocks against the gated + // consumer gang. + ProducerGroupSuffix = "-producer" + + // CompletionIndexAnnotation is the indexed-Job completion index the Job + // controller stamps on each pod; index "0" is the producer (deterministic + // election, no recorded state). + CompletionIndexAnnotation = "batch.kubernetes.io/job-completion-index" + // ProducerIndex is the completion index promoted to producer. + ProducerIndex = "0" +) +``` + +Keep `GangGroupEnv` (`FLUENCE_GANG_GROUP`) — it now tells the **producer's** +sidecar which consumer group to ungate. **Delete** the separate-submitter +constants and helpers: `SubmitterAnnotation`, `GangGroupAnnotation`, +`SubmitterGroupSuffix`, `SubmitterPodSuffix`, and the functions +`mutateSubmitter` and `ensureSubmitterPod`. Everything else in the file +(`resolveGroup`, `resolveGangSize`, `ownerReplicaSetN`, `countGroupPods`, +`linkGroupOps`, the role/job-id env section, the sidecar section) is reused unchanged. + +**Replace** `Mutate` with the coordination router plus two small role functions: + +```go +func (h *quantumHandler) Mutate(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) []spec.Op { + g := resolveGroup(pod) + n := resolveGangSize(ctx, m, pod, g) + mode := coordinationMode(pod) + observe := spec.Label(pod, ObserveLabel) == "true" + + // No coordination: a standalone quantum pod, or an explicitly independent + // member. The REAL submit happens in THIS pod; sidecar only for observe-only + // telemetry. (independent mode routes every member here -> N standalone + // producers, each owning its task and its own queue wait.) + if mode != CoordinationShared || g == "" || n <= 1 { + ops := interceptorOps(pod) + if observe { + sc := sidecarFor(m) + sc.EnsureRBAC(ctx, pod.Namespace) + ops = append(ops, sc.ContainerOps(pod, true, nil)...) + } + log.Printf("[fluence-webhook] quantum %s/%s mode=%s (standalone/independent, observe=%v)", + pod.Namespace, pod.Name, mode, observe) + return ops + } + + // shared mode: promote one member to producer; the rest are gated consumers. + if isProducer(ctx, m, pod, g) { + return h.mutateProducer(ctx, m, pod, g) + } + return h.mutateConsumer(ctx, m, pod, g, n) +} + +// mutateProducer: index-0 member. Its own group-of-one (minCount 1) so it +// schedules alone and runs the REAL submit; sidecar polls the task and ungates +// the consumer group. NOT gated, no faux. The producer is one of the N members, +// so the application is NOT run an extra time. +func (h *quantumHandler) mutateProducer(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod, group string) []spec.Op { + pg := group + ProducerGroupSuffix + m.EnsurePodGroup(ctx, pod.Namespace, pg, pod.Name, 1) + ops := linkGroupOps(pod, pg) + ops = append(ops, interceptorOps(pod)...) // tags the real submit + ops = append(ops, roleEnvOps(pod, RoleProducer)...) // FLUENCE_COORDINATION_ROLE=producer + sc := sidecarFor(m) + sc.EnsureRBAC(ctx, pod.Namespace) + // Tell the sidecar which consumer group (the base group) to list + ungate. + ops = append(ops, sc.ContainerOps(pod, false, []corev1.EnvVar{{Name: GangGroupEnv, Value: group}})...) + log.Printf("[fluence-webhook] quantum producer %s/%s — group %s (ungates %q)", + pod.Namespace, pod.Name, pg, group) + return ops +} + +// mutateConsumer: a non-producer member. Joins the consumer gang +// (minCount N-1), is gated until the producer's task is ready, and is told its +// role (FLUENCE_COORDINATION_ROLE=consumer) + the producer's task id +// (FLUENCE_QUANTUM_JOB_ID). A consumer fetches the shared result by id; it never +// submits, so it gets neither the interceptor nor a faux flag. +func (h *quantumHandler) mutateConsumer(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod, group string, n int32) []spec.Op { + m.EnsurePodGroup(ctx, pod.Namespace, group, pod.Name, n-1) + ops := linkGroupOps(pod, group) + dep := Dependency{Kind: DependencyKindQuantumSubmit, Producer: group + ProducerGroupSuffix, Gate: QuantumGate} + ops = append(ops, dep.applyOps(pod)...) // gate + preempting priority + depends-on + ops = append(ops, consumerEnvOps(pod)...) // role=consumer + FLUENCE_QUANTUM_JOB_ID + log.Printf("[fluence-webhook] quantum consumer %s/%s — group %s minCount=%d, gated (role=consumer)", + pod.Namespace, pod.Name, group, n-1) + return ops +} + +// coordinationMode reads the coordination annotation; default independent. +func coordinationMode(pod *corev1.Pod) string { + if v := spec.Annotation(pod, CoordinationAnnotation); v != "" { + return v + } + return CoordinationIndependent +} + +// isProducer decides whether THIS pod is the gang's single producer. Indexed Job +// (recommended): completion index 0 is the producer — deterministic, race-free. +// Otherwise: first arrival claims the producer slot by the absence of the +// producer PodGroup (best-effort; prefer an indexed Job). +func isProducer(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod, group string) bool { + if idx, ok := pod.Annotations[CompletionIndexAnnotation]; ok { + return idx == ProducerIndex + } + c := m.Client() + if c == nil { + return true // tests / no client: treat as producer + } + pg := group + ProducerGroupSuffix + if _, err := c.SchedulingV1alpha2().PodGroups(pod.Namespace).Get(ctx, pg, metav1.GetOptions{}); err == nil { + return false // already claimed by an earlier arrival + } + return true +} +``` + +Note: `pod.Annotations` may be nil; the `idx, ok := pod.Annotations[...]` form is +nil-safe in Go (indexing a nil map yields the zero value, `ok=false`). + +### Sidecar (Python) — no change + +The producer's sidecar already resolves the vendor at runtime from +`FLUXION_BACKEND`, discovers the task by the `fluence-pod-uid` tag, polls queue +position, and ungates the group named by `FLUENCE_GANG_GROUP` (now the consumer +group) at position==1, stamping `quantum-job-id` on each consumer before removing +its gate. That is exactly the existing flow with the producer in place of the +submitter pod. + +--- + +## Experiments + +Two requirements, both demonstrable on a kind cluster with the mock path and +on a real cluster with Braket. + +### Requirement 1 — Fluence still gang-schedules + +Unchanged classical-gang coverage plus a shared-mode assertion: + +- **Classical gang (regression):** keep `test/e2e/gang/*`. A `minCount=N` classical + PodGroup schedules all-or-nothing. This proves the generic gang machinery is + intact (the change only adds a quantum-pod deferral to `gang.Applies`). +- **Shared consumer gang (new assertion):** submit a `coordination: shared` + indexed Job of N. Assert: exactly one `-producer` PodGroup (minCount 1) and + one `` PodGroup (minCount N−1); the producer runs while the N−1 consumers + are `SchedulingGated`; after ungate the N−1 schedule **together** (gang), not + one-by-one. This proves gang scheduling still holds for the consumer group. + +### Requirement 2 — Both modes work and shared beats a traditional gang + +The metric that isolates the win is **classical node-seconds consumed during the +quantum queue wait** (lower is better), alongside correctness checks. + +Three arms, same N, same workload (the QAOA sampler), same backend: + +| arm | how | expected node-seconds during queue wait | correctness | +|-----|-----|------------------------------------------|-------------| +| **traditional gang** (baseline) | N pods all running, each waits the full queue (no Fluence coordination — e.g. a plain native gang, or `independent` with N=N) | ≈ **N × T_queue** | N pods each run; if they each submit, N real tasks | +| **shared** (new) | `coordination: shared` indexed Job, N pods | ≈ **1 × T_queue** (producer only; consumers gated) | **1** real task; all N pods produce the **same** result; app runs N times, never N+1 | +| **independent** (new) | `coordination: independent` indexed Job, N pods | ≈ **N × T_queue** (no coordination possible) | N distinct tasks/results; correct and the user's explicit choice (reported as the honest baseline, **not** claimed as an improvement) | + +Headline comparison is **shared vs traditional**: same observable result to the N +pods, but shared idles ~1 node through the queue wait instead of N, saving +≈ (N−1) × T_queue node-seconds, and runs the application N times rather than N+1 +(the submitter-pod model's extra run is gone). + +Instrumentation (reuse the Experiment 2 harness): +- per-pod `TIMING` lines → derive each pod's gated interval vs running interval; + sum running-but-pre-result node-seconds per arm. +- producer's sidecar logs queue position over time → T_queue. +- assert real-submit count: shared = 1 (one tagged task on the backend), + independent/traditional = N (count tagged tasks). +- assert shared correctness: all N pods log the **same** task id / result hash. + +Suggested location: a new `experiments/4-coordination/` modeled on +`experiments/2-gang/` (it already measures idle reclamation), parameterized by the +`coordination` annotation and N, emitting node-seconds-during-wait, real-submit +count, and result-agreement per arm. Plot node-seconds vs N for the three arms: +traditional and independent rise ~linearly in N; shared stays ~flat at one node. + +### Build/run notes + +- The producer/consumer split needs no new image: producers and consumers run the + same role-aware sampler; the branch is `FLUENCE_COORDINATION_ROLE` + (producer submits; consumer fetches the shared result by `FLUENCE_QUANTUM_JOB_ID`). +- Use an **indexed** Job (`completionMode: Indexed`, `parallelism == completions == N`) + so producer election is deterministic (index 0) and `resolveGangSize` reads N + from the owner. Stamp `fluence.flux-framework.org/coordination` in the pod + template's annotations. +- kind/mock runs exercise the structural assertions (groups, gating, ungate + ordering) without a backend; real-Braket runs add the node-seconds and + real-submit-count measurements. diff --git a/docs/handlers.md b/docs/handlers.md new file mode 100644 index 0000000..ee70519 --- /dev/null +++ b/docs/handlers.md @@ -0,0 +1,83 @@ +# Webhook handlers & sidecar architecture + +Fluence's value is not creating gangs (Kubernetes 1.36 native gang scheduling +already does that). It is **customizing the gang on the fly based on the +resources a pod requests** — e.g. a shared quantum gang becomes a size-1 +producer gang plus a size-(N-1) consumer gang, with the producer running a +sidecar that ungates its consumers when the quantum task is ready. + +## Handlers + +Each handler is an interface implementation (`pkg/webhook/handler.go`): + +```go +type Handler interface { + Name() string + Applies(ctx, m MutatorAPI, pod) bool + Mutate(ctx, m MutatorAPI, pod) []spec.Op +} +``` + +Handlers self-register by name (`init()` -> `webhook.Register`); a blank import +of the handlers package makes them AVAILABLE. The core never names a handler. + +**Ordering = the active list.** There is no per-handler priority. The active +handler list is BOTH the selection and the dispatch order: + +```go +var DefaultHandlerOrder = []string{"fluxion", "quantum", "gang"} +``` + +Dispatch walks this list in order. `gang` is last because it is last in the +list — the fallback that applies common defaults (honor `group-size`, else +owner-derived N) only if no earlier handler already shaped the gang. A +custom-resource handler is inserted into the list before `gang` to shape its own +gang first. To change the order, or disable a handler, pass a different list. + +## Enabling/disabling handlers + +By default ALL registered handlers are enabled. Restrict the active set on the +webhook command: + +``` +fluence-webhook --handlers=fluxion,gang # run without quantum +FLUENCE_HANDLERS=fluxion,quantum,gang fluence-webhook +``` + +Empty = the default list. The list is the order: `--handlers=gang,fluxion` runs +gang first; omitting a name disables it. Unknown names are warned and dropped. + +(The handler set lives in the WEBHOOK, which mutates pods. `cmd/fluence` is the +scheduler plugin and runs no handlers.) + +## Sidecar interface + +The coordination sidecar is a handler-owned capability, not a core one. Handlers +that need a sidecar use `handlers.Sidecar`: + +```go +type Sidecar interface { + EnsureRBAC(ctx, namespace) + InterceptorOps(pod) []spec.Op + ContainerOps(pod, observe bool) []spec.Op +} +``` + +The default `coreSidecar` delegates to the core's staging primitives. The quantum +handler uses it today; a custom handler can supply its own implementation +(different image, env, gating) without touching the core or other handlers. The +core's `MutatorAPI` keeps the staging primitives only so the default +implementation can delegate — handlers do not call them directly. + +## Group size resolution (the default gang handler) + +`minCount` (the atomic-schedule count) resolves as: + +1. explicit `fluence.flux-framework.org/group-size` annotation — honored verbatim + (the override; e.g. a quantum split sets it directly); +2. else the owning indexed Job's `parallelism` (== MiniCluster size N); +3. else 1, logged. + +This is a common default available to every gang; handler-specific annotations +(quantum coordination mode, completion index, etc.) live in their handlers and are not +required by the core. diff --git a/docs/quantum-scheduling.md b/docs/quantum-scheduling.md index a6967ba..de32220 100644 --- a/docs/quantum-scheduling.md +++ b/docs/quantum-scheduling.md @@ -5,15 +5,15 @@ Hybrid quantum-classical workflows submit work to two independent queues: the Kubernetes scheduler (classical compute) and a QPU vendor API (quantum execution). Classical pods waste node resources while waiting for QPU queue -results. Fluence's coordination system thus gates classical worker pods until +results. Fluence's coordination system thus gates classical consumer pods until the QPU task is one position from executing, then releases them with high priority so they preempt lower-priority work and start immediately as the QPU result arrives. Yes, it could be the case the one task in the queue before -it takes a long time, but I think this is an improved approach than having worker +it takes a long time, but I think this is an improved approach than having consumer pods running (and waiting) for a much longer queue. This only is important -given that you have gangs, or leader worker designs where some leader is launching -the quantum work and otherwise the workers would be waiting and doing nothing -(and wasting resources). +given that you have gangs, or producer/consumer designs where one member is +launching the quantum work and otherwise the other members would be waiting and +doing nothing (and wasting resources). ## 1. The Two-Queue Problem @@ -67,11 +67,13 @@ queue wait — which is worse than the original problem. The design combines four mechanisms: -1. **SDK interceptor** — tags every QPU task with the pod UID -2. **Fluence webhook** — gates worker pods, injects sidecar into quantum pods +1. **SDK interceptor** — tags every submitted QPU task with the pod UID so the + sidecar can find it (staged only on pods that submit) +2. **Fluence webhook** — splits a shared quantum gang into one producer and N-1 + gated consumers; injects the sidecar into the producer 3. **Sidecar controller** — discovers the QPU task, polls queue position, - ungates workers when position==1 -4. **High-priority ungating** — workers preempt lower-priority work at the + ungates the consumers when position==1 +4. **High-priority ungating** — consumers preempt lower-priority work at the last responsible moment ### 3.0 When Fluence acts: the decision matrix @@ -84,19 +86,26 @@ determine what Fluence does: work and there is a vendor backend behind it. - **G (gang?)** — does the pod carry `fluence.flux-framework.org/group`? -| | not quantum | quantum | -|--------------|------------------------|----------------------------------------------------------------| -| **not gang** | group of 1 (nothing) | inject provider interceptor + env; **sidecar only in observe-only mode if telemetry requested** (no workers to ungate) | -| **gang** | gang-schedule only | leader: interceptor + env + sidecar (gates + ungates workers); workers: gate only | +A third property applies only to quantum gangs: the **coordination mode** +(`fluence.flux-framework.org/coordination`, default `independent`). In `shared` +mode the gang produces ONE quantum task that all members share; in `independent` +mode every member does its own quantum work. + +| | not quantum | quantum | +|--------------|------------------------|--------------------------------------------------------------------------------| +| **not gang** | group of 1 (nothing) | inject provider interceptor + env; **sidecar only in observe-only mode if telemetry requested** (nothing to ungate) | +| **gang** (independent) | gang-schedule only | every member is a standalone producer: interceptor + env, real submit, no gate | +| **gang** (shared) | — | producer (index 0): interceptor + env + sidecar, real submit, not gated, group-of-one `-producer`, role=producer; consumers: gate + role=consumer + producer's task id, gang `` (minCount N-1) | The crucial rule: **sidecar/interceptor injection is triggered by the quantum resource request, not the group label.** The group label only controls gang -scheduling and worker gating. A group leader that requests no quantum resource -(e.g. a classical pod that happens to set `BRAKET_DEVICE` itself) is just -gang-scheduled — Fluence injects no sidecar, because there is no quantum work -for it to coordinate. `BRAKET_DEVICE` (or any direct device selection by the -user) is the signal that Fluence is *not* scheduling the quantum resource; -`fluxion.flux-framework.org/qpu` is the signal that it is. +scheduling and (in shared mode) the producer/consumer split. A grouped pod that +requests no quantum resource (e.g. a classical pod that happens to set +`BRAKET_DEVICE` itself) is just gang-scheduled — Fluence injects no sidecar, +because there is no quantum work for it to coordinate. `BRAKET_DEVICE` (or any +direct device selection by the user) is the signal that Fluence is *not* +scheduling the quantum resource; `fluxion.flux-framework.org/qpu` is the signal +that it is. ### 3.1 User interface @@ -106,10 +115,19 @@ The user labels all pods in a workflow group with: metadata: labels: fluence.flux-framework.org/group: my-workflow + annotations: + # only for a quantum gang that shares ONE task across members: + fluence.flux-framework.org/coordination: shared spec: schedulerName: fluence ``` +`coordination` defaults to `independent` (every member does its own quantum +work). Set it to `shared` when the members should share a single quantum task — +then Fluence promotes one member (the indexed-Job completion index 0) to producer +and gates the rest as consumers. The user authors no roles and no submitter pod; +the split is derived from the completion index the Job controller already stamps. + I initially started with having the user create a PodGroup object, and I found that annoying. I do not want to require a PodGroup object when an annotation is easier, and then I have fine-grained control of what the groups looks like. Fluence can handle @@ -117,7 +135,7 @@ everything else automatically. The namespace distinction: - `fluence.flux-framework.org/*` — Fluence scheduler-plugin concerns - (group label, leader annotation, gate name) + (group label, coordination mode, gate name) - `fluxion.flux-framework.org/*` — Fluxion resource-graph concerns (extended resource types, backend attribute env vars) @@ -140,31 +158,57 @@ The three handlers (`pkg/webhook/handlers/`): (backend + attributes) sourced from the annotations the scheduler writes in PreBind. Generic to all Fluxion resources. -**`gang` (`gang.go`)** — applies when the pod carries the group label. Creates a -Fluence-owned PodGroup (`minCount: 1`) on first admission, records that first -pod as the admission-order leader, and stamps `spec.schedulingGroup.podGroupName` -on every pod in the group so the scheduler gangs them. The user only ever sets -the LABEL; the webhook translates it into the native field, so the user never -creates a PodGroup or knows it exists. Knows nothing about quantum — a purely -classical gang is fully handled here, with no sidecar. +**`gang` (`gang.go`)** — applies when the pod carries the group label **and does +not request the quantum resource** (a quantum pod is gang-scheduled by the quantum +handler instead, which owns the producer/consumer split). Creates a Fluence-owned +PodGroup on first admission and stamps `spec.schedulingGroup.podGroupName` on +every pod in the group so the scheduler gangs them. The user only ever sets the +LABEL; the webhook translates it into the native field, so the user never creates +a PodGroup or knows it exists. Knows nothing about quantum — a purely classical +gang is fully handled here, with no sidecar. **`quantum` (`quantum.go`)** — the only handler that knows about quantum -resources, gates, and observe semantics. Applies to a pod in either role: -- **submitter** (requests `fluxion.flux-framework.org/qpu`): a group leader, or - a standalone quantum pod. Always gets the interceptor staged (so its task is - tagged). Gets the **sidecar** only when there is coordination to do — it is a - group leader (workers to ungate) or observe-only telemetry is requested. -- **worker** (a non-leader member of a group whose recorded leader is a quantum - pod): gets the `quantum.braket/ready` scheduling gate, entering - `SchedulingGated` state — invisible to Fluxion, consuming no resources — until - the leader's sidecar ungates it. - -Role is decided by **admission order**, not resource request. In a pod-template -gang (Deployment/Job/StatefulSet) every pod is identical — same group label, -every pod requests the quantum resource — so the leader is simply the first pod -admitted (recorded on the PodGroup); every other pod is a worker, regardless of -its own request. The gate holds workers at PreEnqueue, so the scheduler does not -run PreFilter for them (and `groupPods` excludes gated pods) until ungated. +resources, gates, coordination, and observe semantics. A quantum task's circuit +comes from user code, so the pod that defines a task must RUN to submit it: submit +and gate are mutually exclusive per pod, and gating only helps pods that do not +submit. The handler therefore routes each quantum pod to one of three roles: +- **standalone / independent** (a lone quantum pod, or any member of a gang in + the default `independent` mode): gets the interceptor staged (real mode) so its + own task is tagged, performs its own real submit, is never gated, and gets the + sidecar only when observe-only telemetry is requested. Independent mode means N + members run N tasks and hold N node-waits — honest physics, the user's explicit + default. +- **producer** (in `shared` mode, the completion index 0 member): its own + group-of-one `-producer` (minCount 1) so it schedules alone and runs the + SINGLE real submit; interceptor in real mode; gets the **sidecar**, told which + consumer group to ungate (`FLUENCE_GANG_GROUP`); never gated. The producer is + one of the N members, so the application runs exactly N times — never N+1. +- **consumer** (in `shared` mode, the other N-1 members): joins the `` + gang (minCount N-1), gets the `quantum.braket/ready` scheduling gate (entering + `SchedulingGated` — invisible to Fluxion, consuming no resources — until the + producer's sidecar ungates it), and is told its role + (`FLUENCE_COORDINATION_ROLE=consumer`) and the producer's task id + (`FLUENCE_QUANTUM_JOB_ID`, stamped at ungate). A consumer does **not** submit — + it fetches the shared result by that id — so it gets neither the interceptor nor + any faux flag. + +Role is decided by the **completion index**, not resource request or admission +order. In an indexed Job every pod is identical — same group label, same image, +every pod requests the quantum resource — so the producer is simply the pod the +Job controller stamps with `batch.kubernetes.io/job-completion-index: "0"`; every +other index is a consumer. (For loose pods with no completion index, the first +arrival claims the producer slot by the absence of the `-producer` +PodGroup; an indexed Job is recommended for deterministic election.) The two +groups carry independent minCounts (producer=1, consumers=N-1), which is what lets +the producer schedule and submit while the consumers stay gated — no deadlock, and +no separate submitter pod. + +The workload is **role-aware**: every shared-mode pod is told its role positively +via `FLUENCE_COORDINATION_ROLE` (the webhook's election is the single source of +truth), and the application branches on it — the producer submits, a consumer +fetches the shared result by `FLUENCE_QUANTUM_JOB_ID`. The same image plays both +roles with one cheap branch; there is no submit-interception magic and no faux +flag. ### 3.3 Interceptor and Model C delivery @@ -207,7 +251,11 @@ def patched_run(self, task_specification, *args, **kwargs): This is completely transparent to the user application — no code changes, no package install, no vendor SDK added to the user image (the hook patches whatever SDK the user already has). -leader pod, sharing its AWS credentials and network namespace. + +### 3.4 Sidecar controller + +The sidecar runs as a container alongside the producer pod, sharing its AWS +credentials and network namespace. ```console 1. READ FLUXION_ARN, FLUENCE_POD_UID from env @@ -221,7 +269,7 @@ leader pod, sharing its AWS credentials and network namespace. On timeout: fall back to time-window heuristic (tasks submitted after pod start time on the same device). -3. DISCOVER worker pods: +3. DISCOVER consumer pods: List pods in namespace with fluence.flux-framework.org/group label matching this pod's group, having quantum.braket/ready gate present. @@ -229,7 +277,7 @@ leader pod, sharing its AWS credentials and network namespace. Log position for experiment instrumentation. 5. WHEN is_ready_to_ungate(task) (position == 1 OR state == RUNNING): - For each worker pod: + For each consumer pod: kubectl annotate pod fluence.flux-framework.org/quantum-job-id= kubectl patch pod --type=json \ -p='[{"op":"add","path":"/spec/priorityClassName", @@ -240,7 +288,7 @@ leader pod, sharing its AWS credentials and network namespace. ``` The priority class and gate removal are applied atomically in one patch. -This ensures workers enter the scheduling queue with high priority +This ensures consumers enter the scheduling queue with high priority immediately, without a window where they are ungated but low-priority. ### 3.5 Priority and preemption @@ -250,10 +298,10 @@ by the sidecar at ungate time, not by the webhook at pod creation. Setting it at creation time causes an admission controller conflict (priority integer already defaulted to 0). -When workers are ungated with high priority, Kubernetes preemption evicts +When consumers are ungated with high priority, Kubernetes preemption evicts lower-priority pods to make room. Fluence's pod deletion informer catches these evictions, calls `Cancel(jobid)` in Fluxion, and frees the graph -vertices so Fluxion can allocate them to the incoming high-priority workers. +vertices so Fluxion can allocate them to the incoming high-priority consumers. ### 3.6 Classical allocation follows quantum execution order @@ -298,7 +346,7 @@ Provider: find_my_task(pod_uid, ...) # search by the fluence-pod-uid tag → opaque Task is_ready_to_ungate(task) # decision primitive: position==1 OR running queue_position(task) # optional richer telemetry; None if unavailable - job_id(task) # cross-vendor id handed to workers (NOT the ARN) + job_id(task) # cross-vendor id handed to consumers (NOT the ARN) ``` Vendor-specific identifiers (a Braket task ARN, an IBM job id, a GCP operation @@ -320,7 +368,7 @@ matching provider). Nothing else changes — no build script, no concatenation. #### Observe-only (telemetry) mode -A quantum pod that is *not* a gang (a single quantum pod, no workers to ungate) +A quantum pod that is *not* a gang (a single quantum pod, no consumers to ungate) gets the interceptor and env only — no sidecar — by default, so no surprise machinery is injected. Telemetry is opt-in via the label `fluence.flux-framework.org/observe: "true"`, surfaced to the sidecar as @@ -345,7 +393,7 @@ singleton and gang runs. ### 5.1 Preemption disrupts lower-priority work -At position==1, workers preempt running lower-priority pods. This work is +At position==1, consumers preempt running lower-priority pods. This work is re-queued and eventually runs, but there is a disruption cost. A future design using a `MatchReserveAt(time_at, spec)` Fluxion primitive — where `time_at` is supplied by the QPU vendor via an ETA or task-start event — @@ -364,7 +412,7 @@ heuristic (e.g. a time window) rather than the tag mechanism. ### 5.3 Single task per workflow -The sidecar tracks one QPU task ARN per leader pod. Parameter-shift gradient +The sidecar tracks one QPU task ARN per producer pod. Parameter-shift gradient estimation and other multi-circuit workflows require tracking a set of ARNs. See the scatter design issue for the proposed extension. @@ -388,7 +436,7 @@ function to be exposed through the Go bindings with a `starttime` parameter. For workflows with N independent QPU tasks each paired with one classical pod, an index-based pairing mechanism (`fluence.flux-framework.org/index`) -would allow the sidecar to ungate specific worker pods when their specific +would allow the sidecar to ungate specific consumer pods when their specific task reaches position==1. See the open scatter design issue. ### 6.3 Vendor task-start events @@ -401,7 +449,7 @@ precise ungating. ### 6.4 PostFilter topology-aware preemption A custom Fluence `PostFilter` plugin would ask Fluxion which graph vertices -are blocking a high-priority worker pod, then target preemption at exactly +are blocking a high-priority consumer pod, then target preemption at exactly those pods — rather than the default Kubernetes preemption which picks lowest-priority pods regardless of graph topology. This ensures preemption always produces a valid Fluxion allocation. diff --git a/examples/quantum-pod.yaml b/examples/quantum-pod.yaml index a619df9..b5dfbc9 100644 --- a/examples/quantum-pod.yaml +++ b/examples/quantum-pod.yaml @@ -2,7 +2,7 @@ # via resources (the fluence device plugin advertises fluxion.flux-framework.org/qpu # on every node, so NodeResourcesFit is satisfied). Fluence's PreFilter matches # the request against the resource graph and picks a backend, the webhook injects -# QRMI_BACKEND (the allocated backend) automatically, and note we can add other +# FLUXION_BACKEND (the allocated backend) automatically, and note we can add other # envars here in the future. I chose a webhook because I think this is going to # be a requirement, and the pod is immutable after creation. # Then the container submits via qrmi-go (the separate qrmi-sampler image). @@ -27,4 +27,4 @@ spec: requests: fluxion.flux-framework.org/qpu: "1" limits: - fluxion.flux-framework.org/qpu: "1" \ No newline at end of file + fluxion.flux-framework.org/qpu: "1" diff --git a/examples/test/e2e/gang/multi-gang-contention.yaml b/examples/test/e2e/gang/multi-gang-contention.yaml new file mode 100644 index 0000000..14b0fd8 --- /dev/null +++ b/examples/test/e2e/gang/multi-gang-contention.yaml @@ -0,0 +1,40 @@ +# Two gangs that cannot both place: fluxion allocates one core per slot, so two +# 2-pod gangs need 4 cores, but the cluster graphs ~3 (3 workers, ~1 core each). One gang places entirely; the loser stays FULLY pending +# (all-or-nothing), never partial. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gang-a +spec: + replicas: 2 + selector: {matchLabels: {app: gang-a}} + template: + metadata: + labels: {app: gang-a, fluence.flux-framework.org/group: gang-a} + annotations: {fluence.flux-framework.org/group-size: "2"} + spec: + schedulerName: fluence + containers: + - name: w + image: busybox + command: ["sleep", "3600"] + resources: {requests: {cpu: "1"}} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gang-b +spec: + replicas: 2 + selector: {matchLabels: {app: gang-b}} + template: + metadata: + labels: {app: gang-b, fluence.flux-framework.org/group: gang-b} + annotations: {fluence.flux-framework.org/group-size: "2"} + spec: + schedulerName: fluence + containers: + - name: w + image: busybox + command: ["sleep", "3600"] + resources: {requests: {cpu: "1"}} diff --git a/examples/test/e2e/gang/multi-gang-requeue.yaml b/examples/test/e2e/gang/multi-gang-requeue.yaml new file mode 100644 index 0000000..a8e8636 --- /dev/null +++ b/examples/test/e2e/gang/multi-gang-requeue.yaml @@ -0,0 +1,48 @@ +# Requeue-on-capacity + gang-atomicity test (test/e2e/gang/09). +# gang-win: a 2-pod gang that runs a SHORT job and COMPLETES (pods -> Succeeded), +# freeing its nodes. +# gang-wait: a 2-pod gang needing the same nodes; loses the initial race and sits +# Unschedulable. When gang-win completes, gang-wait must be re-attempted +# (via the shortened unschedulable-recheck timeout) and place atomically. +# On a 3-worker (~3-core) cluster the two 2-pod gangs (4 cores) cannot co-run. +apiVersion: batch/v1 +kind: Job +metadata: + name: gang-win +spec: + completions: 2 + parallelism: 2 + completionMode: Indexed + template: + metadata: + labels: {fluence.flux-framework.org/group: gang-win} + annotations: {fluence.flux-framework.org/group-size: "2"} + spec: + schedulerName: fluence + restartPolicy: Never + containers: + - name: w + image: busybox + command: ["sh","-c","sleep 30"] # completes, frees nodes + resources: {requests: {cpu: "1"}} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: gang-wait +spec: + completions: 2 + parallelism: 2 + completionMode: Indexed + template: + metadata: + labels: {fluence.flux-framework.org/group: gang-wait} + annotations: {fluence.flux-framework.org/group-size: "2"} + spec: + schedulerName: fluence + restartPolicy: Never + containers: + - name: w + image: busybox + command: ["sh","-c","sleep 10"] + resources: {requests: {cpu: "1"}} \ No newline at end of file diff --git a/examples/test/e2e/gang/multi-gang.yaml b/examples/test/e2e/gang/multi-gang.yaml new file mode 100644 index 0000000..9bfa67c --- /dev/null +++ b/examples/test/e2e/gang/multi-gang.yaml @@ -0,0 +1,25 @@ +# Multi-pod gang via the WEBHOOK path (the path the experiments use +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gang3 +spec: + replicas: 2 + selector: + matchLabels: {app: gang3} + template: + metadata: + labels: + app: gang3 + fluence.flux-framework.org/group: gang3 + annotations: + fluence.flux-framework.org/group-size: "2" + spec: + schedulerName: fluence + containers: + - name: worker + image: busybox + command: ["sleep", "3600"] + resources: + requests: + cpu: "1" diff --git a/examples/single-podgroup.yaml b/examples/test/e2e/gang/single-podgroup.yaml similarity index 100% rename from examples/single-podgroup.yaml rename to examples/test/e2e/gang/single-podgroup.yaml diff --git a/examples/test/e2e/quantum/quantum-gang-pods.yaml b/examples/test/e2e/quantum/quantum-gang-pods.yaml new file mode 100644 index 0000000..aacce44 --- /dev/null +++ b/examples/test/e2e/quantum/quantum-gang-pods.yaml @@ -0,0 +1,62 @@ +# Shared-coordination quantum gang for the e2e (producer/consumer, no submitter). +# +# Two identical pods, both requesting the quantum resource, in group "qgang" with +# coordination=shared. The user authors NO roles and NO submitter pod. The webhook +# splits the gang by completion index: +# qgang-0 (index 0) -> PRODUCER: its own group-of-one "qgang-producer" +# (minCount 1), real submit, sidecar, NOT gated. It is a +# real member, so the app runs N times, never N+1. +# qgang-1 (index 1+) -> CONSUMER: the "qgang" gang (minCount N-1=1), gated on +# quantum.braket/ready + preempting priority, interceptor +# told role=consumer; it fetches the producer's result by id. +# +# These are raw pods (not a Job) so the e2e can reference stable names; the +# completion-index annotation is set manually to make producer election +# deterministic (a real workload uses an indexed Job, which the controller stamps +# with batch.kubernetes.io/job-completion-index automatically). group-size makes N +# deterministic for raw pods, which have no owning Job to derive it from. busybox +# stands in for the quantum app; the interceptor staging fails soft (no python), +# which is fine for the structural assertions in 02/03/04. +apiVersion: v1 +kind: Pod +metadata: + name: qgang-0 + labels: + app: qgang + fluence.flux-framework.org/group: qgang + annotations: + fluence.flux-framework.org/group-size: "2" + fluence.flux-framework.org/coordination: shared + batch.kubernetes.io/job-completion-index: "0" # -> producer +spec: + schedulerName: fluence + restartPolicy: Never + containers: + - name: app + image: busybox + command: ["sh", "-c", "echo gang member; sleep 600"] + resources: + requests: {fluxion.flux-framework.org/qpu: "1"} + limits: {fluxion.flux-framework.org/qpu: "1"} +--- +apiVersion: v1 +kind: Pod +metadata: + name: qgang-1 + labels: + app: qgang + fluence.flux-framework.org/group: qgang + annotations: + fluence.flux-framework.org/group-size: "2" + fluence.flux-framework.org/coordination: shared + batch.kubernetes.io/job-completion-index: "1" # -> consumer +spec: + schedulerName: fluence + restartPolicy: Never + containers: + - name: app + image: busybox + command: ["sh", "-c", "echo gang member; sleep 600"] + resources: + requests: {fluxion.flux-framework.org/qpu: "1"} + limits: {fluxion.flux-framework.org/qpu: "1"} diff --git a/examples/test/e2e/quantum-pod-mock.yaml b/examples/test/e2e/quantum/quantum-pod-mock.yaml similarity index 100% rename from examples/test/e2e/quantum-pod-mock.yaml rename to examples/test/e2e/quantum/quantum-pod-mock.yaml diff --git a/examples/test/e2e/sidecar-mock-pods.yaml b/examples/test/e2e/sidecar-mock-pods.yaml deleted file mode 100644 index fb223a7..0000000 --- a/examples/test/e2e/sidecar-mock-pods.yaml +++ /dev/null @@ -1,64 +0,0 @@ ---- -# Leader pod — first admitted, webhook creates PodGroup, injects sidecar, creates RBAC -# User only needs schedulerName: fluence and the quantum-group label. -# No PodGroup object needed — Fluence creates it. -apiVersion: v1 -kind: Pod -metadata: - name: sidecar-test-leader - labels: - app: fluence-sidecar-test - fluence.flux-framework.org/group: sidecar-test-group -spec: - schedulerName: fluence - restartPolicy: Never - containers: - - name: mock-quantum-app - image: busybox - command: - - sh - - -c - - | - echo "mock-quantum-app: running" - echo "arn:aws:braket:us-east-1:123456:quantum-task/mock-abc123" \ - > /tmp/task-arn - echo "mock-quantum-app: task ARN written" - sleep 3600 - resources: - requests: - fluxion.flux-framework.org/qpu: "1" - limits: - fluxion.flux-framework.org/qpu: "1" - ---- -# Worker pod — classical (no QPU). Gated by the webhook because it is a -# non-leader member of a group whose leader is a quantum pod. -apiVersion: v1 -kind: Pod -metadata: - name: sidecar-test-worker - labels: - app: fluence-sidecar-test - fluence.flux-framework.org/group: sidecar-test-group -spec: - schedulerName: fluence - restartPolicy: Never - containers: - - name: classical-worker - image: busybox - command: - - sh - - -c - - | - echo "classical-worker: started" - echo "TASK_ARN=$BRAKET_TASK_ARN" - sleep 10 - env: - - name: FLUENCE_QUANTUM_JOB_ID - valueFrom: - fieldRef: - fieldPath: metadata.annotations['fluence.flux-framework.org/quantum-job-id'] - resources: - requests: - cpu: "100m" - memory: "128Mi" diff --git a/pkg/fluence/fluence.go b/pkg/fluence/fluence.go index a1a10e1..6c3dc13 100644 --- a/pkg/fluence/fluence.go +++ b/pkg/fluence/fluence.go @@ -77,14 +77,61 @@ type Fluence struct { mu sync.Mutex // placement maps a group key to its allocation (nodes, backend, jobids). placement map[string]groupAlloc + // excludedNodes maps a group key to the set of nodes that are GENUINELY + // INCOMPATIBLE with that group (PostFilter saw UnschedulableAndUnresolvable + // from another plugin: a taint, affinity, or constraint Fluxion's graph does + // not model). PreFilter feeds them back as an RFC 31 negated-hostlist + // constraint so the re-match is steered onto other nodes. Nodes that were + // merely BUSY are deliberately NOT recorded here (excluding them would turn + // transient contention into permanent group failure). The set only grows for a + // group, so the exclusion-driven re-match is finite, and it is cleared on + // teardown. Guarded by mu. + excludedNodes map[string]map[string]bool } var ( - _ fwk.PreFilterPlugin = (*Fluence)(nil) - _ fwk.FilterPlugin = (*Fluence)(nil) - _ fwk.PreBindPlugin = (*Fluence)(nil) + _ fwk.PreFilterPlugin = (*Fluence)(nil) + _ fwk.FilterPlugin = (*Fluence)(nil) + _ fwk.PostFilterPlugin = (*Fluence)(nil) + _ fwk.ReservePlugin = (*Fluence)(nil) + _ fwk.PreBindPlugin = (*Fluence)(nil) ) +// schedulableNodes returns only the nodes a normal pod could actually be placed +// on, so the Fluxion graph never offers a node that Kubernetes will then reject +// in Filter. Two kinds are dropped: +// +// - cordoned nodes (spec.unschedulable), and +// - nodes carrying a NoSchedule/NoExecute taint (e.g. the control-plane's +// node-role.kubernetes.io/control-plane:NoSchedule). +// +// Without this, Fluxion can place a gang slot on the control-plane (it looks like +// a valid virtual=false compute node to the graph), the pod is then rejected by +// TaintToleration with UnschedulableAndUnresolvable, and PostFilter abandons the +// whole allocation — on a small cluster that strands the gang permanently. We do +// not attempt to honor specific tolerations here: gang workloads in this setup do +// not tolerate node taints, so any NoSchedule/NoExecute taint means "not for us". +func schedulableNodes(nodes []corev1.Node) []corev1.Node { + out := make([]corev1.Node, 0, len(nodes)) + for _, n := range nodes { + if n.Spec.Unschedulable { + continue + } + tainted := false + for _, t := range n.Spec.Taints { + if t.Effect == corev1.TaintEffectNoSchedule || t.Effect == corev1.TaintEffectNoExecute { + tainted = true + break + } + } + if tainted { + continue + } + out = append(out, n) + } + return out +} + // New builds the plugin: discover cluster nodes, optionally inject quantum // resources, write the JGF graph, initialize the Fluxion matcher, and register // the delete handlers that cancel allocations when their owning object is gone. @@ -129,7 +176,7 @@ func New(ctx context.Context, _ runtime.Object, h fwk.Handle) (fwk.Plugin, error } } - jgfBytes, err := cluster.BuildGraph(nodeList.Items, opts) + jgfBytes, err := cluster.BuildGraph(schedulableNodes(nodeList.Items), opts) if err != nil { return nil, fmt.Errorf("build resource graph: %w", err) } @@ -161,10 +208,11 @@ func New(ctx context.Context, _ runtime.Object, h fwk.Handle) (fwk.Plugin, error fluxion.Init(tmp.Name(), os.Getenv("FLUENCE_MATCH_POLICY"), "") f := &Fluence{ - handle: h, - matcher: fluxion, - knownDevices: knownDevices, - placement: map[string]groupAlloc{}, + handle: h, + matcher: fluxion, + knownDevices: knownDevices, + placement: map[string]groupAlloc{}, + excludedNodes: map[string]map[string]bool{}, } f.registerCancelHandlers() // Periodic + startup reconcile of completed Fluence-created PodGroups, so a @@ -251,7 +299,15 @@ func (f *Fluence) PreFilter( return nil, fwk.AsStatus(err) } - specs, err := placement.JobspecsForGroup(group, pods, f.knownDevices) + f.mu.Lock() + excluded := make([]string, 0, len(f.excludedNodes[group])) + for n := range f.excludedNodes[group] { + excluded = append(excluded, n) + } + f.mu.Unlock() + sort.Strings(excluded) // deterministic constraint for stable matching/logs + + specs, err := placement.JobspecsForGroup(group, pods, f.knownDevices, excluded) if err != nil { return nil, fwk.AsStatus(err) } @@ -390,6 +446,103 @@ func (f *Fluence) Filter( return fwk.NewStatus(fwk.Unschedulable, "node not in fluxion allocation for this group") } +// PostFilter runs when a pod could not be scheduled after Filter — for a Fluence +// group, this means the cached Fluxion allocation's nodes did not all survive the +// other scheduler plugins' Filter checks. Without intervention the group would +// retry forever against the same cached allocation while the Fluxion reservation +// leaked, because PreFilter short-circuits on the cache and nothing else releases +// it on a scheduling failure. +// +// We always abandon the failed allocation here (cancel the Fluxion jobids, drop +// the cached placement) so the next PreFilter re-matches fresh. The careful part +// is WHICH nodes we then permanently exclude from the group's future matches, +// because a group reaches PostFilter for two very different reasons and they must +// be handled oppositely (see fwk.Code docs): +// +// - UnschedulableAndUnresolvable: the node genuinely cannot host this pod and +// re-trying it is pointless (a taint the pod does not tolerate, node affinity +// mismatch, a constraint Fluxion's graph does not model). EXCLUDE it; the +// next PreFilter feeds the exclusion set back as an RFC 31 negated-hostlist +// constraint so Fluxion is steered onto other nodes. +// +// - Unschedulable (plain): the node could host the pod, just not at this +// instant (it is momentarily full). This is TRANSIENT. Do NOT exclude it — +// excluding a merely-busy node converts ordinary contention into permanent +// group failure, and in a saturated cluster (a gang that needs the whole node +// set) it strands the gang forever even though it would fit once a node frees. +// +// So contention excludes nothing and the group recovers by waiting/retrying; +// only durable incompatibility accumulates in excludedNodes (cleared on group +// teardown), which keeps the exclusion-driven re-match finite and correct. +func (f *Fluence) PostFilter( + ctx context.Context, + state fwk.CycleState, + pod *corev1.Pod, + filteredNodeStatusMap fwk.NodeToStatusReader, +) (*fwk.PostFilterResult, *fwk.Status) { + group := groupKey(pod) + + f.mu.Lock() + alloc, ok := f.placement[group] + if !ok { + // No cached allocation for this group — nothing of ours to reconcile. + // (Another plugin's PostFilter, or a non-group pod.) + f.mu.Unlock() + return nil, fwk.NewStatus(fwk.Unschedulable) + } + // Exclude ONLY nodes that are genuinely incompatible with this pod, never + // nodes that were merely busy this cycle. The framework gives us a per-node + // status: UnschedulableAndUnresolvable means the node cannot host the pod and + // re-trying it is pointless (a taint the pod does not tolerate, node affinity + // mismatch, a constraint Fluxion's graph does not model) -> exclude it so the + // re-match is steered elsewhere. A plain Unschedulable means the node could + // host the pod but not right now (it is momentarily full) -> do NOT exclude + // it; it must stay eligible so the group can land there once capacity frees. + // + // This is the whole point: a group enters PostFilter for many reasons, and + // "the cluster is just full at this instant" is the common one. Permanently + // banning the busy nodes (the old whole-allocation exclusion) turned transient + // contention into permanent group failure — exactly backwards. Now contention + // excludes nothing; the group simply abandons this cycle's reservation and + // retries the same nodes when they free. + if f.excludedNodes[group] == nil { + f.excludedNodes[group] = map[string]bool{} + } + var incompatible, busy []string + for _, n := range alloc.place.Nodes { + var code fwk.Code + if filteredNodeStatusMap != nil { + if st := filteredNodeStatusMap.Get(n); st != nil { + code = st.Code() + } + } + if code == fwk.UnschedulableAndUnresolvable { + f.excludedNodes[group][n] = true + incompatible = append(incompatible, n) + } else { + // plain Unschedulable, Success, or unknown/nil -> transient, keep. + busy = append(busy, n) + } + } + excludedCount := len(f.excludedNodes[group]) + jobids := alloc.jobids + delete(f.placement, group) + f.mu.Unlock() + + // Release the Fluxion reservation for the abandoned allocation so the graph + // does not leak it while the group retries. + f.cancelJobids(jobids) + + log.Printf("[fluence] group %s unschedulable: abandoning allocation (jobids %v); "+ + "incompatible(excluded)=%v busy(retryable, NOT excluded)=%v; %d node(s) excluded total", + group, jobids, incompatible, busy, excludedCount) + + // Returning Unschedulable (no nominated node) lets the pod be requeued; the + // next PreFilter re-matches (with any incompatible nodes excluded, but busy + // nodes still in play). Fluxion, not PostFilter preemption, chooses placement. + return nil, fwk.NewStatus(fwk.Unschedulable) +} + // PreBindPreFlight runs before PreBind. It returns Success when we have a cached // allocation for the pod's group (so PreBind can record the jobid, and stamp the // backend for a quantum pod), and Skip otherwise. @@ -408,12 +561,59 @@ func (f *Fluence) PreBindPreFlight( return nil, fwk.NewStatus(fwk.Success) } +// Reserve stamps the chosen backend (and matched attributes) onto the pod as +// early as possible — at reservation, in the scheduling cycle — rather than in +// PreBind. The webhook injects FLUXION_BACKEND (and FLUXION_) as a +// downward-API env sourced from these annotations; downward-API env is resolved +// by the kubelet when the container starts and is NOT updated afterward, so the +// annotation must be present well before the container starts. PreBind runs in +// the (asynchronous) binding cycle, milliseconds before Bind, which races the +// kubelet — Reserve runs earlier and synchronously, giving the annotation time +// to propagate so the value reliably surfaces in the container. +func (f *Fluence) Reserve( + ctx context.Context, + state fwk.CycleState, + pod *corev1.Pod, + nodeName string, +) *fwk.Status { + if err := f.stampBackend(ctx, pod); err != nil { + return fwk.AsStatus(fmt.Errorf("stamp backend annotations: %w", err)) + } + return fwk.NewStatus(fwk.Success) +} + +// Unreserve is a no-op: a stale backend annotation from a reservation that was +// later rejected is harmless (it is overwritten on the next attempt and the +// value is correct for the allocation that produced it), and clearing it would +// cost an extra API call. Required to satisfy fwk.ReservePlugin. +func (f *Fluence) Unreserve(ctx context.Context, state fwk.CycleState, pod *corev1.Pod, nodeName string) { +} + +// stampBackend writes the allocated backend name and matched attributes onto the +// pod (idempotent merge patch). No-op when there is no cached allocation or the +// allocation carries no backend (classical, non-quantum gangs). +func (f *Fluence) stampBackend(ctx context.Context, pod *corev1.Pod) error { + f.mu.Lock() + alloc, ok := f.placement[groupKey(pod)] + f.mu.Unlock() + if !ok || alloc.place.Backend == "" { + return nil + } + ann := map[string]string{placement.BackendAnnotation: alloc.place.Backend} + for k, v := range alloc.place.BackendAttributes { + ann[placement.AttributeAnnotationPrefix+k] = v + } + log.Printf("[fluence] group %s -> backend %q attrs %v (reserve-stamped, nodes %v)", + groupKey(pod), alloc.place.Backend, alloc.place.BackendAttributes, alloc.place.Nodes) + return f.patchPodAnnotations(ctx, pod.Namespace, pod.Name, ann) +} + // PreBind records, in the commit phase, the durable state for this group: -// - the Fluxion jobid onto the owning object (the PodGroup for a gang, else the -// pod) so the allocation can be cancelled when that object is deleted; -// - for a quantum group, the allocated backend onto the pod, which the webhook- -// injected downward-API env surfaces as QRMI_BACKEND (container env is -// immutable post-creation, so the value must travel via an annotation). +// the Fluxion jobid onto the owning object (the PodGroup for a gang, else the +// pod) so the allocation can be cancelled when that object is deleted. The +// backend annotation is stamped earlier, in Reserve (see stampBackend), because +// the webhook-injected downward-API env (FLUXION_BACKEND) must be present before +// the container starts; PreBind is too late and races the kubelet. func (f *Fluence) PreBind( ctx context.Context, state fwk.CycleState, @@ -430,20 +630,10 @@ func (f *Fluence) PreBind( if err := f.recordJobIDs(ctx, pod, alloc.jobids); err != nil { return fwk.AsStatus(fmt.Errorf("record jobids: %w", err)) } - if alloc.place.Backend != "" { - // Stamp the backend name and all matched attributes in one patch. The - // webhook injects a normalized env per annotation so the workload reads - // exactly what it matched (backend + region/qubits/...). - ann := map[string]string{placement.BackendAnnotation: alloc.place.Backend} - for k, v := range alloc.place.BackendAttributes { - ann[placement.AttributeAnnotationPrefix+k] = v - } - log.Printf("[fluence] group %s -> backend %q attrs %v (nodes %v, jobids %v)", - groupKey(pod), alloc.place.Backend, alloc.place.BackendAttributes, - alloc.place.Nodes, alloc.jobids) - if err := f.patchPodAnnotations(ctx, pod.Namespace, pod.Name, ann); err != nil { - return fwk.AsStatus(fmt.Errorf("stamp backend annotations: %w", err)) - } + // Backstop: if Reserve was skipped for any reason, ensure the backend is + // stamped before bind anyway (idempotent). + if err := f.stampBackend(ctx, pod); err != nil { + return fwk.AsStatus(fmt.Errorf("stamp backend annotations: %w", err)) } return fwk.NewStatus(fwk.Success) } @@ -637,6 +827,20 @@ func (f *Fluence) reconcileGroup(ctx context.Context, namespace, group string) { } log.Printf("fluence: reconciled completed gang %s/%s — deleted Fluence-created PodGroup, allocation freed", namespace, group) + + // Producer-group cleanup: in shared coordination the gang is split into the + // consumer group (this group) and the producer's group-of-one + // -producer (a Fluence-created PodGroup, minCount 1). The producer POD + // is a real member of the user's workload (indexed-Job index 0), so we must + // NOT delete it — only its Fluence-created PodGroup, as a backstop to free its + // allocation (its own reconcile pass also reaps it once the producer pod is + // terminal). Skip when this group is itself a producer group, to avoid + // recursing on -producer-producer. + if !strings.HasSuffix(group, producerGroupSuffix) { + pg := group + producerGroupSuffix + _ = f.handle.ClientSet().SchedulingV1alpha2().PodGroups(namespace).Delete(ctx, pg, metav1.DeleteOptions{}) + log.Printf("fluence: reaped producer PodGroup %s/%s for gang %s", namespace, pg, group) + } } // reconcileGraceForEmpty is how long a Fluence-created PodGroup with no live @@ -648,6 +852,12 @@ const reconcileGraceForEmpty = 2 * time.Minute // package (the scheduler must not depend on the webhook). Kept in sync with it. const webhookGroupLabel = "fluence.flux-framework.org/group" +// producerGroupSuffix mirrors handlers.ProducerGroupSuffix: in shared +// coordination the producer (indexed-Job index 0) is its own group-of-one named +// -producer. Duplicated here to avoid importing the webhook handlers package +// into the scheduler plugin; keep the two in sync. +const producerGroupSuffix = "-producer" + // onPodGroupDeleted frees the gang's allocation when its PodGroup is deleted. func (f *Fluence) onPodGroupDeleted(obj interface{}) { pg, ok := obj.(*schedv1a2.PodGroup) @@ -718,6 +928,7 @@ func (f *Fluence) cancelGroup(key string, ann map[string]string) { f.mu.Lock() delete(f.placement, key) + delete(f.excludedNodes, key) // drop accumulated exclusions so a future group reusing the name starts clean f.mu.Unlock() } diff --git a/pkg/fluence/fluence_test.go b/pkg/fluence/fluence_test.go index 998e1a7..5228f97 100644 --- a/pkg/fluence/fluence_test.go +++ b/pkg/fluence/fluence_test.go @@ -1,6 +1,7 @@ package fluence import ( + "context" "errors" "testing" @@ -12,6 +13,7 @@ import ( schedv1a2 "k8s.io/api/scheduling/v1alpha2" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/cache" + fwk "k8s.io/kube-scheduler/framework" ) // fakeMatcher records Cancel calls so cancel behavior can be asserted without @@ -46,7 +48,11 @@ func (m *fakeMatcher) Cancel(jobid uint64) error { } func newTestFluence(m matcher) *Fluence { - return &Fluence{matcher: m, placement: map[string]groupAlloc{}} + return &Fluence{ + matcher: m, + placement: map[string]groupAlloc{}, + excludedNodes: map[string]map[string]bool{}, + } } func ann(jobid string) map[string]string { @@ -345,3 +351,205 @@ func twoSpecs() []*jobspec.Jobspec { {Version: 9999}, } } + +// --- PostFilter allocation reconciliation ----------------------------------- + +// fakeNodeStatus is a minimal fwk.NodeToStatusReader for PostFilter tests: it +// maps node name -> status code so a test can mark some nodes incompatible +// (UnschedulableAndUnresolvable) and others merely busy (Unschedulable). +type fakeNodeStatus map[string]fwk.Code + +func (s fakeNodeStatus) Get(node string) *fwk.Status { + if c, ok := s[node]; ok { + return fwk.NewStatus(c) + } + return nil +} +func (s fakeNodeStatus) NodesForStatusCode(fwk.NodeInfoLister, fwk.Code) ([]fwk.NodeInfo, error) { + return nil, nil +} + +// PostFilter abandons the failed allocation (cancel jobids, drop cache) and +// excludes ONLY genuinely-incompatible nodes (UnschedulableAndUnresolvable). +// A node that was merely busy (plain Unschedulable) MUST stay eligible. +func TestPostFilterExcludesOnlyIncompatibleNodes(t *testing.T) { + m := &fakeMatcher{} + f := newTestFluence(m) + key := "default/training" + f.placement[key] = groupAlloc{ + place: placement.Placement{Nodes: []string{"node-a", "node-b", "node-c"}}, + jobids: []uint64{11, 12}, + } + pod := groupedPod("default", "training-0", "training", nil) + + // node-a incompatible (taint); node-b busy; node-c survived Filter. + status := fakeNodeStatus{ + "node-a": fwk.UnschedulableAndUnresolvable, + "node-b": fwk.Unschedulable, + "node-c": fwk.Success, + } + + _, st := f.PostFilter(context.Background(), nil, pod, status) + if st == nil || st.Code() != fwk.Unschedulable { + t.Fatalf("expected Unschedulable status, got %v", st) + } + if _, still := f.placement[key]; still { + t.Fatal("placement cache should be deleted after PostFilter") + } + if len(m.cancelled) != 2 { + t.Fatalf("expected both jobids cancelled, got %v", m.cancelled) + } + excl := f.excludedNodes[key] + if !excl["node-a"] { + t.Fatalf("incompatible node-a should be excluded, set=%v", excl) + } + if excl["node-b"] || excl["node-c"] { + t.Fatalf("busy/ok nodes must NOT be excluded (would strand a saturated gang), set=%v", excl) + } + if len(excl) != 1 { + t.Fatalf("expected exactly 1 excluded node, got %v", excl) + } +} + +// A group blocked purely by contention (every node merely busy) excludes NOTHING +// so it can retry the same nodes once they free — the saturated-cluster property. +func TestPostFilterContentionExcludesNothing(t *testing.T) { + m := &fakeMatcher{} + f := newTestFluence(m) + key := "default/training" + f.placement[key] = groupAlloc{ + place: placement.Placement{Nodes: []string{"node-a", "node-b"}}, + jobids: []uint64{1}, + } + pod := groupedPod("default", "training-0", "training", nil) + status := fakeNodeStatus{"node-a": fwk.Unschedulable, "node-b": fwk.Unschedulable} + + f.PostFilter(context.Background(), nil, pod, status) + + if len(f.excludedNodes[key]) != 0 { + t.Fatalf("a purely-busy group must exclude no nodes, got %v", f.excludedNodes[key]) + } + if _, still := f.placement[key]; still { + t.Fatal("placement cache should be deleted even when nothing is excluded") + } + if len(m.cancelled) != 1 { + t.Fatalf("expected the jobid cancelled, got %v", m.cancelled) + } +} + +// A nil status map (e.g. all nodes filtered out upstream) must be safe and +// exclude nothing rather than panic or ban the whole allocation. +func TestPostFilterNilStatusMapExcludesNothing(t *testing.T) { + m := &fakeMatcher{} + f := newTestFluence(m) + key := "default/training" + f.placement[key] = groupAlloc{place: placement.Placement{Nodes: []string{"node-a", "node-b"}}, jobids: []uint64{7}} + pod := groupedPod("default", "training-0", "training", nil) + + _, st := f.PostFilter(context.Background(), nil, pod, nil) + if st == nil || st.Code() != fwk.Unschedulable { + t.Fatalf("expected Unschedulable, got %v", st) + } + if len(f.excludedNodes[key]) != 0 { + t.Fatalf("nil status map must exclude nothing, got %v", f.excludedNodes[key]) + } +} + +// Incompatible nodes accumulate across attempts; busy ones never do. +func TestPostFilterAccumulatesIncompatibleAcrossAttempts(t *testing.T) { + m := &fakeMatcher{} + f := newTestFluence(m) + key := "default/training" + pod := groupedPod("default", "training-0", "training", nil) + + f.placement[key] = groupAlloc{place: placement.Placement{Nodes: []string{"node-a", "node-b"}}, jobids: []uint64{1}} + f.PostFilter(context.Background(), nil, pod, fakeNodeStatus{"node-a": fwk.UnschedulableAndUnresolvable, "node-b": fwk.Unschedulable}) + f.placement[key] = groupAlloc{place: placement.Placement{Nodes: []string{"node-c", "node-d"}}, jobids: []uint64{2}} + f.PostFilter(context.Background(), nil, pod, fakeNodeStatus{"node-c": fwk.UnschedulableAndUnresolvable, "node-d": fwk.Unschedulable}) + + excl := f.excludedNodes[key] + for _, n := range []string{"node-a", "node-c"} { + if !excl[n] { + t.Fatalf("incompatible %s should accumulate, got %v", n, excl) + } + } + if excl["node-b"] || excl["node-d"] { + t.Fatalf("busy nodes must never accumulate, got %v", excl) + } + if len(excl) != 2 { + t.Fatalf("exclusion set should be the 2 incompatible nodes, got %v", excl) + } +} + +// PostFilter on a group with no cached allocation (not ours, or already cleared) +// is a safe no-op: no panic, no cancel, returns Unschedulable. +func TestPostFilterUnknownGroupNoop(t *testing.T) { + m := &fakeMatcher{} + f := newTestFluence(m) + pod := groupedPod("default", "stranger-0", "stranger", nil) + + _, status := f.PostFilter(context.Background(), nil, pod, nil) + if status == nil || status.Code() != fwk.Unschedulable { + t.Fatalf("expected Unschedulable, got %v", status) + } + if len(m.cancelled) != 0 { + t.Fatalf("nothing should be cancelled for an unknown group, got %v", m.cancelled) + } + if len(f.excludedNodes) != 0 { + t.Fatalf("no exclusion set should be created for an unknown group, got %v", f.excludedNodes) + } +} + +// Teardown (cancelGroup) must clear the exclusion set so a future group reusing +// the same key does not inherit stale exclusions. +func TestCancelGroupClearsExclusions(t *testing.T) { + m := &fakeMatcher{} + f := newTestFluence(m) + key := "default/training" + f.placement[key] = groupAlloc{jobids: []uint64{9}} + f.excludedNodes[key] = map[string]bool{"node-a": true} + + f.cancelGroup(key, ann("9")) + + if _, still := f.excludedNodes[key]; still { + t.Fatal("exclusion set should be cleared on teardown") + } +} + +// schedulableNodes must drop control-plane (NoSchedule taint), NoExecute-tainted, +// and cordoned nodes, keeping only nodes a normal gang pod can actually land on. +// This keeps the Fluxion graph from offering nodes Kubernetes will reject in +// Filter (which, with whole-allocation PostFilter exclusion, strands the gang). +func TestSchedulableNodesDropsTaintedAndCordoned(t *testing.T) { + node := func(name string, unsched bool, effects ...corev1.TaintEffect) corev1.Node { + n := corev1.Node{} + n.Name = name + n.Spec.Unschedulable = unsched + for _, e := range effects { + n.Spec.Taints = append(n.Spec.Taints, corev1.Taint{Key: "k", Effect: e}) + } + return n + } + in := []corev1.Node{ + node("worker-1", false), + node("worker-2", false), + node("control-plane", false, corev1.TaintEffectNoSchedule), + node("draining", false, corev1.TaintEffectNoExecute), + node("cordoned", true), + node("prefer-only", false, corev1.TaintEffectPreferNoSchedule), // soft taint: keep + } + got := schedulableNodes(in) + gotNames := map[string]bool{} + for _, n := range got { + gotNames[n.Name] = true + } + want := []string{"worker-1", "worker-2", "prefer-only"} + if len(got) != len(want) { + t.Fatalf("expected %d schedulable nodes %v, got %d %v", len(want), want, len(got), gotNames) + } + for _, w := range want { + if !gotNames[w] { + t.Fatalf("expected %s kept, got set %v", w, gotNames) + } + } +} diff --git a/pkg/placement/placement.go b/pkg/placement/placement.go index 554f319..c7f76de 100644 --- a/pkg/placement/placement.go +++ b/pkg/placement/placement.go @@ -214,14 +214,36 @@ func withEntries(counts map[string]int) []jobspec.Resource { // allocation (duration 0 runs to graph end) plus an RFC 31 property constraint // selecting the eligible node set. properties is the AND-set of composed // key=value property strings a matched node must carry. -func systemAttributes(properties []string) map[string]interface{} { +func systemAttributes(properties []string, excludeNodes []string) map[string]interface{} { + // Base property constraint (the eligible-node property AND-set). + constraints := map[string]interface{}{ + "properties": properties, + } + // When a group has had a placement rejected by other scheduler plugins + // (taints, affinity, volume topology that Fluxion's graph does not model), + // PostFilter accumulates the rejected hostnames and we AND in an RFC 31 + // negated hostlist so the re-match is forced onto untried nodes. RFC 31 is + // JsonLogic-style ({operator:[values]}, one operator per object), so to AND + // two operators we nest them under an explicit `and`. We only do this when + // there is something to exclude, so the no-exclusion jobspec is byte-for-byte + // what it was before (and existing tests/behavior are unchanged). + if len(excludeNodes) > 0 { + constraints = map[string]interface{}{ + "and": []interface{}{ + map[string]interface{}{"properties": properties}, + map[string]interface{}{ + "not": []interface{}{ + map[string]interface{}{"hostlist": excludeNodes}, + }, + }, + }, + } + } return map[string]interface{}{ "system": map[string]interface{}{ // duration 0 => hold the allocation until we explicitly Cancel. - "duration": 0, - "constraints": map[string]interface{}{ - "properties": properties, - }, + "duration": 0, + "constraints": constraints, }, } } @@ -229,7 +251,7 @@ func systemAttributes(properties []string) map[string]interface{} { // computeJobspec builds the physical-compute jobspec for a group: one slot per // pod holding the compute resources, constrained to virtual=false nodes. This is // the only jobspec for a group that requests no virtual devices. -func computeJobspec(groupName string, slots int, compute map[string]int) *jobspec.Jobspec { +func computeJobspec(groupName string, slots int, compute map[string]int, excludeNodes []string) *jobspec.Jobspec { return &jobspec.Jobspec{ Version: 9999, Resources: []jobspec.Resource{{ @@ -238,7 +260,7 @@ func computeJobspec(groupName string, slots int, compute map[string]int) *jobspe Label: "default", With: withEntries(compute), }}, - Attributes: systemAttributes([]string{VirtualPropertyFalse}), + Attributes: systemAttributes([]string{VirtualPropertyFalse}, excludeNodes), Tasks: []jobspec.Task{{ Command: []string{groupName}, Slot: "default", @@ -272,7 +294,7 @@ func deviceJobspec(groupName, deviceType string, count int, extraProps []string) Label: "device", With: []jobspec.Resource{{Type: "node", Count: count}}, }}, - Attributes: systemAttributes(props), + Attributes: systemAttributes(props, nil), Tasks: []jobspec.Task{{ Command: []string{groupName}, Slot: "device", @@ -299,6 +321,7 @@ func JobspecsForGroup( groupName string, pods []corev1.Pod, knownDevices map[string]bool, + excludeNodes []string, ) ([]*jobspec.Jobspec, error) { if len(pods) == 0 { return nil, fmt.Errorf("pod group %q has no pods", groupName) @@ -321,7 +344,7 @@ func JobspecsForGroup( } } - specs := []*jobspec.Jobspec{computeJobspec(groupName, len(pods), compute)} + specs := []*jobspec.Jobspec{computeJobspec(groupName, len(pods), compute, excludeNodes)} // Deterministic device order for stable output. deviceTypes := make([]string, 0, len(devices)) diff --git a/pkg/placement/placement_test.go b/pkg/placement/placement_test.go index 33786c8..fe68917 100644 --- a/pkg/placement/placement_test.go +++ b/pkg/placement/placement_test.go @@ -64,7 +64,7 @@ func TestClassicalSingleMatch(t *testing.T) { podWith("p0", corev1.ResourceList{corev1.ResourceCPU: qty(4), "nvidia.com/gpu": qty(1)}), podWith("p1", corev1.ResourceList{corev1.ResourceCPU: qty(4), "nvidia.com/gpu": qty(1)}), } - specs, err := JobspecsForGroup("grp", pods, nil) + specs, err := JobspecsForGroup("grp", pods, nil, nil) if err != nil { t.Fatal(err) } @@ -101,7 +101,7 @@ func TestGroupDeviceMatchWhenLeaderNotFirst(t *testing.T) { }) // Leader deliberately placed last. pods := []corev1.Pod{worker, worker, leader} - specs, err := JobspecsForGroup("qgrp", pods, map[string]bool{"qpu": true}) + specs, err := JobspecsForGroup("qgrp", pods, map[string]bool{"qpu": true}, nil) if err != nil { t.Fatal(err) } @@ -132,7 +132,7 @@ func qpuPodWithRequires(name string, requires map[string]string) corev1.Pod { // constraints, nothing extra (over-constraining would break unconstrained runs). func TestNoRequireAnnotationsAddsNoConstraints(t *testing.T) { p := qpuPodWithRequires("q", nil) - specs, err := JobspecsForGroup("g", []corev1.Pod{p}, map[string]bool{"qpu": true}) + specs, err := JobspecsForGroup("g", []corev1.Pod{p}, map[string]bool{"qpu": true}, nil) if err != nil { t.Fatal(err) } @@ -145,7 +145,7 @@ func TestNoRequireAnnotationsAddsNoConstraints(t *testing.T) { // Exactly one require- constraint. func TestSingleRequireConstraint(t *testing.T) { p := qpuPodWithRequires("q", map[string]string{"qrmi_type": "braket-gate"}) - specs, err := JobspecsForGroup("g", []corev1.Pod{p}, map[string]bool{"qpu": true}) + specs, err := JobspecsForGroup("g", []corev1.Pod{p}, map[string]bool{"qpu": true}, nil) if err != nil { t.Fatal(err) } @@ -169,7 +169,7 @@ func TestMultipleRequireConstraintsAreDeduped(t *testing.T) { // a worker that happens to repeat one of the same require- annotations worker := qpuPodWithRequires("w0", map[string]string{"vendor": "amazon"}) specs, err := JobspecsForGroup("g", []corev1.Pod{leader, worker}, - map[string]bool{"qpu": true}) + map[string]bool{"qpu": true}, nil) if err != nil { t.Fatal(err) } @@ -211,7 +211,7 @@ func TestRequireAnnotationConstrainsDevice(t *testing.T) { leader.Annotations[RequireAnnotationPrefix+"vendor"] = "amazon" specs, err := JobspecsForGroup("qgrp", []corev1.Pod{leader}, - map[string]bool{"qpu": true}) + map[string]bool{"qpu": true}, nil) if err != nil { t.Fatal(err) } @@ -232,7 +232,7 @@ func TestDeviceProducesSecondMatch(t *testing.T) { FluxionResourcePrefix + "qpu": qty(1), }) known := map[string]bool{"qpu": true} - specs, err := JobspecsForGroup("qgrp", []corev1.Pod{p}, known) + specs, err := JobspecsForGroup("qgrp", []corev1.Pod{p}, known, nil) if err != nil { t.Fatal(err) } @@ -274,7 +274,7 @@ func TestDeviceProducesSecondMatch(t *testing.T) { // node), so there are two matches: compute (core=1, virtual=false) and device. func TestDeviceOnlyStillForcesCompute(t *testing.T) { p := podWith("q", corev1.ResourceList{FluxionResourcePrefix + "qpu": qty(1)}) - specs, err := JobspecsForGroup("qonly", []corev1.Pod{p}, map[string]bool{"qpu": true}) + specs, err := JobspecsForGroup("qonly", []corev1.Pod{p}, map[string]bool{"qpu": true}, nil) if err != nil { t.Fatal(err) } @@ -289,7 +289,7 @@ func TestDeviceOnlyStillForcesCompute(t *testing.T) { // Requesting a device type the graph does not model is a hard error. func TestUnknownDeviceErrors(t *testing.T) { p := podWith("q", corev1.ResourceList{FluxionResourcePrefix + "fpga": qty(1)}) - _, err := JobspecsForGroup("grp", []corev1.Pod{p}, map[string]bool{"qpu": true}) + _, err := JobspecsForGroup("grp", []corev1.Pod{p}, map[string]bool{"qpu": true}, nil) if err == nil { t.Fatal("expected an error for an unmodeled device type") } @@ -301,7 +301,7 @@ func TestHoldDurationZero(t *testing.T) { corev1.ResourceCPU: qty(1), FluxionResourcePrefix + "qpu": qty(1), }) - specs, err := JobspecsForGroup("g", []corev1.Pod{p}, map[string]bool{"qpu": true}) + specs, err := JobspecsForGroup("g", []corev1.Pod{p}, map[string]bool{"qpu": true}, nil) if err != nil { t.Fatal(err) } @@ -366,3 +366,76 @@ func TestPlacementUnmarkedNodeIsCompute(t *testing.T) { t.Fatalf("unmarked node should not be a backend, got %q", p.Backend) } } + +// When excludeNodes is non-empty, the compute jobspec's constraint must AND the +// base properties with an RFC 31 negated hostlist, so a re-match avoids the +// rejected nodes. When empty, the constraint must be the plain properties form +// (byte-for-byte the pre-exclusion behavior). +func TestExcludeNodesAddsNegatedHostlist(t *testing.T) { + p := podWith("p", corev1.ResourceList{corev1.ResourceCPU: qty(1)}) + + // no exclusion -> plain properties, no `and`/`not` + specs, err := JobspecsForGroup("g", []corev1.Pod{p}, nil, nil) + if err != nil { + t.Fatal(err) + } + cons := computeConstraints(t, specs[0]) + if _, hasAnd := cons["and"]; hasAnd { + t.Fatalf("no-exclusion constraint must not use `and`: %#v", cons) + } + if _, hasProps := cons["properties"]; !hasProps { + t.Fatalf("no-exclusion constraint must have plain properties: %#v", cons) + } + + // with exclusion -> and[ properties, not[ hostlist ] ] + specs, err = JobspecsForGroup("g", []corev1.Pod{p}, nil, []string{"node-b", "node-c"}) + if err != nil { + t.Fatal(err) + } + cons = computeConstraints(t, specs[0]) + andTerms, ok := cons["and"].([]interface{}) + if !ok || len(andTerms) != 2 { + t.Fatalf("exclusion constraint must be `and` of 2 terms: %#v", cons) + } + // find the not/hostlist term + foundHostlist := false + for _, term := range andTerms { + tm, _ := term.(map[string]interface{}) + notTerm, ok := tm["not"].([]interface{}) + if !ok || len(notTerm) == 0 { + continue + } + inner, _ := notTerm[0].(map[string]interface{}) + hl, ok := inner["hostlist"].([]string) + if !ok { + // json round-trip may make it []interface{}; accept both + if hlAny, ok2 := inner["hostlist"].([]interface{}); ok2 { + if len(hlAny) == 2 { + foundHostlist = true + } + } + continue + } + if len(hl) == 2 { + foundHostlist = true + } + } + if !foundHostlist { + t.Fatalf("exclusion constraint must contain not[hostlist[2 nodes]]: %#v", cons) + } +} + +// computeConstraints digs out attributes.system.constraints from the compute +// jobspec (the first spec; device specs do not carry node exclusions). +func computeConstraints(t *testing.T, spec *jobspec.Jobspec) map[string]interface{} { + t.Helper() + sys, ok := spec.Attributes["system"].(map[string]interface{}) + if !ok { + t.Fatalf("no system attributes: %#v", spec.Attributes) + } + cons, ok := sys["constraints"].(map[string]interface{}) + if !ok { + t.Fatalf("no constraints: %#v", sys) + } + return cons +} diff --git a/pkg/webhook/handler.go b/pkg/webhook/handler.go index 82a1227..61b97b1 100644 --- a/pkg/webhook/handler.go +++ b/pkg/webhook/handler.go @@ -25,34 +25,32 @@ type MutatorAPI interface { // InjectedEnv is the FLUXION_* env contract the scheduler/webhook supplies. InjectedEnv() []corev1.EnvVar - // PodGroup operations (gang scheduling). Group identity is the value of the - // group label, which the core treats as an opaque string. - PodGroupLeader(ctx context.Context, namespace, group string) string - EnsurePodGroup(ctx context.Context, namespace, group, leaderPod string) - RecordLeader(ctx context.Context, namespace, group, leaderPod string) - - // EnsureSidecarRBAC provisions the per-namespace ServiceAccount/Role/Binding - // the sidecar needs. - EnsureSidecarRBAC(ctx context.Context, namespace string) - - // InterceptorOps stages the fluence package into the quantum container via an - // init container + shared volume on PYTHONPATH (Model C). SidecarContainerOps - // adds the sidecar container (observe=true => observe-only telemetry mode). - InterceptorOps(pod *corev1.Pod) []spec.Op - SidecarContainerOps(pod *corev1.Pod, observe bool) []spec.Op + // EnsurePodGroup creates the group's PodGroup with the given gang minCount if + // it does not already exist (idempotent). Group identity is the opaque value + // of the group label. creatorPod is recorded only as the PodGroup's creator + // reference; the core ascribes no role semantics to it. + EnsurePodGroup(ctx context.Context, namespace, group, creatorPod string, minCount int32) } // Handler inspects a pod and, when it applies, contributes JSON patch ops. A pod // flows through every registered handler whose Applies returns true; their ops // are concatenated. Applies is fully general — it receives the pod and the -// MutatorAPI, so a handler may consult cluster state (e.g. resolve a group's -// leader) in deciding whether it applies. +// MutatorAPI, so a handler may consult cluster state in deciding whether it +// applies. type Handler interface { Name() string Applies(ctx context.Context, m MutatorAPI, pod *corev1.Pod) bool Mutate(ctx context.Context, m MutatorAPI, pod *corev1.Pod) []spec.Op } +// DefaultHandlerOrder is the active set AND the dispatch order when the operator +// passes no --handlers flag. Order matters: specific handlers run before the +// generic gang fallback, so "gang" is LAST — it applies default gang sizing +// (group-size annotation or owner-derived N) only if no earlier handler already +// shaped the gang. To change the order or disable a handler, pass a different +// list (e.g. --handlers=fluxion,gang drops quantum). +var DefaultHandlerOrder = []string{"fluxion", "quantum", "gang"} + // ── registration ──────────────────────────────────────────────────────────────── // // Handlers self-register via Register() from their package's init(). The core @@ -60,15 +58,57 @@ type Handler interface { // webhook server wiring) is what populates the registry. This keeps the core // domain-agnostic: adding or removing a handler does not touch core code. -var registry []Handler +// available maps a handler's Name() to the handler. Populated by Register() from +// each handler package's init(). This is the set of handlers that EXIST; which +// ones actually run, and in what order, is decided by activeOrder. +var available = map[string]Handler{} + +// activeOrder is the ordered list of handler names to dispatch. It is BOTH the +// selection (names not present are disabled) and the order (dispatch follows the +// slice). Defaults to DefaultHandlerOrder; overridden by SetActiveHandlers. +var activeOrder = append([]string(nil), DefaultHandlerOrder...) -// Register adds a handler to the global set. Called from handler packages' -// init(). Order of registration is the order handlers run. +// Register adds a handler to the available set under its Name(). Called from +// handler packages' init(). func Register(h Handler) { - registry = append(registry, h) + available[h.Name()] = h +} + +// SetActiveHandlers sets the active, ordered handler list (the --handlers value). +// Empty/nil restores DefaultHandlerOrder. Names with no registered handler are +// dropped and returned as `unknown` so the caller can warn. Order is preserved +// exactly as given — the list is the dispatch order. +func SetActiveHandlers(names []string) (active, unknown []string) { + if len(names) == 0 { + activeOrder = append([]string(nil), DefaultHandlerOrder...) + return activeOrder, nil + } + var ordered []string + for _, n := range names { + if _, ok := available[n]; ok { + ordered = append(ordered, n) + } else { + unknown = append(unknown, n) + } + } + activeOrder = ordered + return activeOrder, unknown +} + +// ActiveHandlerNames returns the active dispatch order (for logging at startup). +func ActiveHandlerNames() []string { + return append([]string(nil), activeOrder...) } -// registered returns the registered handlers (the live registry). +// registered returns the active handlers, resolved from activeOrder, in order. +// Names in the order with no registered handler are skipped (already warned at +// SetActiveHandlers time). func registered() []Handler { - return registry + out := make([]Handler, 0, len(activeOrder)) + for _, n := range activeOrder { + if h, ok := available[n]; ok { + out = append(out, h) + } + } + return out } diff --git a/pkg/webhook/handlers/dependency.go b/pkg/webhook/handlers/dependency.go new file mode 100644 index 0000000..d25d598 --- /dev/null +++ b/pkg/webhook/handlers/dependency.go @@ -0,0 +1,131 @@ +package handlers + +import ( + "github.com/converged-computing/fluence/pkg/webhook/spec" + + corev1 "k8s.io/api/core/v1" +) + +// Dependency is Fluence's GENERAL "this set of pods must wait for a producer to +// be ready" primitive. It is deliberately NOT quantum-specific: quantum is the +// first resource type to use it (a gang waits for a quantum submission to reach +// the device queue), but the same primitive applies to any resource type whose +// readiness is produced out-of-band — a license server, a data stage-in job, a +// warmed cache, another gang, etc. +// +// A Dependency has three parts, each carried as a pod annotation so the +// relationship lives at the GROUP level (not duplicated as bespoke per-resource +// fields) and is readable by both the webhook (at admission) and the scheduler +// (in its reconcile loop): +// +// - Kind: what KIND of readiness this is (the resource type's name). The +// producer side knows how to satisfy this kind; the consumer side +// only knows it must wait. Quantum's kind is "quantum-submit". +// - Producer: the identity of the thing that will signal ready. For quantum it +// is the submitter's (base) group; generally it is whatever the +// kind's handler records as the satisfier. +// - Gate: the scheduling gate held on the dependent (consumer) pods until +// the producer signals ready. Removing the gate is the "ungate" +// and is performed by whatever observes the producer's readiness +// (the quantum sidecar for kind=quantum-submit; the scheduler's +// reconcile loop for kinds whose readiness is in-cluster, e.g. +// "another gang is Running"). +// +// The webhook PRODUCES a Dependency (gates the consumers, stamps the +// annotations); REMOVING the gate is owned by the observer best placed to see +// the producer's readiness. That split — declare here, observe elsewhere — is +// what keeps the primitive general: a new resource type adds a Kind and an +// observer and reuses the gating/annotation machinery unchanged. +type Dependency struct { + Kind string // resource-type readiness kind, e.g. "quantum-submit" + Producer string // identity of the readiness producer (e.g. the base group) + Gate string // scheduling gate held on dependents until ready +} + +// Dependency annotation keys (stamped on the dependent pods). Generic — no +// quantum in the names, so any resource type reuses them. +const ( + // DependsOnKindAnnotation names the readiness kind the dependent waits for. + DependsOnKindAnnotation = "fluence.flux-framework.org/depends-on-kind" + // DependsOnProducerAnnotation names the producer expected to signal ready. + DependsOnProducerAnnotation = "fluence.flux-framework.org/depends-on-producer" + // DependsOnGateAnnotation records which scheduling gate encodes the wait, so + // an observer knows exactly which gate to remove when the producer is ready. + DependsOnGateAnnotation = "fluence.flux-framework.org/depends-on-gate" +) + +// applyOps gates the dependent pod and stamps the dependency annotations so the +// relationship is self-describing on the pod. It reuses the gate machinery +// (gateWithName) verbatim — the gate is the universal "held until ready" +// mechanism regardless of resource type — so a new Kind costs only its readiness +// observer, not new gating code. +func (d Dependency) applyOps(pod *corev1.Pod) []spec.Op { + ops := gateWithName(pod, d.Gate) + ops = append(ops, annotateOp(pod, DependsOnKindAnnotation, d.Kind)...) + ops = append(ops, annotateOp(pod, DependsOnProducerAnnotation, d.Producer)...) + ops = append(ops, annotateOp(pod, DependsOnGateAnnotation, d.Gate)...) + return ops +} + +// DependencyOf reads a dependent pod's declared Dependency, or ok=false if it +// carries none. The scheduler's reconcile loop and the sidecar use this to learn +// what a gated pod is waiting for without hardcoding a kind. +func DependencyOf(pod *corev1.Pod) (Dependency, bool) { + kind := spec.Annotation(pod, DependsOnKindAnnotation) + if kind == "" { + return Dependency{}, false + } + return Dependency{ + Kind: kind, + Producer: spec.Annotation(pod, DependsOnProducerAnnotation), + Gate: spec.Annotation(pod, DependsOnGateAnnotation), + }, true +} + +// annotateOp adds a single metadata annotation (creating the annotations map if +// the pod has none). The key is JSON-Pointer-escaped so slashes are handled. +func annotateOp(pod *corev1.Pod, key, value string) []spec.Op { + if value == "" { + return nil + } + if pod.Annotations == nil { + return []spec.Op{{ + Op: "add", + Path: "/metadata/annotations", + Value: map[string]string{key: value}, + }} + } + return []spec.Op{{ + Op: "add", + Path: "/metadata/annotations/" + escapeJSONPointer(key), + Value: value, + }} +} + +// gateWithName adds a named scheduling gate (idempotent) and raises priority for +// the held pod, generalizing the quantum gating to ANY gate name so the +// dependency primitive is not tied to the quantum gate. +func gateWithName(pod *corev1.Pod, gateName string) []spec.Op { + for _, g := range pod.Spec.SchedulingGates { + if g.Name == gateName { + return nil + } + } + var ops []spec.Op + gate := corev1.PodSchedulingGate{Name: gateName} + if len(pod.Spec.SchedulingGates) == 0 { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/schedulingGates", Value: []corev1.PodSchedulingGate{gate}}) + } else { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/schedulingGates/-", Value: gate}) + } + // Gated dependents schedule reliably once ungated only if they outrank other + // pending work; priorityClassName is immutable post-creation so it must be + // set now. Don't override a user's explicit class. spec.priority is cleared + // to null so the priority admission controller recomputes it from the class + // (add-null is valid whether the field is absent, 0, or set). + if pod.Spec.PriorityClassName == "" { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/priorityClassName", Value: QuantumClassicalPriorityClass}) + ops = append(ops, spec.Op{Op: "add", Path: "/spec/priority", EmitNull: true}) + } + return ops +} diff --git a/pkg/webhook/handlers/gang.go b/pkg/webhook/handlers/gang.go index a6c6126..8ba83f3 100644 --- a/pkg/webhook/handlers/gang.go +++ b/pkg/webhook/handlers/gang.go @@ -2,11 +2,14 @@ package handlers import ( "context" + "log" + "strconv" "github.com/converged-computing/fluence/pkg/webhook" "github.com/converged-computing/fluence/pkg/webhook/spec" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) func init() { @@ -14,7 +17,7 @@ func init() { } // gangHandler gang-schedules pods that carry the group label: it creates a -// Fluence-owned PodGroup (first pod admitted becomes the recorded leader) and +// Fluence-owned PodGroup and // links every pod to it via spec.schedulingGroup.podGroupName, which is the // field the scheduler gangs by. It knows nothing about quantum — a purely // classical gang is fully handled here, with no sidecar. @@ -23,20 +26,88 @@ type gangHandler struct{} func (h *gangHandler) Name() string { return "gang" } func (h *gangHandler) Applies(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) bool { + // Classical gangs only. A pod that requests the quantum resource is gang- + // scheduled by the quantum handler, which owns the producer/consumer split and + // creates both the -producer and PodGroups itself; handling it + // here too would create a second, conflicting PodGroup for the group. + if spec.PodRequestsResource(pod, QuantumResource) { + return false + } return webhook.GroupName(pod) != "" } func (h *gangHandler) Mutate(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) []spec.Op { g := webhook.GroupName(pod) - // First pod admitted in the group creates the PodGroup and is recorded as - // the admission-order leader. All pods are linked to the group. - if m.PodGroupLeader(ctx, pod.Namespace, g) == "" { - m.EnsurePodGroup(ctx, pod.Namespace, g, pod.Name) - m.RecordLeader(ctx, pod.Namespace, g, pod.Name) - } + // Ensure the group's PodGroup exists with the resolved gang size, and link + // this pod to it. EnsurePodGroup is idempotent (no-ops if the PodGroup + // already exists — e.g. created by an earlier, more specific handler), so we + // call it unconditionally. The gang handler knows nothing about quantum or + // submitters; that is the quantum handler's concern. + // minCount = full gang size N (group-size annotation, else owner-derived); + // see resolveMinCount. + m.EnsurePodGroup(ctx, pod.Namespace, g, pod.Name, resolveMinCount(ctx, m, pod)) return schedulingGroupOps(pod, g) } +// resolveMinCount determines the gang's atomic-schedule size N: +// 1. explicit group-size annotation -> honor it verbatim. This is the override +// for when minCount must differ from the parent's replica count (e.g. the +// quantum leader/worker split, where the gang's N is expressed directly). +// 2. otherwise derive from the OWNING object: a Flux Operator MiniCluster pod +// is owned by an indexed Job whose parallelism == completions == size == N. +// (The operator sets Parallelism = Completions = MiniCluster.Spec.Size.) +// 3. otherwise default to 1, logged — never silently size a multi-pod gang to 1. +// +// The leader/worker (quantum) split is orthogonal and unchanged: it is driven by +// QuantumResource in the quantum handler. minCount is always the +// FULL gang N regardless of which pods get gated. +func resolveMinCount(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) int32 { + // 1. explicit override + if pod.Annotations != nil { + if n := pod.Annotations[webhook.GroupSizeAnnotation]; n != "" { + if v, err := strconv.Atoi(n); err == nil && v > 0 { + return int32(v) + } + } + } + // 2. derive from the owning Job's parallelism + if n := ownerJobN(ctx, m, pod); n > 0 { + return n + } + // 3. no signal: a single-pod gang. Log so a missing size on a multi-pod + // workload is visible rather than a silent gang-of-1. + log.Printf("[fluence-webhook] group %s: no group-size annotation and no owning Job parallelism; defaulting minCount=1", webhook.GroupName(pod)) + return 1 +} + +// ownerJobN returns the parallelism (== size N) of the indexed Job that owns the +// pod, or 0 if there is no such owner. The Flux Operator sets a MiniCluster's +// Job Parallelism == Completions == size, so this is the full gang size N. +// Shared by the gang handler (classical: minCount = N) and the quantum handler +// (split: leader group = 1, worker group = N-1). +func ownerJobN(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) int32 { + c := m.Client() + if c == nil { + return 0 + } + for _, ref := range pod.OwnerReferences { + if ref.Kind != "Job" { + continue + } + job, err := c.BatchV1().Jobs(pod.Namespace).Get(ctx, ref.Name, metav1.GetOptions{}) + if err != nil { + return 0 + } + if job.Spec.Parallelism != nil && *job.Spec.Parallelism > 0 { + return *job.Spec.Parallelism + } + if job.Spec.Completions != nil && *job.Spec.Completions > 0 { + return *job.Spec.Completions + } + } + return 0 +} + // schedulingGroupOps links a pod to its PodGroup via the native 1.36 field // spec.schedulingGroup.podGroupName. Idempotent if already linked. func schedulingGroupOps(pod *corev1.Pod, group string) []spec.Op { diff --git a/pkg/webhook/handlers/gang_test.go b/pkg/webhook/handlers/gang_test.go new file mode 100644 index 0000000..ac027f8 --- /dev/null +++ b/pkg/webhook/handlers/gang_test.go @@ -0,0 +1,153 @@ +/* +Copyright 2024 Lawrence Livermore National Security, LLC + (c.f. AUTHORS, NOTICE.LLNS, COPYING) +SPDX-License-Identifier: Apache-2.0 +*/ + +// Tests for gang PodGroup minCount: the whole gang (full N) must schedule +// atomically. Regression guard for the bug where every PodGroup was created +// with minCount=1, so a multi-pod gang was "satisfied" by a single pod and the +// rest were stranded (partial placement). +package handlers + +import ( + "context" + "testing" + + "strconv" + + "github.com/converged-computing/fluence/pkg/webhook" + + corev1 "k8s.io/api/core/v1" + + batchv1 "k8s.io/api/batch/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes/fake" +) + +// minCountOf runs the gang handler for the leader pod of a group and returns the +// minCount of the PodGroup the webhook created. +func minCountOf(t *testing.T, pod *corev1.Pod) int32 { + t.Helper() + m := &webhook.Mutator{Clientset: fake.NewSimpleClientset()} + m.Mutate(context.Background(), pod) + pg, err := m.Clientset.SchedulingV1alpha2(). + PodGroups(pod.Namespace).Get(context.Background(), webhook.GroupName(pod), metav1.GetOptions{}) + if err != nil { + t.Fatalf("PodGroup not created: %v", err) + } + if pg.Spec.SchedulingPolicy.Gang == nil { + t.Fatal("PodGroup has no gang scheduling policy") + } + return pg.Spec.SchedulingPolicy.Gang.MinCount +} + +// minCountWithClient runs the gang handler with a pre-seeded clientset (so the +// owning Job exists) and returns the created PodGroup's minCount. +func minCountWithClient(t *testing.T, pod *corev1.Pod, objs ...interface{}) int32 { + t.Helper() + cs := fake.NewSimpleClientset(toRuntime(objs)...) + m := &webhook.Mutator{Clientset: cs} + m.Mutate(context.Background(), pod) + pg, err := cs.SchedulingV1alpha2().PodGroups(pod.Namespace). + Get(context.Background(), webhook.GroupName(pod), metav1.GetOptions{}) + if err != nil { + t.Fatalf("PodGroup not created: %v", err) + } + return pg.Spec.SchedulingPolicy.Gang.MinCount +} + +func jobWithParallelism(ns, name string, n int32) *batchv1.Job { + return &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &n, Completions: &n}, + } +} + +func ownedBy(pod *corev1.Pod, kind, name string) { + pod.OwnerReferences = append(pod.OwnerReferences, + metav1.OwnerReference{Kind: kind, Name: name}) +} + +// No annotation, but the pod is owned by an indexed Job with parallelism N +// (the Flux Operator MiniCluster case: Parallelism == Completions == size == N). +// minCount must come from the Job. +func TestGangMinCountDerivedFromOwningJob(t *testing.T) { + pod := cpuPod("fluence") + pod.Namespace = "default" + pod.Labels = map[string]string{webhook.GroupLabel: "mc-gang"} + ownedBy(pod, "Job", "mc-gang-job") + got := minCountWithClient(t, pod, jobWithParallelism("default", "mc-gang-job", 4)) + if got != 4 { + t.Errorf("owner-derived: minCount=%d, want 4 (from Job parallelism)", got) + } +} + +// The explicit annotation OVERRIDES the owning Job's parallelism (the override +// exists precisely because minCount may differ from the parent replica count). +func TestGangMinCountAnnotationOverridesOwner(t *testing.T) { + pod := cpuPod("fluence") + pod.Namespace = "default" + pod.Labels = map[string]string{webhook.GroupLabel: "ovr-gang"} + pod.Annotations = map[string]string{webhook.GroupSizeAnnotation: "2"} + ownedBy(pod, "Job", "ovr-gang-job") + got := minCountWithClient(t, pod, jobWithParallelism("default", "ovr-gang-job", 8)) + if got != 2 { + t.Errorf("annotation override: minCount=%d, want 2 (annotation wins over Job=8)", got) + } +} + +// A classical gang of size N must get minCount = N so the whole group schedules +// atomically (this is the core multi-gang fix). +func atoi32(s string) int32 { v, _ := strconv.Atoi(s); return int32(v) } + +func toRuntime(objs []interface{}) []runtime.Object { + out := make([]runtime.Object, 0, len(objs)) + for _, o := range objs { + if ro, ok := o.(runtime.Object); ok { + out = append(out, ro) + } + } + return out +} + +func TestGangMinCountEqualsGroupSize(t *testing.T) { + for _, n := range []string{"2", "4", "8"} { + pod := cpuPod("fluence") + pod.Namespace = "default" + pod.Labels = map[string]string{webhook.GroupLabel: "g-" + n} + pod.Annotations = map[string]string{webhook.GroupSizeAnnotation: n} + got := minCountOf(t, pod) + want := atoi32(n) + if got != want { + t.Errorf("group-size=%s: minCount=%d, want %d", n, got, want) + } + } +} + +// No group-size annotation -> minCount falls back to 1 (single-pod gang). +func TestGangMinCountDefaultsToOne(t *testing.T) { + pod := cpuPod("fluence") + pod.Namespace = "default" + pod.Labels = map[string]string{webhook.GroupLabel: "g-default"} + if got := minCountOf(t, pod); got != 1 { + t.Errorf("absent group-size: minCount=%d, want 1", got) + } +} + +// group-size is the authoritative gang minCount: a workload that sets it to N +// gets minCount=N (the whole gang schedules atomically), regardless of any owner +// replica count. In the gang+submitter model the full workload IS the gang — +// there is no N-1 worker split. +func TestGangMinCountHonorsGroupSize(t *testing.T) { + pod := cpuPod("fluence") + pod.Namespace = "default" + pod.Labels = map[string]string{webhook.GroupLabel: "q-gang"} + pod.Annotations = map[string]string{ + webhook.GroupSizeAnnotation: "4", // full gang size + } + if got := minCountOf(t, pod); got != 4 { + t.Errorf("group-size gang: minCount=%d, want 4 (full N)", got) + } +} diff --git a/pkg/webhook/handlers/handlers_test.go b/pkg/webhook/handlers/handlers_test.go index 04d0e02..dee0746 100644 --- a/pkg/webhook/handlers/handlers_test.go +++ b/pkg/webhook/handlers/handlers_test.go @@ -2,6 +2,7 @@ package handlers import ( "context" + "strings" "testing" "github.com/converged-computing/fluence/pkg/placement" @@ -9,10 +10,7 @@ import ( "github.com/converged-computing/fluence/pkg/webhook/spec" corev1 "k8s.io/api/core/v1" - schedulingv1alpha2 "k8s.io/api/scheduling/v1alpha2" "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes/fake" ) // ── fixtures ──────────────────────────────────────────────────────────────────── @@ -83,16 +81,29 @@ func hasGateOp(ops []spec.Op) bool { return false } +// hasDropQuantumResourceOp reports whether ops remove the Fluxion quantum +// resource from a container's requests or limits (the consumer qpu strip). +func hasDropQuantumResourceOp(ops []spec.Op) bool { + for _, op := range ops { + if op.Op == "remove" && strings.HasSuffix(op.Path, "qpu") && + (strings.Contains(op.Path, "/resources/requests/") || + strings.Contains(op.Path, "/resources/limits/")) { + return true + } + } + return false +} + func hasSidecarOp(ops []spec.Op) bool { for _, op := range ops { switch v := op.Value.(type) { case corev1.Container: - if v.Name == "fluence-sidecar" { + if v.Name == SidecarContainerName { return true } case []corev1.Container: for _, c := range v { - if c.Name == "fluence-sidecar" { + if c.Name == SidecarContainerName { return true } } @@ -127,238 +138,6 @@ func TestMutateSkipsNonFluxion(t *testing.T) { } } -// ── quantum handler: submitter ────────────────────────────────────────────────── - -func TestSingleQuantumGetsInterceptorNoSidecar(t *testing.T) { - m := &webhook.Mutator{AttributeKeys: []string{"region"}} - ops := m.Mutate(context.Background(), qpuPod("fluence")) - names := opEnvNames(ops) - if !contains(names, "FLUXION_BACKEND") { - t.Errorf("want FLUXION_BACKEND, got %v", names) - } - if !contains(names, "PYTHONPATH") || !contains(names, "FLUENCE_POD_UID") { - t.Errorf("want interceptor env (PYTHONPATH, FLUENCE_POD_UID), got %v", names) - } - if hasSidecarOp(ops) { - t.Error("standalone quantum pod should not get a sidecar") - } - if hasGateOp(ops) { - t.Error("standalone quantum pod should not be gated") - } -} - -func TestObserveLabelInjectsSidecar(t *testing.T) { - m := &webhook.Mutator{} - pod := qpuPod("fluence") - pod.Labels = map[string]string{ObserveLabel: "true"} - ops := m.Mutate(context.Background(), pod) - if !hasSidecarOp(ops) { - t.Error("observe-labeled quantum pod should get the sidecar") - } - if hasGateOp(ops) { - t.Error("observe-only pod should not be gated") - } -} - -// ── quantum handler: worker gating ────────────────────────────────────────────── - -func quantumGroupFixture(ns, group, leaderName string) *fake.Clientset { - pg := &schedulingv1alpha2.PodGroup{ - ObjectMeta: metav1.ObjectMeta{ - Name: group, Namespace: ns, - Annotations: map[string]string{webhook.LeaderAnnotation: leaderName}, - }, - } - leaderPod := qpuPod("fluence") - leaderPod.Name = leaderName - leaderPod.Namespace = ns - leaderPod.Labels = map[string]string{webhook.GroupLabel: group} - return fake.NewSimpleClientset(pg, leaderPod) -} - -func TestClassicalWorkerInQuantumGroupIsGated(t *testing.T) { - ns, group, leader := "default", "qaoa", "qaoa-leader" - m := &webhook.Mutator{Clientset: quantumGroupFixture(ns, group, leader)} - - worker := cpuPod("fluence") - worker.Name = "qaoa-worker-0" - worker.Namespace = ns - worker.Labels = map[string]string{webhook.GroupLabel: group} - - ops := m.Mutate(context.Background(), worker) - if !hasGateOp(ops) { - t.Errorf("classical worker in a quantum group should be gated; ops=%v", ops) - } - if hasSidecarOp(ops) { - t.Error("worker should not get a sidecar") - } -} - -func TestClassicalGangWorkerNotGated(t *testing.T) { - ns, group, leader := "default", "classical", "classical-leader" - pg := &schedulingv1alpha2.PodGroup{ - ObjectMeta: metav1.ObjectMeta{Name: group, Namespace: ns, - Annotations: map[string]string{webhook.LeaderAnnotation: leader}}, - } - leaderPod := cpuPod("fluence") - leaderPod.Name = leader - leaderPod.Namespace = ns - leaderPod.Labels = map[string]string{webhook.GroupLabel: group} - m := &webhook.Mutator{Clientset: fake.NewSimpleClientset(pg, leaderPod)} - - worker := cpuPod("fluence") - worker.Name = "classical-worker-0" - worker.Namespace = ns - worker.Labels = map[string]string{webhook.GroupLabel: group} - - if hasGateOp(m.Mutate(context.Background(), worker)) { - t.Error("worker in a classical gang must NOT be gated (would deadlock)") - } -} - -// Pod-template gang: every pod requests QPU; only the recorded leader gets the -// sidecar, the rest are gated workers (role by admission order, not request). -func TestPodTemplateGangSecondPodIsWorker(t *testing.T) { - ns, group, leader := "default", "qaoa", "qaoa-abc123" - pg := &schedulingv1alpha2.PodGroup{ - ObjectMeta: metav1.ObjectMeta{Name: group, Namespace: ns, - Annotations: map[string]string{webhook.LeaderAnnotation: leader}}, - } - leaderPod := qpuPod("fluence") - leaderPod.Name = leader - leaderPod.Namespace = ns - leaderPod.Labels = map[string]string{webhook.GroupLabel: group} - m := &webhook.Mutator{Clientset: fake.NewSimpleClientset(pg, leaderPod)} - - second := qpuPod("fluence") // identical spec, requests QPU - second.Name = "qaoa-def456" - second.Namespace = ns - second.Labels = map[string]string{webhook.GroupLabel: group} - - ops := m.Mutate(context.Background(), second) - if !hasGateOp(ops) { - t.Error("second pod in a pod-template gang must be gated as a worker") - } - if hasSidecarOp(ops) { - t.Error("second pod must NOT get a sidecar (it is a worker)") - } -} - -// ── quantum handler: explicit role annotation ────────────────────────────────── -// -// These cover the fluence.flux-framework.org/role annotation, which makes the -// leader/worker split EXPLICIT rather than inferred by admission order. When the -// annotation is present it is authoritative; the same value is echoed to the -// container as FLUENCE_ROLE so the app reads the role Fluence used. - -// roledQPUPod is a QPU-requesting pod in a group with an explicit role. -func roledQPUPod(ns, group, name, role string) *corev1.Pod { - p := qpuPod("fluence") - p.Name = name - p.Namespace = ns - p.Labels = map[string]string{webhook.GroupLabel: group} - p.Annotations = map[string]string{webhook.RoleAnnotation: role} - return p -} - -// An explicitly-declared leader gets the sidecar and is NOT gated — even though -// no leader is recorded on the PodGroup (admission order never consulted). -func TestExplicitLeaderGetsSidecarNotGated(t *testing.T) { - ns, group := "default", "qaoa" - // fixture with NO LeaderAnnotation recorded — proves we don't rely on it. - pg := &schedulingv1alpha2.PodGroup{ - ObjectMeta: metav1.ObjectMeta{Name: group, Namespace: ns}, - } - m := &webhook.Mutator{Clientset: fake.NewSimpleClientset(pg)} - - leader := roledQPUPod(ns, group, "qaoa-leader", RoleLeader) - ops := m.Mutate(context.Background(), leader) - if hasGateOp(ops) { - t.Error("explicit leader must NOT be gated") - } - if !hasSidecarOp(ops) { - t.Error("explicit leader must get the sidecar") - } - if !contains(opEnvNames(ops), "FLUENCE_ROLE") { - t.Error("leader must get FLUENCE_ROLE injected for the app to read") - } -} - -// An explicitly-declared worker is gated and gets no sidecar — even if it -// requests the QPU resource itself and even if it (wrongly) appears as the -// recorded leader. The annotation overrides both. -func TestExplicitWorkerIsGatedRegardlessOfAdmission(t *testing.T) { - ns, group := "default", "qaoa" - // Adversarial fixture: record THIS worker's own name as the admission-order - // leader. The explicit role:worker must still win and gate it. - worker := roledQPUPod(ns, group, "qaoa-worker-0", RoleWorker) - pg := &schedulingv1alpha2.PodGroup{ - ObjectMeta: metav1.ObjectMeta{Name: group, Namespace: ns, - Annotations: map[string]string{webhook.LeaderAnnotation: worker.Name}}, - } - m := &webhook.Mutator{Clientset: fake.NewSimpleClientset(pg)} - - ops := m.Mutate(context.Background(), worker) - if !hasGateOp(ops) { - t.Error("explicit worker must be gated even if mis-recorded as the admission-order leader") - } - if hasSidecarOp(ops) { - t.Error("explicit worker must NOT get a sidecar") - } - if !contains(opEnvNames(ops), "FLUENCE_ROLE") { - t.Error("worker must get FLUENCE_ROLE injected so the app knows it is a worker") - } -} - -// A heterogeneous gang declared with explicit roles resolves to exactly one -// leader (sidecar, ungated) and the rest workers (gated) — independent of the -// order in which the webhook admits the pods. This is the property a -// leader/worker quantum gang needs and that admission order cannot guarantee. -func TestExplicitRolesResolveRegardlessOfOrder(t *testing.T) { - ns, group := "default", "qaoa" - pg := &schedulingv1alpha2.PodGroup{ - ObjectMeta: metav1.ObjectMeta{Name: group, Namespace: ns}, // no recorded leader - } - m := &webhook.Mutator{Clientset: fake.NewSimpleClientset(pg)} - - pods := []*corev1.Pod{ - roledQPUPod(ns, group, "w0", RoleWorker), - roledQPUPod(ns, group, "leader", RoleLeader), - roledQPUPod(ns, group, "w1", RoleWorker), - } - var leaders, workers int - for _, p := range pods { // any admission order - ops := m.Mutate(context.Background(), p) - switch { - case hasSidecarOp(ops) && !hasGateOp(ops): - leaders++ - case hasGateOp(ops) && !hasSidecarOp(ops): - workers++ - default: - t.Fatalf("pod %s resolved to neither a clean leader nor worker", p.Name) - } - } - if leaders != 1 || workers != 2 { - t.Fatalf("want 1 leader + 2 workers, got %d leaders / %d workers", leaders, workers) - } -} - -// Backwards compatibility: with NO role annotation, the leader is still chosen -// by admission order (the recorded PodGroup leader), exactly as before. -func TestNoRoleAnnotationFallsBackToAdmissionOrder(t *testing.T) { - ns, group, leader := "default", "qaoa", "qaoa-leader" - m := &webhook.Mutator{Clientset: quantumGroupFixture(ns, group, leader)} - - // a second pod with no role annotation, not the recorded leader -> worker - second := qpuPod("fluence") - second.Name = "qaoa-second" - second.Namespace = ns - second.Labels = map[string]string{webhook.GroupLabel: group} - if !hasGateOp(m.Mutate(context.Background(), second)) { - t.Error("without a role annotation, a non-leader group member must be gated by admission order") - } -} - // ── gang handler: scheduling group linkage ────────────────────────────────────── func TestGangStampsSchedulingGroup(t *testing.T) { diff --git a/pkg/webhook/handlers/quantum.go b/pkg/webhook/handlers/quantum.go index 97fbfa6..26a09b0 100644 --- a/pkg/webhook/handlers/quantum.go +++ b/pkg/webhook/handlers/quantum.go @@ -4,11 +4,16 @@ import ( "context" "fmt" "log" + "os" + "strconv" + "strings" "github.com/converged-computing/fluence/pkg/webhook" "github.com/converged-computing/fluence/pkg/webhook/spec" corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -16,188 +21,604 @@ func init() { webhook.Register(&quantumHandler{}) } -// Quantum-specific policy. The webhook core knows NONE of these — they live -// only here, in the quantum handler. +// Quantum-specific policy. The webhook core knows NONE of these — they live only +// here, in the quantum handler. +// +// Model (producer/consumer split, no separate submitter pod). A quantum task's +// circuit comes from user code, so the pod that defines a task must RUN to submit +// it — submit and gate are mutually exclusive per pod. Gating therefore only +// helps pods that do NOT submit. A quantum gang in CoordinationShared mode is +// split, per pod, into two roles decided at admission: +// +// - PRODUCER (one member, the indexed-Job completion index 0): its own +// group-of-one -producer (minCount 1) so it schedules alone and runs +// the SINGLE real submit; staged with the interceptor in REAL (tag) mode and +// given the sidecar, which polls the task and ungates the consumers at +// position==1. NOT gated. The producer is one of the N members, so the +// application is run exactly N times — never N+1. +// - CONSUMERS (the other N-1 members): the gang (minCount N-1), each +// gated on quantum.braket/ready + preempting priority, told its role via +// FLUENCE_COORDINATION_ROLE=consumer and handed the producer's task id via +// FLUENCE_QUANTUM_JOB_ID. A consumer does NOT submit; it fetches the shared +// result by that id. Ungated together when the producer's task is ready. +// +// In CoordinationIndependent mode (the default) there is no shared result to +// coordinate: every member is its own standalone producer (real submit, no gate), +// each owning its task and its own queue wait. A lone quantum pod (no group) is +// always standalone. const ( - // QuantumResource is the Fluxion resource a pod requests when it wants - // Fluence to schedule quantum work. Requesting it is the trigger for sidecar - // + interceptor injection. + // QuantumResource is the Fluxion resource a pod requests to ask Fluence to + // schedule quantum work. Requesting it is the sole trigger for this handler. QuantumResource = "fluxion.flux-framework.org/qpu" - // QuantumGate holds a classical worker until the leader's quantum task is - // ready (the sidecar removes it). + // QuantumGate holds a consumer pod unscheduled until the producer's task is + // ready (the producer's sidecar removes it). QuantumGate = "quantum.braket/ready" - // ObserveLabel opts a standalone quantum pod into observe-only telemetry: - // the sidecar is injected and polls queue position but ungates nothing. + // ObserveLabel opts a STANDALONE quantum pod (a group of one) into + // observe-only telemetry: the sidecar is injected and polls queue position + // but ungates nothing. ObserveLabel = "fluence.flux-framework.org/observe" - // Role values for webhook.RoleAnnotation. - RoleLeader = "leader" - RoleWorker = "worker" + // DependencyKindQuantumSubmit is the readiness Kind for the quantum resource + // type: consumer pods wait for a quantum submission to reach the device queue. + // First concrete instance of the general Dependency primitive (dependency.go). + DependencyKindQuantumSubmit = "quantum-submit" + + // CoordinationAnnotation selects how a quantum gang is coordinated. It is an + // open enum so future designs (e.g. index-paired "scatter") add a mode + // without changing the mechanism. + CoordinationAnnotation = "fluence.flux-framework.org/coordination" + + // CoordinationShared: one real task; the producer (index 0) submits and the + // other members are gated consumers that fetch the producer's result. Each + // member is told its role via FLUENCE_COORDINATION_ROLE; a role-aware workload + // branches on it (producer submits, consumer fetches by FLUENCE_QUANTUM_JOB_ID). + CoordinationShared = "shared" + + // CoordinationIndependent (default): every member does its own quantum work; + // no coordination, no gating. Never invent coordination the user did not ask + // for, and never dedup tasks meant to be distinct. + CoordinationIndependent = "independent" + + // ProducerGroupSuffix names the producer's own group-of-one: -producer + // (minCount 1) so it schedules alone and never deadlocks against the gated + // consumer gang. + ProducerGroupSuffix = "-producer" + + // CompletionIndexAnnotation is the indexed-Job completion index the Job + // controller stamps on each pod; index "0" is the producer (deterministic + // election with no recorded state). + CompletionIndexAnnotation = "batch.kubernetes.io/job-completion-index" + + // ProducerIndex is the completion index promoted to producer. + ProducerIndex = "0" + + // GangGroupEnv tells the producer's sidecar which consumer group label to list + // and ungate when the task is ready. + GangGroupEnv = "FLUENCE_GANG_GROUP" ) -// quantumHandler coordinates quantum-classical workflows. It applies to a pod -// in either role: -// - the quantum submitter (requests QuantumResource): inject the interceptor, -// plus the sidecar when there is coordination to do (group leader, or -// observe-only telemetry requested); -// - a classical worker (a non-leader member of a group whose leader is a -// quantum pod): gate it until the leader's task is ready. -// -// This is the only place in the webhook that knows about quantum resources, -// gates, or observe semantics. +// quantumHandler splits a shared quantum gang into a single producer (real +// submit + sidecar) and N-1 gated, role-aware consumers, or runs every member +// standalone in independent mode (see the package-level model comment). It is the +// only place in the webhook that knows about quantum resources, gates, +// coordination, or observe semantics. type quantumHandler struct{} func (h *quantumHandler) Name() string { return "quantum" } +// Applies to any pod requesting the quantum resource. Producers, consumers, and +// standalone quantum pods all request it; nothing without the resource needs +// quantum handling, so this is the single, unambiguous trigger. func (h *quantumHandler) Applies(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) bool { - if spec.PodRequestsResource(pod, QuantumResource) { - return true + return spec.PodRequestsResource(pod, QuantumResource) +} + +func (h *quantumHandler) Mutate(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) []spec.Op { + g := resolveGroup(pod) + n := resolveGangSize(ctx, m, pod, g) + mode := coordinationMode(pod) + observe := spec.Label(pod, ObserveLabel) == "true" + + // No coordination: a standalone quantum pod, or an explicitly independent + // member. The REAL submit happens in THIS pod; the sidecar is added only for + // observe-only telemetry. (independent mode routes every member here -> N + // standalone producers, each owning its task and its own queue wait.) + if mode != CoordinationShared || g == "" || n <= 1 { + ops := interceptorOps(pod) + if observe { + sc := sidecarFor(m) + sc.EnsureRBAC(ctx, pod.Namespace) + ops = append(ops, sc.ContainerOps(pod, true, nil)...) + } + log.Printf("[fluence-webhook] quantum %s/%s mode=%s (standalone/independent, observe=%v)", + pod.Namespace, pod.Name, mode, observe) + return ops + } + + // shared mode: promote one member to producer; the rest are gated consumers. + if isProducer(ctx, m, pod, g) { + return h.mutateProducer(ctx, m, pod, g) + } + return h.mutateConsumer(ctx, m, pod, g, n) +} + +// mutateProducer wires the single producer member (indexed-Job completion index +// 0): its own group-of-one -producer (minCount 1) so it schedules alone +// and runs the REAL submit, the interceptor in tag mode, RBAC, and the sidecar +// told which consumer group to ungate (FLUENCE_GANG_GROUP). The producer is one +// of the N members, so the application is NOT run an extra time. Never gated. +func (h *quantumHandler) mutateProducer(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod, group string) []spec.Op { + pg := group + ProducerGroupSuffix + m.EnsurePodGroup(ctx, pod.Namespace, pg, pod.Name, 1) + ops := linkGroupOps(pod, pg) + ops = append(ops, interceptorOps(pod)...) // tag mode: the producer submits for real + ops = append(ops, roleEnvOps(pod, RoleProducer)...) // FLUENCE_COORDINATION_ROLE=producer + sc := sidecarFor(m) + sc.EnsureRBAC(ctx, pod.Namespace) + extra := []corev1.EnvVar{{Name: GangGroupEnv, Value: group}} + ops = append(ops, sc.ContainerOps(pod, false, extra)...) + log.Printf("[fluence-webhook] quantum producer %s/%s — group %s (ungates consumers %q)", + pod.Namespace, pod.Name, pg, group) + return ops +} + +// mutateConsumer wires a non-producer member: it joins the consumer gang +// (minCount N-1) and is gated until the producer's task is ready. It is told its +// role (FLUENCE_COORDINATION_ROLE=consumer) and handed the producer's task id +// (FLUENCE_QUANTUM_JOB_ID, stamped on the pod by the sidecar at ungate). A +// role-aware consumer reads those and fetches the shared result instead of +// submitting — so the consumer never calls the vendor submit, and needs neither +// the interceptor nor a faux flag. +func (h *quantumHandler) mutateConsumer(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod, group string, n int32) []spec.Op { + m.EnsurePodGroup(ctx, pod.Namespace, group, pod.Name, n-1) + ops := linkGroupOps(pod, group) + // Express the wait as the GENERAL dependency primitive: this consumer depends + // on the quantum submission produced by -producer, held by the quantum + // gate. applyOps gates the pod, raises priority, and stamps depends-on-*. + dep := Dependency{Kind: DependencyKindQuantumSubmit, Producer: group + ProducerGroupSuffix, Gate: QuantumGate} + ops = append(ops, dep.applyOps(pod)...) + ops = append(ops, consumerEnvOps(pod)...) + // A gated consumer never runs the QPU task — it only fetches the producer's + // shared result — so it must not hold the Fluxion quantum resource. Leaving it + // would make Fluxion allocate a qpu per consumer, capping the gang at the + // backend's graph qpu count and, on a single-slot real QPU, leaving the + // consumers unschedulable. Applies() already routed this pod on the request, so + // stripping it here is safe. + ops = append(ops, dropQuantumResourceOps(pod)...) + log.Printf("[fluence-webhook] quantum consumer %s/%s — group %s minCount=%d, gated (role=consumer, qpu stripped)", + pod.Namespace, pod.Name, group, n-1) + return ops +} + +// dropQuantumResourceOps removes the Fluxion quantum resource from a consumer's +// containers (requests and limits), returning the patch ops and mutating pod in +// place. Only entries that are present are removed (a JSON-patch remove on a +// missing path would fail). The sidecar container is never a consumer concern. +func dropQuantumResourceOps(pod *corev1.Pod) []spec.Op { + rn := corev1.ResourceName(QuantumResource) + // JSON Pointer escaping for the resource key: '~' -> '~0', '/' -> '~1'. + key := strings.ReplaceAll(strings.ReplaceAll(QuantumResource, "~", "~0"), "/", "~1") + var ops []spec.Op + for i, c := range pod.Spec.Containers { + if c.Name == SidecarContainerName { + continue + } + if _, ok := c.Resources.Requests[rn]; ok { + ops = append(ops, spec.Op{Op: "remove", + Path: fmt.Sprintf("/spec/containers/%d/resources/requests/%s", i, key)}) + delete(pod.Spec.Containers[i].Resources.Requests, rn) + } + if _, ok := c.Resources.Limits[rn]; ok { + ops = append(ops, spec.Op{Op: "remove", + Path: fmt.Sprintf("/spec/containers/%d/resources/limits/%s", i, key)}) + delete(pod.Spec.Containers[i].Resources.Limits, rn) + } } - // An explicitly-declared worker applies (so it gets gated) even if it - // doesn't request the quantum resource and the leader isn't recorded yet — - // this removes the admission-order race for explicitly-roled gangs. - if webhook.Role(pod) == RoleWorker && webhook.GroupName(pod) != "" { - return true + return ops +} + +// coordinationMode reads the coordination annotation; default independent. +func coordinationMode(pod *corev1.Pod) string { + if v := spec.Annotation(pod, CoordinationAnnotation); v != "" { + return v } - return h.isWorkerOfQuantumGroup(ctx, m, pod) + return CoordinationIndependent } -// isWorkerOfQuantumGroup reports whether pod is a non-leader member of a group -// whose recorded leader is a quantum (QuantumResource-requesting) pod. Workers -// are classical and do not request the resource themselves, so their role is a -// property of group membership, resolved against cluster state. -func (h *quantumHandler) isWorkerOfQuantumGroup(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) bool { - g := webhook.GroupName(pod) - if g == "" || m.Client() == nil { - return false +// isProducer decides whether THIS pod is the gang's single producer. Indexed Job +// (recommended): completion index 0 is the producer — deterministic, race-free, +// no recorded state. Otherwise: first arrival claims the producer slot by the +// absence of the producer PodGroup (best-effort under concurrent admission; +// prefer an indexed Job for determinism). Indexing a nil annotations map yields +// ok=false, so the indexed branch is nil-safe. +func isProducer(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod, group string) bool { + if idx, ok := pod.Annotations[CompletionIndexAnnotation]; ok { + return idx == ProducerIndex } - leader := m.PodGroupLeader(ctx, pod.Namespace, g) - if leader == "" || leader == pod.Name { - return false + c := m.Client() + if c == nil { + return true // tests / no client: treat as producer } - lp, err := m.Client().CoreV1().Pods(pod.Namespace).Get(ctx, leader, metav1.GetOptions{}) - if err != nil { - return false + pg := group + ProducerGroupSuffix + if _, err := c.SchedulingV1alpha2().PodGroups(pod.Namespace).Get(ctx, pg, metav1.GetOptions{}); err == nil { + return false // already claimed by an earlier arrival } - return spec.PodRequestsResource(lp, QuantumResource) + return true } -func (h *quantumHandler) Mutate(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) []spec.Op { - g := webhook.GroupName(pod) - - // Determine role. An explicit role annotation is AUTHORITATIVE: the workload - // declares which pod leads and which wait, and Fluence honors it directly — - // no admission-order race, and the same value is echoed to the app as - // FLUENCE_ROLE so the webhook's notion of leader and the application's notion - // cannot disagree. When the annotation is absent, fall back to the legacy - // behavior: role is decided by admission order (the first pod admitted in the - // group, recorded on the PodGroup by the gang handler). The admission-order - // path suits a homogeneous pod-template gang where every pod is identical; - // the explicit annotation suits a heterogeneous leader/worker gang. - role := webhook.Role(pod) - var isWorker bool - switch role { - case RoleWorker: - isWorker = true - case RoleLeader: - isWorker = false - default: - if g != "" { - leader := m.PodGroupLeader(ctx, pod.Namespace, g) - isWorker = leader != "" && leader != pod.Name - } - } - - if g != "" && isWorker { - log.Printf("[fluence-webhook] quantum worker %s/%s (role=%q) — gating", - pod.Namespace, pod.Name, role) - ops := gateOps(pod) - ops = append(ops, roleEnvOps(pod, RoleWorker)...) - return ops +// resolveGroup returns the gang group identity: the explicit group label, else +// the owning controller's name (Job/ReplicaSet/StatefulSet — a Deployment's pods +// are owned by a ReplicaSet), else "" (a loose quantum pod with no group, which +// is treated as a standalone group of one). +func resolveGroup(pod *corev1.Pod) string { + if g := webhook.GroupName(pod); g != "" { + return g + } + for _, ref := range pod.OwnerReferences { + switch ref.Kind { + case "Job", "ReplicaSet", "StatefulSet": + return ref.Name + } } + return "" +} - // Submitter/leader role: recorded or declared group leader, or a standalone - // quantum pod. Always gets the interceptor (so its task is tagged). It gets - // the SIDECAR only when there is coordination to do: it is a group leader - // (workers to ungate), or observe-only telemetry is requested. - isLeader := g != "" - observe := spec.Label(pod, ObserveLabel) == "true" +// resolveGangSize returns the full gang size N: the explicit group-size +// annotation (authoritative override), else the owner's replica count (Job +// parallelism/completions, ReplicaSet replicas), else a count of pods already +// carrying the group label (best-effort for loose grouped pods; admission-order +// dependent, which is why the annotation is preferred), else 1. +func resolveGangSize(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod, group string) int32 { + if pod.Annotations != nil { + if v, err := strconv.Atoi(pod.Annotations[webhook.GroupSizeAnnotation]); err == nil && v > 0 { + return int32(v) + } + } + if n := ownerJobN(ctx, m, pod); n > 0 { + return n + } + if n := ownerReplicaSetN(ctx, m, pod); n > 0 { + return n + } + if group != "" { + if n := countGroupPods(ctx, m, pod.Namespace, group); n > 0 { + return n + } + } + return 1 +} - log.Printf("[fluence-webhook] quantum pod %s/%s — interceptor (leader=%v role=%q observe=%v)", - pod.Namespace, pod.Name, isLeader, role, observe) +// ownerReplicaSetN returns the replica count of the ReplicaSet that owns the pod +// (the Deployment case: Deployment -> ReplicaSet -> Pod), or 0 if none. +func ownerReplicaSetN(ctx context.Context, m webhook.MutatorAPI, pod *corev1.Pod) int32 { + c := m.Client() + if c == nil { + return 0 + } + for _, ref := range pod.OwnerReferences { + if ref.Kind != "ReplicaSet" { + continue + } + rs, err := c.AppsV1().ReplicaSets(pod.Namespace).Get(ctx, ref.Name, metav1.GetOptions{}) + if err != nil { + return 0 + } + if rs.Spec.Replicas != nil && *rs.Spec.Replicas > 0 { + return *rs.Spec.Replicas + } + } + return 0 +} - ops := m.InterceptorOps(pod) - ops = append(ops, roleEnvOps(pod, RoleLeader)...) - if isLeader || observe { - m.EnsureSidecarRBAC(ctx, pod.Namespace) - ops = append(ops, m.SidecarContainerOps(pod, observe)...) +// countGroupPods counts pods already carrying the group label (best-effort gang +// size for loose grouped pods that have neither a group-size annotation nor an +// owning controller). Admission-order dependent — prefer the group-size +// annotation when the exact size must be guaranteed. +func countGroupPods(ctx context.Context, m webhook.MutatorAPI, namespace, group string) int32 { + c := m.Client() + if c == nil { + return 0 } - return ops + list, err := c.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: webhook.GroupLabel + "=" + group, + }) + if err != nil { + return 0 + } + return int32(len(list.Items)) } -// roleEnvOps injects FLUENCE_ROLE into every (non-sidecar) container so the -// application reads its gang role from the same source of truth the webhook -// used. effectiveRole is what the webhook decided (leader/worker), used only -// when the pod carries no explicit role annotation; when the annotation is -// present we source the value from it via the downward API so the two always -// agree. Unlike InterceptorOps, this is NOT limited to Fluxion-resource -// containers — worker containers do not request the quantum resource but still -// need to know they are workers. -func roleEnvOps(pod *corev1.Pod, effectiveRole string) []spec.Op { - var value corev1.EnvVar - if webhook.Role(pod) != "" { - value = spec.AnnotationEnv("FLUENCE_ROLE", webhook.RoleAnnotation) - } else { - value = corev1.EnvVar{Name: "FLUENCE_ROLE", Value: effectiveRole} +// linkGroupOps ensures the gang pod carries the group label (so the producer's +// sidecar can list it) and is linked to the gang PodGroup via +// spec.schedulingGroup.podGroupName. Idempotent. +func linkGroupOps(pod *corev1.Pod, group string) []spec.Op { + var ops []spec.Op + if webhook.GroupName(pod) != group { + if pod.Labels == nil { + ops = append(ops, spec.Op{Op: "add", Path: "/metadata/labels", + Value: map[string]string{webhook.GroupLabel: group}}) + } else { + ops = append(ops, spec.Op{Op: "add", + Path: "/metadata/labels/" + escapeJSONPointer(webhook.GroupLabel), + Value: group}) + } } + if pod.Spec.SchedulingGroup == nil || pod.Spec.SchedulingGroup.PodGroupName == nil || + *pod.Spec.SchedulingGroup.PodGroupName != group { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/schedulingGroup", + Value: map[string]string{"podGroupName": group}}) + } + return ops +} + +// escapeJSONPointer escapes "~" and "/" for use in a JSON Pointer path segment. +func escapeJSONPointer(s string) string { + s = strings.ReplaceAll(s, "~", "~0") + s = strings.ReplaceAll(s, "/", "~1") + return s +} + +const QuantumClassicalPriorityClass = "fluence-quantum-classical" + +// ── coordination role (producer / consumer) ───────────────────────────────────── +// +// In a shared gang each member is told its role positively, so the application +// branches on it instead of relying on any submit-interception magic: +// producer submits the one real task (and is tagged so the sidecar finds it); +// consumer does NOT submit — it reads the producer's task id and fetches the +// shared result (e.g. via the vendor's S3-backed result API). +// The role is decided at admission by isProducer (completion index 0, else the +// producer-group claim) and surfaced as FLUENCE_COORDINATION_ROLE. Because the +// election is the webhook's, this env is the single source of truth — the +// container never re-derives its role from the Job index (which loose, non-Job +// pods don't even have). + +const ( + // CoordinationRoleEnv carries the pod's role in a shared gang. A role-aware + // workload branches on it: RoleProducer submits, RoleConsumer fetches the + // shared result by id. Unset for standalone/independent pods (they all submit). + CoordinationRoleEnv = "FLUENCE_COORDINATION_ROLE" + RoleProducer = "producer" + RoleConsumer = "consumer" + + // QuantumJobIDAnnotation is the vendor-neutral task id the ungating sidecar + // stamps on each consumer (mirrors python/fluence/ungate.py JOB_ID_ANNOTATION), + // BEFORE removing the gate. Surfaced into FLUENCE_QUANTUM_JOB_ID via the + // downward API so a consumer can fetch the producer's result by id. + QuantumJobIDAnnotation = "fluence.flux-framework.org/quantum-job-id" + + // QuantumJobIDEnv is the env a consumer reads for the producer's task id. + QuantumJobIDEnv = "FLUENCE_QUANTUM_JOB_ID" +) + +// roleEnvOps sets FLUENCE_COORDINATION_ROLE= on each non-sidecar container. +func roleEnvOps(pod *corev1.Pod, role string) []spec.Op { + return setContainerEnvOps(pod, corev1.EnvVar{Name: CoordinationRoleEnv, Value: role}) +} + +// consumerEnvOps tells a consumer its role and hands it the producer's task id +// (FLUENCE_QUANTUM_JOB_ID, downward API from the annotation the ungating sidecar +// stamps). A consumer never submits, so it gets neither the interceptor nor any +// faux flag — just its role and the id to fetch the shared result with. +func consumerEnvOps(pod *corev1.Pod) []spec.Op { + ops := roleEnvOps(pod, RoleConsumer) + ops = append(ops, setContainerEnvOps(pod, spec.AnnotationEnv(QuantumJobIDEnv, QuantumJobIDAnnotation))...) + return ops +} + +// setContainerEnvOps appends env var e to every non-sidecar container that does +// not already define it, returning the patch ops and mutating pod in place. +func setContainerEnvOps(pod *corev1.Pod, e corev1.EnvVar) []spec.Op { var ops []spec.Op for i, c := range pod.Spec.Containers { - if c.Name == "fluence-sidecar" || spec.HasEnv(c, "FLUENCE_ROLE") { + if c.Name == SidecarContainerName || spec.HasEnv(c, e.Name) { continue } if len(c.Env) == 0 { - ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env", i), Value: []corev1.EnvVar{value}}) + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env", i), Value: []corev1.EnvVar{e}}) + pod.Spec.Containers[i].Env = []corev1.EnvVar{e} } else { - ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: value}) + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: e}) + pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, e) } - pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, value) } return ops } -// gateOps adds the quantum scheduling gate (idempotent). -const QuantumClassicalPriorityClass = "fluence-quantum-classical" +// Sidecar implementation — quantum-owned, NOT core. +// +// The fluence coordination sidecar (its container, name, RBAC, image, and the +// Python interceptor staging) is specific to the quantum integration: it polls a +// vendor queue and ungates workers. None of this belongs on the webhook core, +// which stays domain-agnostic and only exposes generic primitives (Client, +// InjectedEnv, EnsurePodGroup). The core invokes each handler's generic Mutate; +// a handler does its own create/edit side-effects (here: RBAC, ConfigMaps, +// container injection) through the generic client. +// +// These are package-level functions (not methods on the core *Mutator) operating +// on the generic webhook.MutatorAPI. coreSidecar (see sidecar.go) delegates to +// them; a future non-quantum handler that needs a different sidecar supplies its +// own Sidecar implementation and its own container name/image. -func gateOps(pod *corev1.Pod) []spec.Op { - for _, g := range pod.Spec.SchedulingGates { - if g.Name == QuantumGate { - return nil +const ( + // SidecarContainerName is the injected sidecar container's name. Owned here + // (not a global core const) because the container is quantum-specific. + SidecarContainerName = "fluence-sidecar" + + // SidecarServiceAccount is the ServiceAccount (and Role/RoleBinding) name the + // sidecar uses to patch pods and read PodGroups. + SidecarServiceAccount = "fluence-sidecar" + + // defaultSidecarImage is used when FLUENCE_SIDECAR_IMAGE is not set. Owned by + // the quantum integration; the deployment may override it via the env var. + defaultSidecarImage = "vanessa/fluence-sidecar:latest" + + // StageVolumeName / StageMountPath: the shared emptyDir the init container + // stages the fluence Python package into, mounted into workload containers + // and prepended to PYTHONPATH (Model C delivery). + StageVolumeName = "fluence-pkg" + StageMountPath = "/opt/fluence-staged" +) + +// sidecarImage resolves the sidecar image: the FLUENCE_SIDECAR_IMAGE override +// (deployment config) or the quantum default. Read here so image config is owned +// by the integration that uses it, not the core. +func sidecarImage() string { + if v := os.Getenv("FLUENCE_SIDECAR_IMAGE"); v != "" { + return v + } + return defaultSidecarImage +} + +// ensureSidecarRBAC provisions the per-namespace ServiceAccount/Role/RoleBinding +// the sidecar uses to patch pods and read PodGroups. Idempotent (create-if-absent). +func ensureSidecarRBAC(ctx context.Context, m webhook.MutatorAPI, namespace string) { + c := m.Client() + if c == nil { + return + } + lbl := map[string]string{"app": SidecarServiceAccount} + + if _, err := c.CoreV1().ServiceAccounts(namespace).Get(ctx, SidecarServiceAccount, metav1.GetOptions{}); err != nil { + sa := &corev1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Name: SidecarServiceAccount, Namespace: namespace, Labels: lbl}} + if _, err := c.CoreV1().ServiceAccounts(namespace).Create(ctx, sa, metav1.CreateOptions{}); err != nil { + log.Printf("[fluence-webhook] could not create ServiceAccount %s/%s: %v", namespace, SidecarServiceAccount, err) } } + if _, err := c.RbacV1().Roles(namespace).Get(ctx, SidecarServiceAccount, metav1.GetOptions{}); err != nil { + role := &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{Name: SidecarServiceAccount, Namespace: namespace, Labels: lbl}, + Rules: []rbacv1.PolicyRule{ + {APIGroups: []string{""}, Resources: []string{"pods"}, Verbs: []string{"get", "list", "patch", "update"}}, + {APIGroups: []string{"scheduling.k8s.io"}, Resources: []string{"podgroups"}, Verbs: []string{"get", "list"}}, + }, + } + if _, err := c.RbacV1().Roles(namespace).Create(ctx, role, metav1.CreateOptions{}); err != nil { + log.Printf("[fluence-webhook] could not create Role %s/%s: %v", namespace, SidecarServiceAccount, err) + } + } + if _, err := c.RbacV1().RoleBindings(namespace).Get(ctx, SidecarServiceAccount, metav1.GetOptions{}); err != nil { + rb := &rbacv1.RoleBinding{ + ObjectMeta: metav1.ObjectMeta{Name: SidecarServiceAccount, Namespace: namespace, Labels: lbl}, + Subjects: []rbacv1.Subject{{Kind: "ServiceAccount", Name: SidecarServiceAccount, Namespace: namespace}}, + RoleRef: rbacv1.RoleRef{APIGroup: "rbac.authorization.k8s.io", Kind: "Role", Name: SidecarServiceAccount}, + } + if _, err := c.RbacV1().RoleBindings(namespace).Create(ctx, rb, metav1.CreateOptions{}); err != nil { + log.Printf("[fluence-webhook] could not create RoleBinding %s/%s: %v", namespace, SidecarServiceAccount, err) + } + } +} + +// interceptorOps stages the fluence Python package (Model C): an init container +// copies it into a shared emptyDir, mounted into every workload container +// (skipping the sidecar) with PYTHONPATH + FLUENCE_POD_UID, so Python auto-imports +// the interceptor via sitecustomize, which tags the vendor submit so the sidecar +// can find the task. Added to producers and standalone/independent pods (the ones +// that actually submit); consumers don't submit, so they don't get it. +func interceptorOps(pod *corev1.Pod) []spec.Op { var ops []spec.Op - gate := corev1.PodSchedulingGate{Name: QuantumGate} - if len(pod.Spec.SchedulingGates) == 0 { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/schedulingGates", Value: []corev1.PodSchedulingGate{gate}}) + + vol := corev1.Volume{Name: StageVolumeName, VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{}}} + if len(pod.Spec.Volumes) == 0 { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/volumes", Value: []corev1.Volume{vol}}) } else { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/schedulingGates/-", Value: gate}) - } - // Give gated classical workers a raised priority so they schedule reliably - // once ungated. priorityClassName is immutable post-creation, so it MUST be - // set here at admission, not at ungate time. Only set it if the pod doesn't - // already declare one (don't overwrite a user's class). - if pod.Spec.PriorityClassName == "" { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/priorityClassName", Value: QuantumClassicalPriorityClass}) - // Clear spec.priority so the priority admission controller recomputes it - // from the class. The controller errors only when spec.priority is - // non-nil AND differs from the class value; setting it to null avoids - // that in every case. We use add-with-null (not remove): a JSON Patch - // "remove" of an absent path is a hard error, and whether the API has - // already defaulted spec.priority differs across clusters/k8s versions - // (it broke in CI but not on GKE, or vice versa). add-null is valid - // whether the field is absent, 0, or set. - ops = append(ops, spec.Op{Op: "add", Path: "/spec/priority", EmitNull: true}) + ops = append(ops, spec.Op{Op: "add", Path: "/spec/volumes/-", Value: vol}) + } + + initc := corev1.Container{ + Name: "fluence-stage", + Image: sidecarImage(), + ImagePullPolicy: corev1.PullAlways, + Command: []string{"sh", "-c", + fmt.Sprintf("python3 -m fluence.stage %s || echo '[fluence] staging skipped (interceptor unavailable)'", StageMountPath)}, + VolumeMounts: []corev1.VolumeMount{{Name: StageVolumeName, MountPath: StageMountPath}}, + } + if len(pod.Spec.InitContainers) == 0 { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/initContainers", Value: []corev1.Container{initc}}) + } else { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/initContainers/-", Value: initc}) + } + + mount := corev1.VolumeMount{Name: StageVolumeName, MountPath: StageMountPath, ReadOnly: true} + pythonpath := corev1.EnvVar{Name: "PYTHONPATH", Value: StageMountPath} + uid := spec.FieldEnv("FLUENCE_POD_UID", "metadata.uid") + for i, c := range pod.Spec.Containers { + if c.Name == SidecarContainerName { + continue + } + if len(c.VolumeMounts) == 0 { + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/volumeMounts", i), Value: []corev1.VolumeMount{mount}}) + } else { + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/volumeMounts/-", i), Value: mount}) + } + if !spec.HasEnv(c, "PYTHONPATH") { + if len(c.Env) == 0 { + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env", i), Value: []corev1.EnvVar{pythonpath}}) + pod.Spec.Containers[i].Env = []corev1.EnvVar{pythonpath} + } else { + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: pythonpath}) + pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, pythonpath) + } + } + if !spec.HasEnv(c, "FLUENCE_POD_UID") { + ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: uid}) + pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, uid) + } + } + return ops +} + +// sidecarContainerOps adds the fluence sidecar container (pod identity env, the +// generic FLUXION_* contract from InjectedEnv, the observe flag, handler-supplied +// extraEnv, and the workload's secret/configMap-sourced credentials) and sets the +// sidecar ServiceAccount. observe=true selects observe-only telemetry mode. +func sidecarContainerOps(m webhook.MutatorAPI, pod *corev1.Pod, observe bool, extraEnv []corev1.EnvVar) []spec.Op { + var ops []spec.Op + env := []corev1.EnvVar{ + spec.FieldEnv("FLUENCE_POD_UID", "metadata.uid"), + spec.FieldEnv("FLUENCE_POD_NAME", "metadata.name"), + spec.FieldEnv("FLUENCE_NAMESPACE", "metadata.namespace"), + spec.FieldEnv("FLUENCE_GROUP", "metadata.labels['"+webhook.GroupLabel+"']"), + } + env = append(env, m.InjectedEnv()...) + if observe { + env = append(env, corev1.EnvVar{Name: "FLUENCE_OBSERVE", Value: "true"}) + } + env = append(env, extraEnv...) + // Copy the workload container's secret/configMap-sourced env onto the sidecar + // so it can talk to the same backend (domain-agnostic: we propagate whatever + // the workload pulls from a secret/configMap; existing FLUENCE_/FLUXION_ names + // are not overwritten). + if len(pod.Spec.Containers) > 0 { + have := map[string]bool{} + for _, e := range env { + have[e.Name] = true + } + for _, e := range pod.Spec.Containers[0].Env { + if have[e.Name] || e.ValueFrom == nil { + continue + } + if e.ValueFrom.SecretKeyRef != nil || e.ValueFrom.ConfigMapKeyRef != nil { + env = append(env, e) + } + } + } + sidecar := corev1.Container{ + Name: SidecarContainerName, Image: sidecarImage(), ImagePullPolicy: corev1.PullAlways, + Env: env, + Resources: corev1.ResourceRequirements{Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), corev1.ResourceMemory: resource.MustParse("256Mi"), + }}, + } + if len(pod.Spec.Containers) == 0 { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/containers", Value: []corev1.Container{sidecar}}) + } else { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/containers/-", Value: sidecar}) + } + if pod.Spec.ServiceAccountName == "" || pod.Spec.ServiceAccountName == "default" { + ops = append(ops, spec.Op{Op: "add", Path: "/spec/serviceAccountName", Value: SidecarServiceAccount}) } return ops } diff --git a/pkg/webhook/handlers/quantum_test.go b/pkg/webhook/handlers/quantum_test.go new file mode 100644 index 0000000..10a000d --- /dev/null +++ b/pkg/webhook/handlers/quantum_test.go @@ -0,0 +1,521 @@ +/* +Copyright 2024 Lawrence Livermore National Security, LLC + (c.f. AUTHORS, NOTICE.LLNS, COPYING) +SPDX-License-Identifier: Apache-2.0 +*/ + +// quantum_test.go — all tests for the quantum handler: the producer/consumer +// shared-coordination split (no separate submitter pod), independent mode, +// the coordination role + job-id handoff, the sidecar wiring, the Dependency primitive, and the +// standalone/observe paths. Shared fixtures (qpuPod, cpuPod, op helpers) live in +// handlers_test.go. +package handlers + +import ( + "context" + "testing" + + "github.com/converged-computing/fluence/pkg/webhook" + "github.com/converged-computing/fluence/pkg/webhook/spec" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" +) + +// ── standalone / observe ──────────────────────────────────────────────────────── + +func TestSingleQuantumGetsInterceptorNoSidecar(t *testing.T) { + m := &webhook.Mutator{AttributeKeys: []string{"region"}} + ops := m.Mutate(context.Background(), qpuPod("fluence")) + names := opEnvNames(ops) + if !contains(names, "FLUXION_BACKEND") { + t.Errorf("want FLUXION_BACKEND, got %v", names) + } + if !contains(names, "PYTHONPATH") || !contains(names, "FLUENCE_POD_UID") { + t.Errorf("want interceptor env (PYTHONPATH, FLUENCE_POD_UID), got %v", names) + } + if hasSidecarOp(ops) { + t.Error("standalone quantum pod should not get a sidecar") + } + if hasGateOp(ops) { + t.Error("standalone quantum pod should not be gated") + } +} + +func TestObserveLabelInjectsSidecar(t *testing.T) { + m := &webhook.Mutator{} + pod := qpuPod("fluence") + pod.Labels = map[string]string{ObserveLabel: "true"} + ops := m.Mutate(context.Background(), pod) + if !hasSidecarOp(ops) { + t.Error("observe-labeled quantum pod should get the sidecar") + } + if hasGateOp(ops) { + t.Error("observe-only pod should not be gated") + } +} + +// ── shared coordination: producer / consumer split ────────────────────────────── + +// sharedQPUPod is a quantum workload pod (requests the resource) in a group, +// owned by a Job of parallelism N, with coordination=shared and a completion +// index. Index "0" is the producer; any other index is a consumer. This is the +// real shape: an indexed Job whose identical template yields differentiated +// roles purely from the completion index. +func sharedQPUPod(ns, group, name, job, index string) *corev1.Pod { + p := qpuPod("fluence") + p.Name = name + p.Namespace = ns + p.Labels = map[string]string{webhook.GroupLabel: group} + p.Annotations = map[string]string{ + CoordinationAnnotation: CoordinationShared, + CompletionIndexAnnotation: index, + } + p.OwnerReferences = []metav1.OwnerReference{{Kind: "Job", Name: job}} + return p +} + +// gangQPUPod is a quantum workload pod in a group owned by a Job, with NO +// coordination annotation — i.e. the default (independent) mode. +func gangQPUPod(ns, group, name, job string) *corev1.Pod { + p := qpuPod("fluence") + p.Name = name + p.Namespace = ns + p.Labels = map[string]string{webhook.GroupLabel: group} + p.OwnerReferences = []metav1.OwnerReference{{Kind: "Job", Name: job}} + return p +} + +// mincount returns the gang minCount of the named PodGroup, or ok=false. +func mincount(t *testing.T, cs *fake.Clientset, ns, group string) (int32, bool) { + t.Helper() + pg, err := cs.SchedulingV1alpha2().PodGroups(ns).Get(context.Background(), group, metav1.GetOptions{}) + if err != nil || pg.Spec.SchedulingPolicy.Gang == nil { + return 0, false + } + return pg.Spec.SchedulingPolicy.Gang.MinCount, true +} + +// A shared-mode CONSUMER (completion index != 0, owned by Job parallelism=3) is +// gated, told its role (FLUENCE_COORDINATION_ROLE=consumer), joins the +// consumer gang at minCount N-1 (the split), and gets NO sidecar (it is gated). +// No separate submitter pod is ever created — the producer is one of the N members. +func TestSharedConsumerGatedRoleAndSplit(t *testing.T) { + ns, group, job := "default", "qg", "qg-job" + par := int32(3) + cs := fake.NewSimpleClientset(&batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: job, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &par, Completions: &par}}) + m := &webhook.Mutator{Clientset: cs} + + ops := m.Mutate(context.Background(), sharedQPUPod(ns, group, "qg-1", job, "1")) + + if !hasGateOp(ops) { + t.Error("consumer must be gated") + } + if hasSidecarOp(ops) { + t.Error("consumer (gated) must NOT get a sidecar") + } + if !hasDropQuantumResourceOp(ops) { + t.Error("consumer (gated, never runs the QPU) must have its qpu resource stripped") + } + if e, ok := envOp(ops, CoordinationRoleEnv); !ok || e.Value != RoleConsumer { + t.Errorf("consumer must get %s=%s", CoordinationRoleEnv, RoleConsumer) + } + // Consumer gang is minCount N-1 (the producer/consumer split). + if mc, ok := mincount(t, cs, ns, group); !ok || mc != 2 { + t.Errorf("consumer PodGroup minCount=%d (ok=%v), want 2 (N-1 split)", mc, ok) + } + // No separate submitter pod is created. + pods, _ := cs.CoreV1().Pods(ns).List(context.Background(), metav1.ListOptions{}) + if len(pods.Items) != 0 { + t.Errorf("shared mode must NOT spawn a separate submitter pod; found %d pods", len(pods.Items)) + } +} + +// The shared-mode PRODUCER (completion index 0) is wired as the real coordinator: +// its own group-of-one -producer at minCount 1, the real sidecar, not +// gated, role=producer, and told which consumer group to ungate via +// FLUENCE_GANG_GROUP. It is one of the N members — no extra pod is created. +func TestSharedProducerWiredAsRealSidecar(t *testing.T) { + ns, group, job := "default", "qg2", "qg2-job" + par := int32(2) + cs := fake.NewSimpleClientset(&batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: job, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &par, Completions: &par}}) + m := &webhook.Mutator{Clientset: cs} + + ops := m.Mutate(context.Background(), sharedQPUPod(ns, group, "qg2-0", job, "0")) + + if !hasSidecarOp(ops) { + t.Error("producer must get the real sidecar") + } + if hasGateOp(ops) { + t.Error("producer must NOT be gated") + } + if hasDropQuantumResourceOp(ops) { + t.Error("producer must KEEP its qpu resource (it runs the real submit)") + } + if e, ok := envOp(ops, CoordinationRoleEnv); !ok || e.Value != RoleProducer { + t.Errorf("producer must get %s=%s", CoordinationRoleEnv, RoleProducer) + } + if _, ok := envOp(ops, QuantumJobIDEnv); ok { + t.Error("producer must NOT get FLUENCE_QUANTUM_JOB_ID (it submits its own task)") + } + // FLUENCE_GANG_GROUP (the consumer group to ungate) is on the sidecar. + var sidecar *corev1.Container + for _, op := range ops { + if c, ok := op.Value.(corev1.Container); ok && c.Name == SidecarContainerName { + cc := c + sidecar = &cc + } + } + if sidecar == nil { + t.Fatal("no sidecar container on producer") + } + var gotGang bool + for _, e := range sidecar.Env { + if e.Name == GangGroupEnv && e.Value == group { + gotGang = true + } + } + if !gotGang { + t.Errorf("producer sidecar must get %s=%q", GangGroupEnv, group) + } + // Producer is its own group-of-one (minCount 1). + if mc, ok := mincount(t, cs, ns, group+ProducerGroupSuffix); !ok || mc != 1 { + t.Errorf("producer PodGroup %s minCount=%d (ok=%v), want 1", group+ProducerGroupSuffix, mc, ok) + } + // No separate submitter pod. + pods, _ := cs.CoreV1().Pods(ns).List(context.Background(), metav1.ListOptions{}) + if len(pods.Items) != 0 { + t.Errorf("producer is a member, not a spawned pod; found %d pods", len(pods.Items)) + } +} + +// Shared mode never creates an extra pod: a full gang (producer index 0 + +// consumers) is N members, so the application runs exactly N times (not N+1 as +// the old submitter-pod model did). +func TestSharedGangNoSeparateSubmitterPod(t *testing.T) { + ns, group, job := "default", "qauto", "qauto-job" + par := int32(2) + cs := fake.NewSimpleClientset(&batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: job, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &par, Completions: &par}}) + m := &webhook.Mutator{Clientset: cs} + + m.Mutate(context.Background(), sharedQPUPod(ns, group, "qauto-0", job, "0")) // producer + m.Mutate(context.Background(), sharedQPUPod(ns, group, "qauto-1", job, "1")) // consumer + + pods, _ := cs.CoreV1().Pods(ns).List(context.Background(), metav1.ListOptions{}) + if len(pods.Items) != 0 { + t.Errorf("shared mode must not create any pods (no submitter); found %d", len(pods.Items)) + } + // Both groups exist with the right minCounts. + if mc, ok := mincount(t, cs, ns, group+ProducerGroupSuffix); !ok || mc != 1 { + t.Errorf("producer group minCount=%d (ok=%v), want 1", mc, ok) + } + if mc, ok := mincount(t, cs, ns, group); !ok || mc != 1 { + t.Errorf("consumer group minCount=%d (ok=%v), want N-1=1", mc, ok) + } +} + +// ── independent mode (default) ────────────────────────────────────────────────── + +// A grouped quantum pod with no coordination annotation is INDEPENDENT (default): +// it does its own real submit, is not gated, carries no coordination role, and +// triggers no group split and no submitter pod. +func TestIndependentGroupedQuantumIsStandalone(t *testing.T) { + ns, group, job := "default", "indep", "indep-job" + par := int32(3) + cs := fake.NewSimpleClientset(&batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: job, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &par, Completions: &par}}) + m := &webhook.Mutator{Clientset: cs} + + ops := m.Mutate(context.Background(), gangQPUPod(ns, group, "indep-0", job)) + + if hasGateOp(ops) { + t.Error("independent member must not be gated") + } + if _, ok := envOp(ops, CoordinationRoleEnv); ok { + t.Error("independent member must not get a coordination role env") + } + if _, ok := mincount(t, cs, ns, group+ProducerGroupSuffix); ok { + t.Error("independent mode must not create a producer group") + } + pods, _ := cs.CoreV1().Pods(ns).List(context.Background(), metav1.ListOptions{}) + if len(pods.Items) != 0 { + t.Error("independent mode must not spawn a submitter pod") + } +} + +// A standalone quantum pod (no group, no owner → group of one) does its own real +// submit: interceptor staged, but no gating, no coordination role, no submitter. +func TestStandaloneQuantumIsReal(t *testing.T) { + ns := "default" + cs := fake.NewSimpleClientset() + m := &webhook.Mutator{Clientset: cs} + + pod := qpuPod("fluence") + pod.Name = "solo" + pod.Namespace = ns + + ops := m.Mutate(context.Background(), pod) + if hasGateOp(ops) { + t.Error("standalone quantum pod must not be gated") + } + if _, ok := envOp(ops, CoordinationRoleEnv); ok { + t.Error("standalone quantum pod must not get a coordination role env") + } + pods, _ := cs.CoreV1().Pods(ns).List(context.Background(), metav1.ListOptions{}) + if len(pods.Items) != 0 { + t.Error("standalone quantum pod must not spawn a submitter") + } +} + +// Even with coordination=shared, a group of one (Job parallelism 1) has no +// consumers to coordinate, so it falls through to the standalone real-submit path. +func TestSharedGroupOfOneIsStandalone(t *testing.T) { + ns, group, job := "default", "one", "one-job" + par := int32(1) + cs := fake.NewSimpleClientset(&batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: job, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &par, Completions: &par}}) + m := &webhook.Mutator{Clientset: cs} + + ops := m.Mutate(context.Background(), sharedQPUPod(ns, group, "one-0", job, "0")) + if hasGateOp(ops) { + t.Error("shared group-of-one must not be gated") + } + if _, ok := mincount(t, cs, ns, group+ProducerGroupSuffix); ok { + t.Error("shared group-of-one must not create a producer group") + } +} + +// ── role + dependency ──────────────────────────────────────────────────── + +// envOp returns the env var op with the given name, if present (covers both +// single-EnvVar and []EnvVar op shapes). +func envOp(ops []spec.Op, name string) (corev1.EnvVar, bool) { + for _, op := range ops { + switch v := op.Value.(type) { + case corev1.EnvVar: + if v.Name == name { + return v, true + } + case []corev1.EnvVar: + for _, e := range v { + if e.Name == name { + return e, true + } + } + } + } + return corev1.EnvVar{}, false +} + +// annotationOps collects all annotation key=value pairs the ops would stamp. +func annotationOps(ops []spec.Op) map[string]string { + out := map[string]string{} + for _, op := range ops { + // whole-map add: /metadata/annotations + if op.Path == "/metadata/annotations" { + if m, ok := op.Value.(map[string]string); ok { + for k, v := range m { + out[k] = v + } + } + continue + } + // single-key add: /metadata/annotations/ -> string value + const pfx = "/metadata/annotations/" + if len(op.Path) > len(pfx) && op.Path[:len(pfx)] == pfx { + if s, ok := op.Value.(string); ok { + key := unescapeJSONPointer(op.Path[len(pfx):]) + out[key] = s + } + } + } + return out +} + +// unescapeJSONPointer reverses escapeJSONPointer for assertion readability. +func unescapeJSONPointer(s string) string { + // reverse order of escape: ~1 -> /, then ~0 -> ~ + out := "" + for i := 0; i < len(s); i++ { + if s[i] == '~' && i+1 < len(s) { + switch s[i+1] { + case '1': + out += "/" + i++ + continue + case '0': + out += "~" + i++ + continue + } + } + out += string(s[i]) + } + return out +} + +// A shared-mode consumer is expressed as a general Dependency: gated, stamped +// with depends-on-{kind,producer,gate}, and the producer is the -producer +// group. +func TestQuantumConsumerIsGeneralDependency(t *testing.T) { + ns, group, job := "default", "depq", "depq-job" + par := int32(3) + cs := fake.NewSimpleClientset(&batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: job, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &par, Completions: &par}}) + m := &webhook.Mutator{Clientset: cs} + + ops := m.Mutate(context.Background(), sharedQPUPod(ns, group, "depq-1", job, "1")) + + if !hasGateOp(ops) { + t.Errorf("consumer not gated by the dependency (ops: %+v)", ops) + } + ann := annotationOps(ops) + if ann[DependsOnKindAnnotation] != DependencyKindQuantumSubmit { + t.Errorf("depends-on-kind=%q, want %q", ann[DependsOnKindAnnotation], DependencyKindQuantumSubmit) + } + if ann[DependsOnProducerAnnotation] != group+ProducerGroupSuffix { + t.Errorf("depends-on-producer=%q, want %q (the producer group)", ann[DependsOnProducerAnnotation], group+ProducerGroupSuffix) + } + if ann[DependsOnGateAnnotation] != QuantumGate { + t.Errorf("depends-on-gate=%q, want %q", ann[DependsOnGateAnnotation], QuantumGate) + } +} + +// DependencyOf round-trips the stamped annotations back into a Dependency, so a +// scheduler/sidecar observer can read what a gated pod waits for. +func TestDependencyOfRoundTrip(t *testing.T) { + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Annotations: map[string]string{ + DependsOnKindAnnotation: DependencyKindQuantumSubmit, + DependsOnProducerAnnotation: "grp", + DependsOnGateAnnotation: QuantumGate, + }}} + d, ok := DependencyOf(pod) + if !ok || d.Kind != DependencyKindQuantumSubmit || d.Producer != "grp" || d.Gate != QuantumGate { + t.Errorf("DependencyOf=%+v ok=%v, want quantum-submit/grp/%s", d, ok, QuantumGate) + } + if _, ok := DependencyOf(&corev1.Pod{}); ok { + t.Errorf("DependencyOf on a pod with no dependency should be ok=false") + } +} + +// The consumer is role-aware: it gets FLUENCE_COORDINATION_ROLE=consumer and the +// producer's task id via the FLUENCE_QUANTUM_JOB_ID downward-API env, and it is +// NOT staged with the interceptor (a consumer never submits, so it needs neither +// the interceptor nor any faux flag). The user's script branches on the role. +func TestQuantumConsumerStagedWithRole(t *testing.T) { + ns, group, job := "default", "roleq", "roleq-job" + par := int32(2) + cs := fake.NewSimpleClientset(&batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{Name: job, Namespace: ns}, + Spec: batchv1.JobSpec{Parallelism: &par, Completions: &par}}) + m := &webhook.Mutator{Clientset: cs} + + ops := m.Mutate(context.Background(), sharedQPUPod(ns, group, "roleq-1", job, "1")) + + // Role surfaced to the container. + if e, ok := envOp(ops, CoordinationRoleEnv); !ok || e.Value != RoleConsumer { + t.Errorf("consumer missing %s=%s (got %+v, ok=%v)", CoordinationRoleEnv, RoleConsumer, e, ok) + } + + // A consumer never submits, so it is NOT staged with the interceptor. + if _, ok := envOp(ops, "PYTHONPATH"); ok { + t.Error("consumer must NOT be staged with the interceptor (it does not submit)") + } + + // Producer's task id sourced from the annotation the ungating sidecar stamps. + e, ok := envOp(ops, QuantumJobIDEnv) + if !ok { + t.Fatalf("consumer missing %s env", QuantumJobIDEnv) + } + if e.ValueFrom == nil || e.ValueFrom.FieldRef == nil || + e.ValueFrom.FieldRef.FieldPath != "metadata.annotations['"+QuantumJobIDAnnotation+"']" { + t.Errorf("%s should be a downward-API ref to %s, got %+v", QuantumJobIDEnv, QuantumJobIDAnnotation, e) + } +} + +// Classical override below the replica count: group-size=2 on a gang owned by a +// Job(parallelism=5) must yield minCount=2 (the override), not 5. With a cluster +// sized to 2, the gang reaches quorum and runs; if the override were dropped the +// gang would wait forever for 5 (the e2e hang that fails CI). +func TestClassicalOverrideBelowReplicaCount(t *testing.T) { + ns, group, job := "default", "ovr2", "ovr2-job" + pod := cpuPod("fluence") + pod.Namespace = ns + pod.Labels = map[string]string{webhook.GroupLabel: group} + pod.Annotations = map[string]string{webhook.GroupSizeAnnotation: "2"} + ownedBy(pod, "Job", job) + + got := minCountWithClient(t, pod, jobWithParallelism(ns, job, 5)) + if got != 2 { + t.Errorf("override below replicas: minCount=%d, want 2 (override wins over Job=5)", got) + } +} + +// ── sidecar wiring ────────────────────────────────────────────────────────────── + +// The sidecar inherits the workload's secret/configMap-sourced credentials so it +// can talk to the same backend, but NOT plain-value env. (Moved from the core +// webhook package: sidecar construction is now quantum-owned.) +func TestSidecarInheritsWorkloadSecretEnv(t *testing.T) { + m := &webhook.Mutator{Clientset: fake.NewSimpleClientset()} + pod := &corev1.Pod{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{ + Name: "app", + Env: []corev1.EnvVar{ + {Name: "PLAIN_VALUE", Value: "x"}, // plain value: NOT copied + {Name: "AWS_ACCESS_KEY_ID", ValueFrom: &corev1.EnvVarSource{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{Name: "aws-braket-credentials"}, + Key: "AWS_ACCESS_KEY_ID", + }}}, + }, + }}, + }, + } + ops := sidecarContainerOps(m, pod, false, nil) + var sidecar *corev1.Container + for _, op := range ops { + if c, ok := op.Value.(corev1.Container); ok && c.Name == SidecarContainerName { + sidecar = &c + } + } + if sidecar == nil { + t.Fatal("no sidecar container added") + } + var gotSecret, gotPlain bool + for _, e := range sidecar.Env { + if e.Name == "AWS_ACCESS_KEY_ID" && e.ValueFrom != nil && e.ValueFrom.SecretKeyRef != nil { + gotSecret = true + } + if e.Name == "PLAIN_VALUE" { + gotPlain = true + } + } + if !gotSecret { + t.Error("sidecar should inherit the workload's secret-sourced AWS creds") + } + if gotPlain { + t.Error("sidecar should NOT copy plain-value workload env") + } +} + +// The producer member of a shared gang requests the quantum resource (it runs the +// real submit). Sanity check that the helper builds a quantum pod. +func TestSharedProducerRequestsQuantumResource(t *testing.T) { + p := sharedQPUPod("default", "g", "g-0", "g-job", "0") + if !spec.PodRequestsResource(p, QuantumResource) { + t.Error("producer must request the quantum resource (it runs the real submit)") + } +} diff --git a/pkg/webhook/handlers/registry_test.go b/pkg/webhook/handlers/registry_test.go new file mode 100644 index 0000000..346d786 --- /dev/null +++ b/pkg/webhook/handlers/registry_test.go @@ -0,0 +1,82 @@ +/* +Copyright 2024 Lawrence Livermore National Security, LLC + (c.f. AUTHORS, NOTICE.LLNS, COPYING) +SPDX-License-Identifier: Apache-2.0 +*/ + +// Registry behavior: dispatch order comes from the active handler list (not a +// per-handler Order), and the list both selects and orders handlers. +package handlers + +import ( + "context" + "testing" + + "github.com/converged-computing/fluence/pkg/webhook" + "github.com/converged-computing/fluence/pkg/webhook/spec" + + "k8s.io/client-go/kubernetes/fake" +) + +// The default active order ships gang LAST so it only applies default gang +// sizing when no earlier handler shaped the gang. +func TestDefaultOrderGangLast(t *testing.T) { + defer webhook.SetActiveHandlers(nil) + active, _ := webhook.SetActiveHandlers(nil) // restore + read default + if len(active) == 0 { + t.Fatal("no active handlers") + } + if active[len(active)-1] != "gang" { + t.Errorf("gang must be last in default order; got %v", active) + } + // default order is exactly fluxion, quantum, gang + want := []string{"fluxion", "quantum", "gang"} + if len(active) != len(want) { + t.Fatalf("default order = %v, want %v", active, want) + } + for i := range want { + if active[i] != want[i] { + t.Errorf("default order = %v, want %v", active, want) + break + } + } +} + +// The active list IS the order: passing a custom order reorders dispatch, and +// unknown names are reported, not silently kept. +func TestActiveListSetsOrderAndReportsUnknown(t *testing.T) { + defer webhook.SetActiveHandlers(nil) + active, unknown := webhook.SetActiveHandlers([]string{"gang", "fluxion", "bogus"}) + if len(active) != 2 || active[0] != "gang" || active[1] != "fluxion" { + t.Errorf("active = %v, want [gang fluxion] in that order", active) + } + if len(unknown) != 1 || unknown[0] != "bogus" { + t.Errorf("unknown = %v, want [bogus]", unknown) + } +} + +// Dropping a handler from the list disables it: a quantum pod with quantum +// omitted gets no interceptor ops (only fluxion/gang act). +func TestOmittedHandlerDoesNotDispatch(t *testing.T) { + defer webhook.SetActiveHandlers(nil) + m := &webhook.Mutator{Clientset: fake.NewSimpleClientset()} + + webhook.SetActiveHandlers(nil) // default: quantum present + if !hasInterceptor(m.Mutate(context.Background(), qpuPod("fluence"))) { + t.Fatal("with quantum active, expected interceptor (init container) ops") + } + + webhook.SetActiveHandlers([]string{"fluxion", "gang"}) // quantum omitted + if hasInterceptor(m.Mutate(context.Background(), qpuPod("fluence"))) { + t.Error("with quantum omitted, interceptor ops must NOT be present") + } +} + +func hasInterceptor(ops []spec.Op) bool { + for _, op := range ops { + if op.Path == "/spec/initContainers" || op.Path == "/spec/initContainers/-" { + return true + } + } + return false +} diff --git a/pkg/webhook/handlers/sidecar.go b/pkg/webhook/handlers/sidecar.go new file mode 100644 index 0000000..d105a7c --- /dev/null +++ b/pkg/webhook/handlers/sidecar.go @@ -0,0 +1,57 @@ +package handlers + +import ( + "context" + + "github.com/converged-computing/fluence/pkg/webhook" + "github.com/converged-computing/fluence/pkg/webhook/spec" + + corev1 "k8s.io/api/core/v1" +) + +// Sidecar is the capability a handler uses to attach a coordination sidecar to a +// pod. It is NOT part of the webhook core's MutatorAPI: only handlers that need +// a sidecar (today, quantum) depend on it, and a handler may supply its own +// implementation to customize delivery. The default implementation +// (coreSidecar) delegates to the webhook core's interceptor/sidecar ops, which +// remain the staging mechanism shared by any sidecar-using handler. +// +// This is the seam your design calls for: "a general sidecar interface that can +// be used across handlers and customized by the quantum [handler]". A future +// custom-resource handler can implement Sidecar differently (different image, +// env, gating) without touching the core or other handlers. +type Sidecar interface { + // EnsureRBAC provisions the per-namespace ServiceAccount/Role/Binding the + // sidecar needs to read/patch pods and podgroups. + EnsureRBAC(ctx context.Context, namespace string) + // InterceptorOps stages the in-pod interceptor (Model C) into the workload + // containers (init container + shared volume on PYTHONPATH). + InterceptorOps(pod *corev1.Pod) []spec.Op + // ContainerOps adds the sidecar container. observe=true selects observe-only + // telemetry mode (no ungating). extraEnv carries handler-computed, + // domain-specific env (e.g. the quantum handler's FLUENCE_EXPECTED_WORKERS = + // N-1 and FLUENCE_WORKER_GROUP_BASE) so the core never has to know about + // leader/worker concepts — the handler that owns the split owns those values. + ContainerOps(pod *corev1.Pod, observe bool, extraEnv []corev1.EnvVar) []spec.Op +} + +// coreSidecar is the default Sidecar. It delegates to the quantum-owned sidecar +// implementation (see sidecar_impl.go), which uses only the generic MutatorAPI +// (Client, InjectedEnv). The webhook core no longer carries any sidecar logic; a +// custom handler could supply its own Sidecar with a different container/image. +type coreSidecar struct{ m webhook.MutatorAPI } + +func (s coreSidecar) EnsureRBAC(ctx context.Context, namespace string) { + ensureSidecarRBAC(ctx, s.m, namespace) +} +func (s coreSidecar) InterceptorOps(pod *corev1.Pod) []spec.Op { + return interceptorOps(pod) +} +func (s coreSidecar) ContainerOps(pod *corev1.Pod, observe bool, extraEnv []corev1.EnvVar) []spec.Op { + return sidecarContainerOps(s.m, pod, observe, extraEnv) +} + +// sidecarFor returns the Sidecar a handler should use. Centralized so the choice +// of implementation (and any future per-handler customization) lives in one +// place. Today every sidecar-using handler gets the core-backed default. +func sidecarFor(m webhook.MutatorAPI) Sidecar { return coreSidecar{m: m} } diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index 20a7288..b39bec1 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -1,11 +1,11 @@ // Package webhook is fluence's mutating admission webhook. // // The core here is domain-agnostic plumbing: it owns the Mutator, the handler -// dispatcher, per-namespace PodGroup/RBAC provisioning, the Model C package -// staging (init container + shared volume on PYTHONPATH), the HTTP entrypoint, -// and self-managed TLS. It knows nothing about quantum, Braket, gate names, or -// observe labels — that policy lives entirely in the handlers (pkg/webhook/ -// handlers), which self-register via Register(). +// dispatcher, per-namespace PodGroup provisioning, the HTTP entrypoint, and +// self-managed TLS. It knows nothing about quantum, Braket, gate names, sidecars, +// RBAC, or interceptor staging — that policy and machinery lives entirely in the +// handlers (pkg/webhook/handlers), which self-register via Register() and perform +// their own create/edit side-effects through the generic MutatorAPI. // // The webhook self-manages TLS via a self-signed CA patched into the // MutatingWebhookConfiguration caBundle at startup. @@ -32,9 +32,7 @@ import ( admissionv1 "k8s.io/api/admission/v1" corev1 "k8s.io/api/core/v1" - rbacv1 "k8s.io/api/rbac/v1" schedulingv1alpha2 "k8s.io/api/scheduling/v1alpha2" - "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" @@ -52,34 +50,12 @@ const ( // meaning to it (a handler decides what a group means). GroupLabel = "fluence.flux-framework.org/group" - // LeaderAnnotation records the admission-order leader on a PodGroup. - LeaderAnnotation = "fluence.flux-framework.org/leader" - - // RoleAnnotation, set by the workload on each pod, explicitly declares the - // pod's gang role ("leader" or "worker"). When present it is AUTHORITATIVE: - // the quantum handler gates workers and gives the leader the sidecar based - // on this value, instead of inferring the leader by admission order. The - // same value is injected into the container env as FLUENCE_ROLE so the - // application reads its role from the same source of truth Fluence used. - // When absent, role falls back to admission order (backwards compatible). - RoleAnnotation = "fluence.flux-framework.org/role" - - // ExpectedWorkersAnnotation, set by the workload on the leader pod, tells the - // sidecar how many gated workers to wait for before ungating. The count is - // known at admission (the workload declares it) even though worker names are - // not, so it travels as a static sidecar env var. The core treats it as an - // opaque string and ascribes no meaning to it beyond propagation. - ExpectedWorkersAnnotation = "fluence.flux-framework.org/expected-workers" - - // Sidecar/staging infrastructure (generic — not quantum-specific). - SidecarImage = "ghcr.io/converged-computing/fluence-sidecar:latest" - SidecarServiceAccount = "fluence-sidecar" - - // StageVolumeName / StageMountPath: the shared emptyDir the init container - // stages the fluence Python package into, mounted into the user container and - // prepended to PYTHONPATH (Model C delivery). - StageVolumeName = "fluence-pkg" - StageMountPath = "/opt/fluence-staged" + // GroupSizeAnnotation is the gang member count N, set by the workload on each + // pod. It is the authoritative override for the PodGroup gang minCount when + // the size cannot (or should not) be derived from the owning controller — and + // for loose grouped pods where counting at admission is unreliable. The core + // treats it as an opaque integer string. + GroupSizeAnnotation = "fluence.flux-framework.org/group-size" ) // ── Mutator ───────────────────────────────────────────────────────────────────── @@ -87,31 +63,14 @@ const ( type Mutator struct { AttributeKeys []string Clientset kubernetes.Interface - SidecarImage string } // compile-time check that *Mutator satisfies the handler capability interface. var _ MutatorAPI = (*Mutator)(nil) -func (m *Mutator) sidecarImage() string { - if m.SidecarImage != "" { - return m.SidecarImage - } - return SidecarImage -} - // GroupName returns the value of GroupLabel on the pod, or "". func GroupName(pod *corev1.Pod) string { return spec.Label(pod, GroupLabel) } -// Role returns the explicit gang role declared on the pod via RoleAnnotation -// ("leader"/"worker"), or "" if unset (caller falls back to admission order). -func Role(pod *corev1.Pod) string { return spec.Annotation(pod, RoleAnnotation) } - -func resourceQuantity(s string) *resource.Quantity { - q := resource.MustParse(s) - return &q -} - // ── MutatorAPI: capabilities exposed to handlers ──────────────────────────────── // Client implements MutatorAPI: returns the Kubernetes client (nil in tests). @@ -138,29 +97,13 @@ func (m *Mutator) EnvVarNames() []string { return names } -// PodGroupLeader returns the recorded admission-order leader for the group, or -// "". Retries briefly to absorb the concurrent leader/worker admission race. -func (m *Mutator) PodGroupLeader(ctx context.Context, namespace, group string) string { - if m.Clientset == nil || group == "" { - return "" - } - for i := 0; i < 3; i++ { - pg, err := m.Clientset.SchedulingV1alpha2().PodGroups(namespace).Get(ctx, group, metav1.GetOptions{}) - if err != nil { - return "" - } - if pg.Annotations != nil && pg.Annotations[LeaderAnnotation] != "" { - return pg.Annotations[LeaderAnnotation] - } - if i < 2 { - time.Sleep(100 * time.Millisecond) - } +// EnsurePodGroup creates a Fluence-owned PodGroup with gang minCount = the full +// gang size N (the whole group schedules atomically) if absent. minCount<=0 +// falls back to 1. +func (m *Mutator) EnsurePodGroup(ctx context.Context, namespace, group, leaderPod string, minCount int32) { + if minCount <= 0 { + minCount = 1 } - return "" -} - -// EnsurePodGroup creates a Fluence-owned PodGroup (minCount:1) if absent. -func (m *Mutator) EnsurePodGroup(ctx context.Context, namespace, group, leaderPod string) { if m.Clientset == nil { return } @@ -179,205 +122,17 @@ func (m *Mutator) EnsurePodGroup(ctx context.Context, namespace, group, leaderPo }, Spec: schedulingv1alpha2.PodGroupSpec{ SchedulingPolicy: schedulingv1alpha2.PodGroupSchedulingPolicy{ - Gang: &schedulingv1alpha2.GangSchedulingPolicy{MinCount: 1}, + Gang: &schedulingv1alpha2.GangSchedulingPolicy{MinCount: minCount}, }, }, } if _, err := m.Clientset.SchedulingV1alpha2().PodGroups(namespace).Create(ctx, pg, metav1.CreateOptions{}); err != nil { log.Printf("[fluence-webhook] could not create PodGroup %s/%s: %v", namespace, group, err) } else { - log.Printf("[fluence-webhook] created PodGroup %s/%s (minCount=1)", namespace, group) - } -} - -// RecordLeader records leaderPod as the group's admission-order leader. -func (m *Mutator) RecordLeader(ctx context.Context, namespace, group, leaderPod string) { - if m.Clientset == nil || group == "" { - return - } - patch := fmt.Sprintf(`{"metadata":{"annotations":{%q:%q}}}`, LeaderAnnotation, leaderPod) - if _, err := m.Clientset.SchedulingV1alpha2().PodGroups(namespace).Patch( - ctx, group, types.MergePatchType, []byte(patch), metav1.PatchOptions{}); err != nil { - log.Printf("[fluence-webhook] could not record leader on PodGroup %s/%s: %v", namespace, group, err) - } -} - -// EnsureSidecarRBAC provisions the per-namespace ServiceAccount/Role/RoleBinding -// the sidecar uses to patch pods and read PodGroups. -func (m *Mutator) EnsureSidecarRBAC(ctx context.Context, namespace string) { - if m.Clientset == nil { - return - } - lbl := map[string]string{"app": "fluence-sidecar"} - - if _, err := m.Clientset.CoreV1().ServiceAccounts(namespace).Get(ctx, SidecarServiceAccount, metav1.GetOptions{}); err != nil { - sa := &corev1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Name: SidecarServiceAccount, Namespace: namespace, Labels: lbl}} - if _, err := m.Clientset.CoreV1().ServiceAccounts(namespace).Create(ctx, sa, metav1.CreateOptions{}); err != nil { - log.Printf("[fluence-webhook] could not create ServiceAccount %s/%s: %v", namespace, SidecarServiceAccount, err) - } - } - if _, err := m.Clientset.RbacV1().Roles(namespace).Get(ctx, SidecarServiceAccount, metav1.GetOptions{}); err != nil { - role := &rbacv1.Role{ - ObjectMeta: metav1.ObjectMeta{Name: SidecarServiceAccount, Namespace: namespace, Labels: lbl}, - Rules: []rbacv1.PolicyRule{ - {APIGroups: []string{""}, Resources: []string{"pods"}, Verbs: []string{"get", "list", "patch", "update"}}, - {APIGroups: []string{"scheduling.k8s.io"}, Resources: []string{"podgroups"}, Verbs: []string{"get", "list"}}, - }, - } - if _, err := m.Clientset.RbacV1().Roles(namespace).Create(ctx, role, metav1.CreateOptions{}); err != nil { - log.Printf("[fluence-webhook] could not create Role %s/%s: %v", namespace, SidecarServiceAccount, err) - } - } - if _, err := m.Clientset.RbacV1().RoleBindings(namespace).Get(ctx, SidecarServiceAccount, metav1.GetOptions{}); err != nil { - rb := &rbacv1.RoleBinding{ - ObjectMeta: metav1.ObjectMeta{Name: SidecarServiceAccount, Namespace: namespace, Labels: lbl}, - Subjects: []rbacv1.Subject{{Kind: "ServiceAccount", Name: SidecarServiceAccount, Namespace: namespace}}, - RoleRef: rbacv1.RoleRef{APIGroup: "rbac.authorization.k8s.io", Kind: "Role", Name: SidecarServiceAccount}, - } - if _, err := m.Clientset.RbacV1().RoleBindings(namespace).Create(ctx, rb, metav1.CreateOptions{}); err != nil { - log.Printf("[fluence-webhook] could not create RoleBinding %s/%s: %v", namespace, SidecarServiceAccount, err) - } + log.Printf("[fluence-webhook] created PodGroup %s/%s (minCount=%d)", namespace, group, minCount) } } -// InterceptorOps implements Model C delivery. It injects an init container (the -// sidecar image) that stages the fluence Python package into a shared emptyDir, -// mounts that volume into every Fluxion-resource container, and prepends it to -// PYTHONPATH plus sets FLUENCE_POD_UID. Python auto-imports the staged -// sitecustomize on startup, which runs the interceptor — no user code changes, -// no PYTHONSTARTUP (which only fires interactively), no vendor SDK on our side. -func (m *Mutator) InterceptorOps(pod *corev1.Pod) []spec.Op { - var ops []spec.Op - - // Shared volume. - vol := corev1.Volume{Name: StageVolumeName, VolumeSource: corev1.VolumeSource{EmptyDir: &corev1.EmptyDirVolumeSource{}}} - if len(pod.Spec.Volumes) == 0 { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/volumes", Value: []corev1.Volume{vol}}) - } else { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/volumes/-", Value: vol}) - } - - // Init container that stages the package into the shared volume. - // - // Fail-soft: the interceptor is best-effort, so its delivery must be too. We - // wrap the stage command so a failure (bad image, missing python, package - // problem) leaves the shared volume empty and exits 0 rather than blocking - // the user's pod with Init:Error. An empty staged dir simply means the - // interceptor does not run — the user application is unaffected. (This also - // lets CI use a minimal placeholder sidecar image for placement-only tests.) - initc := corev1.Container{ - Name: "fluence-stage", - Image: m.sidecarImage(), - ImagePullPolicy: corev1.PullAlways, - Command: []string{"sh", "-c", - fmt.Sprintf("python -m fluence.stage %s || echo '[fluence] staging skipped (interceptor unavailable)'", StageMountPath)}, - VolumeMounts: []corev1.VolumeMount{{Name: StageVolumeName, MountPath: StageMountPath}}, - } - if len(pod.Spec.InitContainers) == 0 { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/initContainers", Value: []corev1.Container{initc}}) - } else { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/initContainers/-", Value: initc}) - } - - // Mount the staged volume + set PYTHONPATH and FLUENCE_POD_UID on each - // Fluxion-resource container. - mount := corev1.VolumeMount{Name: StageVolumeName, MountPath: StageMountPath, ReadOnly: true} - pythonpath := corev1.EnvVar{Name: "PYTHONPATH", Value: StageMountPath} - uid := spec.FieldEnv("FLUENCE_POD_UID", "metadata.uid") - for i, c := range pod.Spec.Containers { - if !spec.RequestsFluxionResource(c) { - continue - } - if len(c.VolumeMounts) == 0 { - ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/volumeMounts", i), Value: []corev1.VolumeMount{mount}}) - } else { - ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/volumeMounts/-", i), Value: mount}) - } - if !spec.HasEnv(c, "PYTHONPATH") { - if len(c.Env) == 0 { - ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env", i), Value: []corev1.EnvVar{pythonpath}}) - pod.Spec.Containers[i].Env = []corev1.EnvVar{pythonpath} - } else { - ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: pythonpath}) - pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, pythonpath) - } - } - if !spec.HasEnv(c, "FLUENCE_POD_UID") { - ops = append(ops, spec.Op{Op: "add", Path: fmt.Sprintf("/spec/containers/%d/env/-", i), Value: uid}) - pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, uid) - } - } - return ops -} - -// SidecarContainerOps adds the fluence-sidecar container and sets its -// ServiceAccount. observe=true selects observe-only telemetry mode. -func (m *Mutator) SidecarContainerOps(pod *corev1.Pod, observe bool) []spec.Op { - var ops []spec.Op - // The sidecar resolves its vendor provider at runtime from the backend the - // scheduler chose. It gets the same FLUXION_* contract as the workload - // containers (FLUXION_BACKEND + attribute vars like FLUXION_VENDOR), sourced - // via the downward API from the scheduler's annotations — so the values - // resolve once the scheduler writes them, after admission. - env := []corev1.EnvVar{ - spec.FieldEnv("FLUENCE_POD_UID", "metadata.uid"), - spec.FieldEnv("FLUENCE_POD_NAME", "metadata.name"), - spec.FieldEnv("FLUENCE_NAMESPACE", "metadata.namespace"), - spec.FieldEnv("FLUENCE_GROUP", "metadata.labels['"+GroupLabel+"']"), - } - env = append(env, m.InjectedEnv()...) - if observe { - env = append(env, corev1.EnvVar{Name: "FLUENCE_OBSERVE", Value: "true"}) - } - // The gang size is known at admission (the leader carries it), even though - // the worker NAMES are not yet. Propagate the expected worker count to the - // sidecar as a static env var so it can wait until it has discovered that - // many gated workers before ungating, rather than ungating a partial set. - // Read from a generic annotation so the core stays domain-agnostic; the - // workload manifest sets it (e.g. from its own N_WORKERS). - if pod.Annotations != nil { - if n := pod.Annotations[ExpectedWorkersAnnotation]; n != "" { - env = append(env, corev1.EnvVar{Name: "FLUENCE_EXPECTED_WORKERS", Value: n}) - } - } - // The sidecar talks to the same backend the workload does (e.g. to find the - // task and read its queue position), so it needs the same credentials. Copy - // the workload container's secret/configmap-sourced env onto the sidecar. - // This stays domain-agnostic: we don't know or name the provider's creds, we - // just propagate whatever the workload pulls from a secret/configMap (e.g. - // AWS_*, IBM tokens). Existing FLUENCE_/FLUXION_ names are not overwritten. - if len(pod.Spec.Containers) > 0 { - have := map[string]bool{} - for _, e := range env { - have[e.Name] = true - } - for _, e := range pod.Spec.Containers[0].Env { - if have[e.Name] || e.ValueFrom == nil { - continue - } - if e.ValueFrom.SecretKeyRef != nil || e.ValueFrom.ConfigMapKeyRef != nil { - env = append(env, e) - } - } - } - sidecar := corev1.Container{ - Name: "fluence-sidecar", Image: m.sidecarImage(), ImagePullPolicy: corev1.PullAlways, - Env: env, - Resources: corev1.ResourceRequirements{Requests: corev1.ResourceList{ - corev1.ResourceCPU: *resourceQuantity("100m"), corev1.ResourceMemory: *resourceQuantity("256Mi"), - }}, - } - if len(pod.Spec.Containers) == 0 { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/containers", Value: []corev1.Container{sidecar}}) - } else { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/containers/-", Value: sidecar}) - } - if pod.Spec.ServiceAccountName == "" || pod.Spec.ServiceAccountName == "default" { - ops = append(ops, spec.Op{Op: "add", Path: "/spec/serviceAccountName", Value: SidecarServiceAccount}) - } - return ops -} - // ── Dispatcher ────────────────────────────────────────────────────────────────── // Mutate dispatches the pod to every registered handler and concatenates the diff --git a/pkg/webhook/webhook_test.go b/pkg/webhook/webhook_test.go index 26983d4..9af6c9c 100644 --- a/pkg/webhook/webhook_test.go +++ b/pkg/webhook/webhook_test.go @@ -2,8 +2,6 @@ package webhook import ( "testing" - - corev1 "k8s.io/api/core/v1" ) // EnvVarNames returns the FLUXION_* contract names (used by the scheduler plugin @@ -22,47 +20,3 @@ func TestEnvVarNames(t *testing.T) { } } } - -func TestSidecarInheritsWorkloadSecretEnv(t *testing.T) { - m := &Mutator{} - pod := &corev1.Pod{ - Spec: corev1.PodSpec{ - Containers: []corev1.Container{{ - Name: "gang", - Env: []corev1.EnvVar{ - {Name: "GANG_ROLE", Value: "leader"}, // plain value: NOT copied - {Name: "AWS_ACCESS_KEY_ID", ValueFrom: &corev1.EnvVarSource{ - SecretKeyRef: &corev1.SecretKeySelector{ - LocalObjectReference: corev1.LocalObjectReference{Name: "aws-braket-credentials"}, - Key: "AWS_ACCESS_KEY_ID", - }}}, - }, - }}, - }, - } - ops := m.SidecarContainerOps(pod, false) - var sidecar *corev1.Container - for _, op := range ops { - if c, ok := op.Value.(corev1.Container); ok && c.Name == "fluence-sidecar" { - sidecar = &c - } - } - if sidecar == nil { - t.Fatal("no sidecar container added") - } - var gotSecret, gotPlain bool - for _, e := range sidecar.Env { - if e.Name == "AWS_ACCESS_KEY_ID" && e.ValueFrom != nil && e.ValueFrom.SecretKeyRef != nil { - gotSecret = true - } - if e.Name == "GANG_ROLE" { - gotPlain = true - } - } - if !gotSecret { - t.Error("sidecar should inherit the workload's secret-sourced AWS creds") - } - if gotPlain { - t.Error("sidecar should NOT copy plain-value workload env like GANG_ROLE") - } -} diff --git a/python/Dockerfile b/python/Dockerfile index 5cff209..03bf153 100644 --- a/python/Dockerfile +++ b/python/Dockerfile @@ -1,14 +1,6 @@ # Fluence quantum coordination sidecar image. -# -# Bakes the `fluence` Python package in, so the SAME image serves three roles -# (versions locked together — they are built from this one source tree): -# 1. sidecar container — runs `fluence-sidecar` (the coordination loop) -# 2. init container — runs `python -m fluence.stage ` to copy the -# pure-Python package + sitecustomize into a shared -# volume that the webhook mounts onto the user -# container's PYTHONPATH (Model C delivery) -# 3. (the staged copy) — the user container imports the staged package via -# sitecustomize; no install required in the user image +# TODO organize into subdirectories when we have >1 image +#sitecustomize; no install required in the user image FROM python:3.11-slim LABEL org.opencontainers.image.source="https://github.com/converged-computing/fluence" @@ -27,11 +19,8 @@ COPY . /app # Install the package with the vendor SDKs the SIDECAR needs for its own API # calls (task discovery / queue polling). The interceptor staged into the user # container carries NONE of these — it patches whatever SDK the user already has. -RUN pip install --no-cache-dir ".[all]" +RUN pip install --no-cache-dir ".[all]" && ln -s $(which python3) /usr/bin/python -ENV FLUENCE_TASK_DISCOVERY_TIMEOUT=300 +ENV FLUENCE_TASK_DISCOVERY_TIMEOUT=300000 ENV FLUENCE_POLL_INTERVAL=30 - -# Default entrypoint is the sidecar loop; the init container overrides the -# command with `python -m fluence.stage `. CMD ["fluence-sidecar"] diff --git a/python/fluence/providers/base.py b/python/fluence/providers/base.py index dca4429..561bca2 100644 --- a/python/fluence/providers/base.py +++ b/python/fluence/providers/base.py @@ -80,7 +80,7 @@ def find_my_task(self, pod_uid: str, backend: str, timeout: int) -> "Task | None raise NotImplementedError def is_ready_to_ungate(self, task: "Task") -> bool: - """True when workers should be ungated — queue position == 1 or the task + """True when the gang should be ungated — queue position == 1 or the task is already RUNNING/terminal. Always implementable.""" raise NotImplementedError @@ -134,4 +134,4 @@ def resolve_from_env() -> "Provider | None": for k, v in os.environ.items(): if k.startswith("FLUXION_"): attrs[k[len("FLUXION_"):].lower()] = v - return resolve(attrs) + return resolve(attrs) \ No newline at end of file diff --git a/python/fluence/providers/braket.py b/python/fluence/providers/braket.py index 23bd9fc..d6e6ea9 100644 --- a/python/fluence/providers/braket.py +++ b/python/fluence/providers/braket.py @@ -51,6 +51,11 @@ def install_interceptor(self, pod_uid: str) -> bool: original_run = AwsDevice.run def patched_run(self, task_specification, *args, **kwargs): + # Tag the submission with the pod-uid so the sidecar can find this task + # in the queue. The interceptor is staged only on pods that actually + # submit (producers and standalone/independent pods); consumers are + # role-aware (FLUENCE_COORDINATION_ROLE=consumer) and never call run(), + # so there is no submit to intercept and no faux mode to select. if pod_uid: tags = kwargs.get("tags", {}) tags[TAG_KEY] = pod_uid @@ -226,4 +231,4 @@ def job_id(self, task: BraketTask) -> str: PROVIDER = BraketProvider() -register(PROVIDER) +register(PROVIDER) \ No newline at end of file diff --git a/python/fluence/sidecar.py b/python/fluence/sidecar.py index 098574b..d0724e5 100644 --- a/python/fluence/sidecar.py +++ b/python/fluence/sidecar.py @@ -1,18 +1,19 @@ """ fluence.sidecar — provider-agnostic quantum coordination sidecar main loop. -Injected by the Fluence webhook into the quantum-submitting pod. Resolves its -vendor at runtime from the backend annotation, discovers the task the user -application submitted (tagged by the interceptor), polls readiness, and either -ungates gated workers (gang mode) or just logs the queue-position series -(observe-only mode). +Injected by the Fluence webhook into the one-off SUBMITTER pod (gang + submitter +model — there is no leader/worker split). Resolves its vendor at runtime from the +backend annotation, discovers the task the user application submitted (tagged by +the interceptor), polls readiness, and either ungates the gated GANG group (gang +mode) or just logs the queue-position series (observe-only mode). Entry point: `fluence-sidecar` console script (see pyproject.toml) -> main(). Environment (injected by the Fluence webhook): FLUENCE_POD_UID UID of this pod (matches interceptor tag) FLUENCE_NAMESPACE Kubernetes namespace - FLUENCE_GATED_PODS comma-separated gated worker names + FLUENCE_GANG_GROUP group label of the gated gang to ungate + FLUENCE_GATED_PODS optional explicit comma-separated gang pod names FLUENCE_OBSERVE "true" for observe-only telemetry mode FLUXION_BACKEND / FLUXION_VENDOR scheduler-chosen backend / vendor FLUENCE_TASK_DISCOVERY_TIMEOUT seconds to wait for discovery (default 300) @@ -30,6 +31,7 @@ from fluence.ungate import ungate_pods, gated_pods_from_env, namespace_from_env, wait_for_gated_pods + def _poll(provider, task, poll_interval, ungate): mode = "gang" if ungate else "observe-only" log(f"{mode} mode: polling queue position") @@ -52,18 +54,22 @@ def main(): pod_uid = os.environ.get("FLUENCE_POD_UID", "") pod_name = os.environ.get("FLUENCE_POD_NAME", "") group = os.environ.get("FLUENCE_GROUP", "") + # Gang + submitter model: this sidecar runs in the one-off SUBMITTER pod + # (its own group-of-one, -submitter). The gated workload it must ungate + # is the GANG group, named by FLUENCE_GANG_GROUP (set by the webhook). There + # is no leader/worker split and no -workers subgroup. + gang_group = os.environ.get("FLUENCE_GANG_GROUP", "") backend = os.environ.get("FLUXION_BACKEND", "") observe = os.environ.get("FLUENCE_OBSERVE", "").lower() == "true" discovery_timeout = int(os.environ.get("FLUENCE_TASK_DISCOVERY_TIMEOUT", 300)) poll_interval = int(os.environ.get("FLUENCE_POLL_INTERVAL", 30)) - expected_workers = int(os.environ.get("FLUENCE_EXPECTED_WORKERS", 0)) ungate_timeout = int(os.environ.get("FLUENCE_UNGATE_TIMEOUT", 120)) namespace = namespace_from_env() - log("starting fluence quantum sidecar") + log("starting fluence quantum submitter sidecar") log(f" pod_uid={pod_uid} namespace={namespace} group={group} " - f"backend={backend} observe={observe} expected_workers={expected_workers}") + f"gang_group={gang_group} backend={backend} observe={observe}") provider = resolve_from_env() if provider is None: @@ -75,8 +81,9 @@ def main(): if task is None: log("ERROR: could not discover quantum task") if not observe: - ungate_pods(wait_for_gated_pods(namespace, group, expected_workers, - exclude=pod_name, timeout=ungate_timeout), + # Fail open: ungate the gang so it is not stranded forever. + ungate_pods(wait_for_gated_pods(namespace, gang_group, exclude=pod_name, + timeout=ungate_timeout), "", namespace) sys.exit(1) @@ -89,19 +96,18 @@ def main(): log("observe-only run complete") return - # Wait until all expected gated workers are present (gang is submitted - # together), then ungate them. expected_workers is N-1, propagated by the - # webhook from the leader at admission; if unset we ungate whatever is found. + # Ungate the gang: discover the gated pods in the gang group and remove their + # gate, stamping the job-id so each can fetch results by id. The gang pods are + # created up front (Job/Deployment), so they are present by submit time. gated_pods = gated_pods_from_env() or wait_for_gated_pods( - namespace, group, expected_workers, exclude=pod_name, - timeout=ungate_timeout) - log(f"ungating {len(gated_pods)} worker(s): {gated_pods}") + namespace, gang_group, exclude=pod_name, timeout=ungate_timeout) + log(f"ungating {len(gated_pods)} gang pod(s): {gated_pods}") n_ok = ungate_pods(gated_pods, job_id, namespace) if n_ok == len(gated_pods): - log(f"done — {n_ok} worker(s) ungated") + log(f"done — {n_ok} gang pod(s) ungated") else: - log(f"WARNING: ungated only {n_ok}/{len(gated_pods)} worker(s) — see errors above") + log(f"WARNING: ungated only {n_ok}/{len(gated_pods)} gang pod(s) — see errors above") if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/python/fluence/ungate.py b/python/fluence/ungate.py index 1019ead..a40e662 100644 --- a/python/fluence/ungate.py +++ b/python/fluence/ungate.py @@ -84,10 +84,10 @@ def gated_pods_from_env(): def discover_gated_pods(namespace, group, exclude=""): """ Find the names of pods in the same group that still carry the quantum - scheduling gate (i.e. the workers this sidecar's leader must ungate). + scheduling gate (i.e. the gang pods this submitter must ungate). - The leader's sidecar is created before the workers are admitted, so the gated - set cannot be known at admission time and must be discovered at runtime. We + The submitter is created alongside the gang, so the gated set is discovered + at runtime rather than known at admission. We list pods by the group label and keep those with the QUANTUM_GATE_NAME gate still present, excluding the leader pod itself. """ @@ -114,31 +114,24 @@ def discover_gated_pods(namespace, group, exclude=""): return names -def wait_for_gated_pods(namespace, group, expected, exclude="", timeout=120, - interval=3): +def wait_for_gated_pods(namespace, group, exclude="", timeout=120, interval=3): """ - Wait until at least `expected` gated workers have been discovered in the - group, or `timeout` seconds elapse. The gang is submitted together, so all - workers appear quickly; the timeout is a backstop against a crashed/never- - admitted worker so the sidecar never hangs. Returns the discovered list - (which may be short of `expected` if the timeout fired). + Wait until at least one gated gang pod is discovered in the group (the gang + is created up front, so its pods appear quickly), then return all currently + gated pods. The timeout is a backstop so the submitter never hangs if the + gang never appears. Returns the discovered list (possibly empty on timeout). """ deadline = time.time() + timeout found = [] while time.time() < deadline: found = discover_gated_pods(namespace, group, exclude=exclude) - if expected and len(found) >= expected: - log(f"all {expected} gated worker(s) present") + if found: return found - if not expected: - # No expected count known — return whatever is present now. - return found - log(f"waiting for gated workers: {len(found)}/{expected}") + log("waiting for gated gang pods to appear") time.sleep(interval) - log(f"WARNING: timed out waiting for gated workers " - f"({len(found)}/{expected}); ungating what is present") + log("WARNING: timed out waiting for gated gang pods; none found") return found def namespace_from_env(): - return os.environ.get("FLUENCE_NAMESPACE", "default") + return os.environ.get("FLUENCE_NAMESPACE", "default") \ No newline at end of file diff --git a/test/e2e/02-quantum-placement.sh b/test/e2e/02-quantum-placement.sh deleted file mode 100644 index 17897a3..0000000 --- a/test/e2e/02-quantum-placement.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env bash -# Quantum placement: a qpu pod is matched to a backend and the webhook injects QRMI_BACKEND. -set -euo pipefail -HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" -ANN="fluence.flux-framework.org/backend" - -log "TEST 2: quantum placement and backend handoff" -kubectl apply -f examples/test/e2e/quantum-pod-mock.yaml - -wait_pod_phase sampler-mock Running 120 || fail "sampler-mock did not reach Running" - -# fluence must have stamped the chosen backend annotation. -backend="$(kubectl get pod sampler-mock -o jsonpath="{.metadata.annotations.${ANN//./\\.}}" 2>/dev/null || true)" -[ -n "$backend" ] || (show_webhook sampler-mock && fail "backend annotation ($ANN) was not set by fluence") -log "fluence chose backend: $backend" - -# The webhook must have surfaced it as QRMI_BACKEND inside the container. -out="$(kubectl logs sampler-mock || true)" -echo "$out" | grep -q "BACKEND=${backend}" \ - || (show_webhook sampler-mock && fail "QRMI_BACKEND in container ('$out') does not match annotation ($backend)") - -log "PASS: qpu pod scheduled, backend '$backend' chosen and injected as QRMI_BACKEND" -kubectl delete -f examples/test/e2e/quantum-pod-mock.yaml --wait=false || true diff --git a/test/e2e/03-restart-recovery.sh b/test/e2e/03-restart-recovery.sh index 20c1be9..c26980f 100644 --- a/test/e2e/03-restart-recovery.sh +++ b/test/e2e/03-restart-recovery.sh @@ -9,7 +9,7 @@ ANN="fluence.flux-framework.org/backend" log "TEST 3: restart does not double-book an exclusive backend" # 1. Schedule the first qpu pod and capture its backend. -kubectl apply -f examples/test/e2e/quantum-pod-mock.yaml +kubectl apply -f examples/test/e2e/quantum/quantum-pod-mock.yaml wait_pod_phase sampler-mock "$NS" Running 120 || fail "sampler-mock did not reach Running" backend="$(kubectl get pod sampler-mock -n "$NS" -o jsonpath="{.metadata.annotations.${ANN//./\\.}}" 2>/dev/null || true)" [ -n "$backend" ] || fail "first pod has no backend annotation" @@ -26,7 +26,7 @@ wait_pod_phase sampler-mock "$NS" Running 30 || fail "first pod not Running afte # 4. A second pod requesting the same exclusive qpu must NOT get the same backend. # If recovery worked, the backend is occupied and the second pod stays Pending. -kubectl apply -f examples/test/e2e/quantum-pod-mock-2.yaml +kubectl apply -f examples/test/e2e/quantum/quantum-pod-mock-2.yaml if assert_stays_pending sampler-mock-2 "$NS" 45; then log "PASS: second qpu pod stayed Pending; backend '$backend' was not double-booked" else @@ -38,5 +38,5 @@ else fi fi -kubectl delete -f examples/test/e2e/quantum-pod-mock-2.yaml --wait=false || true -kubectl delete -f examples/test/e2e/quantum-pod-mock.yaml --wait=false || true +kubectl delete -f examples/test/e2e/quantum/quantum-pod-mock-2.yaml --wait=false || true +kubectl delete -f examples/test/e2e/quantum/quantum-pod-mock.yaml --wait=false || true diff --git a/test/e2e/04-sidecar-ungate.sh b/test/e2e/04-sidecar-ungate.sh deleted file mode 100644 index 9ffefc8..0000000 --- a/test/e2e/04-sidecar-ungate.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env bash -# Sidecar webhook test. -# -# Verifies that when a PodGroup of size > 1 with QPU resources is submitted: -# 1. The webhook creates fluence-sidecar RBAC in the namespace automatically -# 2. The leader pod gets the sidecar container injected -# 3. The worker pod gets the quantum.braket/ready scheduling gate added -# 4. The worker pod gets fluence-quantum-classical priority class set -# -# Does NOT test the sidecar itself (task discovery, interceptor, -# queue position polling). Those require real AWS credentials and are covered -# by sidecars/providers/braket/test/integration.sh which is run locally. -set -euo pipefail -HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" - -log "TEST 4: sidecar webhook — RBAC creation, gate injection, sidecar injection" - -kubectl apply -f examples/test/e2e/sidecar-mock-pods.yaml - -# Give webhook time to process the leader pod admission -sleep 3 - -# Print webhook logs — always show these so we can see what happened -log "--- webhook logs ---" -kubectl logs -n kube-system deployment/fluence-webhook --tail=50 || true -log "--- end webhook logs ---" - -# 1. Webhook should have created fluence-sidecar ServiceAccount -log "checking webhook created fluence-sidecar ServiceAccount..." -for i in $(seq 1 30); do - kubectl get serviceaccount fluence-sidecar -n default > /dev/null 2>&1 && break - sleep 2 -done -kubectl get serviceaccount fluence-sidecar -n default \ - || fail "webhook did not create fluence-sidecar ServiceAccount" -log " fluence-sidecar ServiceAccount created" - -# 2. Webhook should have created fluence-sidecar Role -kubectl get role fluence-sidecar -n default \ - || fail "webhook did not create fluence-sidecar Role" -log " fluence-sidecar Role created" - -# 3. Webhook should have created fluence-sidecar RoleBinding -kubectl get rolebinding fluence-sidecar -n default \ - || fail "webhook did not create fluence-sidecar RoleBinding" -log " fluence-sidecar RoleBinding created" - -# 4. Leader pod should have the fluence-stage init container injected (Model C: -# it stages the fluence Python package into a shared volume on PYTHONPATH). -log "checking webhook injected the fluence-stage init container..." -wait_pod_phase sidecar-test-leader Running 120 \ - || { kubectl describe pod sidecar-test-leader; fail "sidecar-test-leader did not reach Running"; } -initc=$(kubectl get pod sidecar-test-leader \ - -o jsonpath='{.spec.initContainers[*].name}') -echo "$initc" | grep -q "fluence-stage" \ - || fail "fluence-stage init container not injected (initContainers: $initc)" -log " fluence-stage init container injected" - -# 5. Leader pod should have the sidecar container injected -log "checking sidecar injected into leader pod..." -containers=$(kubectl get pod sidecar-test-leader \ - -o jsonpath='{.spec.containers[*].name}') -echo "$containers" | grep -q "fluence-sidecar" \ - || fail "fluence-sidecar container not injected into leader (containers: $containers)" -log " fluence-sidecar container injected into leader" - -# 6. Worker pod should have scheduling gate added by webhook -gate=$(kubectl get pod sidecar-test-worker \ - -o jsonpath='{.spec.schedulingGates[0].name}') -[ "$gate" = "quantum.braket/ready" ] \ - || fail "worker pod does not have quantum.braket/ready gate (got: $gate)" -log " quantum.braket/ready gate set on worker" - -# 7. Worker pod should have the fluence-quantum-classical priority class set by -# the webhook at admission (so it schedules reliably once ungated). -pc=$(kubectl get pod sidecar-test-worker -o jsonpath='{.spec.priorityClassName}') -[ "$pc" = "fluence-quantum-classical" ] \ - || fail "worker pod missing fluence-quantum-classical priority class (got: $pc)" -log " fluence-quantum-classical priority class set on worker" - -log "PASS: webhook correctly created RBAC, injected sidecar, gated worker" -log "NOTE: fluence-quantum-classical priority is set by the webhook at admission (immutable post-creation)" -log "NOTE: braket sidecar integration test (SDK intercept, tag discovery," -log " queue polling) is in sidecars/providers/braket/test/integration.sh" - -# Only clean up pods and PodGroup — RBAC is namespace infrastructure -# that persists for future quantum workflows in this namespace -kubectl delete -f examples/test/e2e/sidecar-mock-pods.yaml diff --git a/test/e2e/01-classical-gang.sh b/test/e2e/gang/01-classical-gang.sh old mode 100644 new mode 100755 similarity index 71% rename from test/e2e/01-classical-gang.sh rename to test/e2e/gang/01-classical-gang.sh index d2018ac..1ebfc64 --- a/test/e2e/01-classical-gang.sh +++ b/test/e2e/gang/01-classical-gang.sh @@ -1,10 +1,10 @@ #!/usr/bin/env bash # Classical gang scheduling: a PodGroup of 2 must be placed all-or-nothing on real nodes. set -euo pipefail -HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE}/lib.sh" +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" log "TEST 1: classical gang scheduling" -kubectl apply -f examples/single-podgroup.yaml +kubectl apply -f examples/test/e2e/gang/single-podgroup.yaml # All pods in the 'training' deployment must reach Running (scheduled + started). # Wait for the pod to EXIST before waiting for Ready — kubectl wait errors out @@ -25,5 +25,9 @@ count="$(kubectl get pods -l app=training --no-headers | wc -l | tr -d ' ')" [ "$count" = "1" ] || fail "expected 2 training pods, got $count" log "PASS: classical gang placed all $count pods via fluence" -kubectl delete -f examples/single-podgroup.yaml --wait=false || true +kubectl delete -f examples/test/e2e/gang/single-podgroup.yaml --wait=false || true kubectl patch podgroup training --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +# Wait for the pods to actually be gone before the next test runs — otherwise a +# terminating 'training' pod (same name/labels reused by other scenarios) can be +# misread as the next test's placement. +kubectl wait --for=delete pod -l app=training --timeout=60s 2>/dev/null || true diff --git a/test/e2e/gang/02-postfilter-rematch.sh b/test/e2e/gang/02-postfilter-rematch.sh new file mode 100755 index 0000000..f74c87b --- /dev/null +++ b/test/e2e/gang/02-postfilter-rematch.sh @@ -0,0 +1,117 @@ +#!/usr/bin/env bash +# PostFilter re-match: when another scheduler plugin (TaintToleration) rejects a +# node Fluxion allocated, Fluence must abandon that allocation, exclude the node, +# and re-match onto an untainted node. Safety: the gang's RUNNING pod must NEVER +# bind to the tainted node. +# +# This test is self-isolating: it uses its own workload name (pf-rematch) and +# labels, distinct from the other e2e scenarios, and ensures a clean slate first, +# so a pod left over (terminating) from a previous test can never be mistaken for +# this test's placement. It also ignores terminating pods when asserting. +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" + +NAME=pf-rematch +SEL="app=${NAME}" + +log "TEST 5: PostFilter abandons a taint-rejected allocation and re-matches" + +# --- clean slate: no leftover pods from earlier tests under our name ---------- +kubectl delete deployment "$NAME" --ignore-not-found >/dev/null 2>&1 || true +kubectl delete podgroup "$NAME" --ignore-not-found >/dev/null 2>&1 || true +kubectl patch podgroup "$NAME" --type=merge \ + -p '{"metadata":{"finalizers":null}}' >/dev/null 2>&1 || true +kubectl wait --for=delete pod -l "$SEL" --timeout=60s >/dev/null 2>&1 || true +# Defensive: a prior test's workload left running would occupy the only +# untainted worker and make this test fail with a (correct) fluxion +# allocate -1 for lack of capacity. Ensure none lingers. +kubectl delete deployment training --ignore-not-found --wait=false >/dev/null 2>&1 || true +kubectl wait --for=delete pod -l app=training --timeout=60s >/dev/null 2>&1 || true + +TAINTED="$(kubectl get nodes -l '!node-role.kubernetes.io/control-plane' \ + -o jsonpath='{.items[0].metadata.name}')" +[ -n "$TAINTED" ] || fail "no worker node found to taint" +log "tainting node $TAINTED with fluence-e2e=blocked:NoSchedule" +kubectl taint nodes "$TAINTED" fluence-e2e=blocked:NoSchedule --overwrite + +cleanup() { + kubectl taint nodes "$TAINTED" fluence-e2e- 2>/dev/null || true + kubectl delete deployment "$NAME" --ignore-not-found --wait=false 2>/dev/null || true + kubectl delete podgroup "$NAME" --ignore-not-found --wait=false 2>/dev/null || true + kubectl patch podgroup "$NAME" --type=merge \ + -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +} +trap cleanup EXIT + +# --- our own workload (distinct name/labels; does NOT tolerate the taint) ------ +kubectl apply -f - <" for empty fields, so an empty deletionTimestamp + # shows as "", NOT "". Treat "" as empty for both columns. + if [ "$deleted" != "" ] && [ -n "$deleted" ]; then continue; fi # skip terminating + if [ "$node" = "" ] || [ -z "$node" ]; then continue; fi # skip not-yet-bound + checked=$((checked+1)) + if [ "$node" = "$TAINTED" ]; then + fail "SAFETY VIOLATION: running pod $name is bound to the tainted node $TAINTED" + fi + log "$name correctly placed on $node (not the tainted $TAINTED)" +done < <(kubectl get pods -l "$SEL" \ + -o custom-columns='N:.metadata.name,NODE:.spec.nodeName,DEL:.metadata.deletionTimestamp' \ + --no-headers) + +[ "$checked" -ge 1 ] || fail "no running ${NAME} pod found to check" + +# Informational: did PostFilter actually fire (Fluxion picked the tainted node +# first and we re-matched), or did Fluxion place on the good node directly? +POD="$(kubectl -n kube-system get pods -l app=fluence \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true)" +if [ -n "$POD" ] && kubectl -n kube-system logs "$POD" 2>/dev/null \ + | grep -q "unschedulable: abandoning allocation"; then + log "observed PostFilter abandonment in scheduler log (re-match path exercised)" +else + log "note: Fluxion placed on the untainted node directly this run (PostFilter not needed)" +fi + +log "PASS: gang scheduled on an untainted node; no running pod on the tainted node" diff --git a/test/e2e/gang/03-multi-gang.sh b/test/e2e/gang/03-multi-gang.sh new file mode 100755 index 0000000..9f01ae5 --- /dev/null +++ b/test/e2e/gang/03-multi-gang.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# Multi-pod gang scheduling on real nodes. Guards the two failures that the +# single-pod 01 test could NOT catch (and that shipped a minCount=1 bug): +# A) a multi-pod gang must place ALL of them (minCount must equal the gang size, not 1) +# B) under contention, a gang that cannot fully fit stays ENTIRELY pending — +# never partially placed (no stranded pods holding nodes). +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" + +# ---- A) all-or-nothing placement of a 3-pod gang ------------------------------- +log "TEST 6A: multi-pod gang (2) places all-or-nothing" +kubectl apply -f examples/test/e2e/gang/multi-gang.yaml + +# the webhook must have created the PodGroup with minCount = 2 (the bug set it to 1) +log "checking PodGroup minCount == 2 (set by webhook from group-size)" +for i in $(seq 1 30); do + mc="$(kubectl get podgroup gang3 -o jsonpath='{.spec.schedulingPolicy.gang.minCount}' 2>/dev/null || true)" + [ -n "$mc" ] && break; sleep 2 +done +[ "$mc" = "2" ] || fail "PodGroup gang3 minCount=$mc, want 2 (minCount=1 bug -> partial gangs)" + +log "waiting for all 2 gang pods to be Ready" +wait_pods_ready "app=gang3" 2 180 || fail "gang3 did not place all 2 pods (gang scheduling failed)" + +count="$(kubectl get pods -l app=gang3 --field-selector=status.phase=Running --no-headers | wc -l | tr -d ' ')" +[ "$count" = "2" ] || fail "expected 2 Running gang3 pods, got $count (partial placement)" +for p in $(kubectl get pods -l app=gang3 -o name); do + pod="${p#pod/}" + sched="$(kubectl get pod "$pod" -o jsonpath='{.spec.schedulerName}')" + [ "$sched" = "fluence" ] || fail "$pod not scheduled by fluence (got: $sched)" +done +log "PASS 6A: 2-pod gang placed atomically by fluence (minCount=2)" + +kubectl delete -f examples/test/e2e/gang/multi-gang.yaml --wait=false || true +kubectl patch podgroup gang3 --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +kubectl wait --for=delete pod -l app=gang3 --timeout=60s 2>/dev/null || true + +# ---- B) contention: the gang that can't fully fit stays ENTIRELY pending -------- +log "TEST 6B: contention — a gang that cannot fully fit must NOT partially place" +kubectl apply -f examples/test/e2e/gang/multi-gang-contention.yaml + +# wait until the cluster settles. Three possible outcomes: +# - one gang fully Running, other fully Pending -> contention; assert no partial +# - BOTH fully Running -> runner big enough, no contention to test (skip) +# - any partial (1 of 2 in a gang scheduled) -> the bug, fail +log "waiting for gangs to settle" +winner=""; loser=""; both="" +for i in $(seq 1 90); do + ra="$(kubectl get pods -l app=gang-a --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ')" + rb="$(kubectl get pods -l app=gang-b --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ')" + if [ "$ra" = "2" ] && [ "$rb" = "2" ]; then both=1; break; fi + if [ "$ra" = "2" ] && [ "$rb" = "0" ]; then winner=gang-a; loser=gang-b; break; fi + if [ "$rb" = "2" ] && [ "$ra" = "0" ]; then winner=gang-b; loser=gang-a; break; fi + sleep 2 +done + +if [ -n "$both" ]; then + log "SKIP 6B: cluster placed both gangs (>=4 schedulable cores) — no contention on this runner" +else + [ -n "$winner" ] || fail "no clean settle: gang-a=$ra gang-b=$rb running (possible PARTIAL placement)" + log "winner=$winner (2 running), loser=$loser (expected 0 running)" + # the loser must have ZERO pods scheduled to a node — the all-or-nothing guarantee. + # A single scheduled loser pod = partial placement = the bug. + scheduled_loser="$(kubectl get pods -l app=$loser -o jsonpath='{range .items[*]}{.spec.nodeName}{"\n"}{end}' | grep -c . || true)" + [ "$scheduled_loser" = "0" ] || fail "$loser has $scheduled_loser pod(s) on a node — PARTIAL placement (gang violated)" + log "PASS 6B: $loser stayed entirely pending — no partial placement under contention" +fi + +kubectl delete -f examples/test/e2e/gang/multi-gang-contention.yaml --wait=false || true +for g in gang-a gang-b; do + kubectl patch podgroup $g --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +done +kubectl wait --for=delete pod -l app=gang-a --timeout=60s 2>/dev/null || true +kubectl wait --for=delete pod -l app=gang-b --timeout=60s 2>/dev/null || true +log "PASS: multi-gang all-or-nothing verified" diff --git a/test/e2e/gang/04-requeue-on-capacity.sh b/test/e2e/gang/04-requeue-on-capacity.sh new file mode 100755 index 0000000..f41aa71 --- /dev/null +++ b/test/e2e/gang/04-requeue-on-capacity.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash +# Requeue-on-capacity + gang atomicity under contention. +# +# Two 2-pod gangs contend for a cluster that can only run one at a time. This +# guards two invariants that the GKE contention runs exposed: +# 1. ALL-OR-NOTHING: each gang places ALL its pods or NONE — never a partial +# (e.g. 1-of-2 scheduled). The winner must be a clean 2/2; the loser a clean +# 0/2 while it waits. +# 2. REQUEUE: when the winner completes and frees its nodes, the loser is +# re-attempted on its own (no manual nudge) and then ALSO places atomically +# (2/2), driven by the shortened --pod-max-in-unschedulable-pods-duration. +# +# SCOPE / LIMITATION: this is a 3-node kind cluster with small (1-core) pods. It +# verifies the INVARIANTS on a minimal contention case. It does NOT reproduce the +# GKE-scale dynamics where the bug was first seen — one-pod-per-node (~80-core) +# saturation and ~20 simultaneous mixed-size gangs draining in sequence. That +# scale behavior is validated on the real cluster, not in CI; a pass here means +# the invariants hold on the simple case, not that large-scale draining is proven. +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" + +# running-pod count for a gang (job-name label set by the Job controller) +running() { kubectl get pods -l job-name="$1" --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' '; } +# count of a gang's pods actually bound to a node (Running OR already Succeeded) +on_nodes() { kubectl get pods -l job-name="$1" -o jsonpath='{range .items[*]}{.spec.nodeName}{"\n"}{end}' 2>/dev/null | grep -c . || true; } + +log "TEST 9: contended gangs stay all-or-nothing, loser requeues when capacity frees" +kubectl apply -f examples/test/e2e/gang/multi-gang-requeue.yaml + +# ---- 1. one gang wins CLEANLY (2/2); the other places NOTHING (0/2) ------------ +log "waiting for a clean 2/0 split (one whole gang runs, the other entirely waits)" +winner=""; loser="" +for i in $(seq 1 60); do + rw="$(running gang-win)"; ra="$(running gang-wait)" + if [ "$rw" = "2" ] && [ "$ra" = "0" ]; then winner=gang-win; loser=gang-wait; break; fi + if [ "$ra" = "2" ] && [ "$rw" = "0" ]; then winner=gang-wait; loser=gang-win; break; fi + # a 1/x or x/1 state that persists is a PARTIAL gang — fail fast on it + if [ "$rw" = "1" ] || [ "$ra" = "1" ]; then + sleep 6 # allow a transient mid-bind moment to resolve + rw="$(running gang-win)"; ra="$(running gang-wait)" + { [ "$rw" = "1" ] || [ "$ra" = "1" ]; } && \ + fail "PARTIAL gang: gang-win=$rw gang-wait=$ra running (all-or-nothing violated)" + fi + sleep 2 +done +[ -n "$winner" ] || fail "no clean 2/0 split (gang-win=$(running gang-win) gang-wait=$(running gang-wait))" +log " winner=$winner (2/2 running), loser=$loser" + +# loser must have ZERO pods on any node — not even one (that would be a partial) +sl="$(on_nodes "$loser")" +[ "$sl" = "0" ] || fail "$loser has $sl pod(s) bound while it should be entirely pending — PARTIAL placement" +log " $loser entirely pending (0 pods bound) — all-or-nothing holds" + +# ---- 2. winner completes -> loser is requeued AND places atomically ------------ +log "waiting for winner=$winner to complete and free its nodes" +kubectl wait --for=condition=complete job/$winner --timeout=120s || fail "$winner did not complete" +log " $winner completed; capacity freed" + +# The loser must now place ALL its pods (2/2), on its own, within a window above +# the 30s recheck flush but below the 5m default — proving the shortened timeout +# is in effect AND that the requeued gang is still atomic (not a partial). +log "asserting $loser requeues and places ATOMICALLY (2/2) within ~75s" +ok="" +for i in $(seq 1 38); do # ~75s + rl="$(running $loser)" + dl="$(kubectl get pods -l job-name=$loser --field-selector=status.phase=Succeeded --no-headers 2>/dev/null | wc -l | tr -d ' ')" + # both pods accounted for (running and/or already completed) = atomic placement + [ "$((rl + dl))" = "2" ] && { ok=1; break; } + # a lone 1/2 that lingers = partial placement of the requeued gang + if [ "$((rl + dl))" = "1" ]; then + sleep 6 + rl="$(running $loser)"; dl="$(kubectl get pods -l job-name=$loser --field-selector=status.phase=Succeeded --no-headers 2>/dev/null | wc -l | tr -d ' ')" + [ "$((rl + dl))" = "1" ] && fail "$loser placed 1 of 2 pods — PARTIAL placement of the requeued gang" + fi + sleep 2 +done +[ -n "$ok" ] || fail "$loser did NOT place both pods within 75s of capacity freeing — \ +either the shortened --pod-max-in-unschedulable-pods-duration is not taking effect \ +(gang stuck) or the requeued gang did not assemble" +log "PASS 9: $loser requeued and placed atomically (2/2) after $winner freed capacity" + +kubectl delete -f examples/test/e2e/gang/multi-gang-requeue.yaml --wait=false || true +for g in gang-win gang-wait; do + kubectl patch podgroup $g --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +done +kubectl wait --for=delete pod -l job-name=gang-win --timeout=60s 2>/dev/null || true +kubectl wait --for=delete pod -l job-name=gang-wait --timeout=60s 2>/dev/null || true diff --git a/test/e2e/lib.sh b/test/e2e/lib.sh index cad6a2e..13390c9 100644 --- a/test/e2e/lib.sh +++ b/test/e2e/lib.sh @@ -44,7 +44,7 @@ wait_fluence_ready() { show_webhook() { pod=$1 - echo "FAIL: QRMI_BACKEND mismatch" + echo "FAIL: FLUXION_BACKEND mismatch" kubectl get pod $pod -o jsonpath='{.spec.containers[0].env}'; echo kubectl get pod $pod -o jsonpath='{.metadata.annotations}'; echo kubectl -n kube-system logs deploy/fluence-webhook --tail=50 diff --git a/test/e2e/quantum/01-quantum-placement.sh b/test/e2e/quantum/01-quantum-placement.sh new file mode 100755 index 0000000..8f5c475 --- /dev/null +++ b/test/e2e/quantum/01-quantum-placement.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +# Quantum placement: a qpu pod is matched to a backend and the webhook injects FLUXION_BACKEND. +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" +ANN="fluence.flux-framework.org/backend" + +log "TEST 2: quantum placement and backend handoff" +kubectl apply -f examples/test/e2e/quantum/quantum-pod-mock.yaml + +wait_pod_phase sampler-mock Running 120 || fail "sampler-mock did not reach Running" + +# fluence must have stamped the chosen backend annotation. +backend="$(kubectl get pod sampler-mock -o jsonpath="{.metadata.annotations.${ANN//./\\.}}" 2>/dev/null || true)" +[ -n "$backend" ] || (show_webhook sampler-mock && fail "backend annotation ($ANN) was not set by fluence") +log "fluence chose backend: $backend" + +# The webhook must have surfaced it as FLUXION_BACKEND inside the container. +out="$(kubectl logs sampler-mock || true)" +if ! echo "$out" | grep -q "BACKEND=${backend}"; then + # Diagnostic (CI has no interactive shell): show whether the env var is ABSENT + # (not injected -> webhook issue) or PRESENT-BUT-EMPTY (annotation not resolved + # at container start -> delivery/timing issue), and what the container actually got. + log "--- diagnostic: container env spec ---" + kubectl get pod sampler-mock -o jsonpath='{.spec.containers[0].env}' ; echo + log "--- diagnostic: live value via exec ---" + kubectl exec sampler-mock -- sh -c 'echo "FLUXION_BACKEND=[$FLUXION_BACKEND]"' 2>&1 || true + log "--- diagnostic: backend annotation on pod ---" + kubectl get pod sampler-mock -o jsonpath="{.metadata.annotations.${ANN//./\\.}}" ; echo + show_webhook sampler-mock + fail "FLUXION_BACKEND in container ('$out') does not match annotation ($backend)" +fi + +log "PASS: qpu pod scheduled, backend '$backend' chosen and injected as FLUXION_BACKEND" +kubectl delete -f examples/test/e2e/quantum/quantum-pod-mock.yaml --wait=false || true diff --git a/test/e2e/quantum/02-sidecar-ungate.sh b/test/e2e/quantum/02-sidecar-ungate.sh new file mode 100755 index 0000000..a4ae79f --- /dev/null +++ b/test/e2e/quantum/02-sidecar-ungate.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +# Shared-coordination webhook test (producer/consumer, no submitter pod). +# +# When a shared quantum gang (coordination=shared, N pods all requesting QPU) is +# submitted, the webhook must: +# 1. create the fluence-sidecar RBAC in the namespace automatically +# 2. gate every CONSUMER pod with quantum.braket/ready +# 3. raise every CONSUMER pod to the fluence-quantum-classical priority class +# 4. leave the PRODUCER (completion index 0) UNGATED, as a real member (NOT a +# separate spawned pod) +# 5. inject the fluence-stage init container + the sidecar container into the +# producer (Model C staging + the real coordinator) +# +# Does NOT test the sidecar runtime (task discovery, interceptor, queue polling) +# — that needs real AWS creds (sidecars/providers/braket/test/integration.sh). +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" + +GROUP=qgang +PRODUCER=${GROUP}-0 # completion index 0 +CONSUMER=${GROUP}-1 # completion index 1 + +log "TEST 4: shared-gang webhook — RBAC, consumer gating, priority, producer wiring" +kubectl apply -f examples/test/e2e/quantum/quantum-gang-pods.yaml +sleep 3 + +log "--- webhook logs ---" +kubectl logs -n kube-system deployment/fluence-webhook --tail=50 || true +log "--- end webhook logs ---" + +# 1. RBAC created by the webhook (idempotent, per-namespace). +log "checking webhook created fluence-sidecar RBAC..." +for i in $(seq 1 30); do + kubectl get serviceaccount fluence-sidecar -n default >/dev/null 2>&1 && break + sleep 2 +done +kubectl get serviceaccount fluence-sidecar -n default || fail "no fluence-sidecar ServiceAccount" +kubectl get role fluence-sidecar -n default || fail "no fluence-sidecar Role" +kubectl get rolebinding fluence-sidecar -n default || fail "no fluence-sidecar RoleBinding" +log " RBAC present" + +# 2 + 3. The CONSUMER is gated and at the preempting priority class. +gate="$(kubectl get pod "$CONSUMER" -o jsonpath='{.spec.schedulingGates[0].name}' 2>/dev/null || true)" +[ "$gate" = "quantum.braket/ready" ] || fail "$CONSUMER not gated (gate=$gate)" +pc="$(kubectl get pod "$CONSUMER" -o jsonpath='{.spec.priorityClassName}' 2>/dev/null || true)" +[ "$pc" = "fluence-quantum-classical" ] || fail "$CONSUMER priorityClass=$pc, want fluence-quantum-classical" +log " consumer gated + fluence-quantum-classical priority" + +# 4. The PRODUCER is NOT a separate spawned pod and is NOT gated. No -submitter. +if kubectl get pod "${GROUP}-submitter" -n default >/dev/null 2>&1; then + fail "found ${GROUP}-submitter pod — the obsolete separate-submitter model must not exist" +fi +pgate="$(kubectl get pod "$PRODUCER" -o jsonpath='{.spec.schedulingGates[0].name}' 2>/dev/null || true)" +[ -z "$pgate" ] || fail "producer must NOT be gated (gate=$pgate)" +log " producer is a real member, not gated; no separate submitter pod" + +# 5. Producer has the staging init container + the sidecar container. +wait_pod_phase "$PRODUCER" Running 120 \ + || { kubectl describe pod "$PRODUCER"; fail "$PRODUCER did not reach Running"; } +initc="$(kubectl get pod "$PRODUCER" -o jsonpath='{.spec.initContainers[*].name}')" +echo "$initc" | grep -q fluence-stage || fail "fluence-stage init container not injected (init: $initc)" +conts="$(kubectl get pod "$PRODUCER" -o jsonpath='{.spec.containers[*].name}')" +echo "$conts" | grep -q fluence-sidecar || fail "fluence-sidecar container not injected (containers: $conts)" +log " producer has fluence-stage + fluence-sidecar" + +log "PASS: webhook gated the consumers, set priority, created RBAC + wired the producer" +log "NOTE: priority is set at admission (immutable post-creation)" +log "NOTE: braket sidecar runtime (SDK intercept, tag discovery, queue polling)" +log " is in sidecars/providers/braket/test/integration.sh" + +# Clean up pods + PodGroups; RBAC is namespace infra and persists. +kubectl delete -f examples/test/e2e/quantum/quantum-gang-pods.yaml --wait=false || true +for g in "$GROUP" "${GROUP}-producer"; do + kubectl patch podgroup "$g" --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +done +kubectl wait --for=delete pod -l app="$GROUP" --timeout=60s 2>/dev/null || true diff --git a/test/e2e/quantum/03-gang-producer.sh b/test/e2e/quantum/03-gang-producer.sh new file mode 100644 index 0000000..fce4248 --- /dev/null +++ b/test/e2e/quantum/03-gang-producer.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# Producer/consumer structure (replaces the old leader/worker and submitter-pod +# models). +# +# The structural guarantee the ungate path depends on: a shared quantum gang of +# size N is split, by completion index, into the CONSUMER gang +# (minCount N-1, gated) and the PRODUCER's group-of-one -producer +# (minCount 1, not gated). The producer is a real member of the user's workload — +# there is NO separate -submitter pod, NO -workers subgroup, and no +# leader among the user's pods. (The runtime ungate is covered by the braket +# integration test; here we prove the shape.) +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" + +GROUP=qgang +PRODUCER_GROUP=${GROUP}-producer +PRODUCER=${GROUP}-0 # completion index 0 +CONSUMER=${GROUP}-1 # completion index 1 + +log "TEST 7: consumer gang(N-1, gated) + producer(1, member) structure" +kubectl apply -f examples/test/e2e/quantum/quantum-gang-pods.yaml + +# Consumer PodGroup exists with minCount N-1 = 1 (the split). +log "checking consumer group '$GROUP' minCount == 1 (N-1)" +for i in $(seq 1 30); do + gc="$(kubectl get podgroup "$GROUP" -o jsonpath='{.spec.schedulingPolicy.gang.minCount}' 2>/dev/null || true)" + [ -n "$gc" ] && break; sleep 2 +done +[ "$gc" = "1" ] || fail "consumer group $GROUP minCount=$gc, want 1 (N-1)" + +# There must be NO -workers subgroup and NO -submitter pod. +if kubectl get podgroup "${GROUP}-workers" >/dev/null 2>&1; then + fail "found ${GROUP}-workers PodGroup — the obsolete leader/worker split must not exist" +fi +if kubectl get pod "${GROUP}-submitter" >/dev/null 2>&1; then + fail "found ${GROUP}-submitter pod — the obsolete separate-submitter model must not exist" +fi +log " consumer group minCount=1, no -workers subgroup, no -submitter pod" + +# Producer PodGroup -producer exists with minCount 1 (schedules alone). +log "checking producer group '$PRODUCER_GROUP' minCount == 1" +for i in $(seq 1 30); do + sc="$(kubectl get podgroup "$PRODUCER_GROUP" -o jsonpath='{.spec.schedulingPolicy.gang.minCount}' 2>/dev/null || true)" + [ -n "$sc" ] && break; sleep 2 +done +[ "$sc" = "1" ] || fail "producer group $PRODUCER_GROUP minCount=$sc, want 1" + +# Producer pod (index 0) is relinked into its own group-of-one and is NOT gated. +pl="$(kubectl get pod "$PRODUCER" -o jsonpath='{.metadata.labels.fluence\.flux-framework\.org/group}' 2>/dev/null || true)" +[ "$pl" = "$PRODUCER_GROUP" ] || fail "producer group label=$pl, want $PRODUCER_GROUP" +pgate="$(kubectl get pod "$PRODUCER" -o jsonpath='{.spec.schedulingGates[0].name}' 2>/dev/null || true)" +[ -z "$pgate" ] || fail "producer must NOT be gated (gate=$pgate)" +log " producer in '$PRODUCER_GROUP' (minCount 1), not gated" + +# Consumer pod (index 1+) stays in and is gated. +g="$(kubectl get pod "$CONSUMER" -o jsonpath='{.metadata.labels.fluence\.flux-framework\.org/group}' 2>/dev/null || true)" +[ "$g" = "$GROUP" ] || fail "$CONSUMER group label=$g, want $GROUP" +gate="$(kubectl get pod "$CONSUMER" -o jsonpath='{.spec.schedulingGates[0].name}' 2>/dev/null || true)" +[ "$gate" = "quantum.braket/ready" ] || fail "$CONSUMER not gated (gate=$gate)" +# The consumer's dependency points at the producer group. +dp="$(kubectl get pod "$CONSUMER" -o jsonpath='{.metadata.annotations.fluence\.flux-framework\.org/depends-on-producer}' 2>/dev/null || true)" +[ "$dp" = "$PRODUCER_GROUP" ] || fail "consumer depends-on-producer=$dp, want $PRODUCER_GROUP" +log " consumer in '$GROUP', gated, depends on '$PRODUCER_GROUP'" + +log "PASS 7: consumer gang(N-1, gated) + producer(1, member, ungates gang), no submitter/leader/worker" +kubectl delete -f examples/test/e2e/quantum/quantum-gang-pods.yaml --wait=false || true +for g in "$GROUP" "$PRODUCER_GROUP"; do + kubectl patch podgroup "$g" --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +done +kubectl wait --for=delete pod -l app="$GROUP" --timeout=60s 2>/dev/null || true diff --git a/test/e2e/quantum/04-gang-env-contract.sh b/test/e2e/quantum/04-gang-env-contract.sh new file mode 100755 index 0000000..157f78b --- /dev/null +++ b/test/e2e/quantum/04-gang-env-contract.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +# Env-contract e2e (producer/consumer): verify the webhook injects, at admission, +# the env the runtime depends on — IN-CLUSTER, on the real pod specs, with no +# Braket/AWS and WITHOUT requiring scheduling. Guards the seam that, if broken, +# makes a gang schedule then hang or double-submit. +# +# Spec layer only (these are downward-API valueFrom refs whose VALUES resolve at +# placement, but whose PRESENCE is deterministic at admission), so no scheduling, +# no qpu capacity, no logs — it cannot flake on capacity. Contract: +# consumer (role): FLUENCE_COORDINATION_ROLE=consumer, FLUENCE_QUANTUM_JOB_ID, FLUXION_BACKEND +# (NO interceptor/PYTHONPATH — a consumer never submits) +# producer (role): FLUENCE_COORDINATION_ROLE=producer + FLUENCE_GANG_GROUP on the sidecar +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" + +GROUP=qgang +PRODUCER=${GROUP}-0 # completion index 0 +CONSUMER=${GROUP}-1 # completion index 1 + +log "TEST 8: producer/consumer env contract — spec layer" +kubectl apply -f examples/test/e2e/quantum/quantum-gang-pods.yaml + +# does container $2 of pod $1 have an env entry named $3 ? (spec-level only) +has_env() { + kubectl get pod "$1" -o jsonpath="{.spec.containers[?(@.name=='$2')].env[*].name}" \ + 2>/dev/null | tr ' ' '\n' | grep -qx "$3" +} +# value of env $3 in container $2 of pod $1 (empty if absent) +env_val() { + kubectl get pod "$1" -o jsonpath="{.spec.containers[?(@.name=='$2')].env[?(@.name=='$3')].value}" \ + 2>/dev/null || true +} + +log "checking the webhook wired the consumer role contract" +for i in $(seq 1 15); do has_env "$CONSUMER" app FLUENCE_COORDINATION_ROLE && break; sleep 2; done +# Present: the role (=consumer), the producer's task id, and the backend. +for v in FLUENCE_COORDINATION_ROLE FLUENCE_QUANTUM_JOB_ID FLUXION_BACKEND; do + has_env "$CONSUMER" app "$v" \ + || { kubectl get pod "$CONSUMER" -o yaml | sed -n '/containers:/,/status:/p'; \ + fail "consumer 'app' container missing env '$v'"; } + log " consumer has env: $v" +done +role="$(env_val "$CONSUMER" app FLUENCE_COORDINATION_ROLE)" +[ "$role" = "consumer" ] || fail "consumer role=$role, want consumer" +# Absent: a consumer never submits, so no interceptor staging and no faux flag. +for v in PYTHONPATH FLUENCE_FAUX_SUBMIT; do + ! has_env "$CONSUMER" app "$v" || fail "consumer must NOT carry '$v' (it does not submit)" +done +log " consumer role=consumer, no interceptor/faux" + +# The producer's sidecar must know which consumer group to ungate. +log "checking the producer sidecar has FLUENCE_GANG_GROUP=$GROUP" +for i in $(seq 1 30); do kubectl get pod "$PRODUCER" >/dev/null 2>&1 && break; sleep 2; done +gg="$(kubectl get pod "$PRODUCER" \ + -o jsonpath="{.spec.containers[?(@.name=='fluence-sidecar')].env[?(@.name=='FLUENCE_GANG_GROUP')].value}" \ + 2>/dev/null || true)" +[ "$gg" = "$GROUP" ] || fail "producer sidecar FLUENCE_GANG_GROUP=$gg, want $GROUP" +log " producer sidecar has FLUENCE_GANG_GROUP=$gg" + +# The producer carries role=producer and is the real submitter (no consumer id). +prole="$(env_val "$PRODUCER" app FLUENCE_COORDINATION_ROLE)" +[ "$prole" = "producer" ] || fail "producer role=$prole, want producer" +if has_env "$PRODUCER" app FLUENCE_QUANTUM_JOB_ID; then + fail "producer must NOT carry FLUENCE_QUANTUM_JOB_ID (it submits its own task)" +fi +log " producer role=producer, submits its own task" + +log "PASS 8: webhook injects the consumer(role) + producer(role) env contract at admission" + +kubectl delete -f examples/test/e2e/quantum/quantum-gang-pods.yaml --wait=false || true +for g in "$GROUP" "${GROUP}-producer"; do + kubectl patch podgroup "$g" --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true +done +kubectl wait --for=delete pod -l app="$GROUP" --timeout=60s 2>/dev/null || true diff --git a/test/e2e/quantum/setup.sh b/test/e2e/quantum/setup.sh new file mode 100644 index 0000000..57f375a --- /dev/null +++ b/test/e2e/quantum/setup.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# Quantum suite setup (run by the e2e-suite workflow before the NN-*.sh tests). +# +# Installs the qpu add-on so nodes advertise fluxion.flux-framework.org/qpu — +# without it every quantum pod stays Pending (fluence matches in its own graph, +# but the default NodeResourcesFit plugin rejects each node because the extended +# resource is not in allocatable, so the match is rolled back). The base deploy +# (deploy/fluence-test.yaml) does NOT include this; it is quantum-only. +# +# Also points the webhook-injected sidecar/stage image at the CI-loaded image: +# the default sidecar image (ghcr.io/.../fluence-sidecar:latest) is not loaded in +# kind, so the producer's containers could not pull. The fluence-stage init is +# fail-soft (no python in this image -> it logs and exits 0), which is fine for +# the structural assertions; the producer still schedules and runs. +set -euo pipefail +HERE="$(cd "$(dirname "$0")" && pwd)"; . "${HERE%/test/e2e/*}/test/e2e/lib.sh" +IMAGE="${IMAGE:-vanessa/fluence:test}" + +log "quantum setup: installing the qpu add-on (resources ConfigMap + device plugin)" +kubectl apply -f deploy/fluence-resources-test.yaml + +# Run the device plugin from the CI-loaded image (its manifest ships a registry +# image that kind has not pulled). Container name is 'deviceplugin'. +kubectl -n kube-system set image daemonset/fluence-deviceplugin deviceplugin="$IMAGE" +kubectl -n kube-system patch daemonset/fluence-deviceplugin --type=json \ + -p '[{"op":"replace","path":"/spec/template/spec/containers/0/imagePullPolicy","value":"IfNotPresent"}]' \ + 2>/dev/null || true + +# Injected sidecar + stage init must use a present image too (see header). +kubectl -n kube-system set env deployment/fluence-webhook FLUENCE_SIDECAR_IMAGE="$IMAGE" +kubectl -n kube-system rollout status deployment/fluence-webhook --timeout=180s + +# Scheduler re-reads the resources config now that the ConfigMap exists. +kubectl -n kube-system rollout restart deployment/fluence +kubectl -n kube-system rollout status deployment/fluence --timeout=180s + +log "waiting for the device plugin DaemonSet to be Ready" +kubectl -n kube-system rollout status daemonset/fluence-deviceplugin --timeout=180s + +# Block until at least one node advertises the qpu extended resource, so the +# tests do not race the kubelet's device registration. +log "waiting for nodes to advertise fluxion.flux-framework.org/qpu" +ok=0 +for i in $(seq 1 60); do + if kubectl get nodes -o jsonpath='{.items[*].status.allocatable}' 2>/dev/null \ + | grep -q 'fluxion.flux-framework.org/qpu'; then + ok=1; break + fi + sleep 3 +done +[ "$ok" = 1 ] || fail "no node advertised fluxion.flux-framework.org/qpu after the add-on (device plugin not registering)" +log "qpu advertised on at least one node" + +log "quantum setup complete: qpu add-on installed, scheduler restarted, sidecar image=$IMAGE" \ No newline at end of file