From c3ffd4c7349db561ed600de8a1d70f3a528c2dee Mon Sep 17 00:00:00 2001
From: Kevin Su <pingsutw@gmail.com>
Date: Mon, 20 Apr 2026 23:53:45 +0000
Subject: [PATCH 1/6] feat(demo-bundled): add GPU-capable demo cluster image

Adds a GPU variant of the demo-bundled image so users with NVIDIA GPUs
can run `flyte start demo --image ghcr.io/flyteorg/flyte-demo:gpu-latest`
and submit tasks with `Resources(gpu=1)`.

- Dockerfile.gpu stages NVIDIA Container Toolkit v1.19.x binaries and
  their shared libs into the rancher/k3s final image. Libs are copied
  into /usr/lib/<triple>/ because the nvidia-ctk OCI hook runs without
  inheriting LD_LIBRARY_PATH. A statically-linked /sbin/ldconfig is
  also staged (rancher/k3s ships none) because the toolkit's
  update-ldcache hook bind-mounts it into workload pods.
- containerd-config.toml.tmpl sets nvidia as the default containerd
  runtime. Pods requesting nvidia.com/gpu get GPUs without needing
  runtimeClassName in their spec; non-GPU pods are unaffected
  (nvidia-container-runtime is a passthrough when no GPU is requested).
- nvidia-device-plugin.yaml installs a RuntimeClass and the NVIDIA
  k8s-device-plugin DaemonSet so nvidia.com/gpu is advertised on the
  node. Auto-applied by k3s at startup.
- Makefile gains a build-gpu target producing flyte-demo:gpu-latest.
- CI gains a build-and-push step publishing gpu-latest, gpu-nightly,
  and gpu-<sha> tags to both flyte-demo and flyte-sandbox-v2.

The GPU plumbing was verified end-to-end with a layered test image on
an A10G (torch 2.11.0+cu130 reported cuda_available=True). The full
multi-stage Dockerfile.gpu has not been built locally; the CI run here
is the first end-to-end test of the production Dockerfile and may
need fixup iterations.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/flyte-binary-v2.yml         |  21 +++
 docker/demo-bundled/Dockerfile.gpu            | 165 ++++++++++++++++++
 docker/demo-bundled/Makefile                  |   6 +
 .../demo-bundled/containerd-config.toml.tmpl  |  10 ++
 docker/demo-bundled/nvidia-device-plugin.yaml |  45 +++++
 5 files changed, 247 insertions(+)
 create mode 100644 docker/demo-bundled/Dockerfile.gpu
 create mode 100644 docker/demo-bundled/containerd-config.toml.tmpl
 create mode 100644 docker/demo-bundled/nvidia-device-plugin.yaml
diff --git a/.github/workflows/flyte-binary-v2.yml b/.github/workflows/flyte-binary-v2.yml
index 23d2c70d9b..efb99c226b 100644
--- a/.github/workflows/flyte-binary-v2.yml
+++ b/.github/workflows/flyte-binary-v2.yml
@@ -167,3 +167,24 @@ jobs:
           tags: ${{ steps.image-names.outputs.tags }}
           build-args: "FLYTE_DEMO_VERSION=${{ env.FLYTE_DEMO_VERSION }}"
           push: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }}
+      - name: Prepare GPU Image Names
+        id: gpu-image-names
+        uses: docker/metadata-action@v3
+        with:
+          images: |
+            ghcr.io/${{ github.repository_owner }}/flyte-demo
+            ghcr.io/${{ github.repository_owner }}/flyte-sandbox-v2
+          tags: |
+            type=raw,value=gpu-latest,enable=${{ github.event_name == 'push' && github.ref == 'refs/heads/v2' }}
+            type=raw,value=gpu-nightly,enable=${{ github.event_name == 'push' && github.ref == 'refs/heads/v2' }}
+            type=sha,format=long,prefix=gpu-
+      - name: Build and push GPU multi-arch image
+        uses: docker/build-push-action@v6
+        with:
+          context: docker/demo-bundled
+          file: docker/demo-bundled/Dockerfile.gpu
+          allow: "security.insecure"
+          platforms: linux/arm64, linux/amd64
+          tags: ${{ steps.gpu-image-names.outputs.tags }}
+          build-args: "FLYTE_DEMO_VERSION=${{ env.FLYTE_DEMO_VERSION }}"
+          push: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }}
diff --git a/docker/demo-bundled/Dockerfile.gpu b/docker/demo-bundled/Dockerfile.gpu
new file mode 100644
index 0000000000..acaa4de582
--- /dev/null
+++ b/docker/demo-bundled/Dockerfile.gpu
@@ -0,0 +1,165 @@
+# syntax=docker/dockerfile:1.4-labs
+FROM --platform=${BUILDPLATFORM} mgoltzsche/podman:minimal AS builder
+
+ARG TARGETARCH
+ENV TARGETARCH "${TARGETARCH}"
+
+WORKDIR /build
+
+COPY images/manifest.txt images/preload ./
+RUN --security=insecure ./preload manifest.txt
+
+
+FROM --platform=${BUILDPLATFORM} golang:1.24-bullseye AS bootstrap
+
+ARG TARGETARCH
+ENV CGO_ENABLED 0
+ENV GOARCH "${TARGETARCH}"
+ENV GOOS linux
+
+WORKDIR /flyteorg/build
+COPY bootstrap/go.mod bootstrap/go.sum ./
+RUN go mod download
+COPY bootstrap/ ./
+RUN --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/root/go/pkg/mod \
+    go build -o dist/flyte-demo-bootstrap cmd/bootstrap/main.go && \
+    go build -o dist/embedded-postgres cmd/embedded-postgres/main.go
+
+
+# Pre-download PostgreSQL binaries from Maven for the embedded-postgres library.
+# The library expects a cached .txz file; we also extract the dynamic linker since
+# the K3s base image has no libc.
+FROM debian:bookworm-slim AS pg-cache
+
+ARG TARGETARCH
+
+RUN apt-get update && apt-get install -y --no-install-recommends curl unzip xz-utils ca-certificates && rm -rf /var/lib/apt/lists/*
+
+RUN set -ex; \
+    ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "arm64v8" || echo "amd64"); \
+    PG_VERSION="16.9.0"; \
+    CACHE_FILE="embedded-postgres-binaries-linux-${ARCH}-${PG_VERSION}.txz"; \
+    mkdir -p /cache /glibc-libs; \
+    curl -fL "https://repo1.maven.org/maven2/io/zonky/test/postgres/embedded-postgres-binaries-linux-${ARCH}/${PG_VERSION}/embedded-postgres-binaries-linux-${ARCH}-${PG_VERSION}.jar" -o /tmp/pg.jar; \
+    unzip -p /tmp/pg.jar "*.txz" > "/cache/${CACHE_FILE}"; \
+    rm -f /tmp/pg.jar; \
+    mkdir -p /tmp/pg-tmp && tar xJf "/cache/${CACHE_FILE}" -C /tmp/pg-tmp/; \
+    for bin in /tmp/pg-tmp/bin/*; do \
+        ldd "$bin" 2>/dev/null | grep "=>" | awk '{print $3}' | while read lib; do \
+            [ -f "$lib" ] && cp -n "$lib" /glibc-libs/ 2>/dev/null || true; \
+        done; \
+    done; \
+    cp /lib/ld-linux-aarch64.so.1 /glibc-libs/ 2>/dev/null || true; \
+    cp /lib/aarch64-linux-gnu/ld-linux-aarch64.so.1 /glibc-libs/ 2>/dev/null || true; \
+    cp /lib64/ld-linux-x86-64.so.2 /glibc-libs/ 2>/dev/null || true; \
+    cp /lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 /glibc-libs/ 2>/dev/null || true; \
+    rm -rf /tmp/pg-tmp
+
+
+# Stage NVIDIA Container Toolkit + libnvidia-container from NVIDIA's apt repo.
+# k3s auto-registers a "nvidia" containerd runtime at startup if
+# `nvidia-container-runtime` is on PATH in the final image.
+FROM debian:bookworm-slim AS nvidia-toolkit
+
+ARG TARGETARCH
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        curl gnupg ca-certificates && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
+        | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg && \
+    curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
+        | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
+        > /etc/apt/sources.list.d/nvidia-container-toolkit.list && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+        nvidia-container-toolkit-base \
+        libnvidia-container1 \
+        libnvidia-container-tools && \
+    rm -rf /var/lib/apt/lists/*
+
+# Collect binaries, their shared-lib deps, and the dynamic linker so they run
+# inside the minimal rancher/k3s image (which has no libc of its own). Also
+# stage /sbin/ldconfig — the nvidia-container-runtime OCI hook bind-mounts it
+# into workload pods to refresh /etc/ld.so.cache, so it must exist on the node.
+RUN set -ex; \
+    mkdir -p /nvidia-staging/bin /nvidia-staging/lib /nvidia-staging/sbin; \
+    for bin in nvidia-ctk nvidia-container-runtime nvidia-container-runtime.cdi \
+               nvidia-container-runtime.legacy nvidia-container-cli; do \
+        [ -f "/usr/bin/$bin" ] && cp -a "/usr/bin/$bin" /nvidia-staging/bin/ || true; \
+    done; \
+    for bin in /nvidia-staging/bin/*; do \
+        ldd "$bin" 2>/dev/null | grep "=>" | awk '{print $3}' | while read lib; do \
+            [ -f "$lib" ] && cp -n "$lib" /nvidia-staging/lib/ 2>/dev/null || true; \
+        done; \
+    done; \
+    cp /lib64/ld-linux-x86-64.so.2 /nvidia-staging/lib/ 2>/dev/null || true; \
+    cp /lib/ld-linux-aarch64.so.1 /nvidia-staging/lib/ 2>/dev/null || true; \
+    cp /sbin/ldconfig /nvidia-staging/sbin/ldconfig
+
+
+FROM rancher/k3s:v1.34.6-k3s1
+
+ARG TARGETARCH
+
+ARG FLYTE_DEMO_VERSION
+ENV FLYTE_DEMO_VERSION "${FLYTE_DEMO_VERSION}"
+
+COPY --from=builder /build/images/ /var/lib/rancher/k3s/agent/images/
+COPY images/tar/${TARGETARCH}/ /var/lib/rancher/k3s/agent/images/
+COPY manifests/ /var/lib/rancher/k3s/server/manifests-staging/
+COPY bin/ /bin/
+
+# Install bootstrap and embedded postgres
+COPY --from=bootstrap /flyteorg/build/dist/flyte-demo-bootstrap /bin/
+COPY --from=bootstrap /flyteorg/build/dist/embedded-postgres /bin/
+
+# Install pre-cached PostgreSQL binaries and glibc libraries
+COPY --from=pg-cache /cache/ /var/cache/embedded-postgres/
+COPY --from=pg-cache /glibc-libs/ /usr/lib/pg-glibc/
+
+# Install NVIDIA Container Toolkit binaries + supporting libs. The libs go
+# into a default linker search path (/usr/lib/x86_64-linux-gnu) because the
+# nvidia-ctk OCI hook is invoked by containerd without inheriting
+# LD_LIBRARY_PATH. The statically-linked ldconfig at /sbin/ldconfig is
+# required by the toolkit's update-ldcache hook.
+COPY --from=nvidia-toolkit /nvidia-staging/bin/ /usr/bin/
+COPY --from=nvidia-toolkit /nvidia-staging/lib/ /usr/lib/nvidia/
+COPY --from=nvidia-toolkit /nvidia-staging/sbin/ldconfig /sbin/ldconfig
+RUN ARCH_TRIPLE=$([ "$(uname -m)" = "aarch64" ] && echo "aarch64-linux-gnu" || echo "x86_64-linux-gnu") && \
+    mkdir -p "/usr/lib/${ARCH_TRIPLE}" && \
+    cp -a /usr/lib/nvidia/*.so* "/usr/lib/${ARCH_TRIPLE}/" 2>/dev/null || true
+
+# NVIDIA device-plugin DaemonSet + RuntimeClass (auto-applied by k3s at startup).
+COPY nvidia-device-plugin.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin.yaml
+
+# k3s reads this template at startup to generate containerd's config.
+# Sets nvidia as the default runtime so GPU pods don't need runtimeClassName.
+COPY containerd-config.toml.tmpl /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl
+
+# Create dynamic linker symlinks and add postgres user/group
+RUN for f in /usr/lib/pg-glibc/ld-linux-aarch64*; do \
+      [ -f "$f" ] && ln -sf "$f" /lib/$(basename "$f"); \
+    done 2>/dev/null; \
+    for f in /usr/lib/pg-glibc/ld-linux-x86-64*; do \
+      [ -f "$f" ] && mkdir -p /lib64 && ln -sf "$f" /lib64/$(basename "$f"); \
+    done 2>/dev/null; \
+    echo "postgres:x:999:999:PostgreSQL:/tmp:/bin/sh" >> /etc/passwd && \
+    echo "postgres:x:999:" >> /etc/group
+
+# Expose pg-glibc + nvidia libs to the dynamic linker.
+ENV LD_LIBRARY_PATH="/usr/lib/pg-glibc:/usr/lib/nvidia"
+
+# Propagate host GPUs into containers scheduled on this node. These env vars
+# are consumed by nvidia-container-runtime.
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
+
+VOLUME /var/lib/flyte/storage
+
+# Set environment variable for picking up additional CA certificates
+ENV SSL_CERT_DIR /var/lib/flyte/config/ca-certificates
+
+ENTRYPOINT [ "/bin/k3d-entrypoint.sh" ]
+CMD [ "server", "--disable=servicelb", "--disable=metrics-server" ]
diff --git a/docker/demo-bundled/Makefile b/docker/demo-bundled/Makefile
index 40ae40005a..952390ac73 100644
--- a/docker/demo-bundled/Makefile
+++ b/docker/demo-bundled/Makefile
@@ -60,6 +60,12 @@ build: sync-crds flyte dep_update manifests
 	docker buildx build --builder flyte-demo --allow security.insecure --load \
 		--tag flyte-demo:latest .
 
+.PHONY: build-gpu
+build-gpu: sync-crds flyte dep_update manifests
+	docker buildx build --builder flyte-demo --allow security.insecure --load \
+		--file Dockerfile.gpu \
+		--tag flyte-demo:gpu-latest .
+
 # Port map
 # 6443 - k8s API server
 # 30000 - Docker Registry
diff --git a/docker/demo-bundled/containerd-config.toml.tmpl b/docker/demo-bundled/containerd-config.toml.tmpl
new file mode 100644
index 0000000000..7cda384aa0
--- /dev/null
+++ b/docker/demo-bundled/containerd-config.toml.tmpl
@@ -0,0 +1,10 @@
+{{ template "base" . }}
+
+# Override: make the NVIDIA runtime the default. k3s auto-registers a
+# `nvidia` runtime at startup when /usr/bin/nvidia-container-runtime is
+# present. By switching the default, pods requesting `nvidia.com/gpu` get
+# GPU access without needing `runtimeClassName: nvidia` in their spec.
+# nvidia-container-runtime is a passthrough when no GPU is requested, so
+# non-GPU pods are unaffected.
+[plugins.'io.containerd.cri.v1.runtime'.containerd]
+  default_runtime_name = "nvidia"
diff --git a/docker/demo-bundled/nvidia-device-plugin.yaml b/docker/demo-bundled/nvidia-device-plugin.yaml
new file mode 100644
index 0000000000..f0bbfcdb01
--- /dev/null
+++ b/docker/demo-bundled/nvidia-device-plugin.yaml
@@ -0,0 +1,45 @@
+apiVersion: node.k8s.io/v1
+kind: RuntimeClass
+metadata:
+  name: nvidia
+handler: nvidia
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nvidia-device-plugin-daemonset
+  namespace: kube-system
+spec:
+  selector:
+    matchLabels:
+      name: nvidia-device-plugin-ds
+  updateStrategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        name: nvidia-device-plugin-ds
+    spec:
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+      priorityClassName: system-node-critical
+      runtimeClassName: nvidia
+      containers:
+        - name: nvidia-device-plugin-ctr
+          image: nvcr.io/nvidia/k8s-device-plugin:v0.17.0
+          env:
+            - name: FAIL_ON_INIT_ERROR
+              value: "false"
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop: ["ALL"]
+          volumeMounts:
+            - name: device-plugin
+              mountPath: /var/lib/kubelet/device-plugins
+      volumes:
+        - name: device-plugin
+          hostPath:
+            path: /var/lib/kubelet/device-plugins

From 7dee3ac0a1fff134a0e1d495aab83c10d24c89c5 Mon Sep 17 00:00:00 2001
From: Kevin Su <pingsutw@gmail.com>
Date: Tue, 21 Apr 2026 00:22:53 +0000
Subject: [PATCH 2/6] Refactor Dockerfile.gpu to layer on the CPU image

Removes the duplicated builder/bootstrap/pg-cache stages and final-stage
setup by making Dockerfile.gpu a thin layer on top of flyte-demo:latest
(parameterized via ARG BASE_IMAGE). CI now builds the CPU image first
and passes its sha-tag in as BASE_IMAGE to the GPU build.

- Dockerfile.gpu shrinks from ~165 to ~75 lines; inherits flyte-binary,
  embedded postgres, staging manifests, and k3d entrypoint from the
  base image unchanged.
- Makefile build-gpu target now depends on build (not the full prereq
  chain) and passes BASE_IMAGE=flyte-demo:latest.
- CI gates the GPU build on push/workflow_dispatch since PR builds
  don't push the CPU image to ghcr.io (nothing to pull for BASE_IMAGE).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/flyte-binary-v2.yml |   7 +-
 docker/demo-bundled/Dockerfile.gpu    | 119 ++++----------------------
 docker/demo-bundled/Makefile          |   3 +-
 3 files changed, 25 insertions(+), 104 deletions(-)

diff --git a/.github/workflows/flyte-binary-v2.yml b/.github/workflows/flyte-binary-v2.yml
index efb99c226b..c34ad5a577 100644
--- a/.github/workflows/flyte-binary-v2.yml
+++ b/.github/workflows/flyte-binary-v2.yml
@@ -179,6 +179,7 @@ jobs:
             type=raw,value=gpu-nightly,enable=${{ github.event_name == 'push' && github.ref == 'refs/heads/v2' }}
             type=sha,format=long,prefix=gpu-
       - name: Build and push GPU multi-arch image
+        if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }}
         uses: docker/build-push-action@v6
         with:
           context: docker/demo-bundled
@@ -186,5 +187,7 @@ jobs:
           allow: "security.insecure"
           platforms: linux/arm64, linux/amd64
           tags: ${{ steps.gpu-image-names.outputs.tags }}
-          build-args: "FLYTE_DEMO_VERSION=${{ env.FLYTE_DEMO_VERSION }}"
-          push: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }}
+          build-args: |
+            FLYTE_DEMO_VERSION=${{ env.FLYTE_DEMO_VERSION }}
+            BASE_IMAGE=ghcr.io/${{ github.repository_owner }}/flyte-demo:sha-${{ github.sha }}
+          push: true
diff --git a/docker/demo-bundled/Dockerfile.gpu b/docker/demo-bundled/Dockerfile.gpu
index acaa4de582..eaae887653 100644
--- a/docker/demo-bundled/Dockerfile.gpu
+++ b/docker/demo-bundled/Dockerfile.gpu
@@ -1,64 +1,17 @@
 # syntax=docker/dockerfile:1.4-labs
-FROM --platform=${BUILDPLATFORM} mgoltzsche/podman:minimal AS builder
+#
+# GPU-capable demo cluster image. Layers NVIDIA Container Toolkit + the
+# k8s device-plugin on top of the CPU demo image so everything that ships
+# in the base (flyte-binary, embedded postgres, auto-apply manifests) is
+# inherited verbatim. CI builds the CPU image first and passes its tag in
+# via BASE_IMAGE.
 
-ARG TARGETARCH
-ENV TARGETARCH "${TARGETARCH}"
-
-WORKDIR /build
-
-COPY images/manifest.txt images/preload ./
-RUN --security=insecure ./preload manifest.txt
-
-
-FROM --platform=${BUILDPLATFORM} golang:1.24-bullseye AS bootstrap
-
-ARG TARGETARCH
-ENV CGO_ENABLED 0
-ENV GOARCH "${TARGETARCH}"
-ENV GOOS linux
-
-WORKDIR /flyteorg/build
-COPY bootstrap/go.mod bootstrap/go.sum ./
-RUN go mod download
-COPY bootstrap/ ./
-RUN --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/root/go/pkg/mod \
-    go build -o dist/flyte-demo-bootstrap cmd/bootstrap/main.go && \
-    go build -o dist/embedded-postgres cmd/embedded-postgres/main.go
-
-
-# Pre-download PostgreSQL binaries from Maven for the embedded-postgres library.
-# The library expects a cached .txz file; we also extract the dynamic linker since
-# the K3s base image has no libc.
-FROM debian:bookworm-slim AS pg-cache
-
-ARG TARGETARCH
-
-RUN apt-get update && apt-get install -y --no-install-recommends curl unzip xz-utils ca-certificates && rm -rf /var/lib/apt/lists/*
-
-RUN set -ex; \
-    ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "arm64v8" || echo "amd64"); \
-    PG_VERSION="16.9.0"; \
-    CACHE_FILE="embedded-postgres-binaries-linux-${ARCH}-${PG_VERSION}.txz"; \
-    mkdir -p /cache /glibc-libs; \
-    curl -fL "https://repo1.maven.org/maven2/io/zonky/test/postgres/embedded-postgres-binaries-linux-${ARCH}/${PG_VERSION}/embedded-postgres-binaries-linux-${ARCH}-${PG_VERSION}.jar" -o /tmp/pg.jar; \
-    unzip -p /tmp/pg.jar "*.txz" > "/cache/${CACHE_FILE}"; \
-    rm -f /tmp/pg.jar; \
-    mkdir -p /tmp/pg-tmp && tar xJf "/cache/${CACHE_FILE}" -C /tmp/pg-tmp/; \
-    for bin in /tmp/pg-tmp/bin/*; do \
-        ldd "$bin" 2>/dev/null | grep "=>" | awk '{print $3}' | while read lib; do \
-            [ -f "$lib" ] && cp -n "$lib" /glibc-libs/ 2>/dev/null || true; \
-        done; \
-    done; \
-    cp /lib/ld-linux-aarch64.so.1 /glibc-libs/ 2>/dev/null || true; \
-    cp /lib/aarch64-linux-gnu/ld-linux-aarch64.so.1 /glibc-libs/ 2>/dev/null || true; \
-    cp /lib64/ld-linux-x86-64.so.2 /glibc-libs/ 2>/dev/null || true; \
-    cp /lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 /glibc-libs/ 2>/dev/null || true; \
-    rm -rf /tmp/pg-tmp
+ARG BASE_IMAGE=ghcr.io/flyteorg/flyte-demo:nightly
 
 
-# Stage NVIDIA Container Toolkit + libnvidia-container from NVIDIA's apt repo.
-# k3s auto-registers a "nvidia" containerd runtime at startup if
-# `nvidia-container-runtime` is on PATH in the final image.
+# Stage NVIDIA Container Toolkit binaries + supporting libs + ldconfig.
+# k3s auto-registers a `nvidia` containerd runtime at startup if
+# /usr/bin/nvidia-container-runtime is on PATH in the final image.
 FROM debian:bookworm-slim AS nvidia-toolkit
 
 ARG TARGETARCH
@@ -80,9 +33,9 @@ RUN curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
     rm -rf /var/lib/apt/lists/*
 
 # Collect binaries, their shared-lib deps, and the dynamic linker so they run
-# inside the minimal rancher/k3s image (which has no libc of its own). Also
-# stage /sbin/ldconfig — the nvidia-container-runtime OCI hook bind-mounts it
-# into workload pods to refresh /etc/ld.so.cache, so it must exist on the node.
+# inside the minimal rancher/k3s base (no libc of its own). Also stage
+# /sbin/ldconfig — the toolkit's update-ldcache OCI hook bind-mounts it
+# into workload pods.
 RUN set -ex; \
     mkdir -p /nvidia-staging/bin /nvidia-staging/lib /nvidia-staging/sbin; \
     for bin in nvidia-ctk nvidia-container-runtime nvidia-container-runtime.cdi \
@@ -99,31 +52,12 @@ RUN set -ex; \
     cp /sbin/ldconfig /nvidia-staging/sbin/ldconfig
 
 
-FROM rancher/k3s:v1.34.6-k3s1
-
-ARG TARGETARCH
-
-ARG FLYTE_DEMO_VERSION
-ENV FLYTE_DEMO_VERSION "${FLYTE_DEMO_VERSION}"
-
-COPY --from=builder /build/images/ /var/lib/rancher/k3s/agent/images/
-COPY images/tar/${TARGETARCH}/ /var/lib/rancher/k3s/agent/images/
-COPY manifests/ /var/lib/rancher/k3s/server/manifests-staging/
-COPY bin/ /bin/
-
-# Install bootstrap and embedded postgres
-COPY --from=bootstrap /flyteorg/build/dist/flyte-demo-bootstrap /bin/
-COPY --from=bootstrap /flyteorg/build/dist/embedded-postgres /bin/
-
-# Install pre-cached PostgreSQL binaries and glibc libraries
-COPY --from=pg-cache /cache/ /var/cache/embedded-postgres/
-COPY --from=pg-cache /glibc-libs/ /usr/lib/pg-glibc/
+FROM ${BASE_IMAGE}
 
 # Install NVIDIA Container Toolkit binaries + supporting libs. The libs go
-# into a default linker search path (/usr/lib/x86_64-linux-gnu) because the
+# into a default linker search path (/usr/lib/<arch-triple>/) because the
 # nvidia-ctk OCI hook is invoked by containerd without inheriting
-# LD_LIBRARY_PATH. The statically-linked ldconfig at /sbin/ldconfig is
-# required by the toolkit's update-ldcache hook.
+# LD_LIBRARY_PATH.
 COPY --from=nvidia-toolkit /nvidia-staging/bin/ /usr/bin/
 COPY --from=nvidia-toolkit /nvidia-staging/lib/ /usr/lib/nvidia/
 COPY --from=nvidia-toolkit /nvidia-staging/sbin/ldconfig /sbin/ldconfig
@@ -138,28 +72,11 @@ COPY nvidia-device-plugin.yaml /var/lib/rancher/k3s/server/manifests/nvidia-devi
 # Sets nvidia as the default runtime so GPU pods don't need runtimeClassName.
 COPY containerd-config.toml.tmpl /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl
 
-# Create dynamic linker symlinks and add postgres user/group
-RUN for f in /usr/lib/pg-glibc/ld-linux-aarch64*; do \
-      [ -f "$f" ] && ln -sf "$f" /lib/$(basename "$f"); \
-    done 2>/dev/null; \
-    for f in /usr/lib/pg-glibc/ld-linux-x86-64*; do \
-      [ -f "$f" ] && mkdir -p /lib64 && ln -sf "$f" /lib64/$(basename "$f"); \
-    done 2>/dev/null; \
-    echo "postgres:x:999:999:PostgreSQL:/tmp:/bin/sh" >> /etc/passwd && \
-    echo "postgres:x:999:" >> /etc/group
-
-# Expose pg-glibc + nvidia libs to the dynamic linker.
+# Append nvidia libs to the base image's LD_LIBRARY_PATH (which already
+# includes /usr/lib/pg-glibc for embedded postgres).
 ENV LD_LIBRARY_PATH="/usr/lib/pg-glibc:/usr/lib/nvidia"
 
 # Propagate host GPUs into containers scheduled on this node. These env vars
 # are consumed by nvidia-container-runtime.
 ENV NVIDIA_VISIBLE_DEVICES=all
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
-
-VOLUME /var/lib/flyte/storage
-
-# Set environment variable for picking up additional CA certificates
-ENV SSL_CERT_DIR /var/lib/flyte/config/ca-certificates
-
-ENTRYPOINT [ "/bin/k3d-entrypoint.sh" ]
-CMD [ "server", "--disable=servicelb", "--disable=metrics-server" ]
diff --git a/docker/demo-bundled/Makefile b/docker/demo-bundled/Makefile
index 952390ac73..a1cd662907 100644
--- a/docker/demo-bundled/Makefile
+++ b/docker/demo-bundled/Makefile
@@ -61,9 +61,10 @@ build: sync-crds flyte dep_update manifests
 		--tag flyte-demo:latest .
 
 .PHONY: build-gpu
-build-gpu: sync-crds flyte dep_update manifests
+build-gpu: build
 	docker buildx build --builder flyte-demo --allow security.insecure --load \
 		--file Dockerfile.gpu \
+		--build-arg BASE_IMAGE=flyte-demo:latest \
 		--tag flyte-demo:gpu-latest .
 
 # Port map

From c0c552e33fd57fe08194521e6b813e1963efd5a6 Mon Sep 17 00:00:00 2001
From: Kevin Su <pingsutw@gmail.com>
Date: Tue, 21 Apr 2026 00:30:50 +0000
Subject: [PATCH 3/6] ci: match GPU build step's push condition to CPU step

Drops the `if:` gate and conditions `push:` on the same expression the
CPU build uses, so both steps always build and only push on v2-branch
pushes or workflow_dispatch.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/flyte-binary-v2.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/flyte-binary-v2.yml b/.github/workflows/flyte-binary-v2.yml
index c34ad5a577..ae5a186ad0 100644
--- a/.github/workflows/flyte-binary-v2.yml
+++ b/.github/workflows/flyte-binary-v2.yml
@@ -179,7 +179,6 @@ jobs:
             type=raw,value=gpu-nightly,enable=${{ github.event_name == 'push' && github.ref == 'refs/heads/v2' }}
             type=sha,format=long,prefix=gpu-
       - name: Build and push GPU multi-arch image
-        if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }}
         uses: docker/build-push-action@v6
         with:
           context: docker/demo-bundled
@@ -190,4 +189,4 @@ jobs:
           build-args: |
             FLYTE_DEMO_VERSION=${{ env.FLYTE_DEMO_VERSION }}
             BASE_IMAGE=ghcr.io/${{ github.repository_owner }}/flyte-demo:sha-${{ github.sha }}
-          push: true
+          push: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }}

From 7252dc2b9856d9efddc63f95d99567eb0506e1d2 Mon Sep 17 00:00:00 2001
From: Kevin Su <pingsutw@gmail.com>
Date: Tue, 21 Apr 2026 07:54:27 +0000
Subject: [PATCH 4/6] ci: hand CPU image to GPU build via OCI archive

On pull_request events the CPU build step runs with push=false, so the
GPU build's FROM ghcr.io/.../flyte-demo:sha-<sha> fails to resolve
(image not found in the registry). Fix by producing an OCI archive of
the CPU image locally and passing it to the GPU build as a named build
context (build-contexts: base=oci-layout://...) with BASE_IMAGE=base.

Registry push happens in a separate step that only runs on push /
workflow_dispatch, so PR builds no longer need ghcr credentials for
the GPU step.
---
 .github/workflows/flyte-binary-v2.yml | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/flyte-binary-v2.yml b/.github/workflows/flyte-binary-v2.yml
index ae5a186ad0..75aca89679 100644
--- a/.github/workflows/flyte-binary-v2.yml
+++ b/.github/workflows/flyte-binary-v2.yml
@@ -158,7 +158,24 @@ jobs:
           registry: ghcr.io
           username: "${{ secrets.FLYTE_BOT_USERNAME }}"
           password: "${{ secrets.FLYTE_BOT_PAT }}"
-      - name: Build and push multi-arch image
+      - name: Build CPU multi-arch image to OCI archive
+        # Produce an OCI archive locally so the GPU build below can use it as a
+        # named build context. This avoids the PR-gated push chicken-and-egg:
+        # on pull_request events we don't push to ghcr, so the GPU build can't
+        # resolve a ghcr-hosted FROM.
+        uses: docker/build-push-action@v6
+        with:
+          context: docker/demo-bundled
+          allow: "security.insecure"
+          platforms: linux/arm64, linux/amd64
+          build-args: "FLYTE_DEMO_VERSION=${{ env.FLYTE_DEMO_VERSION }}"
+          outputs: type=oci,dest=/tmp/cpu-oci.tar
+      - name: Extract CPU OCI layout for GPU build
+        run: |
+          mkdir -p /tmp/cpu-oci
+          tar -xf /tmp/cpu-oci.tar -C /tmp/cpu-oci
+      - name: Push CPU multi-arch image
+        if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }}
         uses: docker/build-push-action@v6
         with:
           context: docker/demo-bundled
@@ -166,7 +183,7 @@ jobs:
           platforms: linux/arm64, linux/amd64
           tags: ${{ steps.image-names.outputs.tags }}
           build-args: "FLYTE_DEMO_VERSION=${{ env.FLYTE_DEMO_VERSION }}"
-          push: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }}
+          push: true
       - name: Prepare GPU Image Names
         id: gpu-image-names
         uses: docker/metadata-action@v3
@@ -183,10 +200,13 @@ jobs:
         with:
           context: docker/demo-bundled
           file: docker/demo-bundled/Dockerfile.gpu
+          # Point Dockerfile.gpu's `FROM ${BASE_IMAGE}` at the OCI archive
+          # produced above — no registry round-trip needed.
+          build-contexts: base=oci-layout:///tmp/cpu-oci
           allow: "security.insecure"
           platforms: linux/arm64, linux/amd64
           tags: ${{ steps.gpu-image-names.outputs.tags }}
           build-args: |
             FLYTE_DEMO_VERSION=${{ env.FLYTE_DEMO_VERSION }}
-            BASE_IMAGE=ghcr.io/${{ github.repository_owner }}/flyte-demo:sha-${{ github.sha }}
+            BASE_IMAGE=base
           push: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }}

From 2885f57b5ccef695890bd82295f578b9c6027795 Mon Sep 17 00:00:00 2001
From: Kevin Su <pingsutw@gmail.com>
Date: Tue, 21 Apr 2026 08:01:35 +0000
Subject: [PATCH 5/6] ci: cache docker layers for demo-bundled builds

Add GHA cache (type=gha) to the three docker/build-push-action steps in
build-and-push-demo-bundled-image. CPU archive and CPU push share the
demo-cpu scope so the push reuses layers from the archive build; GPU
gets its own demo-gpu scope.
---
 .github/workflows/flyte-binary-v2.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/flyte-binary-v2.yml b/.github/workflows/flyte-binary-v2.yml
index 75aca89679..fbdea50bc2 100644
--- a/.github/workflows/flyte-binary-v2.yml
+++ b/.github/workflows/flyte-binary-v2.yml
@@ -170,6 +170,8 @@ jobs:
           platforms: linux/arm64, linux/amd64
           build-args: "FLYTE_DEMO_VERSION=${{ env.FLYTE_DEMO_VERSION }}"
           outputs: type=oci,dest=/tmp/cpu-oci.tar
+          cache-from: type=gha,scope=demo-cpu
+          cache-to: type=gha,mode=max,scope=demo-cpu
       - name: Extract CPU OCI layout for GPU build
         run: |
           mkdir -p /tmp/cpu-oci
@@ -184,6 +186,7 @@ jobs:
           tags: ${{ steps.image-names.outputs.tags }}
           build-args: "FLYTE_DEMO_VERSION=${{ env.FLYTE_DEMO_VERSION }}"
           push: true
+          cache-from: type=gha,scope=demo-cpu
       - name: Prepare GPU Image Names
         id: gpu-image-names
         uses: docker/metadata-action@v3
@@ -210,3 +213,5 @@ jobs:
             FLYTE_DEMO_VERSION=${{ env.FLYTE_DEMO_VERSION }}
             BASE_IMAGE=base
           push: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }}
+          cache-from: type=gha,scope=demo-gpu
+          cache-to: type=gha,mode=max,scope=demo-gpu

From 65d5095e37468bb7d755c89f019919eb8b357b4e Mon Sep 17 00:00:00 2001
From: Kevin Su <pingsutw@apache.org>
Date: Tue, 21 Apr 2026 01:29:51 -0700
Subject: [PATCH 6/6] Bump Dockerfile.gpu syntax to 1.7-labs for oci-layout
 build context

The oci-layout:// build-context source requires Dockerfile frontend 1.5+.
CI was failing with 'unsupported context source oci-layout for base'.

Signed-off-by: Kevin Su <pingsutw@apache.org>
---
 docker/demo-bundled/Dockerfile.gpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/demo-bundled/Dockerfile.gpu b/docker/demo-bundled/Dockerfile.gpu
index eaae887653..a6bba98fd7 100644
--- a/docker/demo-bundled/Dockerfile.gpu
+++ b/docker/demo-bundled/Dockerfile.gpu
@@ -1,4 +1,4 @@
-# syntax=docker/dockerfile:1.4-labs
+# syntax=docker/dockerfile:1.7-labs
 #
 # GPU-capable demo cluster image. Layers NVIDIA Container Toolkit + the
 # k8s device-plugin on top of the CPU demo image so everything that ships