Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 49 additions & 1 deletion .github/workflows/flyte-binary-v2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -158,12 +158,60 @@ jobs:
registry: ghcr.io
username: "${{ secrets.FLYTE_BOT_USERNAME }}"
password: "${{ secrets.FLYTE_BOT_PAT }}"
- name: Build and push multi-arch image
- name: Build CPU multi-arch image to OCI archive
# Produce an OCI archive locally so the GPU build below can use it as a
# named build context. This avoids the PR-gated push chicken-and-egg:
# on pull_request events we don't push to ghcr, so the GPU build can't
# resolve a ghcr-hosted FROM.
uses: docker/build-push-action@v6
with:
context: docker/demo-bundled
allow: "security.insecure"
platforms: linux/arm64, linux/amd64
build-args: "FLYTE_DEMO_VERSION=${{ env.FLYTE_DEMO_VERSION }}"
outputs: type=oci,dest=/tmp/cpu-oci.tar
cache-from: type=gha,scope=demo-cpu
cache-to: type=gha,mode=max,scope=demo-cpu
- name: Extract CPU OCI layout for GPU build
run: |
mkdir -p /tmp/cpu-oci
tar -xf /tmp/cpu-oci.tar -C /tmp/cpu-oci
- name: Push CPU multi-arch image
if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }}
uses: docker/build-push-action@v6
with:
context: docker/demo-bundled
allow: "security.insecure"
platforms: linux/arm64, linux/amd64
tags: ${{ steps.image-names.outputs.tags }}
build-args: "FLYTE_DEMO_VERSION=${{ env.FLYTE_DEMO_VERSION }}"
push: true
cache-from: type=gha,scope=demo-cpu
- name: Prepare GPU Image Names
id: gpu-image-names
uses: docker/metadata-action@v3
with:
images: |
ghcr.io/${{ github.repository_owner }}/flyte-demo
ghcr.io/${{ github.repository_owner }}/flyte-sandbox-v2
tags: |
type=raw,value=gpu-latest,enable=${{ github.event_name == 'push' && github.ref == 'refs/heads/v2' }}
type=raw,value=gpu-nightly,enable=${{ github.event_name == 'push' && github.ref == 'refs/heads/v2' }}
type=sha,format=long,prefix=gpu-
- name: Build and push GPU multi-arch image
uses: docker/build-push-action@v6
with:
context: docker/demo-bundled
file: docker/demo-bundled/Dockerfile.gpu
# Point Dockerfile.gpu's `FROM ${BASE_IMAGE}` at the OCI archive
# produced above — no registry round-trip needed.
build-contexts: base=oci-layout:///tmp/cpu-oci
allow: "security.insecure"
platforms: linux/arm64, linux/amd64
tags: ${{ steps.gpu-image-names.outputs.tags }}
build-args: |
FLYTE_DEMO_VERSION=${{ env.FLYTE_DEMO_VERSION }}
BASE_IMAGE=base
push: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }}
cache-from: type=gha,scope=demo-gpu
cache-to: type=gha,mode=max,scope=demo-gpu
82 changes: 82 additions & 0 deletions docker/demo-bundled/Dockerfile.gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# syntax=docker/dockerfile:1.7-labs
#
# GPU-capable demo cluster image. Layers NVIDIA Container Toolkit + the
# k8s device-plugin on top of the CPU demo image so everything that ships
# in the base (flyte-binary, embedded postgres, auto-apply manifests) is
# inherited verbatim. CI builds the CPU image first and passes its tag in
# via BASE_IMAGE.

ARG BASE_IMAGE=ghcr.io/flyteorg/flyte-demo:nightly


# Stage NVIDIA Container Toolkit binaries + supporting libs + ldconfig.
# k3s auto-registers a `nvidia` containerd runtime at startup if
# /usr/bin/nvidia-container-runtime is on PATH in the final image.
FROM debian:bookworm-slim AS nvidia-toolkit

ARG TARGETARCH

RUN apt-get update && apt-get install -y --no-install-recommends \
curl gnupg ca-certificates && \
rm -rf /var/lib/apt/lists/*

RUN curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
| gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg && \
curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
| sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
> /etc/apt/sources.list.d/nvidia-container-toolkit.list && \
apt-get update && \
apt-get install -y --no-install-recommends \
nvidia-container-toolkit-base \
libnvidia-container1 \
libnvidia-container-tools && \
rm -rf /var/lib/apt/lists/*

# Collect binaries, their shared-lib deps, and the dynamic linker so they run
# inside the minimal rancher/k3s base (no libc of its own). Also stage
# /sbin/ldconfig — the toolkit's update-ldcache OCI hook bind-mounts it
# into workload pods.
RUN set -ex; \
mkdir -p /nvidia-staging/bin /nvidia-staging/lib /nvidia-staging/sbin; \
for bin in nvidia-ctk nvidia-container-runtime nvidia-container-runtime.cdi \
nvidia-container-runtime.legacy nvidia-container-cli; do \
[ -f "/usr/bin/$bin" ] && cp -a "/usr/bin/$bin" /nvidia-staging/bin/ || true; \
done; \
for bin in /nvidia-staging/bin/*; do \
ldd "$bin" 2>/dev/null | grep "=>" | awk '{print $3}' | while read lib; do \
[ -f "$lib" ] && cp -n "$lib" /nvidia-staging/lib/ 2>/dev/null || true; \
done; \
done; \
cp /lib64/ld-linux-x86-64.so.2 /nvidia-staging/lib/ 2>/dev/null || true; \
cp /lib/ld-linux-aarch64.so.1 /nvidia-staging/lib/ 2>/dev/null || true; \
cp /sbin/ldconfig /nvidia-staging/sbin/ldconfig


FROM ${BASE_IMAGE}

# Install NVIDIA Container Toolkit binaries + supporting libs. The libs go
# into a default linker search path (/usr/lib/<arch-triple>/) because the
# nvidia-ctk OCI hook is invoked by containerd without inheriting
# LD_LIBRARY_PATH.
COPY --from=nvidia-toolkit /nvidia-staging/bin/ /usr/bin/
COPY --from=nvidia-toolkit /nvidia-staging/lib/ /usr/lib/nvidia/
COPY --from=nvidia-toolkit /nvidia-staging/sbin/ldconfig /sbin/ldconfig
RUN ARCH_TRIPLE=$([ "$(uname -m)" = "aarch64" ] && echo "aarch64-linux-gnu" || echo "x86_64-linux-gnu") && \
mkdir -p "/usr/lib/${ARCH_TRIPLE}" && \
cp -a /usr/lib/nvidia/*.so* "/usr/lib/${ARCH_TRIPLE}/" 2>/dev/null || true

# NVIDIA device-plugin DaemonSet + RuntimeClass (auto-applied by k3s at startup).
COPY nvidia-device-plugin.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin.yaml

# k3s reads this template at startup to generate containerd's config.
# Sets nvidia as the default runtime so GPU pods don't need runtimeClassName.
COPY containerd-config.toml.tmpl /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl

# Append nvidia libs to the base image's LD_LIBRARY_PATH (which already
# includes /usr/lib/pg-glibc for embedded postgres).
ENV LD_LIBRARY_PATH="/usr/lib/pg-glibc:/usr/lib/nvidia"

# Propagate host GPUs into containers scheduled on this node. These env vars
# are consumed by nvidia-container-runtime.
ENV NVIDIA_VISIBLE_DEVICES=all
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
7 changes: 7 additions & 0 deletions docker/demo-bundled/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,13 @@ build: sync-crds flyte dep_update manifests
docker buildx build --builder flyte-demo --allow security.insecure --load \
--tag flyte-demo:latest .

.PHONY: build-gpu
build-gpu: build
docker buildx build --builder flyte-demo --allow security.insecure --load \
--file Dockerfile.gpu \
--build-arg BASE_IMAGE=flyte-demo:latest \
--tag flyte-demo:gpu-latest .

# Port map
# 6443 - k8s API server
# 30000 - Docker Registry
Expand Down
10 changes: 10 additions & 0 deletions docker/demo-bundled/containerd-config.toml.tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{{ template "base" . }}

# Override: make the NVIDIA runtime the default. k3s auto-registers a
# `nvidia` runtime at startup when /usr/bin/nvidia-container-runtime is
# present. By switching the default, pods requesting `nvidia.com/gpu` get
# GPU access without needing `runtimeClassName: nvidia` in their spec.
# nvidia-container-runtime is a passthrough when no GPU is requested, so
# non-GPU pods are unaffected.
[plugins.'io.containerd.cri.v1.runtime'.containerd]
default_runtime_name = "nvidia"
45 changes: 45 additions & 0 deletions docker/demo-bundled/nvidia-device-plugin.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
name: nvidia
handler: nvidia
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: nvidia-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-device-plugin-ds
spec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
priorityClassName: system-node-critical
runtimeClassName: nvidia
containers:
- name: nvidia-device-plugin-ctr
image: nvcr.io/nvidia/k8s-device-plugin:v0.17.0
env:
- name: FAIL_ON_INIT_ERROR
value: "false"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
Loading