From f9dec4c375491ad938cf76502e0567991d6e795c Mon Sep 17 00:00:00 2001
From: Andrea Gronchi <agronchi@tai.it>
Date: Tue, 12 May 2026 09:33:33 +0200
Subject: [PATCH 1/6] Add CUDA Docker serving setup - Add CUDA 13 Dockerfile
 and Compose surface for ds4-server - Download selected GGUF weights at
 container startup - Persist weights and disk KV cache through configurable
 bind mounts - Document Docker usage, env vars, and CUDA compatibility notes -
 Move agent guidance into AGENTS.md

---
 .dockerignore         |  11 +++
 .env.example          |  55 +++++++++++++++
 .gitignore            |   1 +
 AGENT.md => AGENTS.md |   8 +++
 README.md             |   5 ++
 compose.yml           |  29 ++++++++
 docker/Dockerfile     |  48 +++++++++++++
 docker/README.md      | 161 ++++++++++++++++++++++++++++++++++++++++++
 docker/entrypoint.sh  |  54 ++++++++++++++
 9 files changed, 372 insertions(+)
 create mode 100644 .dockerignore
 create mode 100644 .env.example
 rename AGENT.md => AGENTS.md (86%)
 create mode 100644 compose.yml
 create mode 100644 docker/Dockerfile
 create mode 100644 docker/README.md
 create mode 100644 docker/entrypoint.sh

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 00000000..e4af5ae2
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,11 @@
+.git
+gguf
+ds4flash.gguf
+ds4
+ds4-server
+ds4-bench
+ds4_test
+*.o
+*.dSYM
+misc
+TODO.md
diff --git a/.env.example b/.env.example
new file mode 100644
index 00000000..987d1111
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,55 @@
+# Common runtime settings
+
+# Model target passed to download_model.sh on container startup.
+# Supported: q2-imatrix, q4-imatrix, q2, q4, none
+DS4_MODEL=q2
+
+# Enable optional MTP speculative decoding support. Set to 1 to download and use
+# the MTP GGUF.
+DS4_ENABLE_MTP=1
+
+# Context size passed to ds4-server --ctx.
+DS4_CTX=100000
+
+# Disk KV cache budget passed to ds4-server --kv-disk-space-mb.
+DS4_KV_DISK_SPACE_MB=20480
+
+# Host port exposed by Docker Compose. The container always listens on 8000.
+DS4_PORT=8000
+
+# Host directory for persistent bind mounts. Compose uses the weights and
+# kv-cache subdirectories below this path.
+DS4_VOLUMES_HOST_DIR=./volumes
+
+# Optional Hugging Face token for model downloads.
+HF_TOKEN=
+
+
+# Less common runtime settings
+
+# MTP draft tokens passed to ds4-server --mtp-draft when DS4_ENABLE_MTP=1.
+DS4_MTP_DRAFT=2
+
+# Optional MTP margin passed to ds4-server --mtp-margin when set.
+DS4_MTP_MARGIN=
+
+# Optional CPU helper threads passed to ds4-server --threads when set.
+DS4_THREADS=
+
+# Extra ds4-server flags appended to the generated command.
+# Example: --quality --kv-cache-min-tokens 1024
+DS4_EXTRA_ARGS=
+
+
+# Build settings
+
+# NVIDIA CUDA image version. Keep this compatible with the host driver reported
+# by nvidia-smi. For hosts reporting CUDA Version: 13.0, use 13.0.x.
+CUDA_VERSION=13.0.3
+
+# Ubuntu suffix used by the NVIDIA CUDA images.
+UBUNTU_VERSION=24.04
+
+# Optional NVCC architecture passed through make CUDA_ARCH. Leave empty for the
+# default container build behavior.
+CUDA_ARCH=
diff --git a/.gitignore b/.gitignore
index 2c70e4d6..1a39c61f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@
 /ds4flash.gguf
 /TODO.md
 /gguf/
+/volumes/
 *.o
 *.dSYM/
 /misc/
diff --git a/AGENT.md b/AGENTS.md
similarity index 86%
rename from AGENT.md
rename to AGENTS.md
index ff9c395c..c88a4042 100644
--- a/AGENT.md
+++ b/AGENTS.md
@@ -43,6 +43,7 @@ Objective-C only where Metal requires it and Metal kernels under `metal/`.
   tool-call mapping, disk KV cache policy.
 - `ds4_metal.m`: Objective-C Metal runtime and kernel wrappers.
 - `metal/*.metal`: compute kernels.
+- `docker/`: CUDA 13 Docker and Compose setup for serving `ds4-server` on Linux.
 - `tests/`: unit and live integration tests.
 - `misc/`: ignored notes, experiments, and old planning material.
 
@@ -51,3 +52,10 @@ Objective-C only where Metal requires it and Metal kernels under `metal/`.
 Use `make` for build validation. Use `make test` for unit/regression tests when a
 model and Metal are available. Use live server tests only when intentionally
 testing the API surface.
+
+## Docker
+
+Docker-specific serving notes are documented in `docker/README.md`. The Docker
+setup targets Linux CUDA 13, mounts weights under `/models`, mounts disk KV cache
+under `/kv-cache`, downloads missing model weights during container startup, and
+serves `ds4-server` through Compose.
diff --git a/README.md b/README.md
index 5ea086b7..3206482f 100644
--- a/README.md
+++ b/README.md
@@ -104,6 +104,11 @@ make
 select another supported GGUF from `./gguf/`. Run `./ds4 --help` and
 `./ds4-server --help` for the full flag list.
 
+For Linux CUDA container builds, see [`docker/README.md`](docker/README.md).
+The Compose setup builds `ds4-server` with CUDA 13, downloads the selected model
+into a weights volume on startup, and serves the HTTP API with a mounted disk KV
+cache volume.
+
 ## Speed
 
 These are single-run Metal CLI numbers with `--ctx 32768`, `--nothink`, greedy
diff --git a/compose.yml b/compose.yml
new file mode 100644
index 00000000..47f63e89
--- /dev/null
+++ b/compose.yml
@@ -0,0 +1,29 @@
+services:
+  ds4:
+    build:
+      context: .
+      dockerfile: docker/Dockerfile
+      args:
+        CUDA_VERSION: ${CUDA_VERSION:-13.0.3}
+        UBUNTU_VERSION: ${UBUNTU_VERSION:-24.04}
+        CUDA_ARCH: ${CUDA_ARCH:-}
+    image: ds4:local
+    gpus: all
+    ports:
+      - "${DS4_PORT:-8000}:8000"
+    environment:
+      DS4_MODEL: ${DS4_MODEL:-q2}
+      DS4_ENABLE_MTP: ${DS4_ENABLE_MTP:-1}
+      DS4_CTX: ${DS4_CTX:-100000}
+      DS4_KV_DISK_SPACE_MB: ${DS4_KV_DISK_SPACE_MB:-20480}
+      DS4_PORT: 8000
+      DS4_HOST: 0.0.0.0
+      HF_TOKEN: ${HF_TOKEN:-}
+      DS4_MTP_DRAFT: ${DS4_MTP_DRAFT:-2}
+      DS4_MTP_MARGIN: ${DS4_MTP_MARGIN:-}
+      DS4_THREADS: ${DS4_THREADS:-}
+      DS4_EXTRA_ARGS: ${DS4_EXTRA_ARGS:-}
+    volumes:
+      - ${DS4_VOLUMES_HOST_DIR:-./volumes}/weights:/models
+      - ${DS4_VOLUMES_HOST_DIR:-./volumes}/kv-cache:/kv-cache
+    restart: unless-stopped
diff --git a/docker/Dockerfile b/docker/Dockerfile
new file mode 100644
index 00000000..65d82b37
--- /dev/null
+++ b/docker/Dockerfile
@@ -0,0 +1,48 @@
+ARG CUDA_VERSION=13.0.3
+ARG UBUNTU_VERSION=24.04
+
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS build
+
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        make \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /src
+COPY . .
+
+ARG CUDA_ARCH=""
+RUN make ds4-server CUDA_ARCH="$CUDA_ARCH"
+
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        ca-certificates \
+        curl \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+COPY --from=build /src/ds4-server /app/ds4-server
+COPY download_model.sh /app/download_model.sh
+COPY docker/entrypoint.sh /usr/local/bin/ds4-entrypoint
+
+RUN chmod +x /app/download_model.sh /usr/local/bin/ds4-entrypoint \
+    && mkdir -p /models /kv-cache
+
+ENV DS4_GGUF_DIR=/models \
+    DS4_MODEL=q2-imatrix \
+    DS4_ENABLE_MTP=0 \
+    DS4_CTX=100000 \
+    DS4_KV_DISK_DIR=/kv-cache \
+    DS4_KV_DISK_SPACE_MB=8192 \
+    DS4_HOST=0.0.0.0 \
+    DS4_PORT=8000
+
+EXPOSE 8000
+VOLUME ["/models", "/kv-cache"]
+
+ENTRYPOINT ["/usr/local/bin/ds4-entrypoint"]
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 00000000..c05f1721
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,161 @@
+# Docker
+
+The Docker setup builds and serves `ds4-server` with the Linux CUDA backend. It
+does not build or use the macOS Metal backend.
+
+Warning: this Docker setup currently targets CUDA systems only. It is not a
+portable container path for Apple Silicon or other Metal-backed macOS systems,
+and it does not support the Metal backend.
+
+## Requirements
+
+- Docker with Compose v2
+- NVIDIA driver compatible with CUDA 13 containers
+- NVIDIA Container Toolkit configured for Docker GPU access
+- Enough disk space for the selected GGUF model and disk KV cache
+
+The default image uses CUDA 13:
+
+- Build stage: `nvidia/cuda:13.0.3-devel-ubuntu24.04`
+- Runtime stage: `nvidia/cuda:13.0.3-runtime-ubuntu24.04`
+
+The CUDA and Ubuntu image versions are build-time parameters. The defaults are
+`CUDA_VERSION=13.0.3` and `UBUNTU_VERSION=24.04`.
+
+## Start
+
+From the repository root:
+
+```sh
+docker compose up --build
+```
+
+On first startup the container downloads the selected model into the weights
+volume, then starts `ds4-server` on port `8000`.
+
+The server exposes the same API as the native binary, including:
+
+- `GET /v1/models`
+- `POST /v1/chat/completions`
+- `POST /v1/completions`
+- `POST /v1/messages`
+
+## Volumes
+
+Compose bind-mounts two directories under `DS4_VOLUMES_HOST_DIR`, which defaults
+to `./volumes` relative to the repository root:
+
+- `${DS4_VOLUMES_HOST_DIR:-./volumes}/weights` mounted at `/models` for GGUF weights
+- `${DS4_VOLUMES_HOST_DIR:-./volumes}/kv-cache` mounted at `/kv-cache` for disk KV checkpoints
+
+The model downloader resumes partial downloads and skips files that are already
+present in `/models`.
+
+## Configuration
+
+Compose reads a root `.env` file by default for variable interpolation. These are
+the main knobs:
+
+```env
+DS4_MODEL=q2-imatrix
+DS4_ENABLE_MTP=0
+DS4_CTX=100000
+DS4_KV_DISK_SPACE_MB=8192
+DS4_MTP_DRAFT=2
+DS4_MTP_MARGIN=
+DS4_THREADS=
+DS4_EXTRA_ARGS=
+DS4_VOLUMES_HOST_DIR=./volumes
+HF_TOKEN=
+CUDA_VERSION=13.0.3
+UBUNTU_VERSION=24.04
+CUDA_ARCH=
+```
+
+`DS4_MODEL` is passed to `download_model.sh`. Supported values are:
+
+- `q2-imatrix`
+- `q4-imatrix`
+- `q2`
+- `q4`
+- `none`
+
+Use `none` only when you provide a model path yourself with `DS4_EXTRA_ARGS` or
+by overriding the container command.
+
+`DS4_ENABLE_MTP=1` downloads the optional MTP GGUF and starts the server with
+`--mtp /models/DeepSeek-V4-Flash-MTP-Q4K-Q8_0-F32.gguf`.
+
+`DS4_CTX` maps to `--ctx`.
+
+`DS4_KV_DISK_SPACE_MB` maps to `--kv-disk-space-mb`.
+
+`DS4_EXTRA_ARGS` is appended to the generated `ds4-server` command for advanced
+server flags such as `--quality` or cache policy tuning.
+
+`DS4_VOLUMES_HOST_DIR` selects the host directory used for persistent weights and
+disk KV cache bind mounts. The default is `./volumes`, with `weights` and
+`kv-cache` subdirectories below it.
+
+`CUDA_ARCH` is a build argument passed to `make`. Leave it empty for the default
+container build behavior, or set it when you need an explicit NVCC architecture.
+
+`CUDA_VERSION` and `UBUNTU_VERSION` select the NVIDIA CUDA base images used for
+both build and runtime stages. Keep `CUDA_VERSION` compatible with the host
+NVIDIA driver. A host where `nvidia-smi` reports `CUDA Version: 13.0` should use
+a CUDA `13.0.x` container, not `13.1.x`.
+
+## Examples
+
+Use the default q2 imatrix model:
+
+```sh
+docker compose up --build
+```
+
+Enable MTP and use a larger disk KV cache:
+
+```sh
+DS4_ENABLE_MTP=1 DS4_KV_DISK_SPACE_MB=32768 docker compose up --build
+```
+
+Use q4 imatrix weights:
+
+```sh
+DS4_MODEL=q4-imatrix docker compose up --build
+```
+
+Set a larger context window:
+
+```sh
+DS4_CTX=250000 docker compose up --build
+```
+
+Store weights and disk KV cache under a different host directory:
+
+```sh
+DS4_VOLUMES_HOST_DIR=/data/ds4 docker compose up --build
+```
+
+Pass extra server flags:
+
+```sh
+DS4_EXTRA_ARGS="--quality --kv-cache-min-tokens 1024" docker compose up --build
+```
+
+Build against a different compatible CUDA container version:
+
+```sh
+CUDA_VERSION=13.0.3 UBUNTU_VERSION=24.04 docker compose build
+```
+
+## Authentication
+
+Public downloads normally do not require authentication. If Hugging Face requires
+a token, set `HF_TOKEN` in the environment or in the root `.env` file.
+
+## Notes
+
+The first startup can take a long time because the q2 model is roughly 81 GB and
+the q4 model is roughly 153 GB. Keep the weights volume mounted so subsequent
+starts reuse the existing GGUF files.
diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh
new file mode 100644
index 00000000..73f021e6
--- /dev/null
+++ b/docker/entrypoint.sh
@@ -0,0 +1,54 @@
+#!/bin/sh
+set -eu
+
+DS4_GGUF_DIR=${DS4_GGUF_DIR:-/models}
+DS4_MODEL=${DS4_MODEL:-q2-imatrix}
+DS4_ENABLE_MTP=${DS4_ENABLE_MTP:-0}
+DS4_CTX=${DS4_CTX:-100000}
+DS4_KV_DISK_DIR=${DS4_KV_DISK_DIR:-/kv-cache}
+DS4_KV_DISK_SPACE_MB=${DS4_KV_DISK_SPACE_MB:-8192}
+DS4_HOST=${DS4_HOST:-0.0.0.0}
+DS4_PORT=${DS4_PORT:-8000}
+DS4_MTP_DRAFT=${DS4_MTP_DRAFT:-2}
+
+export DS4_GGUF_DIR
+
+mkdir -p "$DS4_GGUF_DIR" "$DS4_KV_DISK_DIR"
+
+if [ -n "$DS4_MODEL" ] && [ "$DS4_MODEL" != "none" ]; then
+    /app/download_model.sh "$DS4_MODEL"
+fi
+
+set -- \
+    --host "$DS4_HOST" \
+    --port "$DS4_PORT" \
+    --ctx "$DS4_CTX" \
+    --kv-disk-dir "$DS4_KV_DISK_DIR" \
+    --kv-disk-space-mb "$DS4_KV_DISK_SPACE_MB" \
+    "$@"
+
+case "$DS4_ENABLE_MTP" in
+    1|true|TRUE|yes|YES|on|ON)
+        /app/download_model.sh mtp
+        set -- --mtp "$DS4_GGUF_DIR/DeepSeek-V4-Flash-MTP-Q4K-Q8_0-F32.gguf" --mtp-draft "$DS4_MTP_DRAFT" "$@"
+        if [ -n "${DS4_MTP_MARGIN:-}" ]; then
+            set -- --mtp-margin "$DS4_MTP_MARGIN" "$@"
+        fi
+        ;;
+esac
+
+if [ -n "${DS4_MODEL_PATH:-}" ]; then
+    set -- --model "$DS4_MODEL_PATH" "$@"
+fi
+
+if [ -n "${DS4_THREADS:-}" ]; then
+    set -- --threads "$DS4_THREADS" "$@"
+fi
+
+if [ -n "${DS4_EXTRA_ARGS:-}" ]; then
+    # Intentionally split DS4_EXTRA_ARGS like a shell command line for advanced flags.
+    # shellcheck disable=SC2086
+    set -- "$@" $DS4_EXTRA_ARGS
+fi
+
+exec /app/ds4-server "$@"

From 7e85de58e463577df1d93aa84949b3d1eff4b7fa Mon Sep 17 00:00:00 2001
From: Andrea Gronchi <agronchi@tai.it>
Date: Tue, 12 May 2026 09:42:23 +0200
Subject: [PATCH 2/6] mapped default volume mount locations to match gguf/
 download path from download_model.sh

---
 .env.example     |  8 ++++++--
 .gitignore       |  2 +-
 compose.yml      |  4 ++--
 docker/README.md | 22 +++++++++++++---------
 4 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/.env.example b/.env.example
index 987d1111..508b53b0 100644
--- a/.env.example
+++ b/.env.example
@@ -17,8 +17,12 @@ DS4_KV_DISK_SPACE_MB=20480
 # Host port exposed by Docker Compose. The container always listens on 8000.
 DS4_PORT=8000
 
-# Host directory for persistent bind mounts. Compose uses the weights and
-# kv-cache subdirectories below this path.
+# Host directory for GGUF weights. The default matches download_model.sh so
+# models downloaded on the host under ./gguf are reused by the container.
+DS4_WEIGHTS_HOST_DIR=./gguf
+
+# Host directory for remaining persistent bind mounts. Compose uses the kv-cache
+# subdirectory below this path.
 DS4_VOLUMES_HOST_DIR=./volumes
 
 # Optional Hugging Face token for model downloads.
diff --git a/.gitignore b/.gitignore
index 1a39c61f..c72e27dd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,7 +7,7 @@
 /ds4flash.gguf
 /TODO.md
 /gguf/
-/volumes/
+/kv-cache/
 *.o
 *.dSYM/
 /misc/
diff --git a/compose.yml b/compose.yml
index 47f63e89..d0cdfe39 100644
--- a/compose.yml
+++ b/compose.yml
@@ -24,6 +24,6 @@ services:
       DS4_THREADS: ${DS4_THREADS:-}
       DS4_EXTRA_ARGS: ${DS4_EXTRA_ARGS:-}
     volumes:
-      - ${DS4_VOLUMES_HOST_DIR:-./volumes}/weights:/models
-      - ${DS4_VOLUMES_HOST_DIR:-./volumes}/kv-cache:/kv-cache
+      - ${DS4_WEIGHTS_HOST_DIR:-./gguf}:/models
+      - ${DS4_VOLUMES_HOST_DIR:-./kv-cache}:/kv-cache
     restart: unless-stopped
diff --git a/docker/README.md b/docker/README.md
index c05f1721..cd08c3f1 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -42,14 +42,15 @@ The server exposes the same API as the native binary, including:
 
 ## Volumes
 
-Compose bind-mounts two directories under `DS4_VOLUMES_HOST_DIR`, which defaults
-to `./volumes` relative to the repository root:
+Compose bind-mounts the GGUF weights directory and disk KV cache directory:
 
-- `${DS4_VOLUMES_HOST_DIR:-./volumes}/weights` mounted at `/models` for GGUF weights
+- `${DS4_WEIGHTS_HOST_DIR:-./gguf}` mounted at `/models` for GGUF weights
 - `${DS4_VOLUMES_HOST_DIR:-./volumes}/kv-cache` mounted at `/kv-cache` for disk KV checkpoints
 
 The model downloader resumes partial downloads and skips files that are already
-present in `/models`.
+present in `/models`. The default weights mount is `./gguf`, matching the native
+`download_model.sh` default, so models downloaded on the host are reused by the
+container instead of downloaded again.
 
 ## Configuration
 
@@ -65,6 +66,7 @@ DS4_MTP_DRAFT=2
 DS4_MTP_MARGIN=
 DS4_THREADS=
 DS4_EXTRA_ARGS=
+DS4_WEIGHTS_HOST_DIR=./gguf
 DS4_VOLUMES_HOST_DIR=./volumes
 HF_TOKEN=
 CUDA_VERSION=13.0.3
@@ -93,9 +95,11 @@ by overriding the container command.
 `DS4_EXTRA_ARGS` is appended to the generated `ds4-server` command for advanced
 server flags such as `--quality` or cache policy tuning.
 
-`DS4_VOLUMES_HOST_DIR` selects the host directory used for persistent weights and
-disk KV cache bind mounts. The default is `./volumes`, with `weights` and
-`kv-cache` subdirectories below it.
+`DS4_WEIGHTS_HOST_DIR` selects the host directory mounted at `/models`. The
+default is `./gguf`, matching the native model downloader.
+
+`DS4_VOLUMES_HOST_DIR` selects the host directory used for remaining persistent
+bind mounts. The disk KV cache uses the `kv-cache` subdirectory below it.
 
 `CUDA_ARCH` is a build argument passed to `make`. Leave it empty for the default
 container build behavior, or set it when you need an explicit NVCC architecture.
@@ -131,10 +135,10 @@ Set a larger context window:
 DS4_CTX=250000 docker compose up --build
 ```
 
-Store weights and disk KV cache under a different host directory:
+Store weights and disk KV cache under different host directories:
 
 ```sh
-DS4_VOLUMES_HOST_DIR=/data/ds4 docker compose up --build
+DS4_WEIGHTS_HOST_DIR=/data/ds4/gguf DS4_VOLUMES_HOST_DIR=/data/ds4 docker compose up --build
 ```
 
 Pass extra server flags:

From 61567aab982fc891764418d846a8493068675ff3 Mon Sep 17 00:00:00 2001
From: Andrea Gronchi <agronchi@tai.it>
Date: Tue, 12 May 2026 10:00:46 +0200
Subject: [PATCH 3/6] edit docker/README.md, deslopped it a bit

---
 docker/README.md | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/docker/README.md b/docker/README.md
index cd08c3f1..58f79043 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -1,20 +1,16 @@
 # Docker
 
-The Docker setup builds and serves `ds4-server` with the Linux CUDA backend. It
-does not build or use the macOS Metal backend.
-
-Warning: this Docker setup currently targets CUDA systems only. It is not a
-portable container path for Apple Silicon or other Metal-backed macOS systems,
-and it does not support the Metal backend.
+The Docker setup builds and serves `ds4-server` with the Linux CUDA backend. 
+It currently does not support the ds4 Metal backend, although this should be easy enought to add.
 
 ## Requirements
 
 - Docker with Compose v2
-- NVIDIA driver compatible with CUDA 13 containers
+- NVIDIA driver compatible with CUDA 12/13 containers
 - NVIDIA Container Toolkit configured for Docker GPU access
 - Enough disk space for the selected GGUF model and disk KV cache
 
-The default image uses CUDA 13:
+The default image uses CUDA 13, which is what most DGX Spark system are currently targeting:
 
 - Build stage: `nvidia/cuda:13.0.3-devel-ubuntu24.04`
 - Runtime stage: `nvidia/cuda:13.0.3-runtime-ubuntu24.04`
@@ -31,7 +27,7 @@ docker compose up --build
 ```
 
 On first startup the container downloads the selected model into the weights
-volume, then starts `ds4-server` on port `8000`.
+volume, then starts `ds4-server` on port `8000`. Which model? See `DS4_MODEL` later.
 
 The server exposes the same API as the native binary, including:
 
@@ -58,7 +54,7 @@ Compose reads a root `.env` file by default for variable interpolation. These ar
 the main knobs:
 
 ```env
-DS4_MODEL=q2-imatrix
+DS4_MODEL=q2
 DS4_ENABLE_MTP=0
 DS4_CTX=100000
 DS4_KV_DISK_SPACE_MB=8192
@@ -74,7 +70,7 @@ UBUNTU_VERSION=24.04
 CUDA_ARCH=
 ```
 
-`DS4_MODEL` is passed to `download_model.sh`. Supported values are:
+`DS4_MODEL` is passed to `download_model.sh`. Supported values at the time of writing are:
 
 - `q2-imatrix`
 - `q4-imatrix`
@@ -111,7 +107,7 @@ a CUDA `13.0.x` container, not `13.1.x`.
 
 ## Examples
 
-Use the default q2 imatrix model:
+Use the default q2 model:
 
 ```sh
 docker compose up --build
@@ -123,10 +119,10 @@ Enable MTP and use a larger disk KV cache:
 DS4_ENABLE_MTP=1 DS4_KV_DISK_SPACE_MB=32768 docker compose up --build
 ```
 
-Use q4 imatrix weights:
+Use q4 weights for the lucky RAM owners:
 
 ```sh
-DS4_MODEL=q4-imatrix docker compose up --build
+DS4_MODEL=q4 docker compose up --build
 ```
 
 Set a larger context window:

From b7e173de2dabeb3fa57189218d76e23a2a8ab565 Mon Sep 17 00:00:00 2001
From: Andrea Gronchi <agronchi@tai.it>
Date: Wed, 13 May 2026 09:55:02 +0200
Subject: [PATCH 4/6] feat(docker): add DS4_BUILD_TARGET build arg for
 multi-target support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce DS4_BUILD_TARGET to select the Makefile target for the Linux
CUDA Docker build, replacing the previous CUDA_ARCH-only approach that
hardcoded `make ds4-server`.

The new variable exposes three options:
- cuda-generic (default) — make with nvcc -arch=native
- cuda-spark — omits -arch for DGX Spark / GB10
- cuda — explicit architecture override via CUDA_ARCH=sm_N

CUDA_ARCH is now only consumed when DS4_BUILD_TARGET=cuda and silently
ignored otherwise.
---
 .env.example      | 10 ++++++++--
 compose.yml       |  1 +
 docker/Dockerfile |  5 +++--
 docker/README.md  | 25 +++++++++++++++++++++++--
 4 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/.env.example b/.env.example
index 508b53b0..ccd9a894 100644
--- a/.env.example
+++ b/.env.example
@@ -54,6 +54,12 @@ CUDA_VERSION=13.0.3
 # Ubuntu suffix used by the NVIDIA CUDA images.
 UBUNTU_VERSION=24.04
 
-# Optional NVCC architecture passed through make CUDA_ARCH. Leave empty for the
-# default container build behavior.
+# Makefile build target for the Linux CUDA build.
+#   cuda-generic  nvcc -arch=native (default, for most servers)
+#   cuda-spark    omit -arch, for DGX Spark / GB10
+#   cuda          use with CUDA_ARCH=sm_N for an explicit override
+DS4_BUILD_TARGET=cuda-generic
+
+# NVCC architecture override. Only consumed when DS4_BUILD_TARGET=cuda.
+# Ignored by cuda-generic and cuda-spark.
 CUDA_ARCH=
diff --git a/compose.yml b/compose.yml
index d0cdfe39..c989d783 100644
--- a/compose.yml
+++ b/compose.yml
@@ -6,6 +6,7 @@ services:
       args:
         CUDA_VERSION: ${CUDA_VERSION:-13.0.3}
         UBUNTU_VERSION: ${UBUNTU_VERSION:-24.04}
+        DS4_BUILD_TARGET: ${DS4_BUILD_TARGET:-cuda-generic}
         CUDA_ARCH: ${CUDA_ARCH:-}
     image: ds4:local
     gpus: all
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 65d82b37..84eca649 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -13,8 +13,9 @@ RUN apt-get update \
 WORKDIR /src
 COPY . .
 
-ARG CUDA_ARCH=""
-RUN make ds4-server CUDA_ARCH="$CUDA_ARCH"
+ARG DS4_BUILD_TARGET=cuda-generic
+ARG CUDA_ARCH=
+RUN make $(DS4_BUILD_TARGET) CUDA_ARCH="$(CUDA_ARCH)"
 
 FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 
diff --git a/docker/README.md b/docker/README.md
index 58f79043..eee1d5ed 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -67,6 +67,7 @@ DS4_VOLUMES_HOST_DIR=./volumes
 HF_TOKEN=
 CUDA_VERSION=13.0.3
 UBUNTU_VERSION=24.04
+DS4_BUILD_TARGET=cuda-generic
 CUDA_ARCH=
 ```
 
@@ -97,8 +98,16 @@ default is `./gguf`, matching the native model downloader.
 `DS4_VOLUMES_HOST_DIR` selects the host directory used for remaining persistent
 bind mounts. The disk KV cache uses the `kv-cache` subdirectory below it.
 
-`CUDA_ARCH` is a build argument passed to `make`. Leave it empty for the default
-container build behavior, or set it when you need an explicit NVCC architecture.
+`DS4_BUILD_TARGET` selects the Makefile target for the Linux CUDA build:
+
+| Value | Effect |
+|---|---|
+| `cuda-generic` (default) | `make CUDA_ARCH=native` — targets the visible GPU via `nvcc -arch=native` |
+| `cuda-spark` | `make CUDA_ARCH=` — omits `-arch` for DGX Spark / GB10 |
+| `cuda` | Use with `CUDA_ARCH=sm_N` for an explicit architecture override |
+
+`CUDA_ARCH` is passed alongside `DS4_BUILD_TARGET` and is only consumed when
+`DS4_BUILD_TARGET=cuda`. For the other targets the variable is silently ignored.
 
 `CUDA_VERSION` and `UBUNTU_VERSION` select the NVIDIA CUDA base images used for
 both build and runtime stages. Keep `CUDA_VERSION` compatible with the host
@@ -143,6 +152,18 @@ Pass extra server flags:
 DS4_EXTRA_ARGS="--quality --kv-cache-min-tokens 1024" docker compose up --build
 ```
 
+Build for DGX Spark / GB10 (omits explicit `nvcc -arch`):
+
+```sh
+DS4_BUILD_TARGET=cuda-spark docker compose up --build
+```
+
+Build with an explicit CUDA architecture override:
+
+```sh
+DS4_BUILD_TARGET=cuda CUDA_ARCH=sm_120 docker compose up --build
+```
+
 Build against a different compatible CUDA container version:
 
 ```sh

From b3d1c03f1a78a486acd7e6bd9fc7395c833017c2 Mon Sep 17 00:00:00 2001
From: Andrea Gronchi <agronchi@tai.it>
Date: Wed, 13 May 2026 10:01:44 +0200
Subject: [PATCH 5/6] add: .dockerignore to only package the bare minimum build
 context blob

---
 .dockerignore | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index e4af5ae2..7fb3eced 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,11 +1,19 @@
-.git
-gguf
-ds4flash.gguf
-ds4
-ds4-server
-ds4-bench
-ds4_test
-*.o
-*.dSYM
-misc
-TODO.md
+*
+!Makefile
+!ds4.c
+!ds4.h
+!ds4_gpu.h
+!ds4_cli.c
+!ds4_server.c
+!ds4_bench.c
+!ds4_cuda.cu
+!ds4_iq2_tables_cuda.inc
+!linenoise.c
+!linenoise.h
+!rax.c
+!rax.h
+!rax_malloc.h
+!download_model.sh
+!docker/
+!docker/Dockerfile
+!docker/entrypoint.sh

From 479e2ea227961df9a927c80b91c5a0024bdd4cd1 Mon Sep 17 00:00:00 2001
From: Andrea Gronchi <agronchi@tai.it>
Date: Wed, 13 May 2026 10:08:09 +0200
Subject: [PATCH 6/6] fix: botched make call

---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 84eca649..caf47b0b 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -15,7 +15,7 @@ COPY . .
 
 ARG DS4_BUILD_TARGET=cuda-generic
 ARG CUDA_ARCH=
-RUN make $(DS4_BUILD_TARGET) CUDA_ARCH="$(CUDA_ARCH)"
+RUN make ${DS4_BUILD_TARGET} CUDA_ARCH="${CUDA_ARCH}"
 
 FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}