antirez · neta79 · May 12, 2026 · May 12, 2026 · May 12, 2026 · May 12, 2026
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,19 @@
+*
+!Makefile
+!ds4.c
+!ds4.h
+!ds4_gpu.h
+!ds4_cli.c
+!ds4_server.c
+!ds4_bench.c
+!ds4_cuda.cu
+!ds4_iq2_tables_cuda.inc
+!linenoise.c
+!linenoise.h
+!rax.c
+!rax.h
+!rax_malloc.h
+!download_model.sh
+!docker/
+!docker/Dockerfile
+!docker/entrypoint.sh
diff --git a/.env.example b/.env.example
@@ -0,0 +1,65 @@
+# Common runtime settings
+
+# Model target passed to download_model.sh on container startup.
+# Supported: q2-imatrix, q4-imatrix, q2, q4, none
+DS4_MODEL=q2
+
+# Enable optional MTP speculative decoding support. Set to 1 to download and use
+# the MTP GGUF.
+DS4_ENABLE_MTP=1
+
+# Context size passed to ds4-server --ctx.
+DS4_CTX=100000
+
+# Disk KV cache budget passed to ds4-server --kv-disk-space-mb.
+DS4_KV_DISK_SPACE_MB=20480
+
+# Host port exposed by Docker Compose. The container always listens on 8000.
+DS4_PORT=8000
+
+# Host directory for GGUF weights. The default matches download_model.sh so
+# models downloaded on the host under ./gguf are reused by the container.
+DS4_WEIGHTS_HOST_DIR=./gguf
+
+# Host directory for remaining persistent bind mounts. Compose uses the kv-cache
+# subdirectory below this path.
+DS4_VOLUMES_HOST_DIR=./volumes
+
+# Optional Hugging Face token for model downloads.
+HF_TOKEN=
+
+
+# Less common runtime settings
+
+# MTP draft tokens passed to ds4-server --mtp-draft when DS4_ENABLE_MTP=1.
+DS4_MTP_DRAFT=2
+
+# Optional MTP margin passed to ds4-server --mtp-margin when set.
+DS4_MTP_MARGIN=
+
+# Optional CPU helper threads passed to ds4-server --threads when set.
+DS4_THREADS=
+
+# Extra ds4-server flags appended to the generated command.
+# Example: --quality --kv-cache-min-tokens 1024
+DS4_EXTRA_ARGS=
+
+
+# Build settings
+
+# NVIDIA CUDA image version. Keep this compatible with the host driver reported
+# by nvidia-smi. For hosts reporting CUDA Version: 13.0, use 13.0.x.
+CUDA_VERSION=13.0.3
+
+# Ubuntu suffix used by the NVIDIA CUDA images.
+UBUNTU_VERSION=24.04
+
+# Makefile build target for the Linux CUDA build.
+#   cuda-generic  nvcc -arch=native (default, for most servers)
+#   cuda-spark    omit -arch, for DGX Spark / GB10
+#   cuda          use with CUDA_ARCH=sm_N for an explicit override
+DS4_BUILD_TARGET=cuda-generic
+
+# NVCC architecture override. Only consumed when DS4_BUILD_TARGET=cuda.
+# Ignored by cuda-generic and cuda-spark.
+CUDA_ARCH=
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,7 @@
 /ds4flash.gguf
 /TODO.md
 /gguf/
+/kv-cache/
 *.o
 *.dSYM/
 /misc/

diff --git a/AGENT.md → AGENTS.md b/AGENT.md → AGENTS.md
@@ -43,6 +43,7 @@ Objective-C only where Metal requires it and Metal kernels under `metal/`.
   tool-call mapping, disk KV cache policy.
 - `ds4_metal.m`: Objective-C Metal runtime and kernel wrappers.
 - `metal/*.metal`: compute kernels.
+- `docker/`: CUDA 13 Docker and Compose setup for serving `ds4-server` on Linux.
 - `tests/`: unit and live integration tests.
 - `misc/`: ignored notes, experiments, and old planning material.
 
@@ -51,3 +52,10 @@ Objective-C only where Metal requires it and Metal kernels under `metal/`.
 Use `make` for build validation. Use `make test` for unit/regression tests when a
 model and Metal are available. Use live server tests only when intentionally
 testing the API surface.
+
+## Docker
+
+Docker-specific serving notes are documented in `docker/README.md`. The Docker
+setup targets Linux CUDA 13, mounts weights under `/models`, mounts disk KV cache
+under `/kv-cache`, downloads missing model weights during container startup, and
+serves `ds4-server` through Compose.
diff --git a/README.md b/README.md
@@ -135,6 +135,11 @@ make cpu              # CPU-only diagnostics build
 select another supported GGUF from `./gguf/`. Run `./ds4 --help` and
 `./ds4-server --help` for the full flag list.
 
+For Linux CUDA container builds, see [`docker/README.md`](docker/README.md).
+The Compose setup builds `ds4-server` with CUDA 13, downloads the selected model
+into a weights volume on startup, and serves the HTTP API with a mounted disk KV
+cache volume.
+
 ## Speed
 
 These are single-run Metal CLI numbers with `--ctx 32768`, `--nothink`, greedy

diff --git a/compose.yml b/compose.yml
@@ -0,0 +1,30 @@
+services:
+  ds4:
+    build:
+      context: .
+      dockerfile: docker/Dockerfile
+      args:
+        CUDA_VERSION: ${CUDA_VERSION:-13.0.3}
+        UBUNTU_VERSION: ${UBUNTU_VERSION:-24.04}
+        DS4_BUILD_TARGET: ${DS4_BUILD_TARGET:-cuda-generic}
+        CUDA_ARCH: ${CUDA_ARCH:-}
+    image: ds4:local
+    gpus: all
+    ports:
+      - "${DS4_PORT:-8000}:8000"
+    environment:
+      DS4_MODEL: ${DS4_MODEL:-q2}
+      DS4_ENABLE_MTP: ${DS4_ENABLE_MTP:-1}
+      DS4_CTX: ${DS4_CTX:-100000}
+      DS4_KV_DISK_SPACE_MB: ${DS4_KV_DISK_SPACE_MB:-20480}
+      DS4_PORT: 8000
+      DS4_HOST: 0.0.0.0
+      HF_TOKEN: ${HF_TOKEN:-}
+      DS4_MTP_DRAFT: ${DS4_MTP_DRAFT:-2}
+      DS4_MTP_MARGIN: ${DS4_MTP_MARGIN:-}
+      DS4_THREADS: ${DS4_THREADS:-}
+      DS4_EXTRA_ARGS: ${DS4_EXTRA_ARGS:-}
+    volumes:
+      - ${DS4_WEIGHTS_HOST_DIR:-./gguf}:/models
+      - ${DS4_VOLUMES_HOST_DIR:-./kv-cache}:/kv-cache
+    restart: unless-stopped
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -0,0 +1,49 @@
+ARG CUDA_VERSION=13.0.3
+ARG UBUNTU_VERSION=24.04
+
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS build
+
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        make \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /src
+COPY . .
+
+ARG DS4_BUILD_TARGET=cuda-generic
+ARG CUDA_ARCH=
+RUN make ${DS4_BUILD_TARGET} CUDA_ARCH="${CUDA_ARCH}"
+
+FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        ca-certificates \
+        curl \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+COPY --from=build /src/ds4-server /app/ds4-server
+COPY download_model.sh /app/download_model.sh
+COPY docker/entrypoint.sh /usr/local/bin/ds4-entrypoint
+
+RUN chmod +x /app/download_model.sh /usr/local/bin/ds4-entrypoint \
+    && mkdir -p /models /kv-cache
+
+ENV DS4_GGUF_DIR=/models \
+    DS4_MODEL=q2-imatrix \
+    DS4_ENABLE_MTP=0 \
+    DS4_CTX=100000 \
+    DS4_KV_DISK_DIR=/kv-cache \
+    DS4_KV_DISK_SPACE_MB=8192 \
+    DS4_HOST=0.0.0.0 \
+    DS4_PORT=8000
+
+EXPOSE 8000
+VOLUME ["/models", "/kv-cache"]
+
+ENTRYPOINT ["/usr/local/bin/ds4-entrypoint"]
diff --git a/docker/README.md b/docker/README.md
@@ -0,0 +1,182 @@
+# Docker
+
+The Docker setup builds and serves `ds4-server` with the Linux CUDA backend. 
+It currently does not support the ds4 Metal backend, although this should be easy enought to add.
+
+## Requirements
+
+- Docker with Compose v2
+- NVIDIA driver compatible with CUDA 12/13 containers
+- NVIDIA Container Toolkit configured for Docker GPU access
+- Enough disk space for the selected GGUF model and disk KV cache
+
+The default image uses CUDA 13, which is what most DGX Spark system are currently targeting:
+
+- Build stage: `nvidia/cuda:13.0.3-devel-ubuntu24.04`
+- Runtime stage: `nvidia/cuda:13.0.3-runtime-ubuntu24.04`
+
+The CUDA and Ubuntu image versions are build-time parameters. The defaults are
+`CUDA_VERSION=13.0.3` and `UBUNTU_VERSION=24.04`.
+
+## Start
+
+From the repository root:
+
+```sh
+docker compose up --build
+```
+
+On first startup the container downloads the selected model into the weights
+volume, then starts `ds4-server` on port `8000`. Which model? See `DS4_MODEL` later.
+
+The server exposes the same API as the native binary, including:
+
+- `GET /v1/models`
+- `POST /v1/chat/completions`
+- `POST /v1/completions`
+- `POST /v1/messages`
+
+## Volumes
+
+Compose bind-mounts the GGUF weights directory and disk KV cache directory:
+
+- `${DS4_WEIGHTS_HOST_DIR:-./gguf}` mounted at `/models` for GGUF weights
+- `${DS4_VOLUMES_HOST_DIR:-./volumes}/kv-cache` mounted at `/kv-cache` for disk KV checkpoints
+
+The model downloader resumes partial downloads and skips files that are already
+present in `/models`. The default weights mount is `./gguf`, matching the native
+`download_model.sh` default, so models downloaded on the host are reused by the
+container instead of downloaded again.
+
+## Configuration
+
+Compose reads a root `.env` file by default for variable interpolation. These are
+the main knobs:
+
+```env
+DS4_MODEL=q2
+DS4_ENABLE_MTP=0
+DS4_CTX=100000
+DS4_KV_DISK_SPACE_MB=8192
+DS4_MTP_DRAFT=2
+DS4_MTP_MARGIN=
+DS4_THREADS=
+DS4_EXTRA_ARGS=
+DS4_WEIGHTS_HOST_DIR=./gguf
+DS4_VOLUMES_HOST_DIR=./volumes
+HF_TOKEN=
+CUDA_VERSION=13.0.3
+UBUNTU_VERSION=24.04
+DS4_BUILD_TARGET=cuda-generic
+CUDA_ARCH=
+```
+
+`DS4_MODEL` is passed to `download_model.sh`. Supported values at the time of writing are:
+
+- `q2-imatrix`
+- `q4-imatrix`
+- `q2`
+- `q4`
+- `none`
+
+Use `none` only when you provide a model path yourself with `DS4_EXTRA_ARGS` or
+by overriding the container command.
+
+`DS4_ENABLE_MTP=1` downloads the optional MTP GGUF and starts the server with
+`--mtp /models/DeepSeek-V4-Flash-MTP-Q4K-Q8_0-F32.gguf`.
+
+`DS4_CTX` maps to `--ctx`.
+
+`DS4_KV_DISK_SPACE_MB` maps to `--kv-disk-space-mb`.
+
+`DS4_EXTRA_ARGS` is appended to the generated `ds4-server` command for advanced
+server flags such as `--quality` or cache policy tuning.
+
+`DS4_WEIGHTS_HOST_DIR` selects the host directory mounted at `/models`. The
+default is `./gguf`, matching the native model downloader.
+
+`DS4_VOLUMES_HOST_DIR` selects the host directory used for remaining persistent
+bind mounts. The disk KV cache uses the `kv-cache` subdirectory below it.
+
+`DS4_BUILD_TARGET` selects the Makefile target for the Linux CUDA build:
+
+| Value | Effect |
+|---|---|
+| `cuda-generic` (default) | `make CUDA_ARCH=native` — targets the visible GPU via `nvcc -arch=native` |
+| `cuda-spark` | `make CUDA_ARCH=` — omits `-arch` for DGX Spark / GB10 |
+| `cuda` | Use with `CUDA_ARCH=sm_N` for an explicit architecture override |
+
+`CUDA_ARCH` is passed alongside `DS4_BUILD_TARGET` and is only consumed when
+`DS4_BUILD_TARGET=cuda`. For the other targets the variable is silently ignored.
+
+`CUDA_VERSION` and `UBUNTU_VERSION` select the NVIDIA CUDA base images used for
+both build and runtime stages. Keep `CUDA_VERSION` compatible with the host
+NVIDIA driver. A host where `nvidia-smi` reports `CUDA Version: 13.0` should use
+a CUDA `13.0.x` container, not `13.1.x`.
+
+## Examples
+
+Use the default q2 model:
+
+```sh
+docker compose up --build
+```
+
+Enable MTP and use a larger disk KV cache:
+
+```sh
+DS4_ENABLE_MTP=1 DS4_KV_DISK_SPACE_MB=32768 docker compose up --build
+```
+
+Use q4 weights for the lucky RAM owners:
+
+```sh
+DS4_MODEL=q4 docker compose up --build
+```
+
+Set a larger context window:
+
+```sh
+DS4_CTX=250000 docker compose up --build
+```
+
+Store weights and disk KV cache under different host directories:
+
+```sh
+DS4_WEIGHTS_HOST_DIR=/data/ds4/gguf DS4_VOLUMES_HOST_DIR=/data/ds4 docker compose up --build
+```
+
+Pass extra server flags:
+
+```sh
+DS4_EXTRA_ARGS="--quality --kv-cache-min-tokens 1024" docker compose up --build
+```
+
+Build for DGX Spark / GB10 (omits explicit `nvcc -arch`):
+
+```sh
+DS4_BUILD_TARGET=cuda-spark docker compose up --build
+```
+
+Build with an explicit CUDA architecture override:
+
+```sh
+DS4_BUILD_TARGET=cuda CUDA_ARCH=sm_120 docker compose up --build
+```
+
+Build against a different compatible CUDA container version:
+
+```sh
+CUDA_VERSION=13.0.3 UBUNTU_VERSION=24.04 docker compose build
+```
+
+## Authentication
+
+Public downloads normally do not require authentication. If Hugging Face requires
+a token, set `HF_TOKEN` in the environment or in the root `.env` file.
+
+## Notes
+
+The first startup can take a long time because the q2 model is roughly 81 GB and
+the q4 model is roughly 153 GB. Keep the weights volume mounted so subsequent
+starts reuse the existing GGUF files.
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,6 +7,7 @@ @@
     /ds4flash.gguf
     /TODO.md
     /gguf/
+    /kv-cache/
     *.o
     *.dSYM/
     /misc/
@@ Expand Down @@