diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..7fb3eced --- /dev/null +++ b/.dockerignore @@ -0,0 +1,19 @@ +* +!Makefile +!ds4.c +!ds4.h +!ds4_gpu.h +!ds4_cli.c +!ds4_server.c +!ds4_bench.c +!ds4_cuda.cu +!ds4_iq2_tables_cuda.inc +!linenoise.c +!linenoise.h +!rax.c +!rax.h +!rax_malloc.h +!download_model.sh +!docker/ +!docker/Dockerfile +!docker/entrypoint.sh diff --git a/.env.example b/.env.example new file mode 100644 index 00000000..ccd9a894 --- /dev/null +++ b/.env.example @@ -0,0 +1,65 @@ +# Common runtime settings + +# Model target passed to download_model.sh on container startup. +# Supported: q2-imatrix, q4-imatrix, q2, q4, none +DS4_MODEL=q2 + +# Enable optional MTP speculative decoding support. Set to 1 to download and use +# the MTP GGUF. +DS4_ENABLE_MTP=1 + +# Context size passed to ds4-server --ctx. +DS4_CTX=100000 + +# Disk KV cache budget passed to ds4-server --kv-disk-space-mb. +DS4_KV_DISK_SPACE_MB=20480 + +# Host port exposed by Docker Compose. The container always listens on 8000. +DS4_PORT=8000 + +# Host directory for GGUF weights. The default matches download_model.sh so +# models downloaded on the host under ./gguf are reused by the container. +DS4_WEIGHTS_HOST_DIR=./gguf + +# Host directory for remaining persistent bind mounts. Compose uses the kv-cache +# subdirectory below this path. +DS4_VOLUMES_HOST_DIR=./volumes + +# Optional Hugging Face token for model downloads. +HF_TOKEN= + + +# Less common runtime settings + +# MTP draft tokens passed to ds4-server --mtp-draft when DS4_ENABLE_MTP=1. +DS4_MTP_DRAFT=2 + +# Optional MTP margin passed to ds4-server --mtp-margin when set. +DS4_MTP_MARGIN= + +# Optional CPU helper threads passed to ds4-server --threads when set. +DS4_THREADS= + +# Extra ds4-server flags appended to the generated command. +# Example: --quality --kv-cache-min-tokens 1024 +DS4_EXTRA_ARGS= + + +# Build settings + +# NVIDIA CUDA image version. Keep this compatible with the host driver reported +# by nvidia-smi. For hosts reporting CUDA Version: 13.0, use 13.0.x. +CUDA_VERSION=13.0.3 + +# Ubuntu suffix used by the NVIDIA CUDA images. +UBUNTU_VERSION=24.04 + +# Makefile build target for the Linux CUDA build. +# cuda-generic nvcc -arch=native (default, for most servers) +# cuda-spark omit -arch, for DGX Spark / GB10 +# cuda use with CUDA_ARCH=sm_N for an explicit override +DS4_BUILD_TARGET=cuda-generic + +# NVCC architecture override. Only consumed when DS4_BUILD_TARGET=cuda. +# Ignored by cuda-generic and cuda-spark. +CUDA_ARCH= diff --git a/.gitignore b/.gitignore index 2c70e4d6..c72e27dd 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ /ds4flash.gguf /TODO.md /gguf/ +/kv-cache/ *.o *.dSYM/ /misc/ diff --git a/AGENT.md b/AGENTS.md similarity index 86% rename from AGENT.md rename to AGENTS.md index ff9c395c..c88a4042 100644 --- a/AGENT.md +++ b/AGENTS.md @@ -43,6 +43,7 @@ Objective-C only where Metal requires it and Metal kernels under `metal/`. tool-call mapping, disk KV cache policy. - `ds4_metal.m`: Objective-C Metal runtime and kernel wrappers. - `metal/*.metal`: compute kernels. +- `docker/`: CUDA 13 Docker and Compose setup for serving `ds4-server` on Linux. - `tests/`: unit and live integration tests. - `misc/`: ignored notes, experiments, and old planning material. @@ -51,3 +52,10 @@ Objective-C only where Metal requires it and Metal kernels under `metal/`. Use `make` for build validation. Use `make test` for unit/regression tests when a model and Metal are available. Use live server tests only when intentionally testing the API surface. + +## Docker + +Docker-specific serving notes are documented in `docker/README.md`. The Docker +setup targets Linux CUDA 13, mounts weights under `/models`, mounts disk KV cache +under `/kv-cache`, downloads missing model weights during container startup, and +serves `ds4-server` through Compose. diff --git a/README.md b/README.md index 4b7c69ec..7143cab4 100644 --- a/README.md +++ b/README.md @@ -145,6 +145,11 @@ make cpu # CPU-only diagnostics build select another supported GGUF from `./gguf/`. Run `./ds4 --help` and `./ds4-server --help` for the full flag list. +For Linux CUDA container builds, see [`docker/README.md`](docker/README.md). +The Compose setup builds `ds4-server` with CUDA 13, downloads the selected model +into a weights volume on startup, and serves the HTTP API with a mounted disk KV +cache volume. + ## Speed These are single-run Metal CLI numbers with `--ctx 32768`, `--nothink`, greedy diff --git a/compose.yml b/compose.yml new file mode 100644 index 00000000..c989d783 --- /dev/null +++ b/compose.yml @@ -0,0 +1,30 @@ +services: + ds4: + build: + context: . + dockerfile: docker/Dockerfile + args: + CUDA_VERSION: ${CUDA_VERSION:-13.0.3} + UBUNTU_VERSION: ${UBUNTU_VERSION:-24.04} + DS4_BUILD_TARGET: ${DS4_BUILD_TARGET:-cuda-generic} + CUDA_ARCH: ${CUDA_ARCH:-} + image: ds4:local + gpus: all + ports: + - "${DS4_PORT:-8000}:8000" + environment: + DS4_MODEL: ${DS4_MODEL:-q2} + DS4_ENABLE_MTP: ${DS4_ENABLE_MTP:-1} + DS4_CTX: ${DS4_CTX:-100000} + DS4_KV_DISK_SPACE_MB: ${DS4_KV_DISK_SPACE_MB:-20480} + DS4_PORT: 8000 + DS4_HOST: 0.0.0.0 + HF_TOKEN: ${HF_TOKEN:-} + DS4_MTP_DRAFT: ${DS4_MTP_DRAFT:-2} + DS4_MTP_MARGIN: ${DS4_MTP_MARGIN:-} + DS4_THREADS: ${DS4_THREADS:-} + DS4_EXTRA_ARGS: ${DS4_EXTRA_ARGS:-} + volumes: + - ${DS4_WEIGHTS_HOST_DIR:-./gguf}:/models + - ${DS4_VOLUMES_HOST_DIR:-./kv-cache}:/kv-cache + restart: unless-stopped diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 00000000..caf47b0b --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,49 @@ +ARG CUDA_VERSION=13.0.3 +ARG UBUNTU_VERSION=24.04 + +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS build + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + make \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /src +COPY . . + +ARG DS4_BUILD_TARGET=cuda-generic +ARG CUDA_ARCH= +RUN make ${DS4_BUILD_TARGET} CUDA_ARCH="${CUDA_ARCH}" + +FROM nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + ca-certificates \ + curl \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY --from=build /src/ds4-server /app/ds4-server +COPY download_model.sh /app/download_model.sh +COPY docker/entrypoint.sh /usr/local/bin/ds4-entrypoint + +RUN chmod +x /app/download_model.sh /usr/local/bin/ds4-entrypoint \ + && mkdir -p /models /kv-cache + +ENV DS4_GGUF_DIR=/models \ + DS4_MODEL=q2-imatrix \ + DS4_ENABLE_MTP=0 \ + DS4_CTX=100000 \ + DS4_KV_DISK_DIR=/kv-cache \ + DS4_KV_DISK_SPACE_MB=8192 \ + DS4_HOST=0.0.0.0 \ + DS4_PORT=8000 + +EXPOSE 8000 +VOLUME ["/models", "/kv-cache"] + +ENTRYPOINT ["/usr/local/bin/ds4-entrypoint"] diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 00000000..eee1d5ed --- /dev/null +++ b/docker/README.md @@ -0,0 +1,182 @@ +# Docker + +The Docker setup builds and serves `ds4-server` with the Linux CUDA backend. +It currently does not support the ds4 Metal backend, although this should be easy enought to add. + +## Requirements + +- Docker with Compose v2 +- NVIDIA driver compatible with CUDA 12/13 containers +- NVIDIA Container Toolkit configured for Docker GPU access +- Enough disk space for the selected GGUF model and disk KV cache + +The default image uses CUDA 13, which is what most DGX Spark system are currently targeting: + +- Build stage: `nvidia/cuda:13.0.3-devel-ubuntu24.04` +- Runtime stage: `nvidia/cuda:13.0.3-runtime-ubuntu24.04` + +The CUDA and Ubuntu image versions are build-time parameters. The defaults are +`CUDA_VERSION=13.0.3` and `UBUNTU_VERSION=24.04`. + +## Start + +From the repository root: + +```sh +docker compose up --build +``` + +On first startup the container downloads the selected model into the weights +volume, then starts `ds4-server` on port `8000`. Which model? See `DS4_MODEL` later. + +The server exposes the same API as the native binary, including: + +- `GET /v1/models` +- `POST /v1/chat/completions` +- `POST /v1/completions` +- `POST /v1/messages` + +## Volumes + +Compose bind-mounts the GGUF weights directory and disk KV cache directory: + +- `${DS4_WEIGHTS_HOST_DIR:-./gguf}` mounted at `/models` for GGUF weights +- `${DS4_VOLUMES_HOST_DIR:-./volumes}/kv-cache` mounted at `/kv-cache` for disk KV checkpoints + +The model downloader resumes partial downloads and skips files that are already +present in `/models`. The default weights mount is `./gguf`, matching the native +`download_model.sh` default, so models downloaded on the host are reused by the +container instead of downloaded again. + +## Configuration + +Compose reads a root `.env` file by default for variable interpolation. These are +the main knobs: + +```env +DS4_MODEL=q2 +DS4_ENABLE_MTP=0 +DS4_CTX=100000 +DS4_KV_DISK_SPACE_MB=8192 +DS4_MTP_DRAFT=2 +DS4_MTP_MARGIN= +DS4_THREADS= +DS4_EXTRA_ARGS= +DS4_WEIGHTS_HOST_DIR=./gguf +DS4_VOLUMES_HOST_DIR=./volumes +HF_TOKEN= +CUDA_VERSION=13.0.3 +UBUNTU_VERSION=24.04 +DS4_BUILD_TARGET=cuda-generic +CUDA_ARCH= +``` + +`DS4_MODEL` is passed to `download_model.sh`. Supported values at the time of writing are: + +- `q2-imatrix` +- `q4-imatrix` +- `q2` +- `q4` +- `none` + +Use `none` only when you provide a model path yourself with `DS4_EXTRA_ARGS` or +by overriding the container command. + +`DS4_ENABLE_MTP=1` downloads the optional MTP GGUF and starts the server with +`--mtp /models/DeepSeek-V4-Flash-MTP-Q4K-Q8_0-F32.gguf`. + +`DS4_CTX` maps to `--ctx`. + +`DS4_KV_DISK_SPACE_MB` maps to `--kv-disk-space-mb`. + +`DS4_EXTRA_ARGS` is appended to the generated `ds4-server` command for advanced +server flags such as `--quality` or cache policy tuning. + +`DS4_WEIGHTS_HOST_DIR` selects the host directory mounted at `/models`. The +default is `./gguf`, matching the native model downloader. + +`DS4_VOLUMES_HOST_DIR` selects the host directory used for remaining persistent +bind mounts. The disk KV cache uses the `kv-cache` subdirectory below it. + +`DS4_BUILD_TARGET` selects the Makefile target for the Linux CUDA build: + +| Value | Effect | +|---|---| +| `cuda-generic` (default) | `make CUDA_ARCH=native` — targets the visible GPU via `nvcc -arch=native` | +| `cuda-spark` | `make CUDA_ARCH=` — omits `-arch` for DGX Spark / GB10 | +| `cuda` | Use with `CUDA_ARCH=sm_N` for an explicit architecture override | + +`CUDA_ARCH` is passed alongside `DS4_BUILD_TARGET` and is only consumed when +`DS4_BUILD_TARGET=cuda`. For the other targets the variable is silently ignored. + +`CUDA_VERSION` and `UBUNTU_VERSION` select the NVIDIA CUDA base images used for +both build and runtime stages. Keep `CUDA_VERSION` compatible with the host +NVIDIA driver. A host where `nvidia-smi` reports `CUDA Version: 13.0` should use +a CUDA `13.0.x` container, not `13.1.x`. + +## Examples + +Use the default q2 model: + +```sh +docker compose up --build +``` + +Enable MTP and use a larger disk KV cache: + +```sh +DS4_ENABLE_MTP=1 DS4_KV_DISK_SPACE_MB=32768 docker compose up --build +``` + +Use q4 weights for the lucky RAM owners: + +```sh +DS4_MODEL=q4 docker compose up --build +``` + +Set a larger context window: + +```sh +DS4_CTX=250000 docker compose up --build +``` + +Store weights and disk KV cache under different host directories: + +```sh +DS4_WEIGHTS_HOST_DIR=/data/ds4/gguf DS4_VOLUMES_HOST_DIR=/data/ds4 docker compose up --build +``` + +Pass extra server flags: + +```sh +DS4_EXTRA_ARGS="--quality --kv-cache-min-tokens 1024" docker compose up --build +``` + +Build for DGX Spark / GB10 (omits explicit `nvcc -arch`): + +```sh +DS4_BUILD_TARGET=cuda-spark docker compose up --build +``` + +Build with an explicit CUDA architecture override: + +```sh +DS4_BUILD_TARGET=cuda CUDA_ARCH=sm_120 docker compose up --build +``` + +Build against a different compatible CUDA container version: + +```sh +CUDA_VERSION=13.0.3 UBUNTU_VERSION=24.04 docker compose build +``` + +## Authentication + +Public downloads normally do not require authentication. If Hugging Face requires +a token, set `HF_TOKEN` in the environment or in the root `.env` file. + +## Notes + +The first startup can take a long time because the q2 model is roughly 81 GB and +the q4 model is roughly 153 GB. Keep the weights volume mounted so subsequent +starts reuse the existing GGUF files. diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh new file mode 100644 index 00000000..73f021e6 --- /dev/null +++ b/docker/entrypoint.sh @@ -0,0 +1,54 @@ +#!/bin/sh +set -eu + +DS4_GGUF_DIR=${DS4_GGUF_DIR:-/models} +DS4_MODEL=${DS4_MODEL:-q2-imatrix} +DS4_ENABLE_MTP=${DS4_ENABLE_MTP:-0} +DS4_CTX=${DS4_CTX:-100000} +DS4_KV_DISK_DIR=${DS4_KV_DISK_DIR:-/kv-cache} +DS4_KV_DISK_SPACE_MB=${DS4_KV_DISK_SPACE_MB:-8192} +DS4_HOST=${DS4_HOST:-0.0.0.0} +DS4_PORT=${DS4_PORT:-8000} +DS4_MTP_DRAFT=${DS4_MTP_DRAFT:-2} + +export DS4_GGUF_DIR + +mkdir -p "$DS4_GGUF_DIR" "$DS4_KV_DISK_DIR" + +if [ -n "$DS4_MODEL" ] && [ "$DS4_MODEL" != "none" ]; then + /app/download_model.sh "$DS4_MODEL" +fi + +set -- \ + --host "$DS4_HOST" \ + --port "$DS4_PORT" \ + --ctx "$DS4_CTX" \ + --kv-disk-dir "$DS4_KV_DISK_DIR" \ + --kv-disk-space-mb "$DS4_KV_DISK_SPACE_MB" \ + "$@" + +case "$DS4_ENABLE_MTP" in + 1|true|TRUE|yes|YES|on|ON) + /app/download_model.sh mtp + set -- --mtp "$DS4_GGUF_DIR/DeepSeek-V4-Flash-MTP-Q4K-Q8_0-F32.gguf" --mtp-draft "$DS4_MTP_DRAFT" "$@" + if [ -n "${DS4_MTP_MARGIN:-}" ]; then + set -- --mtp-margin "$DS4_MTP_MARGIN" "$@" + fi + ;; +esac + +if [ -n "${DS4_MODEL_PATH:-}" ]; then + set -- --model "$DS4_MODEL_PATH" "$@" +fi + +if [ -n "${DS4_THREADS:-}" ]; then + set -- --threads "$DS4_THREADS" "$@" +fi + +if [ -n "${DS4_EXTRA_ARGS:-}" ]; then + # Intentionally split DS4_EXTRA_ARGS like a shell command line for advanced flags. + # shellcheck disable=SC2086 + set -- "$@" $DS4_EXTRA_ARGS +fi + +exec /app/ds4-server "$@"