Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 88 additions & 46 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,23 @@
CC ?= cc
UNAME_S := $(shell uname -s)

# Default backend selection
ifeq ($(UNAME_S),Darwin)
BACKEND ?= metal
else
# On Linux, try to detect ROCm or CUDA if BACKEND is not set.
# Default to 'cpu' if neither is found.
ifeq ($(BACKEND),)
ifneq ($(wildcard /opt/rocm/bin/hipcc),)
BACKEND = rocm
else ifneq ($(shell which nvcc 2>/dev/null),)
BACKEND = cuda
else
BACKEND = cpu
endif
endif
endif

ifeq ($(UNAME_S),Darwin)
NATIVE_CPU_FLAG ?= -mcpu=native
else
Expand All @@ -9,59 +26,77 @@ endif

CFLAGS ?= -O3 -ffast-math $(NATIVE_CPU_FLAG) -Wall -Wextra -std=c99
OBJCFLAGS ?= -O3 -ffast-math $(NATIVE_CPU_FLAG) -Wall -Wextra -fobjc-arc

LDLIBS ?= -lm -pthread
METAL_SRCS := $(wildcard metal/*.metal)

ifeq ($(UNAME_S),Darwin)
METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal
CORE_OBJS = ds4.o ds4_metal.o
CORE_OBJS = ds4.o
CPU_CORE_OBJS = ds4_cpu.o
else
CFLAGS += -D_GNU_SOURCE -fno-finite-math-only
CUDA_HOME ?= /usr/local/cuda
NVCC ?= $(CUDA_HOME)/bin/nvcc
CUDA_ARCH ?= native
ifneq ($(strip $(CUDA_ARCH)),)
NVCC_ARCH_FLAGS := -arch=$(CUDA_ARCH)
endif
NVCCFLAGS ?= -O3 --use_fast_math $(NVCC_ARCH_FLAGS) -Xcompiler $(NATIVE_CPU_FLAG) -Xcompiler -pthread
CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas
CORE_OBJS = ds4.o ds4_cuda.o
CPU_CORE_OBJS = ds4_cpu.o
METAL_LDLIBS := $(LDLIBS)

# Backend specific settings
ifeq ($(BACKEND),metal)
METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal
CORE_OBJS += ds4_metal.o
LDLIBS_BIN = $(METAL_LDLIBS)
CC_BIN = $(CC)
endif

.PHONY: all clean test cpu cuda-regression
ifeq ($(BACKEND),cuda)
CUDA_HOME ?= /usr/local/cuda
NVCC ?= $(CUDA_HOME)/bin/nvcc
CUDA_ARCH ?= native
ifneq ($(strip $(CUDA_ARCH)),)
NVCC_ARCH_FLAGS := -arch=$(CUDA_ARCH)
endif
NVCCFLAGS ?= -O3 --use_fast_math $(NVCC_ARCH_FLAGS) -Xcompiler $(NATIVE_CPU_FLAG) -Xcompiler -pthread
CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas
CORE_OBJS += ds4_cuda.o
CFLAGS += -DDS4_HAVE_CUDA
LDLIBS_BIN = $(CUDA_LDLIBS)
CC_BIN = $(NVCC)
CC_BIN_FLAGS = $(NVCCFLAGS)
REGRESSION_TEST = tests/cuda_long_context_smoke
endif

all: ds4 ds4-server ds4-bench
ifeq ($(BACKEND),rocm)
ROCM_HOME ?= /opt/rocm
HIPCC ?= $(ROCM_HOME)/bin/hipcc
HIP_ARCH ?= native
ifneq ($(strip $(HIP_ARCH)),)
HIP_ARCH_FLAGS := --offload-arch=$(HIP_ARCH)
endif
HIPCCFLAGS ?= -O3 -ffast-math $(HIP_ARCH_FLAGS) $(NATIVE_CPU_FLAG) -pthread -Wno-unused-result
HIP_LDLIBS ?= -lm -pthread -L$(ROCM_HOME)/lib -lhipblas -lamdhip64
CORE_OBJS += ds4_hip.o
CFLAGS += -DDS4_HAVE_ROCM
LDLIBS_BIN = $(HIP_LDLIBS)
CC_BIN = $(HIPCC)
CC_BIN_FLAGS = $(HIPCCFLAGS)
REGRESSION_TEST = tests/rocm_long_context_smoke
endif

ifeq ($(UNAME_S),Darwin)
ds4: ds4_cli.o linenoise.o $(CORE_OBJS)
$(CC) $(CFLAGS) -o $@ ds4_cli.o linenoise.o $(CORE_OBJS) $(METAL_LDLIBS)
ifeq ($(BACKEND),cpu)
CFLAGS += -DDS4_NO_GPU
CORE_OBJS = $(CPU_CORE_OBJS)
LDLIBS_BIN = $(LDLIBS)
CC_BIN = $(CC)
endif

ds4-server: ds4_server.o rax.o $(CORE_OBJS)
$(CC) $(CFLAGS) -o $@ ds4_server.o rax.o $(CORE_OBJS) $(METAL_LDLIBS)
ifeq ($(UNAME_S),Linux)
CFLAGS += -D_GNU_SOURCE -fno-finite-math-only
endif

ds4-bench: ds4_bench.o $(CORE_OBJS)
$(CC) $(CFLAGS) -o $@ ds4_bench.o $(CORE_OBJS) $(METAL_LDLIBS)
.PHONY: all clean test cpu cuda-regression rocm-regression

cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o linenoise.o rax.o $(CPU_CORE_OBJS)
$(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
$(CC) $(CFLAGS) -o ds4-server ds4_server_cpu.o rax.o $(CPU_CORE_OBJS) $(LDLIBS)
$(CC) $(CFLAGS) -o ds4-bench ds4_bench_cpu.o $(CPU_CORE_OBJS) $(LDLIBS)
all: ds4 ds4-server ds4-bench

cuda-regression:
@echo "cuda-regression requires a CUDA build"
else
ds4: ds4_cli.o linenoise.o $(CORE_OBJS)
$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
$(CC_BIN) $(CC_BIN_FLAGS) $(CFLAGS) -o $@ $^ $(LDLIBS_BIN)

ds4-server: ds4_server.o rax.o $(CORE_OBJS)
$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
$(CC_BIN) $(CC_BIN_FLAGS) $(CFLAGS) -o $@ $^ $(LDLIBS_BIN)

ds4-bench: ds4_bench.o $(CORE_OBJS)
$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
$(CC_BIN) $(CC_BIN_FLAGS) $(CFLAGS) -o $@ $^ $(LDLIBS_BIN)

cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o linenoise.o rax.o $(CPU_CORE_OBJS)
$(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
Expand All @@ -70,7 +105,9 @@ cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o linenoise.o rax.o $(CPU_CORE

cuda-regression: tests/cuda_long_context_smoke
./tests/cuda_long_context_smoke
endif

rocm-regression: tests/rocm_long_context_smoke
./tests/rocm_long_context_smoke

ds4.o: ds4.c ds4.h ds4_gpu.h
$(CC) $(CFLAGS) -c -o $@ ds4.c
Expand All @@ -87,9 +124,6 @@ ds4_bench.o: ds4_bench.c ds4.h
ds4_test.o: tests/ds4_test.c ds4_server.c ds4.h rax.h
$(CC) $(CFLAGS) -Wno-unused-function -c -o $@ tests/ds4_test.c

tests/cuda_long_context_smoke.o: tests/cuda_long_context_smoke.c ds4_gpu.h
$(CC) $(CFLAGS) -I. -c -o $@ tests/cuda_long_context_smoke.c

rax.o: rax.c rax.h rax_malloc.h
$(CC) $(CFLAGS) -c -o $@ rax.c

Expand All @@ -114,18 +148,26 @@ ds4_metal.o: ds4_metal.m ds4_gpu.h $(METAL_SRCS)
ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc
$(NVCC) $(NVCCFLAGS) -c -o $@ ds4_cuda.cu

ds4_hip.o: ds4_hip.cpp ds4_gpu.h ds4_iq2_tables_hip.inc
$(HIPCC) $(HIPCCFLAGS) -c -o $@ ds4_hip.cpp

tests/cuda_long_context_smoke.o: tests/cuda_long_context_smoke.c ds4_gpu.h
$(CC) $(CFLAGS) -I. -c -o $@ tests/cuda_long_context_smoke.c

tests/cuda_long_context_smoke: tests/cuda_long_context_smoke.o ds4_cuda.o
$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)

tests/rocm_long_context_smoke.o: tests/rocm_long_context_smoke.c ds4_gpu.h
$(CC) $(CFLAGS) -I. -c -o $@ tests/rocm_long_context_smoke.c

tests/rocm_long_context_smoke: tests/rocm_long_context_smoke.o ds4_hip.o
$(HIPCC) $(HIPCCFLAGS) -o $@ $^ $(HIP_LDLIBS)

ds4_test: ds4_test.o rax.o $(CORE_OBJS)
ifeq ($(UNAME_S),Darwin)
$(CC) $(CFLAGS) -o $@ ds4_test.o rax.o $(CORE_OBJS) $(METAL_LDLIBS)
else
$(NVCC) $(NVCCFLAGS) -o $@ ds4_test.o rax.o $(CORE_OBJS) $(CUDA_LDLIBS)
endif
$(CC_BIN) $(CC_BIN_FLAGS) $(CFLAGS) -o $@ $^ $(LDLIBS_BIN)

test: ds4_test
./ds4_test

clean:
rm -f ds4 ds4-server ds4-bench ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o
rm -f ds4 ds4-server ds4-bench ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o tests/rocm_long_context_smoke tests/rocm_long_context_smoke.o
47 changes: 47 additions & 0 deletions PR_DESCRIPTION.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
### **Pull Request Description: Add AMD ROCm/HIP Support and Strix Halo Optimizations**

#### **Overview**
This PR introduces a complete AMD ROCm/HIP backend to DwarfStar 4, optimized specifically for hardware with unified memory architectures like the **AMD Strix Halo (gfx1151)**. It migrates the project from its original CUDA dependency to a portable HIP implementation while maintaining functional parity and performance.

#### **Key Changes**
1. **ROCm/HIP Backend Migration**:
* Ported `ds4_cuda.cu` to `ds4_hip.cpp` and transitioned all symbol dependencies from CUDA/cuBLAS to HIP/hipBLAS.
* Updated the `Makefile` to detect and support the ROCm stack using `hipcc`.
2. **Strix Halo / HSA Optimizations**:
* **Zero-Copy Memory Access**: Configured the engine to use HSA direct access (Zero-Copy) by default on AMD hardware. This avoids duplicating 83+ GiB of model weights in system RAM, significantly reducing memory overhead.
* **Vectorized Kernels**: Optimized F16 and F32 GEMV kernels using vectorized loads and warp-shuffle reductions for improved decoding throughput.
* **Hardware Intrinsics**: Replaced scalar loops with AMD-specific hardware dot-product intrinsics (`v_dot4_i32_i8`).
3. **Unified Tooling**:
* Added **`build.sh`**: A one-click script for ROCm compilation.
* Added **`rocm_start_server.sh`**: A unified script that handles stale process cleanup, system cache flushing, and optimized server launch.
4. **Verification**:
* Successfully validated with the `rocm-regression` long-context smoke test.
* End-to-end testing performed using DeepSeek-V4-Flash Q2-imatrix weights.

#### **Performance Benchmarks (AMD Strix Halo / Radeon Graphics)**
* **Decoding Speed**: **8.09 – 13.24 tokens/sec** (Non-MTP, Zero-Copy mode).
* **Prefill Latency**: **~4.45s** for short prompts (post-warmup).
* **Startup**: ~16s weight warmup for 83.60 GiB mapping.

#### **How to Test**
1. **Build**: `./build.sh`
2. **Start**: `./rocm_start_server.sh`
3. **Verify**:
```bash
curl -X POST http://127.0.0.1:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "ds4flash",
"messages": [{"role": "user", "content": "Hello, how are you?"}],
"max_tokens": 50
}'
```

---

### **Summary of Work Done**
* **Full Backend Port**: Replaced all CUDA/cuBLAS APIs with HIP/hipBLAS equivalents.
* **Environmental Cleanup**: Renamed all CUDA-specific environment variables to `DS4_HIP_*` (e.g., `DS4_HIP_PREFILL_CHUNK`).
* **Driver Compatibility**: Added robust `hipHostRegister` fallbacks for diverse ROCm driver environments.
* **Unified Startup Flow**: Fused cleanup and server launch into a single, reliable maintenance script.
* **Documentation Integrity**: Updated `README.md` with dedicated ROCm onboarding instructions.
110 changes: 99 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ That said, a few important things about this project:
* This software is developed with **strong assistance from GPT 5.5** and with humans leading the ideas, testing, and debugging. We say this openly because it shaped how the project was built. If you are not happy with AI-developed code, this software is not for you. The acknowledgement below is equally important: this would not exist without `llama.cpp` and GGML, largely written by hand.
* This implementation is based on the idea that compressed KV caches like the one of DeepSeek v4 and the fast SSD disks of modern MacBooks should change our idea that KV cache belongs to RAM. **The KV cache is actually a first-class disk citizen**.
* Our vision is that local inference should be a set of three things working well together, out of the box: A) inference engine with HTTP API + B) GGUF specially crafted to run well under a given engine and given assumptions + C) testing and validation with coding agents implementations. This inference engine only runs with the GGUF files provided. It gets tested against officially obtained logits at different context sizes. This project exists because we wanted to make one local model feel finished end to end, not just runnable. However this is just alpha quality code, so probably we are not still there.
* The optimized graph path targets **Metal on macOS** and **CUDA on Linux**. The CPU path is only for correctness checks and model/tokenizer diagnostics. For CPU-only Linux builds, use `make cpu`; it builds the normal `./ds4` and `./ds4-server` binaries without CUDA or Metal. On macOS, **warning: current macOS versions have a bug in the virtual memory implementation that will crash the kernel** if you try to run the CPU code. Remember? Software sucks. It was not possible to fix the CPU inference to avoid crashing, since each time you have to restart the computer, which is not funny. Help us, if you have the guts.
* The optimized graph path targets **Metal on macOS**, **CUDA on Linux**, and **ROCm/HIP on Linux (AMD)**. The CPU path is only for correctness checks and model/tokenizer diagnostics. For CPU-only Linux builds, use `make cpu`; it builds the normal `./ds4` and `./ds4-server` binaries without CUDA or Metal. On macOS, **warning: current macOS versions have a bug in the virtual memory implementation that will crash the kernel** if you try to run the CPU code. Remember? Software sucks. It was not possible to fix the CPU inference to avoid crashing, since each time you have to restart the computer, which is not funny. Help us, if you have the guts.

## Acknowledgements to llama.cpp and GGML

Expand Down Expand Up @@ -97,13 +97,52 @@ slight speedup, not a meaningful generation-speed win.
Then build:

```sh
make
make # Defaults to CUDA on Linux or Metal on macOS
./build.sh # Recommended for AMD ROCm/HIP builds
```

`./ds4flash.gguf` is the default model path used by both binaries. Pass `-m` to
select another supported GGUF from `./gguf/`. Run `./ds4 --help` and
`./ds4-server --help` for the full flag list.

## AMD ROCm / HIP Support (Linux)

For AMD GPUs (like the Strix Halo / Radeon Graphics), DwarfStar 4 supports the ROCm backend via HIP.

### Building

Use the provided build script to compile with ROCm support:

```sh
./build.sh
```

This script performs a clean build using `BACKEND=rocm` and the `hipcc` compiler.

### Starting the Server

The project includes a unified startup script that cleans up stale processes, flushes system memory caches, and launches the server with optimized ROCm flags:

```sh
./rocm_start_server.sh
```

This script is specifically tuned for hardware like the **AMD Strix Halo**, unsetting `DS4_HIP_COPY_MODEL` to enable **Zero-Copy HSA access**, which allows the GPU to read model weights directly from system RAM without duplication.

### Testing

Once the server is listening, you can verify it with a `curl` request:

```sh
curl -X POST http://127.0.0.1:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "ds4flash",
"messages": [{"role": "user", "content": "Hello, how are you?"}],
"max_tokens": 50
}'
```

## Speed

These are single-run Metal CLI numbers with `--ctx 32768`, `--nothink`, greedy
Expand Down Expand Up @@ -607,34 +646,83 @@ the kv cache files include the verbatim prompt cached.

## Backends

The default graph backend is Metal on macOS and CUDA on Linux CUDA builds:
The default graph backend is Metal on macOS and CUDA/ROCm on Linux:

```sh
./ds4 -p "Hello" --metal
./ds4 -p "Hello" --cuda
./ds4 -p "Hello" --metal # macOS
./ds4 -p "Hello" --cuda # Linux (NVIDIA CUDA or AMD ROCm/HIP)
```

CUDA builds default to `CUDA_ARCH=native`, so `nvcc` targets the visible GPU.
Set `CUDA_ARCH` explicitly when cross-building or when you need a known target:
### Building for ROCm (AMD GPU, Linux)

The Linux build automatically uses ROCm/HIP when `/opt/rocm` is present —
no separate target is needed:

```sh
make # detects ROCm automatically; builds ds4, ds4-server, ds4-bench
```

**Prerequisites:** ROCm 7.x (`/opt/rocm/bin/hipcc` must exist). Check your GPU architecture with:

```sh
/opt/rocm/bin/hipcc --version
rocminfo | grep "Name:.*gfx"
```

The Makefile picks up the GPU architecture automatically with `HIP_ARCH=native`. For specific AMD architectures (like **Strix Halo** or **RDNA3**), you can override it:

```sh
make HIP_ARCH=gfx1151 # AMD Strix Halo / Radeon 8060S
make HIP_ARCH=gfx1100 # RX 7900 XTX
make HIP_ARCH=gfx1030 # RX 6800/6900 (RDNA2)
```

**Performance Tuning for APUs (Strix Halo):**
The ROCm backend is optimized for the unified memory architecture of the Strix Halo:
- **Memory Advisories**: The model uses `hipMemAdviseSetCoarseGrain` to allow the GPU to cache system-mapped weights effectively, drastically improving TPS on APUs.
- **Hardware Dot-Products**: Q2 quantization uses native RDNA3/3.5 `v_dot4_i32_i8` instructions for peak math throughput.
- **Coalesced Access**: GEMV kernels are tuned for RDNA3 wavefront sizes to saturate the 180+ GB/s memory bus.

**First-run kernel compilation:** On the first inference after a rebuild, ROCm may JIT-compile GPU kernels via COMGR. This can take a few minutes. Subsequent runs load from cache and start immediately.

**Strix Halo / APU notes (gfx1151):**
- **Always clean system cache before start**: On APUs with shared memory, the Linux PageCache can fragment the unified address space. It is highly recommended to flush caches before starting the server to ensure the 83GB model can be mapped/copied efficiently.
```sh
sudo sync; echo 3 | sudo tee /proc/sys/vm/drop_caches
```
- ROCm sees only GTT (system RAM) — the BIOS UMA carveout is not exposed for
compute. Use a small UMA carveout (e.g. 512 MB in BIOS) and rely on GTT for
the model and KV cache.
- The 84 GB IQ2 model maps as `cached coarse-grained` over GTT, which gives
the GPU direct access without explicit copies.
- Use `--backend cuda` (the flag name is unchanged; it maps to HIP internally).

```sh
./ds4-server --backend cuda --ctx 32768 \
--kv-disk-dir /tmp/ds4-kv --kv-disk-space-mb 8192
```

### Building for NVIDIA CUDA (Linux)

Same `make` command — the Makefile uses `nvcc` when `/opt/rocm` is absent.
Set `CUDA_ARCH` if needed:

```sh
make CUDA_ARCH=sm_120
make CUDA_ARCH= # old nvcc default target behavior
```

There is also a CPU reference/debug path:
### CPU reference build

```sh
./ds4 -p "Hello" --cpu
make cpu
./ds4
./ds4 -p "Hello"
```

Do not treat the CPU path as the production target. The CLI and `ds4-server`
support the CPU backend for reference/debug use and share the same KV session
and snapshot format as Metal and CUDA, but normal inference should use Metal or
CUDA.
CUDA/ROCm.

## Steering

Expand Down
Loading