antirez · AyaSakura-comp · May 13, 2026 · May 13, 2026 · May 13, 2026 · May 13, 2026
diff --git a/Makefile b/Makefile
@@ -1,6 +1,23 @@
 CC ?= cc
 UNAME_S := $(shell uname -s)
 
+# Default backend selection
+ifeq ($(UNAME_S),Darwin)
+    BACKEND ?= metal
+else
+    # On Linux, try to detect ROCm or CUDA if BACKEND is not set.
+    # Default to 'cpu' if neither is found.
+    ifeq ($(BACKEND),)
+        ifneq ($(wildcard /opt/rocm/bin/hipcc),)
+            BACKEND = rocm
+        else ifneq ($(shell which nvcc 2>/dev/null),)
+            BACKEND = cuda
+        else
+            BACKEND = cpu
+        endif
+    endif
+endif
+
 ifeq ($(UNAME_S),Darwin)
 NATIVE_CPU_FLAG ?= -mcpu=native
 else
@@ -9,59 +26,77 @@ endif
 
 CFLAGS ?= -O3 -ffast-math $(NATIVE_CPU_FLAG) -Wall -Wextra -std=c99
 OBJCFLAGS ?= -O3 -ffast-math $(NATIVE_CPU_FLAG) -Wall -Wextra -fobjc-arc
-
 LDLIBS ?= -lm -pthread
 METAL_SRCS := $(wildcard metal/*.metal)
 
-ifeq ($(UNAME_S),Darwin)
-METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal
-CORE_OBJS = ds4.o ds4_metal.o
+CORE_OBJS = ds4.o
 CPU_CORE_OBJS = ds4_cpu.o
-else
-CFLAGS += -D_GNU_SOURCE -fno-finite-math-only
-CUDA_HOME ?= /usr/local/cuda
-NVCC ?= $(CUDA_HOME)/bin/nvcc
-CUDA_ARCH ?= native
-ifneq ($(strip $(CUDA_ARCH)),)
-NVCC_ARCH_FLAGS := -arch=$(CUDA_ARCH)
-endif
-NVCCFLAGS ?= -O3 --use_fast_math $(NVCC_ARCH_FLAGS) -Xcompiler $(NATIVE_CPU_FLAG) -Xcompiler -pthread
-CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas
-CORE_OBJS = ds4.o ds4_cuda.o
-CPU_CORE_OBJS = ds4_cpu.o
-METAL_LDLIBS := $(LDLIBS)
+
+# Backend specific settings
+ifeq ($(BACKEND),metal)
+    METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal
+    CORE_OBJS += ds4_metal.o
+    LDLIBS_BIN = $(METAL_LDLIBS)
+    CC_BIN = $(CC)
 endif
 
-.PHONY: all clean test cpu cuda-regression
+ifeq ($(BACKEND),cuda)
+    CUDA_HOME ?= /usr/local/cuda
+    NVCC ?= $(CUDA_HOME)/bin/nvcc
+    CUDA_ARCH ?= native
+    ifneq ($(strip $(CUDA_ARCH)),)
+        NVCC_ARCH_FLAGS := -arch=$(CUDA_ARCH)
+    endif
+    NVCCFLAGS ?= -O3 --use_fast_math $(NVCC_ARCH_FLAGS) -Xcompiler $(NATIVE_CPU_FLAG) -Xcompiler -pthread
+    CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas
+    CORE_OBJS += ds4_cuda.o
+    CFLAGS += -DDS4_HAVE_CUDA
+    LDLIBS_BIN = $(CUDA_LDLIBS)
+    CC_BIN = $(NVCC)
+    CC_BIN_FLAGS = $(NVCCFLAGS)
+    REGRESSION_TEST = tests/cuda_long_context_smoke
+endif
 
-all: ds4 ds4-server ds4-bench
+ifeq ($(BACKEND),rocm)
+    ROCM_HOME ?= /opt/rocm
+    HIPCC ?= $(ROCM_HOME)/bin/hipcc
+    HIP_ARCH ?= native
+    ifneq ($(strip $(HIP_ARCH)),)
+        HIP_ARCH_FLAGS := --offload-arch=$(HIP_ARCH)
+    endif
+    HIPCCFLAGS ?= -O3 -ffast-math $(HIP_ARCH_FLAGS) $(NATIVE_CPU_FLAG) -pthread -Wno-unused-result
+    HIP_LDLIBS ?= -lm -pthread -L$(ROCM_HOME)/lib -lhipblas -lamdhip64
+    CORE_OBJS += ds4_hip.o
+    CFLAGS += -DDS4_HAVE_ROCM
+    LDLIBS_BIN = $(HIP_LDLIBS)
+    CC_BIN = $(HIPCC)
+    CC_BIN_FLAGS = $(HIPCCFLAGS)
+    REGRESSION_TEST = tests/rocm_long_context_smoke
+endif
 
-ifeq ($(UNAME_S),Darwin)
-ds4: ds4_cli.o linenoise.o $(CORE_OBJS)
-	$(CC) $(CFLAGS) -o $@ ds4_cli.o linenoise.o $(CORE_OBJS) $(METAL_LDLIBS)
+ifeq ($(BACKEND),cpu)
+    CFLAGS += -DDS4_NO_GPU
+    CORE_OBJS = $(CPU_CORE_OBJS)
+    LDLIBS_BIN = $(LDLIBS)
+    CC_BIN = $(CC)
+endif
 
-ds4-server: ds4_server.o rax.o $(CORE_OBJS)
-	$(CC) $(CFLAGS) -o $@ ds4_server.o rax.o $(CORE_OBJS) $(METAL_LDLIBS)
+ifeq ($(UNAME_S),Linux)
+    CFLAGS += -D_GNU_SOURCE -fno-finite-math-only
+endif
 
-ds4-bench: ds4_bench.o $(CORE_OBJS)
-	$(CC) $(CFLAGS) -o $@ ds4_bench.o $(CORE_OBJS) $(METAL_LDLIBS)
+.PHONY: all clean test cpu cuda-regression rocm-regression
 
-cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o linenoise.o rax.o $(CPU_CORE_OBJS)
-	$(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
-	$(CC) $(CFLAGS) -o ds4-server ds4_server_cpu.o rax.o $(CPU_CORE_OBJS) $(LDLIBS)
-	$(CC) $(CFLAGS) -o ds4-bench ds4_bench_cpu.o $(CPU_CORE_OBJS) $(LDLIBS)
+all: ds4 ds4-server ds4-bench
 
-cuda-regression:
-	@echo "cuda-regression requires a CUDA build"
-else
 ds4: ds4_cli.o linenoise.o $(CORE_OBJS)
-	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
+	$(CC_BIN) $(CC_BIN_FLAGS) $(CFLAGS) -o $@ $^ $(LDLIBS_BIN)
 
 ds4-server: ds4_server.o rax.o $(CORE_OBJS)
-	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
+	$(CC_BIN) $(CC_BIN_FLAGS) $(CFLAGS) -o $@ $^ $(LDLIBS_BIN)
 
 ds4-bench: ds4_bench.o $(CORE_OBJS)
-	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
+	$(CC_BIN) $(CC_BIN_FLAGS) $(CFLAGS) -o $@ $^ $(LDLIBS_BIN)
 
 cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o linenoise.o rax.o $(CPU_CORE_OBJS)
 	$(CC) $(CFLAGS) -o ds4 ds4_cli_cpu.o linenoise.o $(CPU_CORE_OBJS) $(LDLIBS)
@@ -70,7 +105,9 @@ cpu: ds4_cli_cpu.o ds4_server_cpu.o ds4_bench_cpu.o linenoise.o rax.o $(CPU_CORE
 
 cuda-regression: tests/cuda_long_context_smoke
 	./tests/cuda_long_context_smoke
-endif
+
+rocm-regression: tests/rocm_long_context_smoke
+	./tests/rocm_long_context_smoke
 
 ds4.o: ds4.c ds4.h ds4_gpu.h
 	$(CC) $(CFLAGS) -c -o $@ ds4.c
@@ -87,9 +124,6 @@ ds4_bench.o: ds4_bench.c ds4.h
 ds4_test.o: tests/ds4_test.c ds4_server.c ds4.h rax.h
 	$(CC) $(CFLAGS) -Wno-unused-function -c -o $@ tests/ds4_test.c
 
-tests/cuda_long_context_smoke.o: tests/cuda_long_context_smoke.c ds4_gpu.h
-	$(CC) $(CFLAGS) -I. -c -o $@ tests/cuda_long_context_smoke.c
-
 rax.o: rax.c rax.h rax_malloc.h
 	$(CC) $(CFLAGS) -c -o $@ rax.c
 
@@ -114,18 +148,26 @@ ds4_metal.o: ds4_metal.m ds4_gpu.h $(METAL_SRCS)
 ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc
 	$(NVCC) $(NVCCFLAGS) -c -o $@ ds4_cuda.cu
 
+ds4_hip.o: ds4_hip.cpp ds4_gpu.h ds4_iq2_tables_hip.inc
+	$(HIPCC) $(HIPCCFLAGS) -c -o $@ ds4_hip.cpp
+
+tests/cuda_long_context_smoke.o: tests/cuda_long_context_smoke.c ds4_gpu.h
+	$(CC) $(CFLAGS) -I. -c -o $@ tests/cuda_long_context_smoke.c
+
 tests/cuda_long_context_smoke: tests/cuda_long_context_smoke.o ds4_cuda.o
 	$(NVCC) $(NVCCFLAGS) -o $@ $^ $(CUDA_LDLIBS)
 
+tests/rocm_long_context_smoke.o: tests/rocm_long_context_smoke.c ds4_gpu.h
+	$(CC) $(CFLAGS) -I. -c -o $@ tests/rocm_long_context_smoke.c
+
+tests/rocm_long_context_smoke: tests/rocm_long_context_smoke.o ds4_hip.o
+	$(HIPCC) $(HIPCCFLAGS) -o $@ $^ $(HIP_LDLIBS)
+
 ds4_test: ds4_test.o rax.o $(CORE_OBJS)
-ifeq ($(UNAME_S),Darwin)
-	$(CC) $(CFLAGS) -o $@ ds4_test.o rax.o $(CORE_OBJS) $(METAL_LDLIBS)
-else
-	$(NVCC) $(NVCCFLAGS) -o $@ ds4_test.o rax.o $(CORE_OBJS) $(CUDA_LDLIBS)
-endif
+	$(CC_BIN) $(CC_BIN_FLAGS) $(CFLAGS) -o $@ $^ $(LDLIBS_BIN)
 
 test: ds4_test
 	./ds4_test
 
 clean:
-	rm -f ds4 ds4-server ds4-bench ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o
+	rm -f ds4 ds4-server ds4-bench ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o tests/rocm_long_context_smoke tests/rocm_long_context_smoke.o
diff --git a/PR_DESCRIPTION.md b/PR_DESCRIPTION.md
@@ -0,0 +1,47 @@
+### **Pull Request Description: Add AMD ROCm/HIP Support and Strix Halo Optimizations**
+
+#### **Overview**
+This PR introduces a complete AMD ROCm/HIP backend to DwarfStar 4, optimized specifically for hardware with unified memory architectures like the **AMD Strix Halo (gfx1151)**. It migrates the project from its original CUDA dependency to a portable HIP implementation while maintaining functional parity and performance.
+
+#### **Key Changes**
+1.  **ROCm/HIP Backend Migration**:
+    *   Ported `ds4_cuda.cu` to `ds4_hip.cpp` and transitioned all symbol dependencies from CUDA/cuBLAS to HIP/hipBLAS.
+    *   Updated the `Makefile` to detect and support the ROCm stack using `hipcc`.
+2.  **Strix Halo / HSA Optimizations**:
+    *   **Zero-Copy Memory Access**: Configured the engine to use HSA direct access (Zero-Copy) by default on AMD hardware. This avoids duplicating 83+ GiB of model weights in system RAM, significantly reducing memory overhead.
+    *   **Vectorized Kernels**: Optimized F16 and F32 GEMV kernels using vectorized loads and warp-shuffle reductions for improved decoding throughput.
+    *   **Hardware Intrinsics**: Replaced scalar loops with AMD-specific hardware dot-product intrinsics (`v_dot4_i32_i8`).
+3.  **Unified Tooling**:
+    *   Added **`build.sh`**: A one-click script for ROCm compilation.
+    *   Added **`rocm_start_server.sh`**: A unified script that handles stale process cleanup, system cache flushing, and optimized server launch.
+4.  **Verification**:
+    *   Successfully validated with the `rocm-regression` long-context smoke test.
+    *   End-to-end testing performed using DeepSeek-V4-Flash Q2-imatrix weights.
+
+#### **Performance Benchmarks (AMD Strix Halo / Radeon Graphics)**
+*   **Decoding Speed**: **8.09 – 13.24 tokens/sec** (Non-MTP, Zero-Copy mode).
+*   **Prefill Latency**: **~4.45s** for short prompts (post-warmup).
+*   **Startup**: ~16s weight warmup for 83.60 GiB mapping.
+
+#### **How to Test**
+1.  **Build**: `./build.sh`
+2.  **Start**: `./rocm_start_server.sh`
+3.  **Verify**:
+    ```bash
+    curl -X POST http://127.0.0.1:8000/v1/chat/completions \
+        -H "Content-Type: application/json" \
+        -d '{
+            "model": "ds4flash",
+            "messages": [{"role": "user", "content": "Hello, how are you?"}],
+            "max_tokens": 50
+        }'
+    ```
+
+---
+
+### **Summary of Work Done**
+*   **Full Backend Port**: Replaced all CUDA/cuBLAS APIs with HIP/hipBLAS equivalents.
+*   **Environmental Cleanup**: Renamed all CUDA-specific environment variables to `DS4_HIP_*` (e.g., `DS4_HIP_PREFILL_CHUNK`).
+*   **Driver Compatibility**: Added robust `hipHostRegister` fallbacks for diverse ROCm driver environments.
+*   **Unified Startup Flow**: Fused cleanup and server launch into a single, reliable maintenance script.
+*   **Documentation Integrity**: Updated `README.md` with dedicated ROCm onboarding instructions.
diff --git a/README.md b/README.md
@@ -29,7 +29,7 @@ That said, a few important things about this project:
 * This software is developed with **strong assistance from GPT 5.5** and with humans leading the ideas, testing, and debugging. We say this openly because it shaped how the project was built. If you are not happy with AI-developed code, this software is not for you. The acknowledgement below is equally important: this would not exist without `llama.cpp` and GGML, largely written by hand.
 * This implementation is based on the idea that compressed KV caches like the one of DeepSeek v4 and the fast SSD disks of modern MacBooks should change our idea that KV cache belongs to RAM. **The KV cache is actually a first-class disk citizen**.
 * Our vision is that local inference should be a set of three things working well together, out of the box: A) inference engine with HTTP API + B) GGUF specially crafted to run well under a given engine and given assumptions + C) testing and validation with coding agents implementations. This inference engine only runs with the GGUF files provided. It gets tested against officially obtained logits at different context sizes. This project exists because we wanted to make one local model feel finished end to end, not just runnable. However this is just alpha quality code, so probably we are not still there.
-* The optimized graph path targets **Metal on macOS** and **CUDA on Linux**. The CPU path is only for correctness checks and model/tokenizer diagnostics. For CPU-only Linux builds, use `make cpu`; it builds the normal `./ds4` and `./ds4-server` binaries without CUDA or Metal. On macOS, **warning: current macOS versions have a bug in the virtual memory implementation that will crash the kernel** if you try to run the CPU code. Remember? Software sucks. It was not possible to fix the CPU inference to avoid crashing, since each time you have to restart the computer, which is not funny. Help us, if you have the guts.
+* The optimized graph path targets **Metal on macOS**, **CUDA on Linux**, and **ROCm/HIP on Linux (AMD)**. The CPU path is only for correctness checks and model/tokenizer diagnostics. For CPU-only Linux builds, use `make cpu`; it builds the normal `./ds4` and `./ds4-server` binaries without CUDA or Metal. On macOS, **warning: current macOS versions have a bug in the virtual memory implementation that will crash the kernel** if you try to run the CPU code. Remember? Software sucks. It was not possible to fix the CPU inference to avoid crashing, since each time you have to restart the computer, which is not funny. Help us, if you have the guts.
 
 ## Acknowledgements to llama.cpp and GGML
 
@@ -97,13 +97,52 @@ slight speedup, not a meaningful generation-speed win.
 Then build:
 
 ```sh
-make
+make              # Defaults to CUDA on Linux or Metal on macOS
+./build.sh        # Recommended for AMD ROCm/HIP builds
 ```
 
 `./ds4flash.gguf` is the default model path used by both binaries. Pass `-m` to
 select another supported GGUF from `./gguf/`. Run `./ds4 --help` and
 `./ds4-server --help` for the full flag list.
 
+## AMD ROCm / HIP Support (Linux)
+
+For AMD GPUs (like the Strix Halo / Radeon Graphics), DwarfStar 4 supports the ROCm backend via HIP.
+
+### Building
+
+Use the provided build script to compile with ROCm support:
+
+```sh
+./build.sh
+```
+
+This script performs a clean build using `BACKEND=rocm` and the `hipcc` compiler.
+
+### Starting the Server
+
+The project includes a unified startup script that cleans up stale processes, flushes system memory caches, and launches the server with optimized ROCm flags:
+
+```sh
+./rocm_start_server.sh
+```
+
+This script is specifically tuned for hardware like the **AMD Strix Halo**, unsetting `DS4_HIP_COPY_MODEL` to enable **Zero-Copy HSA access**, which allows the GPU to read model weights directly from system RAM without duplication.
+
+### Testing
+
+Once the server is listening, you can verify it with a `curl` request:
+
+```sh
+curl -X POST http://127.0.0.1:8000/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "ds4flash",
+        "messages": [{"role": "user", "content": "Hello, how are you?"}],
+        "max_tokens": 50
+    }'
+```
+
 ## Speed
 
 These are single-run Metal CLI numbers with `--ctx 32768`, `--nothink`, greedy
@@ -607,34 +646,83 @@ the kv cache files include the verbatim prompt cached.
 
 ## Backends
 
-The default graph backend is Metal on macOS and CUDA on Linux CUDA builds:
+The default graph backend is Metal on macOS and CUDA/ROCm on Linux:
 
 ```sh
-./ds4 -p "Hello" --metal
-./ds4 -p "Hello" --cuda
+./ds4 -p "Hello" --metal   # macOS
+./ds4 -p "Hello" --cuda    # Linux (NVIDIA CUDA or AMD ROCm/HIP)
 ```
 
-CUDA builds default to `CUDA_ARCH=native`, so `nvcc` targets the visible GPU.
-Set `CUDA_ARCH` explicitly when cross-building or when you need a known target:
+### Building for ROCm (AMD GPU, Linux)
+
+The Linux build automatically uses ROCm/HIP when `/opt/rocm` is present —
+no separate target is needed:
+
+```sh
+make          # detects ROCm automatically; builds ds4, ds4-server, ds4-bench
+```
+
+**Prerequisites:** ROCm 7.x (`/opt/rocm/bin/hipcc` must exist). Check your GPU architecture with:
+
+```sh
+/opt/rocm/bin/hipcc --version
+rocminfo | grep "Name:.*gfx"
+```
+
+The Makefile picks up the GPU architecture automatically with `HIP_ARCH=native`. For specific AMD architectures (like **Strix Halo** or **RDNA3**), you can override it:
+
+```sh
+make HIP_ARCH=gfx1151     # AMD Strix Halo / Radeon 8060S
+make HIP_ARCH=gfx1100     # RX 7900 XTX
+make HIP_ARCH=gfx1030     # RX 6800/6900 (RDNA2)
+```
+
+**Performance Tuning for APUs (Strix Halo):**
+The ROCm backend is optimized for the unified memory architecture of the Strix Halo:
+- **Memory Advisories**: The model uses `hipMemAdviseSetCoarseGrain` to allow the GPU to cache system-mapped weights effectively, drastically improving TPS on APUs.
+- **Hardware Dot-Products**: Q2 quantization uses native RDNA3/3.5 `v_dot4_i32_i8` instructions for peak math throughput.
+- **Coalesced Access**: GEMV kernels are tuned for RDNA3 wavefront sizes to saturate the 180+ GB/s memory bus.
+
+**First-run kernel compilation:** On the first inference after a rebuild, ROCm may JIT-compile GPU kernels via COMGR. This can take a few minutes. Subsequent runs load from cache and start immediately.
+
+**Strix Halo / APU notes (gfx1151):**
+- **Always clean system cache before start**: On APUs with shared memory, the Linux PageCache can fragment the unified address space. It is highly recommended to flush caches before starting the server to ensure the 83GB model can be mapped/copied efficiently.
+  ```sh
+  sudo sync; echo 3 | sudo tee /proc/sys/vm/drop_caches
+  ```
+- ROCm sees only GTT (system RAM) — the BIOS UMA carveout is not exposed for
+  compute. Use a small UMA carveout (e.g. 512 MB in BIOS) and rely on GTT for
+  the model and KV cache.
+- The 84 GB IQ2 model maps as `cached coarse-grained` over GTT, which gives
+  the GPU direct access without explicit copies.
+- Use `--backend cuda` (the flag name is unchanged; it maps to HIP internally).
+
+```sh
+./ds4-server --backend cuda --ctx 32768 \
+  --kv-disk-dir /tmp/ds4-kv --kv-disk-space-mb 8192
+```
+
+### Building for NVIDIA CUDA (Linux)
+
+Same `make` command — the Makefile uses `nvcc` when `/opt/rocm` is absent.
+Set `CUDA_ARCH` if needed:
 
 ```sh
 make CUDA_ARCH=sm_120
 make CUDA_ARCH=        # old nvcc default target behavior
 ```
 
-There is also a CPU reference/debug path:
+### CPU reference build
 
 ```sh
 ./ds4 -p "Hello" --cpu
 make cpu
-./ds4
-./ds4 -p "Hello"
 ```
 
 Do not treat the CPU path as the production target. The CLI and `ds4-server`
 support the CPU backend for reference/debug use and share the same KV session
 and snapshot format as Metal and CUDA, but normal inference should use Metal or
-CUDA.
+CUDA/ROCm.
 
 ## Steering