symforce-org · jeffdaily · Jun 16, 2026 · Jun 23, 2026
diff --git a/cmake/rerun_if_needed.py b/cmake/rerun_if_needed.py
@@ -84,7 +84,7 @@ def get_path_git_hash(path_to_check: str, repo_root: T.Optional[str] = None) ->
     Changes to untracked files won't affect the hash.
     """
     # Get relative path from possibly absolute path
-    relative_path = os.path.relpath(path_to_check, repo_root or os.getcwd())
+    relative_path = os.path.relpath(path_to_check, repo_root or os.getcwd()).replace("\\", "/")
 
     tree_hash, diff_index = _get_hashes(relative_path, cwd=repo_root)
 

diff --git a/symforce/caspar/README.md b/symforce/caspar/README.md
@@ -110,6 +110,17 @@ The following access patterns are currently supported:
 
 
 
+### AMD GPUs (ROCm/HIP)
+
+Caspar also runs on AMD GPUs through ROCm/HIP. The same generated kernels and runtime compile against the HIP toolchain via a small `cuda_to_hip.h` compatibility header that maps the CUDA spellings Caspar emits (`cudaMalloc`, `__syncthreads`, cooperative groups, CUB primitives, and friends) onto their HIP equivalents, so the symbolic kernel definitions are unchanged.
+
+To build a generated library for AMD GPUs, enable the HIP path when compiling:
+
+```python
+compile_caspar_library(caslib, output_dir, use_hip=True, hip_arch="gfx90a")
+```
+
+Set the target AMD GPU via the `hip_arch` argument, for example `gfx90a` for CDNA2 (MI200) or `gfx1100` for RDNA3. The ROCm build needs a HIP-enabled compiler (`hipcc`/`amdclang++`) and the `hip` and `hipcub` packages from a ROCm installation. When `use_hip` is off the build is unchanged and continues to use CUDA.
 
 ### Etymology
 Caspar, an acronym for **C**UDA **A**ccelerator for **S**ymbolic **P**rogramming with **A**daptive **R**eordering, is named after the Danish–Norwegian mathematician [Caspar Wessel](https://en.wikipedia.org/wiki/Caspar_Wessel). Wessel was the first to describe the geometrical interpretation of complex numbers as points in the complex plane and as vectors. However, since his thesis was written in Danish, it initially received little recognition. When his work was rediscovered later, the mathematician Sophus Lie, known for his discovery of Lie algebra, wrote the following in the newspaper:

diff --git a/symforce/caspar/code_generation/library.py b/symforce/caspar/code_generation/library.py
@@ -180,6 +180,8 @@ def compile(
         debug: bool = False,
         cuda_arch: str | None = None,
         jobs: int | None = None,
+        use_hip: bool = False,
+        hip_arch: str | None = None,
     ) -> None:
         build_dir = out_dir / "build"
         config_cmd: list[str | Path] = [
@@ -189,8 +191,15 @@ def compile(
             "-B",
             build_dir,
             f"-DCMAKE_BUILD_TYPE={'Debug' if debug else 'Release'}",
-            f"-DCMAKE_CUDA_ARCHITECTURES={cuda_arch if cuda_arch is not None else _real_cuda_arch_string()}",
         ]
+        if use_hip:
+            config_cmd.append("-DUSE_HIP=ON")
+            if hip_arch:
+                config_cmd.append(f"-DCMAKE_HIP_ARCHITECTURES={hip_arch}")
+        else:
+            config_cmd.append(
+                f"-DCMAKE_CUDA_ARCHITECTURES={cuda_arch if cuda_arch is not None else _real_cuda_arch_string()}"
+            )
         subprocess.run(config_cmd, check=True)
         build_cmd: list[str | Path] = ["cmake", "--build", build_dir, "--parallel"]
         if jobs:

diff --git a/symforce/caspar/source/runtime/cuda_to_hip.h b/symforce/caspar/source/runtime/cuda_to_hip.h
@@ -0,0 +1,134 @@
+/* ----------------------------------------------------------------------------
+ * SymForce - Copyright 2025, Skydio, Inc.
+ * Copyright (c) 2026 Advanced Micro Devices, Inc.
+ * This source code is under the Apache 2.0 license found in the LICENSE file.
+ *
+ * Author: Jeff Daily <jeff.daily@amd.com>
+ * ---------------------------------------------------------------------------- */
+
+#pragma once
+
+// CUDA-to-HIP compatibility header for ROCm/HIP builds.
+// Provides the CUDA API spellings the project uses via aliases to HIP equivalents.
+
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_cooperative_groups.h>
+#include <hipcub/hipcub.hpp>
+
+// CUDA runtime API -> HIP runtime API
+#define cudaMalloc              hipMalloc
+#define cudaFree                hipFree
+#define cudaMemset              hipMemset
+#define cudaMemcpy              hipMemcpy
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost  hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice  hipMemcpyHostToDevice
+#define cudaError_t             hipError_t
+#define cudaSuccess             hipSuccess
+#define cudaStream_t            hipStream_t
+#define cudaDeviceSynchronize   hipDeviceSynchronize
+#define cudaGetLastError        hipGetLastError
+#define cudaGetErrorString      hipGetErrorString
+#define cudaSetDevice           hipSetDevice
+#define cudaGetDevice           hipGetDevice
+#define cudaPointerGetAttributes hipPointerGetAttributes
+#define cudaPointerAttributes   hipPointerAttribute_t
+
+// HIP shared-memory atomics are block-scoped by definition (no inter-block visibility),
+// so atomicAdd_block is equivalent to atomicAdd on shared memory.
+#define atomicAdd_block atomicAdd
+
+// CUB -> hipCUB namespace
+namespace cub = hipcub;
+
+// Cooperative groups: HIP has cg basics but lacks cg::reduce, cg::labeled_partition, memcpy_async.
+// Provide manual implementations where needed.
+
+namespace caspar_hip {
+
+// Butterfly reduction within a cooperative group (thread_block_tile or coalesced_group).
+// Replaces cg::reduce(group, val, cg::plus<T>()).
+template <typename GroupT, typename T>
+__device__ __forceinline__ T reduce_sum(GroupT group, T val) {
+  for (unsigned int offset = group.size() / 2; offset > 0; offset >>= 1) {
+    val += group.shfl_xor(val, offset);
+  }
+  return val;
+}
+
+// Butterfly reduction for max (cg::greater<T>).
+template <typename GroupT, typename T>
+__device__ __forceinline__ T reduce_max(GroupT group, T val) {
+  for (unsigned int offset = group.size() / 2; offset > 0; offset >>= 1) {
+    T other = group.shfl_xor(val, offset);
+    val = (val > other) ? val : other;
+  }
+  return val;
+}
+
+// Match_any: return a mask of lanes in the tile that have the same label.
+// HIP CG has match_any() for coalesced groups.
+template <typename GroupT, typename LabelT>
+__device__ __forceinline__ unsigned long long match_any_mask(GroupT group, LabelT label) {
+  // HIP cooperative_groups::coalesced_group has match_any
+  return group.match_any(label);
+}
+
+// Labeled partition emulation: reduce values within lanes sharing the same label,
+// and have exactly one lane per unique label perform the atomic.
+// Returns the reduced value and sets is_leader=true for exactly one lane per label.
+template <typename GroupT, typename T, typename LabelT>
+__device__ __forceinline__ T labeled_reduce_sum(GroupT group, T val, LabelT label, bool& is_leader) {
+  // Get mask of lanes with same label
+  unsigned long long same_label_mask = group.match_any(label);
+
+  // Find my position within the matching lanes
+  unsigned int my_lane = group.thread_rank();
+  unsigned long long lower_mask = (1ULL << my_lane) - 1;
+  unsigned int rank_in_label = __popcll(same_label_mask & lower_mask);
+
+  // Leader is the lowest-numbered lane in the group
+  is_leader = (rank_in_label == 0);
+
+  // Count total lanes with this label
+  unsigned int label_size = __popcll(same_label_mask);
+
+  // Butterfly reduction over the masked lanes
+  // For each reduction step, exchange with lane at offset if both are in same_label_mask
+  T result = val;
+  for (unsigned int offset = 1; offset < group.size(); offset <<= 1) {
+    unsigned int partner_lane = my_lane ^ offset;
+    bool partner_has_same_label = (same_label_mask >> partner_lane) & 1ULL;
+    T partner_val = group.shfl_xor(result, offset);
+    if (partner_has_same_label && partner_lane < group.size()) {
+      result += partner_val;
+    }
+  }
+
+  return result;
+}
+
+}  // namespace caspar_hip
+
+// Macros to replace cg::reduce calls (used in device code)
+#define CG_REDUCE_SUM(group, val) caspar_hip::reduce_sum(group, val)
+#define CG_LABELED_REDUCE_SUM(group, val, label, is_leader) \
+    caspar_hip::labeled_reduce_sum(group, val, label, is_leader)
+
+#else  // CUDA path
+
+#include <cuda_runtime.h>
+#include <cooperative_groups.h>
+#include <cooperative_groups/details/partitioning.h>
+#include <cooperative_groups/reduce.h>
+#include <cooperative_groups/memcpy_async.h>
+#include <cub/cub.cuh>
+
+namespace cg = cooperative_groups;
+
+// On CUDA, cg::reduce is available
+#define CG_REDUCE_SUM(group, val) cg::reduce(group, val, cg::plus<decltype(val)>())
+
+#endif
diff --git a/symforce/caspar/source/runtime/memops.cuh b/symforce/caspar/source/runtime/memops.cuh
@@ -7,11 +7,7 @@
 
 #include <stdio.h>
 
-#include <cooperative_groups.h>
-#include <cooperative_groups/memcpy_async.h>
-#include <cooperative_groups/reduce.h>
-#include <cuda_runtime.h>
-
+#include "cuda_to_hip.h"
 #include "shared_indices.h"
 
 namespace cg = cooperative_groups;
@@ -260,6 +256,32 @@ __forceinline__ __device__ void FlushSumShared(StorageT* const output, const uin
   if (idx.argsort != 0xffff) {  // 0xffff indicates the thread is not used.
     unique = indices[indices[idx.argsort].target].unique;
   }
+
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+  // HIP lacks cg::labeled_partition; use per-lane atomicAdd fallback.
+  // The butterfly approach doesn't work for non-contiguous label groups (lanes at
+  // arbitrary positions share a label, but XOR only pairs specific distances).
+  // Per-lane atomicAdd is simpler and correct (shared-memory atomics are fast).
+
+#pragma unroll
+  for (int i = 0; i < dim_target; i++) {
+    const SharedIndex idx_inner = indices[threadIdx.x];
+    StorageT val = StorageT(0);
+    if (idx_inner.argsort != 0xffff) {
+      // Read value BEFORE zeroing (order matters!)
+      val = inout_shared[idx_inner.argsort * dim_target + i];
+    }
+    __syncthreads();
+    inout_shared[threadIdx.x * dim_target + i] = 0.0f;
+    __syncthreads();
+
+    if (idx_inner.argsort != 0xffff) {
+      // Each lane with valid data does its own atomicAdd to the target location
+      atomicAdd_block(&inout_shared[indices[idx_inner.argsort].target * dim_target + i], val);
+    }
+    __syncthreads();
+  }
+#else
   const cg::coalesced_group group = cg::labeled_partition(cg::coalesced_threads(), unique);
 
 #pragma unroll
@@ -279,6 +301,7 @@ __forceinline__ __device__ void FlushSumShared(StorageT* const output, const uin
     }
     __syncthreads();
   }
+#endif
 
   constexpr uint dim_aligned = dim_target == 3 ? 4 : dim_target;
   for (int i = 0; i < dim_target; i++) {
@@ -303,6 +326,29 @@ template <uint dim_target, typename StorageT>
 __forceinline__ __device__ void FlushSumBlock(StorageT* const output, StorageT* const inout_shared,
                                               const bool valid) {
   __syncthreads();
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+  // HIP coalesced_group lacks shfl_xor; use shared-memory atomics directly.
+  constexpr uint dim_aligned = dim_target == 3 ? 4 : dim_target;
+
+#pragma unroll
+  for (int i = 0; i < dim_target; i++) {
+    StorageT val = StorageT(0);
+    if (valid) {
+      val = inout_shared[threadIdx.x * dim_target + i];
+    }
+    __syncthreads();
+    inout_shared[threadIdx.x * dim_target + i] = 0.0f;
+    __syncthreads();
+
+    if (valid) {
+      atomicAdd_block(&inout_shared[i], val);
+    }
+    __syncthreads();
+  }
+  for (int i = threadIdx.x; i < dim_target; i += blockDim.x) {
+    output[blockIdx.x * dim_aligned + i] = inout_shared[i];
+  }
+#else
   const cg::coalesced_group group = cg::binary_partition(cg::coalesced_threads(), valid);
   constexpr uint dim_aligned = dim_target == 3 ? 4 : dim_target;
 
@@ -326,20 +372,36 @@ __forceinline__ __device__ void FlushSumBlock(StorageT* const output, StorageT*
   for (int i = threadIdx.x; i < dim_target; i += blockDim.x) {
     output[blockIdx.x * dim_aligned + i] = inout_shared[i];
   }
+#endif
 }
 template <typename StorageT>
 __forceinline__ __device__ void SumStore(StorageT* const shared_tmp, StorageT* const inout_shared,
                                          const uint offset, const bool valid, StorageT data) {
   auto group = cg::tiled_partition<32>(cg::this_thread_block());
 
   __syncthreads();
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+  // HIP lacks cg::reduce; use butterfly reduction within the tile
+  StorageT tot = valid ? data : StorageT(0);
+  for (unsigned int o = group.size() / 2; o > 0; o >>= 1) {
+    tot += group.shfl_xor(tot, o);
+  }
+#else
   StorageT tot = cg::reduce(group, valid ? data : 0.0f, cg::plus<StorageT>());
+#endif
   if (group.thread_rank() == 0) {
     inout_shared[group.meta_group_rank()] = tot;
   }
   __syncthreads();
   if (group.meta_group_rank() == 0) {
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+    tot = inout_shared[group.thread_rank()];
+    for (unsigned int o = group.size() / 2; o > 0; o >>= 1) {
+      tot += group.shfl_xor(tot, o);
+    }
+#else
     tot = cg::reduce(group, inout_shared[group.thread_rank()], cg::plus<StorageT>());
+#endif
     if (group.thread_rank() == 0) {
       shared_tmp[offset] = tot;
     }
@@ -364,6 +426,29 @@ template <uint dim_target, typename StorageT>
 __forceinline__ __device__ void FlushSumBlockAdd(StorageT* const output,
                                                  StorageT* const inout_shared, const bool valid) {
   __syncthreads();
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+  // HIP coalesced_group lacks shfl_xor; use shared-memory atomics directly.
+  constexpr uint dim_aligned = dim_target == 3 ? 4 : dim_target;
+
+#pragma unroll
+  for (int i = 0; i < dim_target; i++) {
+    StorageT val = StorageT(0);
+    if (valid) {
+      val = inout_shared[threadIdx.x * dim_target + i];
+    }
+    __syncthreads();
+    inout_shared[threadIdx.x * dim_target + i] = 0.0f;
+    __syncthreads();
+
+    if (valid) {
+      atomicAdd_block(&inout_shared[i], val);
+    }
+    __syncthreads();
+  }
+  for (int i = threadIdx.x; i < dim_target; i += blockDim.x) {
+    output[blockIdx.x * dim_aligned + i] += inout_shared[i];
+  }
+#else
   const cg::coalesced_group group = cg::binary_partition(cg::coalesced_threads(), valid);
   constexpr uint dim_aligned = dim_target == 3 ? 4 : dim_target;
 
@@ -387,6 +472,7 @@ __forceinline__ __device__ void FlushSumBlockAdd(StorageT* const output,
   for (int i = threadIdx.x; i < dim_target; i += blockDim.x) {
     output[blockIdx.x * dim_aligned + i] += inout_shared[i];
   }
+#endif
 }
 
 // READ SHARED

diff --git a/symforce/caspar/source/runtime/pybind_array_tools.cc b/symforce/caspar/source/runtime/pybind_array_tools.cc
@@ -174,13 +174,26 @@ int GetDeviceId(const py::object& obj) {
     auto interface = obj.attr("__cuda_array_interface__").cast<py::dict>();
     auto data = interface["data"].cast<py::tuple>();
     void* ptr = reinterpret_cast<void*>(data[0].cast<size_t>());
+    // This is a host translation unit, so it uses local cuda*->hip* aliases here
+    // rather than the device-side cuda_to_hip.h (which pulls in hipcub and
+    // __device__ helpers intended for the .cu sources).
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+    hipPointerAttribute_t attrs;
+    hipError_t err = hipPointerGetAttributes(&attrs, ptr);
+    if (err != hipSuccess) {
+      hipGetLastError();
+      return -1;
+    }
+    return attrs.device;
+#else
     cudaPointerAttributes attrs;
     cudaError_t err = cudaPointerGetAttributes(&attrs, ptr);
     if (err != cudaSuccess) {
       cudaGetLastError();
       return -1;
     }
     return attrs.device;
+#endif
   } catch (...) {
     return -1;  // Fallback if interface or attributes aren't available
   }

diff --git a/symforce/caspar/source/runtime/pybind_array_tools.h b/symforce/caspar/source/runtime/pybind_array_tools.h
@@ -14,7 +14,11 @@
 
 #pragma once
 
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+#include <hip/hip_runtime.h>
+#else
 #include <cuda_runtime.h>
+#endif
 #include <pybind11/pybind11.h>
 
 namespace py = pybind11;