diff --git a/cmake/rerun_if_needed.py b/cmake/rerun_if_needed.py
index 925bb1f2..b8f47e57 100755
--- a/cmake/rerun_if_needed.py
+++ b/cmake/rerun_if_needed.py
@@ -84,7 +84,7 @@ def get_path_git_hash(path_to_check: str, repo_root: T.Optional[str] = None) ->
     Changes to untracked files won't affect the hash.
     """
     # Get relative path from possibly absolute path
-    relative_path = os.path.relpath(path_to_check, repo_root or os.getcwd())
+    relative_path = os.path.relpath(path_to_check, repo_root or os.getcwd()).replace("\\", "/")
 
     tree_hash, diff_index = _get_hashes(relative_path, cwd=repo_root)
 
diff --git a/symforce/caspar/README.md b/symforce/caspar/README.md
index bdc3dfd0..58c50f2d 100644
--- a/symforce/caspar/README.md
+++ b/symforce/caspar/README.md
@@ -110,6 +110,17 @@ The following access patterns are currently supported:
 
 
 
+### AMD GPUs (ROCm/HIP)
+
+Caspar also runs on AMD GPUs through ROCm/HIP. The same generated kernels and runtime compile against the HIP toolchain via a small `cuda_to_hip.h` compatibility header that maps the CUDA spellings Caspar emits (`cudaMalloc`, `__syncthreads`, cooperative groups, CUB primitives, and friends) onto their HIP equivalents, so the symbolic kernel definitions are unchanged.
+
+To build a generated library for AMD GPUs, enable the HIP path when compiling:
+
+```python
+compile_caspar_library(caslib, output_dir, use_hip=True, hip_arch="gfx90a")
+```
+
+Set the target AMD GPU via the `hip_arch` argument, for example `gfx90a` for CDNA2 (MI200) or `gfx1100` for RDNA3. The ROCm build needs a HIP-enabled compiler (`hipcc`/`amdclang++`) and the `hip` and `hipcub` packages from a ROCm installation. When `use_hip` is off the build is unchanged and continues to use CUDA.
 
 ### Etymology
 Caspar, an acronym for **C**UDA **A**ccelerator for **S**ymbolic **P**rogramming with **A**daptive **R**eordering, is named after the Danish–Norwegian mathematician [Caspar Wessel](https://en.wikipedia.org/wiki/Caspar_Wessel). Wessel was the first to describe the geometrical interpretation of complex numbers as points in the complex plane and as vectors. However, since his thesis was written in Danish, it initially received little recognition. When his work was rediscovered later, the mathematician Sophus Lie, known for his discovery of Lie algebra, wrote the following in the newspaper:
diff --git a/symforce/caspar/code_generation/library.py b/symforce/caspar/code_generation/library.py
index a1e452ff..a6da1bd4 100644
--- a/symforce/caspar/code_generation/library.py
+++ b/symforce/caspar/code_generation/library.py
@@ -180,6 +180,8 @@ def compile(
         debug: bool = False,
         cuda_arch: str | None = None,
         jobs: int | None = None,
+        use_hip: bool = False,
+        hip_arch: str | None = None,
     ) -> None:
         build_dir = out_dir / "build"
         config_cmd: list[str | Path] = [
@@ -189,8 +191,15 @@ def compile(
             "-B",
             build_dir,
             f"-DCMAKE_BUILD_TYPE={'Debug' if debug else 'Release'}",
-            f"-DCMAKE_CUDA_ARCHITECTURES={cuda_arch if cuda_arch is not None else _real_cuda_arch_string()}",
         ]
+        if use_hip:
+            config_cmd.append("-DUSE_HIP=ON")
+            if hip_arch:
+                config_cmd.append(f"-DCMAKE_HIP_ARCHITECTURES={hip_arch}")
+        else:
+            config_cmd.append(
+                f"-DCMAKE_CUDA_ARCHITECTURES={cuda_arch if cuda_arch is not None else _real_cuda_arch_string()}"
+            )
         subprocess.run(config_cmd, check=True)
         build_cmd: list[str | Path] = ["cmake", "--build", build_dir, "--parallel"]
         if jobs:
diff --git a/symforce/caspar/source/runtime/cuda_to_hip.h b/symforce/caspar/source/runtime/cuda_to_hip.h
new file mode 100644
index 00000000..5fa7148f
--- /dev/null
+++ b/symforce/caspar/source/runtime/cuda_to_hip.h
@@ -0,0 +1,134 @@
+/* ----------------------------------------------------------------------------
+ * SymForce - Copyright 2025, Skydio, Inc.
+ * Copyright (c) 2026 Advanced Micro Devices, Inc.
+ * This source code is under the Apache 2.0 license found in the LICENSE file.
+ *
+ * Author: Jeff Daily <jeff.daily@amd.com>
+ * ---------------------------------------------------------------------------- */
+
+#pragma once
+
+// CUDA-to-HIP compatibility header for ROCm/HIP builds.
+// Provides the CUDA API spellings the project uses via aliases to HIP equivalents.
+
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_cooperative_groups.h>
+#include <hipcub/hipcub.hpp>
+
+// CUDA runtime API -> HIP runtime API
+#define cudaMalloc              hipMalloc
+#define cudaFree                hipFree
+#define cudaMemset              hipMemset
+#define cudaMemcpy              hipMemcpy
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost  hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice  hipMemcpyHostToDevice
+#define cudaError_t             hipError_t
+#define cudaSuccess             hipSuccess
+#define cudaStream_t            hipStream_t
+#define cudaDeviceSynchronize   hipDeviceSynchronize
+#define cudaGetLastError        hipGetLastError
+#define cudaGetErrorString      hipGetErrorString
+#define cudaSetDevice           hipSetDevice
+#define cudaGetDevice           hipGetDevice
+#define cudaPointerGetAttributes hipPointerGetAttributes
+#define cudaPointerAttributes   hipPointerAttribute_t
+
+// HIP shared-memory atomics are block-scoped by definition (no inter-block visibility),
+// so atomicAdd_block is equivalent to atomicAdd on shared memory.
+#define atomicAdd_block atomicAdd
+
+// CUB -> hipCUB namespace
+namespace cub = hipcub;
+
+// Cooperative groups: HIP has cg basics but lacks cg::reduce, cg::labeled_partition, memcpy_async.
+// Provide manual implementations where needed.
+
+namespace caspar_hip {
+
+// Butterfly reduction within a cooperative group (thread_block_tile or coalesced_group).
+// Replaces cg::reduce(group, val, cg::plus<T>()).
+template <typename GroupT, typename T>
+__device__ __forceinline__ T reduce_sum(GroupT group, T val) {
+  for (unsigned int offset = group.size() / 2; offset > 0; offset >>= 1) {
+    val += group.shfl_xor(val, offset);
+  }
+  return val;
+}
+
+// Butterfly reduction for max (cg::greater<T>).
+template <typename GroupT, typename T>
+__device__ __forceinline__ T reduce_max(GroupT group, T val) {
+  for (unsigned int offset = group.size() / 2; offset > 0; offset >>= 1) {
+    T other = group.shfl_xor(val, offset);
+    val = (val > other) ? val : other;
+  }
+  return val;
+}
+
+// Match_any: return a mask of lanes in the tile that have the same label.
+// HIP CG has match_any() for coalesced groups.
+template <typename GroupT, typename LabelT>
+__device__ __forceinline__ unsigned long long match_any_mask(GroupT group, LabelT label) {
+  // HIP cooperative_groups::coalesced_group has match_any
+  return group.match_any(label);
+}
+
+// Labeled partition emulation: reduce values within lanes sharing the same label,
+// and have exactly one lane per unique label perform the atomic.
+// Returns the reduced value and sets is_leader=true for exactly one lane per label.
+template <typename GroupT, typename T, typename LabelT>
+__device__ __forceinline__ T labeled_reduce_sum(GroupT group, T val, LabelT label, bool& is_leader) {
+  // Get mask of lanes with same label
+  unsigned long long same_label_mask = group.match_any(label);
+
+  // Find my position within the matching lanes
+  unsigned int my_lane = group.thread_rank();
+  unsigned long long lower_mask = (1ULL << my_lane) - 1;
+  unsigned int rank_in_label = __popcll(same_label_mask & lower_mask);
+
+  // Leader is the lowest-numbered lane in the group
+  is_leader = (rank_in_label == 0);
+
+  // Count total lanes with this label
+  unsigned int label_size = __popcll(same_label_mask);
+
+  // Butterfly reduction over the masked lanes
+  // For each reduction step, exchange with lane at offset if both are in same_label_mask
+  T result = val;
+  for (unsigned int offset = 1; offset < group.size(); offset <<= 1) {
+    unsigned int partner_lane = my_lane ^ offset;
+    bool partner_has_same_label = (same_label_mask >> partner_lane) & 1ULL;
+    T partner_val = group.shfl_xor(result, offset);
+    if (partner_has_same_label && partner_lane < group.size()) {
+      result += partner_val;
+    }
+  }
+
+  return result;
+}
+
+}  // namespace caspar_hip
+
+// Macros to replace cg::reduce calls (used in device code)
+#define CG_REDUCE_SUM(group, val) caspar_hip::reduce_sum(group, val)
+#define CG_LABELED_REDUCE_SUM(group, val, label, is_leader) \
+    caspar_hip::labeled_reduce_sum(group, val, label, is_leader)
+
+#else  // CUDA path
+
+#include <cuda_runtime.h>
+#include <cooperative_groups.h>
+#include <cooperative_groups/details/partitioning.h>
+#include <cooperative_groups/reduce.h>
+#include <cooperative_groups/memcpy_async.h>
+#include <cub/cub.cuh>
+
+namespace cg = cooperative_groups;
+
+// On CUDA, cg::reduce is available
+#define CG_REDUCE_SUM(group, val) cg::reduce(group, val, cg::plus<decltype(val)>())
+
+#endif
diff --git a/symforce/caspar/source/runtime/memops.cuh b/symforce/caspar/source/runtime/memops.cuh
index 49dc5299..dcdd31f6 100644
--- a/symforce/caspar/source/runtime/memops.cuh
+++ b/symforce/caspar/source/runtime/memops.cuh
@@ -7,11 +7,7 @@
 
 #include <stdio.h>
 
-#include <cooperative_groups.h>
-#include <cooperative_groups/memcpy_async.h>
-#include <cooperative_groups/reduce.h>
-#include <cuda_runtime.h>
-
+#include "cuda_to_hip.h"
 #include "shared_indices.h"
 
 namespace cg = cooperative_groups;
@@ -260,6 +256,32 @@ __forceinline__ __device__ void FlushSumShared(StorageT* const output, const uin
   if (idx.argsort != 0xffff) {  // 0xffff indicates the thread is not used.
     unique = indices[indices[idx.argsort].target].unique;
   }
+
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+  // HIP lacks cg::labeled_partition; use per-lane atomicAdd fallback.
+  // The butterfly approach doesn't work for non-contiguous label groups (lanes at
+  // arbitrary positions share a label, but XOR only pairs specific distances).
+  // Per-lane atomicAdd is simpler and correct (shared-memory atomics are fast).
+
+#pragma unroll
+  for (int i = 0; i < dim_target; i++) {
+    const SharedIndex idx_inner = indices[threadIdx.x];
+    StorageT val = StorageT(0);
+    if (idx_inner.argsort != 0xffff) {
+      // Read value BEFORE zeroing (order matters!)
+      val = inout_shared[idx_inner.argsort * dim_target + i];
+    }
+    __syncthreads();
+    inout_shared[threadIdx.x * dim_target + i] = 0.0f;
+    __syncthreads();
+
+    if (idx_inner.argsort != 0xffff) {
+      // Each lane with valid data does its own atomicAdd to the target location
+      atomicAdd_block(&inout_shared[indices[idx_inner.argsort].target * dim_target + i], val);
+    }
+    __syncthreads();
+  }
+#else
   const cg::coalesced_group group = cg::labeled_partition(cg::coalesced_threads(), unique);
 
 #pragma unroll
@@ -279,6 +301,7 @@ __forceinline__ __device__ void FlushSumShared(StorageT* const output, const uin
     }
     __syncthreads();
   }
+#endif
 
   constexpr uint dim_aligned = dim_target == 3 ? 4 : dim_target;
   for (int i = 0; i < dim_target; i++) {
@@ -303,6 +326,29 @@ template <uint dim_target, typename StorageT>
 __forceinline__ __device__ void FlushSumBlock(StorageT* const output, StorageT* const inout_shared,
                                               const bool valid) {
   __syncthreads();
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+  // HIP coalesced_group lacks shfl_xor; use shared-memory atomics directly.
+  constexpr uint dim_aligned = dim_target == 3 ? 4 : dim_target;
+
+#pragma unroll
+  for (int i = 0; i < dim_target; i++) {
+    StorageT val = StorageT(0);
+    if (valid) {
+      val = inout_shared[threadIdx.x * dim_target + i];
+    }
+    __syncthreads();
+    inout_shared[threadIdx.x * dim_target + i] = 0.0f;
+    __syncthreads();
+
+    if (valid) {
+      atomicAdd_block(&inout_shared[i], val);
+    }
+    __syncthreads();
+  }
+  for (int i = threadIdx.x; i < dim_target; i += blockDim.x) {
+    output[blockIdx.x * dim_aligned + i] = inout_shared[i];
+  }
+#else
   const cg::coalesced_group group = cg::binary_partition(cg::coalesced_threads(), valid);
   constexpr uint dim_aligned = dim_target == 3 ? 4 : dim_target;
 
@@ -326,6 +372,7 @@ __forceinline__ __device__ void FlushSumBlock(StorageT* const output, StorageT*
   for (int i = threadIdx.x; i < dim_target; i += blockDim.x) {
     output[blockIdx.x * dim_aligned + i] = inout_shared[i];
   }
+#endif
 }
 template <typename StorageT>
 __forceinline__ __device__ void SumStore(StorageT* const shared_tmp, StorageT* const inout_shared,
@@ -333,13 +380,28 @@ __forceinline__ __device__ void SumStore(StorageT* const shared_tmp, StorageT* c
   auto group = cg::tiled_partition<32>(cg::this_thread_block());
 
   __syncthreads();
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+  // HIP lacks cg::reduce; use butterfly reduction within the tile
+  StorageT tot = valid ? data : StorageT(0);
+  for (unsigned int o = group.size() / 2; o > 0; o >>= 1) {
+    tot += group.shfl_xor(tot, o);
+  }
+#else
   StorageT tot = cg::reduce(group, valid ? data : 0.0f, cg::plus<StorageT>());
+#endif
   if (group.thread_rank() == 0) {
     inout_shared[group.meta_group_rank()] = tot;
   }
   __syncthreads();
   if (group.meta_group_rank() == 0) {
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+    tot = inout_shared[group.thread_rank()];
+    for (unsigned int o = group.size() / 2; o > 0; o >>= 1) {
+      tot += group.shfl_xor(tot, o);
+    }
+#else
     tot = cg::reduce(group, inout_shared[group.thread_rank()], cg::plus<StorageT>());
+#endif
     if (group.thread_rank() == 0) {
       shared_tmp[offset] = tot;
     }
@@ -364,6 +426,29 @@ template <uint dim_target, typename StorageT>
 __forceinline__ __device__ void FlushSumBlockAdd(StorageT* const output,
                                                  StorageT* const inout_shared, const bool valid) {
   __syncthreads();
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+  // HIP coalesced_group lacks shfl_xor; use shared-memory atomics directly.
+  constexpr uint dim_aligned = dim_target == 3 ? 4 : dim_target;
+
+#pragma unroll
+  for (int i = 0; i < dim_target; i++) {
+    StorageT val = StorageT(0);
+    if (valid) {
+      val = inout_shared[threadIdx.x * dim_target + i];
+    }
+    __syncthreads();
+    inout_shared[threadIdx.x * dim_target + i] = 0.0f;
+    __syncthreads();
+
+    if (valid) {
+      atomicAdd_block(&inout_shared[i], val);
+    }
+    __syncthreads();
+  }
+  for (int i = threadIdx.x; i < dim_target; i += blockDim.x) {
+    output[blockIdx.x * dim_aligned + i] += inout_shared[i];
+  }
+#else
   const cg::coalesced_group group = cg::binary_partition(cg::coalesced_threads(), valid);
   constexpr uint dim_aligned = dim_target == 3 ? 4 : dim_target;
 
@@ -387,6 +472,7 @@ __forceinline__ __device__ void FlushSumBlockAdd(StorageT* const output,
   for (int i = threadIdx.x; i < dim_target; i += blockDim.x) {
     output[blockIdx.x * dim_aligned + i] += inout_shared[i];
   }
+#endif
 }
 
 // READ SHARED
diff --git a/symforce/caspar/source/runtime/pybind_array_tools.cc b/symforce/caspar/source/runtime/pybind_array_tools.cc
index 9e43f198..7f057905 100644
--- a/symforce/caspar/source/runtime/pybind_array_tools.cc
+++ b/symforce/caspar/source/runtime/pybind_array_tools.cc
@@ -174,6 +174,18 @@ int GetDeviceId(const py::object& obj) {
     auto interface = obj.attr("__cuda_array_interface__").cast<py::dict>();
     auto data = interface["data"].cast<py::tuple>();
     void* ptr = reinterpret_cast<void*>(data[0].cast<size_t>());
+    // This is a host translation unit, so it uses local cuda*->hip* aliases here
+    // rather than the device-side cuda_to_hip.h (which pulls in hipcub and
+    // __device__ helpers intended for the .cu sources).
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+    hipPointerAttribute_t attrs;
+    hipError_t err = hipPointerGetAttributes(&attrs, ptr);
+    if (err != hipSuccess) {
+      hipGetLastError();
+      return -1;
+    }
+    return attrs.device;
+#else
     cudaPointerAttributes attrs;
     cudaError_t err = cudaPointerGetAttributes(&attrs, ptr);
     if (err != cudaSuccess) {
@@ -181,6 +193,7 @@ int GetDeviceId(const py::object& obj) {
       return -1;
     }
     return attrs.device;
+#endif
   } catch (...) {
     return -1;  // Fallback if interface or attributes aren't available
   }
diff --git a/symforce/caspar/source/runtime/pybind_array_tools.h b/symforce/caspar/source/runtime/pybind_array_tools.h
index 96445288..1e938ed6 100644
--- a/symforce/caspar/source/runtime/pybind_array_tools.h
+++ b/symforce/caspar/source/runtime/pybind_array_tools.h
@@ -14,7 +14,11 @@
 
 #pragma once
 
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+#include <hip/hip_runtime.h>
+#else
 #include <cuda_runtime.h>
+#endif
 #include <pybind11/pybind11.h>
 
 namespace py = pybind11;
diff --git a/symforce/caspar/source/runtime/shared_indices.cu b/symforce/caspar/source/runtime/shared_indices.cu
index f3d20c61..ecc4baab 100644
--- a/symforce/caspar/source/runtime/shared_indices.cu
+++ b/symforce/caspar/source/runtime/shared_indices.cu
@@ -3,10 +3,7 @@
  * This source code is under the Apache 2.0 license found in the LICENSE file.
  * ---------------------------------------------------------------------------- */
 
-#include <cooperative_groups.h>
-#include <cooperative_groups/memcpy_async.h>
-#include <cooperative_groups/reduce.h>
-
+#include "cuda_to_hip.h"
 #include "shared_indices.h"
 
 namespace cg = cooperative_groups;
@@ -111,16 +108,31 @@ __forceinline__ __device__ uint GetOrd(const uint* values, const uint length, co
 __global__ void SharedIndicesKernel(const uint* const __restrict__ indices,
                                     SharedIndex* const __restrict__ shared_indices_out,
                                     const uint size) {
-  const auto block = cg::this_thread_block();
+  auto block = cg::this_thread_block();
 
   const auto gtrank = cg::this_grid().thread_rank();
   const auto btrank = block.thread_rank();
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+  // HIP's group_dim() is non-const; use blockDim directly
+  const int start = blockIdx.x * blockDim.x;
+#else
   const int start = block.group_index().x * block.group_dim().x;
+#endif
   const int num = min(1024, size - start) * sizeof(uint);
 
   __shared__ uint indices_loc[1024];
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+  // HIP lacks cg::memcpy_async; use synchronous block-strided copy
+  const uint* src = indices + start;
+  const uint num_elements = num / sizeof(uint);
+  for (uint i = btrank; i < num_elements; i += block.size()) {
+    indices_loc[i] = src[i];
+  }
+  __syncthreads();
+#else
   cg::memcpy_async(block, indices_loc, indices + start, num);
   cg::wait(block);
+#endif
 
   const uint val = gtrank < size ? indices_loc[btrank] : 0xFFFFFFFF;
 
diff --git a/symforce/caspar/source/runtime/shared_indices.h b/symforce/caspar/source/runtime/shared_indices.h
index 8b2db65e..e40fed15 100644
--- a/symforce/caspar/source/runtime/shared_indices.h
+++ b/symforce/caspar/source/runtime/shared_indices.h
@@ -7,7 +7,11 @@
 
 #include <cstdint>
 
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+#include <hip/hip_runtime.h>
+#else
 #include <cuda_runtime.h>
+#endif
 
 namespace caspar {
 
diff --git a/symforce/caspar/source/runtime/solver_tools.cu b/symforce/caspar/source/runtime/solver_tools.cu
index 5954fb04..ca19dc0f 100644
--- a/symforce/caspar/source/runtime/solver_tools.cu
+++ b/symforce/caspar/source/runtime/solver_tools.cu
@@ -3,9 +3,7 @@
  * This source code is under the Apache 2.0 license found in the LICENSE file.
  * ---------------------------------------------------------------------------- */
 
-#include <cub/cub.cuh>
-#include <cuda_runtime.h>
-
+#include "cuda_to_hip.h"
 #include "solver_tools.h"
 
 namespace caspar {
diff --git a/symforce/caspar/source/runtime/sort_indices.cu b/symforce/caspar/source/runtime/sort_indices.cu
index 0e1356ec..ad14e711 100644
--- a/symforce/caspar/source/runtime/sort_indices.cu
+++ b/symforce/caspar/source/runtime/sort_indices.cu
@@ -3,8 +3,7 @@
  * This source code is under the Apache 2.0 license found in the LICENSE file.
  * ---------------------------------------------------------------------------- */
 
-#include <cub/cub.cuh>
-
+#include "cuda_to_hip.h"
 #include "sort_indices.h"
 
 namespace {
diff --git a/symforce/caspar/source/runtime/sort_indices.h b/symforce/caspar/source/runtime/sort_indices.h
index 37800303..311790d8 100644
--- a/symforce/caspar/source/runtime/sort_indices.h
+++ b/symforce/caspar/source/runtime/sort_indices.h
@@ -5,7 +5,11 @@
 
 #pragma once
 
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+#include <hip/hip_runtime.h>
+#else
 #include <cuda_runtime.h>
+#endif
 
 namespace caspar {
 
diff --git a/symforce/caspar/source/templates/buildfiles/CMakeLists.txt.jinja b/symforce/caspar/source/templates/buildfiles/CMakeLists.txt.jinja
index c7b1c6eb..bec1bc7c 100644
--- a/symforce/caspar/source/templates/buildfiles/CMakeLists.txt.jinja
+++ b/symforce/caspar/source/templates/buildfiles/CMakeLists.txt.jinja
@@ -4,18 +4,32 @@
  # ---------------------------------------------------------------------------- #}
 cmake_minimum_required(VERSION 3.18)
 
-if(NOT DEFINED CMAKE_CUDA_COMPILER AND DEFINED ENV{CUDACXX})
-  set(CMAKE_CUDA_COMPILER "$ENV{CUDACXX}" CACHE FILEPATH "CUDA compiler")
-endif()
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-project(caspar_library LANGUAGES CXX CUDA)
+option(USE_HIP "Build with HIP for AMD GPUs" OFF)
 
-# SASS for Turing/Ampere/Ada (sm_75–89) + PTX fallback for forward JIT-compatibility. sm_75 minimum required by cooperative_groups::reduce.
-if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-  set(CMAKE_CUDA_ARCHITECTURES 75 80 86 89 75-virtual)
-endif()
+if(USE_HIP)
+  # project(... LANGUAGES ... HIP) honors -DCMAKE_HIP_ARCHITECTURES, otherwise
+  # auto-detects the host GPU and errors on a no-GPU build host.
+  project(caspar_library LANGUAGES CXX HIP)
 
-find_package(CUDAToolkit REQUIRED)
+  find_package(hip REQUIRED)
+  find_package(hipcub REQUIRED)
+else()
+  if(NOT DEFINED CMAKE_CUDA_COMPILER AND DEFINED ENV{CUDACXX})
+    set(CMAKE_CUDA_COMPILER "$ENV{CUDACXX}" CACHE FILEPATH "CUDA compiler")
+  endif()
+
+  project(caspar_library LANGUAGES CXX CUDA)
+
+  # SASS for Turing/Ampere/Ada (sm_75-89) + PTX fallback for forward JIT-compatibility. sm_75 minimum required by cooperative_groups::reduce.
+  if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+    set(CMAKE_CUDA_ARCHITECTURES 75 80 86 89 75-virtual)
+  endif()
+
+  find_package(CUDAToolkit REQUIRED)
+endif()
 
 {% if python_bindings %}
 option(CASPAR_BUILD_PYTHON_BINDINGS "Build Python bindings via pybind11" ON)
@@ -45,10 +59,31 @@ list(FILTER CUDA_HEADERS EXCLUDE REGEX "pybind.*")
 
 add_library({{caslib.name}}_core STATIC ${CUDA_SOURCES} ${CUDA_HEADERS})
 target_include_directories({{caslib.name}}_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-target_link_libraries({{caslib.name}}_core PUBLIC CUDA::cudart)
-target_compile_options({{caslib.name}}_core PRIVATE
-  $<$<COMPILE_LANGUAGE:CUDA>:--use_fast_math>
-)
+
+if(USE_HIP)
+  # Mark .cu files as HIP language
+  set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE HIP)
+  target_compile_definitions({{caslib.name}}_core PUBLIC USE_HIP)
+  # Link hip targets PRIVATE to avoid propagating HIP compile options to pure-CXX consumers.
+  # The static library has the HIP code linked in; pybind consumers only need the includes.
+  target_link_libraries({{caslib.name}}_core PRIVATE hip::host hip::hipcub)
+  # Expose ROCm include directory for consumers that need hipError_t etc
+  target_include_directories({{caslib.name}}_core PUBLIC ${hip_INCLUDE_DIRS})
+  # Use -ffp-contract=fast -fno-math-errno instead of -ffast-math.
+  # clang's -ffast-math enables -fassociative-math which can NaN backward passes.
+  target_compile_options({{caslib.name}}_core PRIVATE
+    $<$<COMPILE_LANGUAGE:HIP>:-ffp-contract=fast -fno-math-errno>
+  )
+  set_target_properties({{caslib.name}}_core PROPERTIES
+    HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}"
+  )
+else()
+  target_link_libraries({{caslib.name}}_core PUBLIC CUDA::cudart)
+  target_compile_options({{caslib.name}}_core PRIVATE
+    $<$<COMPILE_LANGUAGE:CUDA>:--use_fast_math>
+  )
+endif()
+
 set_target_properties({{caslib.name}}_core PROPERTIES
   POSITION_INDEPENDENT_CODE ON
 )
@@ -60,6 +95,12 @@ if(CASPAR_BUILD_PYTHON_BINDINGS)
   pybind11_add_module({{caslib.name}} ${CPP_SOURCES} ${CPP_HEADERS})
   target_link_libraries({{caslib.name}} PRIVATE {{caslib.name}}_core)
 
+  if(USE_HIP)
+    # Compile pybind sources as HIP (so HIP headers work) but they have no device code.
+    # The HIP compiler handles them as host-only code.
+    set_source_files_properties(${CPP_SOURCES} PROPERTIES LANGUAGE HIP)
+  endif()
+
   set_target_properties({{caslib.name}} PROPERTIES
     LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
     RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
diff --git a/symforce/caspar/source/templates/caspar_mappings.cu.jinja b/symforce/caspar/source/templates/caspar_mappings.cu.jinja
index eb398676..ac314bf0 100644
--- a/symforce/caspar/source/templates/caspar_mappings.cu.jinja
+++ b/symforce/caspar/source/templates/caspar_mappings.cu.jinja
@@ -4,13 +4,10 @@
  # ---------------------------------------------------------------------------- #}
 #include <stdio.h>
 
-#include <cooperative_groups.h>
-#include <cooperative_groups/memcpy_async.h>
+#include "cuda_to_hip.h"
 
 #include "caspar_mappings.h"
 
-namespace cg = cooperative_groups;
-
 // We use shared memory to improve the memory access.
 // A smaller block size of 32 allows for larger nodetypes.
 constexpr int block_size = 32;
diff --git a/symforce/caspar/source/templates/caspar_mappings.h.jinja b/symforce/caspar/source/templates/caspar_mappings.h.jinja
index af3795ad..53684205 100644
--- a/symforce/caspar/source/templates/caspar_mappings.h.jinja
+++ b/symforce/caspar/source/templates/caspar_mappings.h.jinja
@@ -4,7 +4,7 @@
  # ---------------------------------------------------------------------------- #}
 #pragma once
 
-#include <cuda_runtime.h>
+#include "cuda_to_hip.h"
 
 namespace caspar {
 {% for nodetype in caslib.exposed_types%}
diff --git a/symforce/caspar/source/templates/kernel.cu.jinja b/symforce/caspar/source/templates/kernel.cu.jinja
index fa30d92d..e18addd0 100644
--- a/symforce/caspar/source/templates/kernel.cu.jinja
+++ b/symforce/caspar/source/templates/kernel.cu.jinja
@@ -2,17 +2,11 @@
  # SymForce - Copyright 2025, Skydio, Inc.
  # This source code is under the Apache 2.0 license found in the LICENSE file.
  # ---------------------------------------------------------------------------- #}
-#include <cooperative_groups.h>
-#include <cooperative_groups/details/partitioning.h>
-#include <cooperative_groups/memcpy_async.h>
-#include <cooperative_groups/reduce.h>
-#include <cuda_runtime.h>
+#include "cuda_to_hip.h"
 
 #include "kernel_{{kernel.name}}.h"
 #include "memops.cuh"
 
-namespace cg = cooperative_groups;
-
 namespace caspar {
 
 __global__ void
diff --git a/symforce/caspar/source/templates/kernel.h.jinja b/symforce/caspar/source/templates/kernel.h.jinja
index 476a50e4..8c189f4c 100644
--- a/symforce/caspar/source/templates/kernel.h.jinja
+++ b/symforce/caspar/source/templates/kernel.h.jinja
@@ -4,7 +4,7 @@
  # ---------------------------------------------------------------------------- #}
 #pragma once
 
-#include <cuda_runtime.h>
+#include "cuda_to_hip.h"
 
 #include "shared_indices.h"
 
diff --git a/symforce/caspar/source/templates/solver.h.jinja b/symforce/caspar/source/templates/solver.h.jinja
index 855a9789..c9a35c7a 100644
--- a/symforce/caspar/source/templates/solver.h.jinja
+++ b/symforce/caspar/source/templates/solver.h.jinja
@@ -6,7 +6,7 @@
 
 #include <cstdint>
 
-#include <cuda_runtime.h>
+#include "cuda_to_hip.h"
 
 #include <vector>