Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmake/rerun_if_needed.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def get_path_git_hash(path_to_check: str, repo_root: T.Optional[str] = None) ->
Changes to untracked files won't affect the hash.
"""
# Get relative path from possibly absolute path
relative_path = os.path.relpath(path_to_check, repo_root or os.getcwd())
relative_path = os.path.relpath(path_to_check, repo_root or os.getcwd()).replace("\\", "/")

tree_hash, diff_index = _get_hashes(relative_path, cwd=repo_root)

Expand Down
11 changes: 11 additions & 0 deletions symforce/caspar/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,17 @@ The following access patterns are currently supported:



### AMD GPUs (ROCm/HIP)

Caspar also runs on AMD GPUs through ROCm/HIP. The same generated kernels and runtime compile against the HIP toolchain via a small `cuda_to_hip.h` compatibility header that maps the CUDA spellings Caspar emits (`cudaMalloc`, `__syncthreads`, cooperative groups, CUB primitives, and friends) onto their HIP equivalents, so the symbolic kernel definitions are unchanged.

To build a generated library for AMD GPUs, enable the HIP path when compiling:

```python
compile_caspar_library(caslib, output_dir, use_hip=True, hip_arch="gfx90a")
```

Set the target AMD GPU via the `hip_arch` argument, for example `gfx90a` for CDNA2 (MI200) or `gfx1100` for RDNA3. The ROCm build needs a HIP-enabled compiler (`hipcc`/`amdclang++`) and the `hip` and `hipcub` packages from a ROCm installation. When `use_hip` is off the build is unchanged and continues to use CUDA.

### Etymology
Caspar, an acronym for **C**UDA **A**ccelerator for **S**ymbolic **P**rogramming with **A**daptive **R**eordering, is named after the Danish–Norwegian mathematician [Caspar Wessel](https://en.wikipedia.org/wiki/Caspar_Wessel). Wessel was the first to describe the geometrical interpretation of complex numbers as points in the complex plane and as vectors. However, since his thesis was written in Danish, it initially received little recognition. When his work was rediscovered later, the mathematician Sophus Lie, known for his discovery of Lie algebra, wrote the following in the newspaper:
Expand Down
11 changes: 10 additions & 1 deletion symforce/caspar/code_generation/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ def compile(
debug: bool = False,
cuda_arch: str | None = None,
jobs: int | None = None,
use_hip: bool = False,
hip_arch: str | None = None,
) -> None:
build_dir = out_dir / "build"
config_cmd: list[str | Path] = [
Expand All @@ -189,8 +191,15 @@ def compile(
"-B",
build_dir,
f"-DCMAKE_BUILD_TYPE={'Debug' if debug else 'Release'}",
f"-DCMAKE_CUDA_ARCHITECTURES={cuda_arch if cuda_arch is not None else _real_cuda_arch_string()}",
]
if use_hip:
config_cmd.append("-DUSE_HIP=ON")
if hip_arch:
config_cmd.append(f"-DCMAKE_HIP_ARCHITECTURES={hip_arch}")
else:
config_cmd.append(
f"-DCMAKE_CUDA_ARCHITECTURES={cuda_arch if cuda_arch is not None else _real_cuda_arch_string()}"
)
subprocess.run(config_cmd, check=True)
build_cmd: list[str | Path] = ["cmake", "--build", build_dir, "--parallel"]
if jobs:
Expand Down
134 changes: 134 additions & 0 deletions symforce/caspar/source/runtime/cuda_to_hip.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
/* ----------------------------------------------------------------------------
* SymForce - Copyright 2025, Skydio, Inc.
* Copyright (c) 2026 Advanced Micro Devices, Inc.
* This source code is under the Apache 2.0 license found in the LICENSE file.
*
* Author: Jeff Daily <jeff.daily@amd.com>
* ---------------------------------------------------------------------------- */

#pragma once

// CUDA-to-HIP compatibility header for ROCm/HIP builds.
// Provides the CUDA API spellings the project uses via aliases to HIP equivalents.

#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)

#include <hip/hip_runtime.h>
#include <hip/hip_cooperative_groups.h>
#include <hipcub/hipcub.hpp>

// CUDA runtime API -> HIP runtime API
#define cudaMalloc hipMalloc
#define cudaFree hipFree
#define cudaMemset hipMemset
#define cudaMemcpy hipMemcpy
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaError_t hipError_t
#define cudaSuccess hipSuccess
#define cudaStream_t hipStream_t
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaGetLastError hipGetLastError
#define cudaGetErrorString hipGetErrorString
#define cudaSetDevice hipSetDevice
#define cudaGetDevice hipGetDevice
#define cudaPointerGetAttributes hipPointerGetAttributes
#define cudaPointerAttributes hipPointerAttribute_t

// HIP shared-memory atomics are block-scoped by definition (no inter-block visibility),
// so atomicAdd_block is equivalent to atomicAdd on shared memory.
#define atomicAdd_block atomicAdd

// CUB -> hipCUB namespace
namespace cub = hipcub;

// Cooperative groups: HIP has cg basics but lacks cg::reduce, cg::labeled_partition, memcpy_async.
// Provide manual implementations where needed.

namespace caspar_hip {

// Butterfly reduction within a cooperative group (thread_block_tile or coalesced_group).
// Replaces cg::reduce(group, val, cg::plus<T>()).
template <typename GroupT, typename T>
__device__ __forceinline__ T reduce_sum(GroupT group, T val) {
for (unsigned int offset = group.size() / 2; offset > 0; offset >>= 1) {
val += group.shfl_xor(val, offset);
}
return val;
}

// Butterfly reduction for max (cg::greater<T>).
template <typename GroupT, typename T>
__device__ __forceinline__ T reduce_max(GroupT group, T val) {
for (unsigned int offset = group.size() / 2; offset > 0; offset >>= 1) {
T other = group.shfl_xor(val, offset);
val = (val > other) ? val : other;
}
return val;
}

// Match_any: return a mask of lanes in the tile that have the same label.
// HIP CG has match_any() for coalesced groups.
template <typename GroupT, typename LabelT>
__device__ __forceinline__ unsigned long long match_any_mask(GroupT group, LabelT label) {
// HIP cooperative_groups::coalesced_group has match_any
return group.match_any(label);
}

// Labeled partition emulation: reduce values within lanes sharing the same label,
// and have exactly one lane per unique label perform the atomic.
// Returns the reduced value and sets is_leader=true for exactly one lane per label.
template <typename GroupT, typename T, typename LabelT>
__device__ __forceinline__ T labeled_reduce_sum(GroupT group, T val, LabelT label, bool& is_leader) {
// Get mask of lanes with same label
unsigned long long same_label_mask = group.match_any(label);

// Find my position within the matching lanes
unsigned int my_lane = group.thread_rank();
unsigned long long lower_mask = (1ULL << my_lane) - 1;
unsigned int rank_in_label = __popcll(same_label_mask & lower_mask);

// Leader is the lowest-numbered lane in the group
is_leader = (rank_in_label == 0);

// Count total lanes with this label
unsigned int label_size = __popcll(same_label_mask);

// Butterfly reduction over the masked lanes
// For each reduction step, exchange with lane at offset if both are in same_label_mask
T result = val;
for (unsigned int offset = 1; offset < group.size(); offset <<= 1) {
unsigned int partner_lane = my_lane ^ offset;
bool partner_has_same_label = (same_label_mask >> partner_lane) & 1ULL;
T partner_val = group.shfl_xor(result, offset);
if (partner_has_same_label && partner_lane < group.size()) {
result += partner_val;
}
}

return result;
}

} // namespace caspar_hip

// Macros to replace cg::reduce calls (used in device code)
#define CG_REDUCE_SUM(group, val) caspar_hip::reduce_sum(group, val)
#define CG_LABELED_REDUCE_SUM(group, val, label, is_leader) \
caspar_hip::labeled_reduce_sum(group, val, label, is_leader)

#else // CUDA path

#include <cuda_runtime.h>
#include <cooperative_groups.h>
#include <cooperative_groups/details/partitioning.h>
#include <cooperative_groups/reduce.h>
#include <cooperative_groups/memcpy_async.h>
#include <cub/cub.cuh>

namespace cg = cooperative_groups;

// On CUDA, cg::reduce is available
#define CG_REDUCE_SUM(group, val) cg::reduce(group, val, cg::plus<decltype(val)>())

#endif
96 changes: 91 additions & 5 deletions symforce/caspar/source/runtime/memops.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,7 @@

#include <stdio.h>

#include <cooperative_groups.h>
#include <cooperative_groups/memcpy_async.h>
#include <cooperative_groups/reduce.h>
#include <cuda_runtime.h>

#include "cuda_to_hip.h"
#include "shared_indices.h"

namespace cg = cooperative_groups;
Expand Down Expand Up @@ -260,6 +256,32 @@ __forceinline__ __device__ void FlushSumShared(StorageT* const output, const uin
if (idx.argsort != 0xffff) { // 0xffff indicates the thread is not used.
unique = indices[indices[idx.argsort].target].unique;
}

#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
// HIP lacks cg::labeled_partition; use per-lane atomicAdd fallback.
// The butterfly approach doesn't work for non-contiguous label groups (lanes at
// arbitrary positions share a label, but XOR only pairs specific distances).
// Per-lane atomicAdd is simpler and correct (shared-memory atomics are fast).

#pragma unroll
for (int i = 0; i < dim_target; i++) {
const SharedIndex idx_inner = indices[threadIdx.x];
StorageT val = StorageT(0);
if (idx_inner.argsort != 0xffff) {
// Read value BEFORE zeroing (order matters!)
val = inout_shared[idx_inner.argsort * dim_target + i];
}
__syncthreads();
inout_shared[threadIdx.x * dim_target + i] = 0.0f;
__syncthreads();

if (idx_inner.argsort != 0xffff) {
// Each lane with valid data does its own atomicAdd to the target location
atomicAdd_block(&inout_shared[indices[idx_inner.argsort].target * dim_target + i], val);
}
__syncthreads();
}
#else
const cg::coalesced_group group = cg::labeled_partition(cg::coalesced_threads(), unique);

#pragma unroll
Expand All @@ -279,6 +301,7 @@ __forceinline__ __device__ void FlushSumShared(StorageT* const output, const uin
}
__syncthreads();
}
#endif

constexpr uint dim_aligned = dim_target == 3 ? 4 : dim_target;
for (int i = 0; i < dim_target; i++) {
Expand All @@ -303,6 +326,29 @@ template <uint dim_target, typename StorageT>
__forceinline__ __device__ void FlushSumBlock(StorageT* const output, StorageT* const inout_shared,
const bool valid) {
__syncthreads();
#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
// HIP coalesced_group lacks shfl_xor; use shared-memory atomics directly.
constexpr uint dim_aligned = dim_target == 3 ? 4 : dim_target;

#pragma unroll
for (int i = 0; i < dim_target; i++) {
StorageT val = StorageT(0);
if (valid) {
val = inout_shared[threadIdx.x * dim_target + i];
}
__syncthreads();
inout_shared[threadIdx.x * dim_target + i] = 0.0f;
__syncthreads();

if (valid) {
atomicAdd_block(&inout_shared[i], val);
}
__syncthreads();
}
for (int i = threadIdx.x; i < dim_target; i += blockDim.x) {
output[blockIdx.x * dim_aligned + i] = inout_shared[i];
}
#else
const cg::coalesced_group group = cg::binary_partition(cg::coalesced_threads(), valid);
constexpr uint dim_aligned = dim_target == 3 ? 4 : dim_target;

Expand All @@ -326,20 +372,36 @@ __forceinline__ __device__ void FlushSumBlock(StorageT* const output, StorageT*
for (int i = threadIdx.x; i < dim_target; i += blockDim.x) {
output[blockIdx.x * dim_aligned + i] = inout_shared[i];
}
#endif
}
template <typename StorageT>
__forceinline__ __device__ void SumStore(StorageT* const shared_tmp, StorageT* const inout_shared,
const uint offset, const bool valid, StorageT data) {
auto group = cg::tiled_partition<32>(cg::this_thread_block());

__syncthreads();
#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
// HIP lacks cg::reduce; use butterfly reduction within the tile
StorageT tot = valid ? data : StorageT(0);
for (unsigned int o = group.size() / 2; o > 0; o >>= 1) {
tot += group.shfl_xor(tot, o);
}
#else
StorageT tot = cg::reduce(group, valid ? data : 0.0f, cg::plus<StorageT>());
#endif
if (group.thread_rank() == 0) {
inout_shared[group.meta_group_rank()] = tot;
}
__syncthreads();
if (group.meta_group_rank() == 0) {
#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
tot = inout_shared[group.thread_rank()];
for (unsigned int o = group.size() / 2; o > 0; o >>= 1) {
tot += group.shfl_xor(tot, o);
}
#else
tot = cg::reduce(group, inout_shared[group.thread_rank()], cg::plus<StorageT>());
#endif
if (group.thread_rank() == 0) {
shared_tmp[offset] = tot;
}
Expand All @@ -364,6 +426,29 @@ template <uint dim_target, typename StorageT>
__forceinline__ __device__ void FlushSumBlockAdd(StorageT* const output,
StorageT* const inout_shared, const bool valid) {
__syncthreads();
#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
// HIP coalesced_group lacks shfl_xor; use shared-memory atomics directly.
constexpr uint dim_aligned = dim_target == 3 ? 4 : dim_target;

#pragma unroll
for (int i = 0; i < dim_target; i++) {
StorageT val = StorageT(0);
if (valid) {
val = inout_shared[threadIdx.x * dim_target + i];
}
__syncthreads();
inout_shared[threadIdx.x * dim_target + i] = 0.0f;
__syncthreads();

if (valid) {
atomicAdd_block(&inout_shared[i], val);
}
__syncthreads();
}
for (int i = threadIdx.x; i < dim_target; i += blockDim.x) {
output[blockIdx.x * dim_aligned + i] += inout_shared[i];
}
#else
const cg::coalesced_group group = cg::binary_partition(cg::coalesced_threads(), valid);
constexpr uint dim_aligned = dim_target == 3 ? 4 : dim_target;

Expand All @@ -387,6 +472,7 @@ __forceinline__ __device__ void FlushSumBlockAdd(StorageT* const output,
for (int i = threadIdx.x; i < dim_target; i += blockDim.x) {
output[blockIdx.x * dim_aligned + i] += inout_shared[i];
}
#endif
}

// READ SHARED
Expand Down
13 changes: 13 additions & 0 deletions symforce/caspar/source/runtime/pybind_array_tools.cc
Original file line number Diff line number Diff line change
Expand Up @@ -174,13 +174,26 @@ int GetDeviceId(const py::object& obj) {
auto interface = obj.attr("__cuda_array_interface__").cast<py::dict>();
auto data = interface["data"].cast<py::tuple>();
void* ptr = reinterpret_cast<void*>(data[0].cast<size_t>());
// This is a host translation unit, so it uses local cuda*->hip* aliases here
// rather than the device-side cuda_to_hip.h (which pulls in hipcub and
// __device__ helpers intended for the .cu sources).
#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
hipPointerAttribute_t attrs;
hipError_t err = hipPointerGetAttributes(&attrs, ptr);
if (err != hipSuccess) {
hipGetLastError();
return -1;
}
return attrs.device;
#else
cudaPointerAttributes attrs;
cudaError_t err = cudaPointerGetAttributes(&attrs, ptr);
if (err != cudaSuccess) {
cudaGetLastError();
return -1;
}
return attrs.device;
#endif
} catch (...) {
return -1; // Fallback if interface or attributes aren't available
}
Expand Down
4 changes: 4 additions & 0 deletions symforce/caspar/source/runtime/pybind_array_tools.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@

#pragma once

#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
#include <hip/hip_runtime.h>
#else
#include <cuda_runtime.h>
#endif
#include <pybind11/pybind11.h>

namespace py = pybind11;
Expand Down
Loading