diff --git a/cmake/rerun_if_needed.py b/cmake/rerun_if_needed.py index 925bb1f2..b8f47e57 100755 --- a/cmake/rerun_if_needed.py +++ b/cmake/rerun_if_needed.py @@ -84,7 +84,7 @@ def get_path_git_hash(path_to_check: str, repo_root: T.Optional[str] = None) -> Changes to untracked files won't affect the hash. """ # Get relative path from possibly absolute path - relative_path = os.path.relpath(path_to_check, repo_root or os.getcwd()) + relative_path = os.path.relpath(path_to_check, repo_root or os.getcwd()).replace("\\", "/") tree_hash, diff_index = _get_hashes(relative_path, cwd=repo_root) diff --git a/symforce/caspar/README.md b/symforce/caspar/README.md index bdc3dfd0..58c50f2d 100644 --- a/symforce/caspar/README.md +++ b/symforce/caspar/README.md @@ -110,6 +110,17 @@ The following access patterns are currently supported: +### AMD GPUs (ROCm/HIP) + +Caspar also runs on AMD GPUs through ROCm/HIP. The same generated kernels and runtime compile against the HIP toolchain via a small `cuda_to_hip.h` compatibility header that maps the CUDA spellings Caspar emits (`cudaMalloc`, `__syncthreads`, cooperative groups, CUB primitives, and friends) onto their HIP equivalents, so the symbolic kernel definitions are unchanged. + +To build a generated library for AMD GPUs, enable the HIP path when compiling: + +```python +compile_caspar_library(caslib, output_dir, use_hip=True, hip_arch="gfx90a") +``` + +Set the target AMD GPU via the `hip_arch` argument, for example `gfx90a` for CDNA2 (MI200) or `gfx1100` for RDNA3. The ROCm build needs a HIP-enabled compiler (`hipcc`/`amdclang++`) and the `hip` and `hipcub` packages from a ROCm installation. When `use_hip` is off the build is unchanged and continues to use CUDA. ### Etymology Caspar, an acronym for **C**UDA **A**ccelerator for **S**ymbolic **P**rogramming with **A**daptive **R**eordering, is named after the Danish–Norwegian mathematician [Caspar Wessel](https://en.wikipedia.org/wiki/Caspar_Wessel). Wessel was the first to describe the geometrical interpretation of complex numbers as points in the complex plane and as vectors. However, since his thesis was written in Danish, it initially received little recognition. When his work was rediscovered later, the mathematician Sophus Lie, known for his discovery of Lie algebra, wrote the following in the newspaper: diff --git a/symforce/caspar/code_generation/library.py b/symforce/caspar/code_generation/library.py index a1e452ff..a6da1bd4 100644 --- a/symforce/caspar/code_generation/library.py +++ b/symforce/caspar/code_generation/library.py @@ -180,6 +180,8 @@ def compile( debug: bool = False, cuda_arch: str | None = None, jobs: int | None = None, + use_hip: bool = False, + hip_arch: str | None = None, ) -> None: build_dir = out_dir / "build" config_cmd: list[str | Path] = [ @@ -189,8 +191,15 @@ def compile( "-B", build_dir, f"-DCMAKE_BUILD_TYPE={'Debug' if debug else 'Release'}", - f"-DCMAKE_CUDA_ARCHITECTURES={cuda_arch if cuda_arch is not None else _real_cuda_arch_string()}", ] + if use_hip: + config_cmd.append("-DUSE_HIP=ON") + if hip_arch: + config_cmd.append(f"-DCMAKE_HIP_ARCHITECTURES={hip_arch}") + else: + config_cmd.append( + f"-DCMAKE_CUDA_ARCHITECTURES={cuda_arch if cuda_arch is not None else _real_cuda_arch_string()}" + ) subprocess.run(config_cmd, check=True) build_cmd: list[str | Path] = ["cmake", "--build", build_dir, "--parallel"] if jobs: diff --git a/symforce/caspar/source/runtime/cuda_to_hip.h b/symforce/caspar/source/runtime/cuda_to_hip.h new file mode 100644 index 00000000..5fa7148f --- /dev/null +++ b/symforce/caspar/source/runtime/cuda_to_hip.h @@ -0,0 +1,134 @@ +/* ---------------------------------------------------------------------------- + * SymForce - Copyright 2025, Skydio, Inc. + * Copyright (c) 2026 Advanced Micro Devices, Inc. + * This source code is under the Apache 2.0 license found in the LICENSE file. + * + * Author: Jeff Daily + * ---------------------------------------------------------------------------- */ + +#pragma once + +// CUDA-to-HIP compatibility header for ROCm/HIP builds. +// Provides the CUDA API spellings the project uses via aliases to HIP equivalents. + +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + +#include +#include +#include + +// CUDA runtime API -> HIP runtime API +#define cudaMalloc hipMalloc +#define cudaFree hipFree +#define cudaMemset hipMemset +#define cudaMemcpy hipMemcpy +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaError_t hipError_t +#define cudaSuccess hipSuccess +#define cudaStream_t hipStream_t +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaGetLastError hipGetLastError +#define cudaGetErrorString hipGetErrorString +#define cudaSetDevice hipSetDevice +#define cudaGetDevice hipGetDevice +#define cudaPointerGetAttributes hipPointerGetAttributes +#define cudaPointerAttributes hipPointerAttribute_t + +// HIP shared-memory atomics are block-scoped by definition (no inter-block visibility), +// so atomicAdd_block is equivalent to atomicAdd on shared memory. +#define atomicAdd_block atomicAdd + +// CUB -> hipCUB namespace +namespace cub = hipcub; + +// Cooperative groups: HIP has cg basics but lacks cg::reduce, cg::labeled_partition, memcpy_async. +// Provide manual implementations where needed. + +namespace caspar_hip { + +// Butterfly reduction within a cooperative group (thread_block_tile or coalesced_group). +// Replaces cg::reduce(group, val, cg::plus()). +template +__device__ __forceinline__ T reduce_sum(GroupT group, T val) { + for (unsigned int offset = group.size() / 2; offset > 0; offset >>= 1) { + val += group.shfl_xor(val, offset); + } + return val; +} + +// Butterfly reduction for max (cg::greater). +template +__device__ __forceinline__ T reduce_max(GroupT group, T val) { + for (unsigned int offset = group.size() / 2; offset > 0; offset >>= 1) { + T other = group.shfl_xor(val, offset); + val = (val > other) ? val : other; + } + return val; +} + +// Match_any: return a mask of lanes in the tile that have the same label. +// HIP CG has match_any() for coalesced groups. +template +__device__ __forceinline__ unsigned long long match_any_mask(GroupT group, LabelT label) { + // HIP cooperative_groups::coalesced_group has match_any + return group.match_any(label); +} + +// Labeled partition emulation: reduce values within lanes sharing the same label, +// and have exactly one lane per unique label perform the atomic. +// Returns the reduced value and sets is_leader=true for exactly one lane per label. +template +__device__ __forceinline__ T labeled_reduce_sum(GroupT group, T val, LabelT label, bool& is_leader) { + // Get mask of lanes with same label + unsigned long long same_label_mask = group.match_any(label); + + // Find my position within the matching lanes + unsigned int my_lane = group.thread_rank(); + unsigned long long lower_mask = (1ULL << my_lane) - 1; + unsigned int rank_in_label = __popcll(same_label_mask & lower_mask); + + // Leader is the lowest-numbered lane in the group + is_leader = (rank_in_label == 0); + + // Count total lanes with this label + unsigned int label_size = __popcll(same_label_mask); + + // Butterfly reduction over the masked lanes + // For each reduction step, exchange with lane at offset if both are in same_label_mask + T result = val; + for (unsigned int offset = 1; offset < group.size(); offset <<= 1) { + unsigned int partner_lane = my_lane ^ offset; + bool partner_has_same_label = (same_label_mask >> partner_lane) & 1ULL; + T partner_val = group.shfl_xor(result, offset); + if (partner_has_same_label && partner_lane < group.size()) { + result += partner_val; + } + } + + return result; +} + +} // namespace caspar_hip + +// Macros to replace cg::reduce calls (used in device code) +#define CG_REDUCE_SUM(group, val) caspar_hip::reduce_sum(group, val) +#define CG_LABELED_REDUCE_SUM(group, val, label, is_leader) \ + caspar_hip::labeled_reduce_sum(group, val, label, is_leader) + +#else // CUDA path + +#include +#include +#include +#include +#include +#include + +namespace cg = cooperative_groups; + +// On CUDA, cg::reduce is available +#define CG_REDUCE_SUM(group, val) cg::reduce(group, val, cg::plus()) + +#endif diff --git a/symforce/caspar/source/runtime/memops.cuh b/symforce/caspar/source/runtime/memops.cuh index 49dc5299..dcdd31f6 100644 --- a/symforce/caspar/source/runtime/memops.cuh +++ b/symforce/caspar/source/runtime/memops.cuh @@ -7,11 +7,7 @@ #include -#include -#include -#include -#include - +#include "cuda_to_hip.h" #include "shared_indices.h" namespace cg = cooperative_groups; @@ -260,6 +256,32 @@ __forceinline__ __device__ void FlushSumShared(StorageT* const output, const uin if (idx.argsort != 0xffff) { // 0xffff indicates the thread is not used. unique = indices[indices[idx.argsort].target].unique; } + +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + // HIP lacks cg::labeled_partition; use per-lane atomicAdd fallback. + // The butterfly approach doesn't work for non-contiguous label groups (lanes at + // arbitrary positions share a label, but XOR only pairs specific distances). + // Per-lane atomicAdd is simpler and correct (shared-memory atomics are fast). + +#pragma unroll + for (int i = 0; i < dim_target; i++) { + const SharedIndex idx_inner = indices[threadIdx.x]; + StorageT val = StorageT(0); + if (idx_inner.argsort != 0xffff) { + // Read value BEFORE zeroing (order matters!) + val = inout_shared[idx_inner.argsort * dim_target + i]; + } + __syncthreads(); + inout_shared[threadIdx.x * dim_target + i] = 0.0f; + __syncthreads(); + + if (idx_inner.argsort != 0xffff) { + // Each lane with valid data does its own atomicAdd to the target location + atomicAdd_block(&inout_shared[indices[idx_inner.argsort].target * dim_target + i], val); + } + __syncthreads(); + } +#else const cg::coalesced_group group = cg::labeled_partition(cg::coalesced_threads(), unique); #pragma unroll @@ -279,6 +301,7 @@ __forceinline__ __device__ void FlushSumShared(StorageT* const output, const uin } __syncthreads(); } +#endif constexpr uint dim_aligned = dim_target == 3 ? 4 : dim_target; for (int i = 0; i < dim_target; i++) { @@ -303,6 +326,29 @@ template __forceinline__ __device__ void FlushSumBlock(StorageT* const output, StorageT* const inout_shared, const bool valid) { __syncthreads(); +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + // HIP coalesced_group lacks shfl_xor; use shared-memory atomics directly. + constexpr uint dim_aligned = dim_target == 3 ? 4 : dim_target; + +#pragma unroll + for (int i = 0; i < dim_target; i++) { + StorageT val = StorageT(0); + if (valid) { + val = inout_shared[threadIdx.x * dim_target + i]; + } + __syncthreads(); + inout_shared[threadIdx.x * dim_target + i] = 0.0f; + __syncthreads(); + + if (valid) { + atomicAdd_block(&inout_shared[i], val); + } + __syncthreads(); + } + for (int i = threadIdx.x; i < dim_target; i += blockDim.x) { + output[blockIdx.x * dim_aligned + i] = inout_shared[i]; + } +#else const cg::coalesced_group group = cg::binary_partition(cg::coalesced_threads(), valid); constexpr uint dim_aligned = dim_target == 3 ? 4 : dim_target; @@ -326,6 +372,7 @@ __forceinline__ __device__ void FlushSumBlock(StorageT* const output, StorageT* for (int i = threadIdx.x; i < dim_target; i += blockDim.x) { output[blockIdx.x * dim_aligned + i] = inout_shared[i]; } +#endif } template __forceinline__ __device__ void SumStore(StorageT* const shared_tmp, StorageT* const inout_shared, @@ -333,13 +380,28 @@ __forceinline__ __device__ void SumStore(StorageT* const shared_tmp, StorageT* c auto group = cg::tiled_partition<32>(cg::this_thread_block()); __syncthreads(); +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + // HIP lacks cg::reduce; use butterfly reduction within the tile + StorageT tot = valid ? data : StorageT(0); + for (unsigned int o = group.size() / 2; o > 0; o >>= 1) { + tot += group.shfl_xor(tot, o); + } +#else StorageT tot = cg::reduce(group, valid ? data : 0.0f, cg::plus()); +#endif if (group.thread_rank() == 0) { inout_shared[group.meta_group_rank()] = tot; } __syncthreads(); if (group.meta_group_rank() == 0) { +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + tot = inout_shared[group.thread_rank()]; + for (unsigned int o = group.size() / 2; o > 0; o >>= 1) { + tot += group.shfl_xor(tot, o); + } +#else tot = cg::reduce(group, inout_shared[group.thread_rank()], cg::plus()); +#endif if (group.thread_rank() == 0) { shared_tmp[offset] = tot; } @@ -364,6 +426,29 @@ template __forceinline__ __device__ void FlushSumBlockAdd(StorageT* const output, StorageT* const inout_shared, const bool valid) { __syncthreads(); +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + // HIP coalesced_group lacks shfl_xor; use shared-memory atomics directly. + constexpr uint dim_aligned = dim_target == 3 ? 4 : dim_target; + +#pragma unroll + for (int i = 0; i < dim_target; i++) { + StorageT val = StorageT(0); + if (valid) { + val = inout_shared[threadIdx.x * dim_target + i]; + } + __syncthreads(); + inout_shared[threadIdx.x * dim_target + i] = 0.0f; + __syncthreads(); + + if (valid) { + atomicAdd_block(&inout_shared[i], val); + } + __syncthreads(); + } + for (int i = threadIdx.x; i < dim_target; i += blockDim.x) { + output[blockIdx.x * dim_aligned + i] += inout_shared[i]; + } +#else const cg::coalesced_group group = cg::binary_partition(cg::coalesced_threads(), valid); constexpr uint dim_aligned = dim_target == 3 ? 4 : dim_target; @@ -387,6 +472,7 @@ __forceinline__ __device__ void FlushSumBlockAdd(StorageT* const output, for (int i = threadIdx.x; i < dim_target; i += blockDim.x) { output[blockIdx.x * dim_aligned + i] += inout_shared[i]; } +#endif } // READ SHARED diff --git a/symforce/caspar/source/runtime/pybind_array_tools.cc b/symforce/caspar/source/runtime/pybind_array_tools.cc index 9e43f198..7f057905 100644 --- a/symforce/caspar/source/runtime/pybind_array_tools.cc +++ b/symforce/caspar/source/runtime/pybind_array_tools.cc @@ -174,6 +174,18 @@ int GetDeviceId(const py::object& obj) { auto interface = obj.attr("__cuda_array_interface__").cast(); auto data = interface["data"].cast(); void* ptr = reinterpret_cast(data[0].cast()); + // This is a host translation unit, so it uses local cuda*->hip* aliases here + // rather than the device-side cuda_to_hip.h (which pulls in hipcub and + // __device__ helpers intended for the .cu sources). +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + hipPointerAttribute_t attrs; + hipError_t err = hipPointerGetAttributes(&attrs, ptr); + if (err != hipSuccess) { + hipGetLastError(); + return -1; + } + return attrs.device; +#else cudaPointerAttributes attrs; cudaError_t err = cudaPointerGetAttributes(&attrs, ptr); if (err != cudaSuccess) { @@ -181,6 +193,7 @@ int GetDeviceId(const py::object& obj) { return -1; } return attrs.device; +#endif } catch (...) { return -1; // Fallback if interface or attributes aren't available } diff --git a/symforce/caspar/source/runtime/pybind_array_tools.h b/symforce/caspar/source/runtime/pybind_array_tools.h index 96445288..1e938ed6 100644 --- a/symforce/caspar/source/runtime/pybind_array_tools.h +++ b/symforce/caspar/source/runtime/pybind_array_tools.h @@ -14,7 +14,11 @@ #pragma once +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) +#include +#else #include +#endif #include namespace py = pybind11; diff --git a/symforce/caspar/source/runtime/shared_indices.cu b/symforce/caspar/source/runtime/shared_indices.cu index f3d20c61..ecc4baab 100644 --- a/symforce/caspar/source/runtime/shared_indices.cu +++ b/symforce/caspar/source/runtime/shared_indices.cu @@ -3,10 +3,7 @@ * This source code is under the Apache 2.0 license found in the LICENSE file. * ---------------------------------------------------------------------------- */ -#include -#include -#include - +#include "cuda_to_hip.h" #include "shared_indices.h" namespace cg = cooperative_groups; @@ -111,16 +108,31 @@ __forceinline__ __device__ uint GetOrd(const uint* values, const uint length, co __global__ void SharedIndicesKernel(const uint* const __restrict__ indices, SharedIndex* const __restrict__ shared_indices_out, const uint size) { - const auto block = cg::this_thread_block(); + auto block = cg::this_thread_block(); const auto gtrank = cg::this_grid().thread_rank(); const auto btrank = block.thread_rank(); +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + // HIP's group_dim() is non-const; use blockDim directly + const int start = blockIdx.x * blockDim.x; +#else const int start = block.group_index().x * block.group_dim().x; +#endif const int num = min(1024, size - start) * sizeof(uint); __shared__ uint indices_loc[1024]; +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + // HIP lacks cg::memcpy_async; use synchronous block-strided copy + const uint* src = indices + start; + const uint num_elements = num / sizeof(uint); + for (uint i = btrank; i < num_elements; i += block.size()) { + indices_loc[i] = src[i]; + } + __syncthreads(); +#else cg::memcpy_async(block, indices_loc, indices + start, num); cg::wait(block); +#endif const uint val = gtrank < size ? indices_loc[btrank] : 0xFFFFFFFF; diff --git a/symforce/caspar/source/runtime/shared_indices.h b/symforce/caspar/source/runtime/shared_indices.h index 8b2db65e..e40fed15 100644 --- a/symforce/caspar/source/runtime/shared_indices.h +++ b/symforce/caspar/source/runtime/shared_indices.h @@ -7,7 +7,11 @@ #include +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) +#include +#else #include +#endif namespace caspar { diff --git a/symforce/caspar/source/runtime/solver_tools.cu b/symforce/caspar/source/runtime/solver_tools.cu index 5954fb04..ca19dc0f 100644 --- a/symforce/caspar/source/runtime/solver_tools.cu +++ b/symforce/caspar/source/runtime/solver_tools.cu @@ -3,9 +3,7 @@ * This source code is under the Apache 2.0 license found in the LICENSE file. * ---------------------------------------------------------------------------- */ -#include -#include - +#include "cuda_to_hip.h" #include "solver_tools.h" namespace caspar { diff --git a/symforce/caspar/source/runtime/sort_indices.cu b/symforce/caspar/source/runtime/sort_indices.cu index 0e1356ec..ad14e711 100644 --- a/symforce/caspar/source/runtime/sort_indices.cu +++ b/symforce/caspar/source/runtime/sort_indices.cu @@ -3,8 +3,7 @@ * This source code is under the Apache 2.0 license found in the LICENSE file. * ---------------------------------------------------------------------------- */ -#include - +#include "cuda_to_hip.h" #include "sort_indices.h" namespace { diff --git a/symforce/caspar/source/runtime/sort_indices.h b/symforce/caspar/source/runtime/sort_indices.h index 37800303..311790d8 100644 --- a/symforce/caspar/source/runtime/sort_indices.h +++ b/symforce/caspar/source/runtime/sort_indices.h @@ -5,7 +5,11 @@ #pragma once +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) +#include +#else #include +#endif namespace caspar { diff --git a/symforce/caspar/source/templates/buildfiles/CMakeLists.txt.jinja b/symforce/caspar/source/templates/buildfiles/CMakeLists.txt.jinja index c7b1c6eb..bec1bc7c 100644 --- a/symforce/caspar/source/templates/buildfiles/CMakeLists.txt.jinja +++ b/symforce/caspar/source/templates/buildfiles/CMakeLists.txt.jinja @@ -4,18 +4,32 @@ # ---------------------------------------------------------------------------- #} cmake_minimum_required(VERSION 3.18) -if(NOT DEFINED CMAKE_CUDA_COMPILER AND DEFINED ENV{CUDACXX}) - set(CMAKE_CUDA_COMPILER "$ENV{CUDACXX}" CACHE FILEPATH "CUDA compiler") -endif() +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) -project(caspar_library LANGUAGES CXX CUDA) +option(USE_HIP "Build with HIP for AMD GPUs" OFF) -# SASS for Turing/Ampere/Ada (sm_75–89) + PTX fallback for forward JIT-compatibility. sm_75 minimum required by cooperative_groups::reduce. -if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - set(CMAKE_CUDA_ARCHITECTURES 75 80 86 89 75-virtual) -endif() +if(USE_HIP) + # project(... LANGUAGES ... HIP) honors -DCMAKE_HIP_ARCHITECTURES, otherwise + # auto-detects the host GPU and errors on a no-GPU build host. + project(caspar_library LANGUAGES CXX HIP) -find_package(CUDAToolkit REQUIRED) + find_package(hip REQUIRED) + find_package(hipcub REQUIRED) +else() + if(NOT DEFINED CMAKE_CUDA_COMPILER AND DEFINED ENV{CUDACXX}) + set(CMAKE_CUDA_COMPILER "$ENV{CUDACXX}" CACHE FILEPATH "CUDA compiler") + endif() + + project(caspar_library LANGUAGES CXX CUDA) + + # SASS for Turing/Ampere/Ada (sm_75-89) + PTX fallback for forward JIT-compatibility. sm_75 minimum required by cooperative_groups::reduce. + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES 75 80 86 89 75-virtual) + endif() + + find_package(CUDAToolkit REQUIRED) +endif() {% if python_bindings %} option(CASPAR_BUILD_PYTHON_BINDINGS "Build Python bindings via pybind11" ON) @@ -45,10 +59,31 @@ list(FILTER CUDA_HEADERS EXCLUDE REGEX "pybind.*") add_library({{caslib.name}}_core STATIC ${CUDA_SOURCES} ${CUDA_HEADERS}) target_include_directories({{caslib.name}}_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) -target_link_libraries({{caslib.name}}_core PUBLIC CUDA::cudart) -target_compile_options({{caslib.name}}_core PRIVATE - $<$:--use_fast_math> -) + +if(USE_HIP) + # Mark .cu files as HIP language + set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE HIP) + target_compile_definitions({{caslib.name}}_core PUBLIC USE_HIP) + # Link hip targets PRIVATE to avoid propagating HIP compile options to pure-CXX consumers. + # The static library has the HIP code linked in; pybind consumers only need the includes. + target_link_libraries({{caslib.name}}_core PRIVATE hip::host hip::hipcub) + # Expose ROCm include directory for consumers that need hipError_t etc + target_include_directories({{caslib.name}}_core PUBLIC ${hip_INCLUDE_DIRS}) + # Use -ffp-contract=fast -fno-math-errno instead of -ffast-math. + # clang's -ffast-math enables -fassociative-math which can NaN backward passes. + target_compile_options({{caslib.name}}_core PRIVATE + $<$:-ffp-contract=fast -fno-math-errno> + ) + set_target_properties({{caslib.name}}_core PROPERTIES + HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}" + ) +else() + target_link_libraries({{caslib.name}}_core PUBLIC CUDA::cudart) + target_compile_options({{caslib.name}}_core PRIVATE + $<$:--use_fast_math> + ) +endif() + set_target_properties({{caslib.name}}_core PROPERTIES POSITION_INDEPENDENT_CODE ON ) @@ -60,6 +95,12 @@ if(CASPAR_BUILD_PYTHON_BINDINGS) pybind11_add_module({{caslib.name}} ${CPP_SOURCES} ${CPP_HEADERS}) target_link_libraries({{caslib.name}} PRIVATE {{caslib.name}}_core) + if(USE_HIP) + # Compile pybind sources as HIP (so HIP headers work) but they have no device code. + # The HIP compiler handles them as host-only code. + set_source_files_properties(${CPP_SOURCES} PROPERTIES LANGUAGE HIP) + endif() + set_target_properties({{caslib.name}} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} diff --git a/symforce/caspar/source/templates/caspar_mappings.cu.jinja b/symforce/caspar/source/templates/caspar_mappings.cu.jinja index eb398676..ac314bf0 100644 --- a/symforce/caspar/source/templates/caspar_mappings.cu.jinja +++ b/symforce/caspar/source/templates/caspar_mappings.cu.jinja @@ -4,13 +4,10 @@ # ---------------------------------------------------------------------------- #} #include -#include -#include +#include "cuda_to_hip.h" #include "caspar_mappings.h" -namespace cg = cooperative_groups; - // We use shared memory to improve the memory access. // A smaller block size of 32 allows for larger nodetypes. constexpr int block_size = 32; diff --git a/symforce/caspar/source/templates/caspar_mappings.h.jinja b/symforce/caspar/source/templates/caspar_mappings.h.jinja index af3795ad..53684205 100644 --- a/symforce/caspar/source/templates/caspar_mappings.h.jinja +++ b/symforce/caspar/source/templates/caspar_mappings.h.jinja @@ -4,7 +4,7 @@ # ---------------------------------------------------------------------------- #} #pragma once -#include +#include "cuda_to_hip.h" namespace caspar { {% for nodetype in caslib.exposed_types%} diff --git a/symforce/caspar/source/templates/kernel.cu.jinja b/symforce/caspar/source/templates/kernel.cu.jinja index fa30d92d..e18addd0 100644 --- a/symforce/caspar/source/templates/kernel.cu.jinja +++ b/symforce/caspar/source/templates/kernel.cu.jinja @@ -2,17 +2,11 @@ # SymForce - Copyright 2025, Skydio, Inc. # This source code is under the Apache 2.0 license found in the LICENSE file. # ---------------------------------------------------------------------------- #} -#include -#include -#include -#include -#include +#include "cuda_to_hip.h" #include "kernel_{{kernel.name}}.h" #include "memops.cuh" -namespace cg = cooperative_groups; - namespace caspar { __global__ void diff --git a/symforce/caspar/source/templates/kernel.h.jinja b/symforce/caspar/source/templates/kernel.h.jinja index 476a50e4..8c189f4c 100644 --- a/symforce/caspar/source/templates/kernel.h.jinja +++ b/symforce/caspar/source/templates/kernel.h.jinja @@ -4,7 +4,7 @@ # ---------------------------------------------------------------------------- #} #pragma once -#include +#include "cuda_to_hip.h" #include "shared_indices.h" diff --git a/symforce/caspar/source/templates/solver.h.jinja b/symforce/caspar/source/templates/solver.h.jinja index 855a9789..c9a35c7a 100644 --- a/symforce/caspar/source/templates/solver.h.jinja +++ b/symforce/caspar/source/templates/solver.h.jinja @@ -6,7 +6,7 @@ #include -#include +#include "cuda_to_hip.h" #include