Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 95 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,101 @@ if(NOT CMAKE_CUDA_HOST_COMPILER)
set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}")
endif()

enable_language(CUDA)
# Build the GPU kernels for AMD via HIP/ROCm instead of CUDA. Additive: the
# NVIDIA path below is unchanged when USE_HIP is OFF.
option(USE_HIP "Build the GPU code with HIP for AMD GPUs (ROCm)" OFF)

if(USE_HIP)
# enable_language(HIP) honors -DCMAKE_HIP_ARCHITECTURES, otherwise auto-detects
# the host GPU(s) via rocm_agent_enumerator and errors if none is found (a
# no-GPU build host must then set the arch explicitly).
enable_language(HIP)

# Host .cpp files also use the CUDA runtime/library spellings (cudaStream_t,
# cudaMalloc, ...); steer every C/C++/HIP TU through the ROCm compat layer.
add_compile_definitions(USE_HIP __HIP_PLATFORM_AMD__)
set(CVCUDA_HIP_COMPAT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/cmake/hip")
include_directories(BEFORE "${CVCUDA_HIP_COMPAT_DIR}")
# Force-include the compat header on every C++/HIP TU so the cuda*->hip*
# aliases and the 64-bit warp mask precede any include, regardless of file
# order. C TUs do not use the CUDA runtime, so they are left alone.
set(_cvcuda_force_include "-include ${CVCUDA_HIP_COMPAT_DIR}/CvCudaHipCompat.h")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${_cvcuda_force_include}")
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} ${_cvcuda_force_include}")

# clang (HIP) defaults to -ffp-contract=fast, which forms FMAs across separate
# statements/expressions; nvcc only contracts within a single expression
# (--fmad=true). That extra contraction makes HIP float results drift by ~1 ULP
# from the CUDA build and the CPU gold references (e.g. the bicubic weight chain
# in InterpolationWrap), failing the bit-exact gtests. Pin contraction to the
# CUDA/host semantics so float math is reproducible across the two toolchains.
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -ffp-contract=on")

find_package(hip REQUIRED)
find_package(hipcub REQUIRED)
find_package(hiprand REQUIRED)
find_package(hipblas REQUIRED)
find_package(hipsolver REQUIRED)

# The compat header is force-included on every C++/HIP TU and pulls in the
# HIP runtime, so the ROCm headers must be on the include path globally (not
# only on targets that link hip::host).
get_target_property(_hip_inc hip::host INTERFACE_INCLUDE_DIRECTORIES)
if(_hip_inc)
include_directories(SYSTEM ${_hip_inc})
else()
include_directories(SYSTEM "/opt/rocm/include")
endif()

# Map the CUDA::* link targets the build uses onto their HIP equivalents so
# the per-target target_link_libraries() lines stay untouched.
foreach(_cudalib cudart_static cublas_static cublasLt_static cusolver_static cuda_driver)
if(NOT TARGET CUDA::${_cudalib})
add_library(CUDA::${_cudalib} INTERFACE IMPORTED)
target_link_libraries(CUDA::${_cudalib} INTERFACE hip::host)
endif()
endforeach()
target_link_libraries(CUDA::cusolver_static INTERFACE roc::hipsolver)
target_link_libraries(CUDA::cublas_static INTERFACE roc::hipblas)

# Retag every .cu source LANGUAGE HIP without editing each per-target
# CMakeLists. add_library/add_executable are overridden at top scope to
# forward to the real command then flip the target's .cu files to HIP and
# set HIP_ARCHITECTURES. The NVIDIA build never defines
# these macros, so it is a pure passthrough.
macro(_cvcuda_hipify_target _tgt)
get_target_property(_t_type ${_tgt} TYPE)
if(NOT _t_type STREQUAL "INTERFACE_LIBRARY")
get_target_property(_t_srcs ${_tgt} SOURCES)
if(_t_srcs)
foreach(_s ${_t_srcs})
if(_s MATCHES "\\.cu$")
set_source_files_properties(${_s} PROPERTIES LANGUAGE HIP)
endif()
endforeach()
endif()
set_target_properties(${_tgt} PROPERTIES HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}")
endif()
endmacro()

macro(add_library _tgt)
_add_library(${_tgt} ${ARGN})
set(_args "${ARGN}")
if(NOT "ALIAS" IN_LIST _args AND NOT "IMPORTED" IN_LIST _args AND NOT "INTERFACE" IN_LIST _args)
_cvcuda_hipify_target(${_tgt})
endif()
endmacro()

macro(add_executable _tgt)
_add_executable(${_tgt} ${ARGN})
set(_args "${ARGN}")
if(NOT "ALIAS" IN_LIST _args AND NOT "IMPORTED" IN_LIST _args)
_cvcuda_hipify_target(${_tgt})
endif()
endmacro()
else()
enable_language(CUDA)
endif()

# Used when creating special builds
set(PROJECT_VERSION_SUFFIX "")
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ We provide pre-built Python wheels on pypi.org for a variety of Python versions

See [Installation](https://cvcuda.github.io/CV-CUDA/installation.html) for complete installation instructions including building from source, installing Debian packages, and tar archives.

CV-CUDA can also be built for AMD GPUs by compiling its GPU code with HIP for ROCm instead of CUDA. This path is additive and off by default; the NVIDIA build is unchanged. See [Building for AMD GPUs (ROCm)](https://cvcuda.github.io/CV-CUDA/installation.html#building-for-amd-gpus-rocm) for details.

### Compatibility

|CV-CUDA Build|Platform|CUDA Version|CUDA Compute Capability|Hardware Architectures|Nvidia Driver|Python Versions|Supported Compilers (build from source and API compatiblity)|API compatibility with prebuilt binaries|OS/Linux distributions tested with prebuilt packages|
Expand Down
15 changes: 12 additions & 3 deletions cmake/ConfigBuildTree.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,15 @@ include(CMakeDependentOption)

option(EXPOSE_CODE "Expose in resulting binaries parts of our code" ${DEFAULT_EXPOSE_CODE})
option(WARNINGS_AS_ERRORS "Treat compilation warnings as errors" OFF)
cmake_dependent_option(ENABLE_COMPAT_OLD_GLIBC "Generates binaries that work with old distros, with old glibc" ON "NOT ARCH_AARCH64" OFF)
# The old-glibc compat shim links prebuilt glibc-2.17 stub .so files (shipped via
# Git LFS); it is a deployment-portability feature unrelated to GPU correctness,
# so default it OFF on the ROCm build.
cmake_dependent_option(ENABLE_COMPAT_OLD_GLIBC "Generates binaries that work with old distros, with old glibc" ON "NOT ARCH_AARCH64;NOT USE_HIP" OFF)

# Needed to get cuda version
find_package(CUDAToolkit REQUIRED)
if(NOT USE_HIP)
find_package(CUDAToolkit REQUIRED)
endif()

# Are we inside a git repo and it has submodules enabled?
if(EXISTS ${CMAKE_SOURCE_DIR}/.git AND EXISTS ${CMAKE_SOURCE_DIR}/.gitmodules)
Expand All @@ -85,7 +90,11 @@ else()
"CV-CUDA only supports Linux platform.")
endif()

set(CVCUDA_BUILD_SUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}-${CVCUDA_SYSTEM_NAME}")
if(USE_HIP)
set(CVCUDA_BUILD_SUFFIX "hip-${CVCUDA_SYSTEM_NAME}")
else()
set(CVCUDA_BUILD_SUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}-${CVCUDA_SYSTEM_NAME}")
endif()

function(setup_dso target version)
string(REGEX MATCHALL "[0-9]+" version_list "${version}")
Expand Down
6 changes: 6 additions & 0 deletions cmake/ConfigCUDA.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# The whole CUDA toolkit configuration is skipped on the HIP/ROCm build; the
# top-level CMakeLists handles enable_language(HIP) and the HIP packages.
if(NOT USE_HIP)

string(REPLACE "." ";" CUDA_VERSION_LIST ${CMAKE_CUDA_COMPILER_VERSION})
list(GET CUDA_VERSION_LIST 0 CUDA_VERSION_MAJOR)
list(GET CUDA_VERSION_LIST 1 CUDA_VERSION_MINOR)
Expand Down Expand Up @@ -100,3 +104,5 @@ if(NOT USE_CMAKE_CUDA_ARCHITECTURES)
# which is the old architecture supported by nvcc. We don't want that.
set(CMAKE_CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}" CACHE STRING "CUDA architectures to build for" FORCE)
endif()

endif() # NOT USE_HIP
8 changes: 7 additions & 1 deletion cmake/ConfigCompiler.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,13 @@ endif()
include(CheckIPOSupported)
check_ipo_supported(RESULT LTO_SUPPORTED)

set(LTO_ENABLED ON)
# LTO does not finalize under the HIP link step (device objects stay slim
# bitcode), which breaks linking; keep it off on ROCm.
if(USE_HIP)
set(LTO_ENABLED OFF)
else()
set(LTO_ENABLED ON)
endif()

if(ENABLE_SANITIZER)
set(COMPILER_SANITIZER_FLAGS
Expand Down
215 changes: 215 additions & 0 deletions cmake/hip/CvCudaHipCompat.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-FileCopyrightText: Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Author: Jeff Daily <jeff.daily@amd.com>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

// ROCm/HIP compatibility layer for CV-CUDA. Force-included on every HIP
// translation unit (CMAKE_HIP_FLAGS -include) so the aliases below are in
// scope before any CV-CUDA or CUDA-toolkit header is parsed. The NVIDIA build
// never sees this file: the cmake/hip shim dir and this header are only on the
// HIP include path. Only the actual CUDA-runtime/library symbols CV-CUDA uses
// are aliased here; CV-CUDA's own cuda<Op>Submit/cuda<Op>Create public API
// names are deliberately left untouched.

#ifndef CVCUDA_HIP_COMPAT_H
#define CVCUDA_HIP_COMPAT_H

#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)

#if defined(__cplusplus)
// libc host declarations must win over HIP's device overloads of memcpy/memset:
// pull them in before the HIP runtime so host TUs keep the standard host
// prototypes.
#include <cstdlib>
#include <cstring>

#if defined(__HIPCC__)
// .cu translation units (compiled by hipcc) need the full device runtime.
#include <hip/hip_runtime.h>
#else
// Plain g++ host TUs only need the runtime API (types + host-callable entry
// points); the device runtime header is heavier and pulls device builtins.
#include <hip/hip_runtime_api.h>
#endif
#endif

// CUDA_VERSION: several headers (Compat.hpp, Metaprogramming.hpp, StreamId.cpp)
// branch on it. CV-CUDA wants the pre-13.0 compound-type aliases, so report a
// version below 13000. Do NOT define __CUDA_ARCH__ on HIP: the SaturateCast PTX
// table and the NVCV SIMD-video-intrinsic paths are gated on __CUDA_ARCH__ and
// must stay inert, falling through to their portable C++/per-element bodies.
#ifndef CUDA_VERSION
#define CUDA_VERSION 12020
#endif

// ---- runtime: error/status -------------------------------------------------
#define cudaError_t hipError_t
#define cudaError hipError_t
#define cudaSuccess hipSuccess
#define cudaErrorNotReady hipErrorNotReady
#define cudaErrorInvalidValue hipErrorInvalidValue
#define cudaErrorMemoryAllocation hipErrorOutOfMemory
#define cudaErrorCudartUnloading hipErrorDeinitialized
#define cudaErrorTextureFetchFailed hipErrorInvalidTexture
#define cudaGetLastError hipGetLastError
#define cudaPeekAtLastError hipPeekAtLastError
#define cudaGetErrorString hipGetErrorString
#define cudaGetErrorName hipGetErrorName
#define cudaGetVersion hipRuntimeGetVersion

// ---- runtime: device -------------------------------------------------------
#define cudaGetDevice hipGetDevice
#define cudaSetDevice hipSetDevice
#define cudaGetDeviceCount hipGetDeviceCount
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaDeviceProp hipDeviceProp_t
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaDevAttrTextureAlignment hipDeviceAttributeTextureAlignment
#define cudaDevAttrTexturePitchAlignment hipDeviceAttributeTexturePitchAlignment

// NVCV derives a tensor/image row-pitch alignment from the texture *pitch*
// alignment device attribute. On NVIDIA that attribute is 32 bytes, so a tightly
// packed image (e.g. a 640-byte uchar row) keeps a 640-byte row stride. AMD
// reports 256 there, which would pad that row to 768 and silently change the
// in-memory layout every NVCV consumer (and the whole-buffer test comparisons)
// assumes. No CV-CUDA tensor is bound to a HW texture object, so the larger
// HW pitch is unnecessary here; clamp the queried pitch alignment to the NVIDIA
// value to keep the byte layout identical to the CUDA build. Other attribute
// queries pass through unchanged.
#if defined(__cplusplus)
__host__ inline hipError_t cvcuda_hipDeviceGetAttribute(int *value, hipDeviceAttribute_t attr, int device)
{
hipError_t err = hipDeviceGetAttribute(value, attr, device);
if (err == hipSuccess && attr == hipDeviceAttributeTexturePitchAlignment && value && *value > 32)
{
*value = 32;
}
return err;
}
#define cudaDeviceGetAttribute cvcuda_hipDeviceGetAttribute
#else
#define cudaDeviceGetAttribute hipDeviceGetAttribute
#endif

// ---- runtime: memory -------------------------------------------------------
#define cudaMalloc hipMalloc
#define cudaMallocManaged hipMallocManaged
#define cudaFree hipFree
#define cudaMallocHost hipHostMalloc
#define cudaHostAlloc hipHostMalloc
#define cudaFreeHost hipHostFree
#define cudaHostFree hipHostFree
#define cudaHostAllocMapped hipHostMallocMapped
#define cudaHostAllocWriteCombined hipHostMallocWriteCombined
#define cudaMemset hipMemset
#define cudaMemsetAsync hipMemsetAsync
#define cudaMemset2D hipMemset2D
#define cudaMemset2DAsync hipMemset2DAsync
#define cudaMemcpy hipMemcpy
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpy2D hipMemcpy2D
#define cudaMemcpy2DAsync hipMemcpy2DAsync
#define cudaMemcpyKind hipMemcpyKind
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyHostToHost hipMemcpyHostToHost
#define cudaMemcpyDefault hipMemcpyDefault
#define cudaPointerAttributes hipPointerAttribute_t
#define cudaPointerGetAttributes hipPointerGetAttributes
#define cudaMemoryTypeHost hipMemoryTypeHost
#define cudaMemoryTypeDevice hipMemoryTypeDevice
#define cudaMemoryTypeManaged hipMemoryTypeManaged
#define cudaMemoryTypeUnregistered hipMemoryTypeUnregistered

// ---- runtime: stream / event ----------------------------------------------
#define cudaStream_t hipStream_t
#define cudaStreamDefault hipStreamDefault
#define cudaStreamPerThread hipStreamPerThread
#define cudaStreamNonBlocking hipStreamNonBlocking
#define cudaStreamSynchronize hipStreamSynchronize
#define cudaStreamWaitEvent hipStreamWaitEvent
#define cudaStreamDestroy hipStreamDestroy
#define cudaStreamGetId hipStreamGetId
#define cudaStreamCreate hipStreamCreate
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define cudaStreamCreateWithPriority hipStreamCreateWithPriority
#define cudaDeviceGetStreamPriorityRange hipDeviceGetStreamPriorityRange
#define cudaEvent_t hipEvent_t
#define cudaEventDefault hipEventDefault
#define cudaEventDisableTiming hipEventDisableTiming
#define cudaEventCreate hipEventCreate
#define cudaEventCreateWithFlags hipEventCreateWithFlags
#define cudaEventRecord hipEventRecord
#define cudaEventQuery hipEventQuery
#define cudaEventSynchronize hipEventSynchronize
#define cudaEventElapsedTime hipEventElapsedTime
#define cudaEventDestroy hipEventDestroy

// ---- full-wavefront mask ---------------------------------------------------
// __shfl*_sync on ROCm static_asserts a 64-bit mask regardless of wave width.
// The width argument (kept explicit at every call site that needs it) controls
// the subgroup; the mask just marks participants.
#define NVCV_WARP_FULL_MASK 0xffffffffffffffffULL

// ---- built-in index types --------------------------------------------------
// On CUDA blockIdx/blockDim/threadIdx are uint3/dim3, so the kernels' common
// idiom `blockIdx * blockDim + threadIdx` resolves through the cuda:: compound
// operators. On HIP these are distinct __hip_builtin_*_t structs (with only a
// dim3 conversion), so neither HIP's nor CV-CUDA's vector operators deduce them.
// Provide the exact whole-vector forms the kernels use, lowering to uint3. These
// take only the builtin index types, which carry no NVCV TypeTraits, so they do
// not compete with the cuda:: compound operators.
// Defined only under hipcc (where the __hip_builtin_*_t types exist and where
// __global__ bodies are parsed); plain g++ host TUs that include only the HIP
// runtime API do not see these builtin types, so the operators must not appear
// there. hipcc parses these in both its host and device passes.
#if defined(__cplusplus) && defined(__HIPCC__)
__host__ __device__ __forceinline__ uint3 operator*(const __hip_builtin_blockIdx_t &a,
const __hip_builtin_blockDim_t &b)
{
return uint3{a.x * b.x, a.y * b.y, a.z * b.z};
}
__host__ __device__ __forceinline__ uint3 operator*(const __hip_builtin_blockDim_t &a,
const __hip_builtin_blockIdx_t &b)
{
return uint3{a.x * b.x, a.y * b.y, a.z * b.z};
}
__host__ __device__ __forceinline__ uint3 operator+(const uint3 &a, const __hip_builtin_threadIdx_t &b)
{
return uint3{a.x + b.x, a.y + b.y, a.z + b.z};
}
__host__ __device__ __forceinline__ uint3 operator+(const __hip_builtin_threadIdx_t &a, const uint3 &b)
{
return uint3{a.x + b.x, a.y + b.y, a.z + b.z};
}
#endif

// std::declval is __host__-only (libstdc++); clang rejects it in the unevaluated
// decltype of a __device__-only function (where nvcc is lenient). Provide a
// __host__ __device__ equivalent for those few device-side decltype sites.
#if defined(__cplusplus)
namespace nvcv { namespace cuda { namespace compat {
template<typename T>
__host__ __device__ T &&declval() noexcept;
}}} // namespace nvcv::cuda::compat
#endif

#endif // __HIP_PLATFORM_AMD__ || USE_HIP

#endif // CVCUDA_HIP_COMPAT_H
2 changes: 2 additions & 0 deletions cmake/hip/channel_descriptor.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#pragma once
#include "CvCudaHipCompat.h"
Loading