diff --git a/CMakeLists.txt b/CMakeLists.txt index 9ad41245f..5335afb3c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,7 +33,101 @@ if(NOT CMAKE_CUDA_HOST_COMPILER) set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}") endif() -enable_language(CUDA) +# Build the GPU kernels for AMD via HIP/ROCm instead of CUDA. Additive: the +# NVIDIA path below is unchanged when USE_HIP is OFF. +option(USE_HIP "Build the GPU code with HIP for AMD GPUs (ROCm)" OFF) + +if(USE_HIP) + # enable_language(HIP) honors -DCMAKE_HIP_ARCHITECTURES, otherwise auto-detects + # the host GPU(s) via rocm_agent_enumerator and errors if none is found (a + # no-GPU build host must then set the arch explicitly). + enable_language(HIP) + + # Host .cpp files also use the CUDA runtime/library spellings (cudaStream_t, + # cudaMalloc, ...); steer every C/C++/HIP TU through the ROCm compat layer. + add_compile_definitions(USE_HIP __HIP_PLATFORM_AMD__) + set(CVCUDA_HIP_COMPAT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/cmake/hip") + include_directories(BEFORE "${CVCUDA_HIP_COMPAT_DIR}") + # Force-include the compat header on every C++/HIP TU so the cuda*->hip* + # aliases and the 64-bit warp mask precede any include, regardless of file + # order. C TUs do not use the CUDA runtime, so they are left alone. + set(_cvcuda_force_include "-include ${CVCUDA_HIP_COMPAT_DIR}/CvCudaHipCompat.h") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${_cvcuda_force_include}") + set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} ${_cvcuda_force_include}") + + # clang (HIP) defaults to -ffp-contract=fast, which forms FMAs across separate + # statements/expressions; nvcc only contracts within a single expression + # (--fmad=true). That extra contraction makes HIP float results drift by ~1 ULP + # from the CUDA build and the CPU gold references (e.g. the bicubic weight chain + # in InterpolationWrap), failing the bit-exact gtests. Pin contraction to the + # CUDA/host semantics so float math is reproducible across the two toolchains. + set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -ffp-contract=on") + + find_package(hip REQUIRED) + find_package(hipcub REQUIRED) + find_package(hiprand REQUIRED) + find_package(hipblas REQUIRED) + find_package(hipsolver REQUIRED) + + # The compat header is force-included on every C++/HIP TU and pulls in the + # HIP runtime, so the ROCm headers must be on the include path globally (not + # only on targets that link hip::host). + get_target_property(_hip_inc hip::host INTERFACE_INCLUDE_DIRECTORIES) + if(_hip_inc) + include_directories(SYSTEM ${_hip_inc}) + else() + include_directories(SYSTEM "/opt/rocm/include") + endif() + + # Map the CUDA::* link targets the build uses onto their HIP equivalents so + # the per-target target_link_libraries() lines stay untouched. + foreach(_cudalib cudart_static cublas_static cublasLt_static cusolver_static cuda_driver) + if(NOT TARGET CUDA::${_cudalib}) + add_library(CUDA::${_cudalib} INTERFACE IMPORTED) + target_link_libraries(CUDA::${_cudalib} INTERFACE hip::host) + endif() + endforeach() + target_link_libraries(CUDA::cusolver_static INTERFACE roc::hipsolver) + target_link_libraries(CUDA::cublas_static INTERFACE roc::hipblas) + + # Retag every .cu source LANGUAGE HIP without editing each per-target + # CMakeLists. add_library/add_executable are overridden at top scope to + # forward to the real command then flip the target's .cu files to HIP and + # set HIP_ARCHITECTURES. The NVIDIA build never defines + # these macros, so it is a pure passthrough. + macro(_cvcuda_hipify_target _tgt) + get_target_property(_t_type ${_tgt} TYPE) + if(NOT _t_type STREQUAL "INTERFACE_LIBRARY") + get_target_property(_t_srcs ${_tgt} SOURCES) + if(_t_srcs) + foreach(_s ${_t_srcs}) + if(_s MATCHES "\\.cu$") + set_source_files_properties(${_s} PROPERTIES LANGUAGE HIP) + endif() + endforeach() + endif() + set_target_properties(${_tgt} PROPERTIES HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}") + endif() + endmacro() + + macro(add_library _tgt) + _add_library(${_tgt} ${ARGN}) + set(_args "${ARGN}") + if(NOT "ALIAS" IN_LIST _args AND NOT "IMPORTED" IN_LIST _args AND NOT "INTERFACE" IN_LIST _args) + _cvcuda_hipify_target(${_tgt}) + endif() + endmacro() + + macro(add_executable _tgt) + _add_executable(${_tgt} ${ARGN}) + set(_args "${ARGN}") + if(NOT "ALIAS" IN_LIST _args AND NOT "IMPORTED" IN_LIST _args) + _cvcuda_hipify_target(${_tgt}) + endif() + endmacro() +else() + enable_language(CUDA) +endif() # Used when creating special builds set(PROJECT_VERSION_SUFFIX "") diff --git a/README.md b/README.md index 92a90d107..43e6edf1f 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,8 @@ We provide pre-built Python wheels on pypi.org for a variety of Python versions See [Installation](https://cvcuda.github.io/CV-CUDA/installation.html) for complete installation instructions including building from source, installing Debian packages, and tar archives. +CV-CUDA can also be built for AMD GPUs by compiling its GPU code with HIP for ROCm instead of CUDA. This path is additive and off by default; the NVIDIA build is unchanged. See [Building for AMD GPUs (ROCm)](https://cvcuda.github.io/CV-CUDA/installation.html#building-for-amd-gpus-rocm) for details. + ### Compatibility |CV-CUDA Build|Platform|CUDA Version|CUDA Compute Capability|Hardware Architectures|Nvidia Driver|Python Versions|Supported Compilers (build from source and API compatiblity)|API compatibility with prebuilt binaries|OS/Linux distributions tested with prebuilt packages| diff --git a/cmake/ConfigBuildTree.cmake b/cmake/ConfigBuildTree.cmake index 7b306ae45..dddd767ac 100644 --- a/cmake/ConfigBuildTree.cmake +++ b/cmake/ConfigBuildTree.cmake @@ -66,10 +66,15 @@ include(CMakeDependentOption) option(EXPOSE_CODE "Expose in resulting binaries parts of our code" ${DEFAULT_EXPOSE_CODE}) option(WARNINGS_AS_ERRORS "Treat compilation warnings as errors" OFF) -cmake_dependent_option(ENABLE_COMPAT_OLD_GLIBC "Generates binaries that work with old distros, with old glibc" ON "NOT ARCH_AARCH64" OFF) +# The old-glibc compat shim links prebuilt glibc-2.17 stub .so files (shipped via +# Git LFS); it is a deployment-portability feature unrelated to GPU correctness, +# so default it OFF on the ROCm build. +cmake_dependent_option(ENABLE_COMPAT_OLD_GLIBC "Generates binaries that work with old distros, with old glibc" ON "NOT ARCH_AARCH64;NOT USE_HIP" OFF) # Needed to get cuda version -find_package(CUDAToolkit REQUIRED) +if(NOT USE_HIP) + find_package(CUDAToolkit REQUIRED) +endif() # Are we inside a git repo and it has submodules enabled? if(EXISTS ${CMAKE_SOURCE_DIR}/.git AND EXISTS ${CMAKE_SOURCE_DIR}/.gitmodules) @@ -85,7 +90,11 @@ else() "CV-CUDA only supports Linux platform.") endif() -set(CVCUDA_BUILD_SUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}-${CVCUDA_SYSTEM_NAME}") +if(USE_HIP) + set(CVCUDA_BUILD_SUFFIX "hip-${CVCUDA_SYSTEM_NAME}") +else() + set(CVCUDA_BUILD_SUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}-${CVCUDA_SYSTEM_NAME}") +endif() function(setup_dso target version) string(REGEX MATCHALL "[0-9]+" version_list "${version}") diff --git a/cmake/ConfigCUDA.cmake b/cmake/ConfigCUDA.cmake index 0ec51d304..a42efc1fe 100644 --- a/cmake/ConfigCUDA.cmake +++ b/cmake/ConfigCUDA.cmake @@ -13,6 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +# The whole CUDA toolkit configuration is skipped on the HIP/ROCm build; the +# top-level CMakeLists handles enable_language(HIP) and the HIP packages. +if(NOT USE_HIP) + string(REPLACE "." ";" CUDA_VERSION_LIST ${CMAKE_CUDA_COMPILER_VERSION}) list(GET CUDA_VERSION_LIST 0 CUDA_VERSION_MAJOR) list(GET CUDA_VERSION_LIST 1 CUDA_VERSION_MINOR) @@ -100,3 +104,5 @@ if(NOT USE_CMAKE_CUDA_ARCHITECTURES) # which is the old architecture supported by nvcc. We don't want that. set(CMAKE_CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}" CACHE STRING "CUDA architectures to build for" FORCE) endif() + +endif() # NOT USE_HIP diff --git a/cmake/ConfigCompiler.cmake b/cmake/ConfigCompiler.cmake index c0984eef1..5224d6d19 100644 --- a/cmake/ConfigCompiler.cmake +++ b/cmake/ConfigCompiler.cmake @@ -46,7 +46,13 @@ endif() include(CheckIPOSupported) check_ipo_supported(RESULT LTO_SUPPORTED) -set(LTO_ENABLED ON) +# LTO does not finalize under the HIP link step (device objects stay slim +# bitcode), which breaks linking; keep it off on ROCm. +if(USE_HIP) + set(LTO_ENABLED OFF) +else() + set(LTO_ENABLED ON) +endif() if(ENABLE_SANITIZER) set(COMPILER_SANITIZER_FLAGS diff --git a/cmake/hip/CvCudaHipCompat.h b/cmake/hip/CvCudaHipCompat.h new file mode 100644 index 000000000..69692070a --- /dev/null +++ b/cmake/hip/CvCudaHipCompat.h @@ -0,0 +1,215 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Author: Jeff Daily + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// ROCm/HIP compatibility layer for CV-CUDA. Force-included on every HIP +// translation unit (CMAKE_HIP_FLAGS -include) so the aliases below are in +// scope before any CV-CUDA or CUDA-toolkit header is parsed. The NVIDIA build +// never sees this file: the cmake/hip shim dir and this header are only on the +// HIP include path. Only the actual CUDA-runtime/library symbols CV-CUDA uses +// are aliased here; CV-CUDA's own cudaSubmit/cudaCreate public API +// names are deliberately left untouched. + +#ifndef CVCUDA_HIP_COMPAT_H +#define CVCUDA_HIP_COMPAT_H + +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) + +#if defined(__cplusplus) +// libc host declarations must win over HIP's device overloads of memcpy/memset: +// pull them in before the HIP runtime so host TUs keep the standard host +// prototypes. +#include +#include + +#if defined(__HIPCC__) +// .cu translation units (compiled by hipcc) need the full device runtime. +#include +#else +// Plain g++ host TUs only need the runtime API (types + host-callable entry +// points); the device runtime header is heavier and pulls device builtins. +#include +#endif +#endif + +// CUDA_VERSION: several headers (Compat.hpp, Metaprogramming.hpp, StreamId.cpp) +// branch on it. CV-CUDA wants the pre-13.0 compound-type aliases, so report a +// version below 13000. Do NOT define __CUDA_ARCH__ on HIP: the SaturateCast PTX +// table and the NVCV SIMD-video-intrinsic paths are gated on __CUDA_ARCH__ and +// must stay inert, falling through to their portable C++/per-element bodies. +#ifndef CUDA_VERSION +#define CUDA_VERSION 12020 +#endif + +// ---- runtime: error/status ------------------------------------------------- +#define cudaError_t hipError_t +#define cudaError hipError_t +#define cudaSuccess hipSuccess +#define cudaErrorNotReady hipErrorNotReady +#define cudaErrorInvalidValue hipErrorInvalidValue +#define cudaErrorMemoryAllocation hipErrorOutOfMemory +#define cudaErrorCudartUnloading hipErrorDeinitialized +#define cudaErrorTextureFetchFailed hipErrorInvalidTexture +#define cudaGetLastError hipGetLastError +#define cudaPeekAtLastError hipPeekAtLastError +#define cudaGetErrorString hipGetErrorString +#define cudaGetErrorName hipGetErrorName +#define cudaGetVersion hipRuntimeGetVersion + +// ---- runtime: device ------------------------------------------------------- +#define cudaGetDevice hipGetDevice +#define cudaSetDevice hipSetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaDeviceProp hipDeviceProp_t +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaDevAttrTextureAlignment hipDeviceAttributeTextureAlignment +#define cudaDevAttrTexturePitchAlignment hipDeviceAttributeTexturePitchAlignment + +// NVCV derives a tensor/image row-pitch alignment from the texture *pitch* +// alignment device attribute. On NVIDIA that attribute is 32 bytes, so a tightly +// packed image (e.g. a 640-byte uchar row) keeps a 640-byte row stride. AMD +// reports 256 there, which would pad that row to 768 and silently change the +// in-memory layout every NVCV consumer (and the whole-buffer test comparisons) +// assumes. No CV-CUDA tensor is bound to a HW texture object, so the larger +// HW pitch is unnecessary here; clamp the queried pitch alignment to the NVIDIA +// value to keep the byte layout identical to the CUDA build. Other attribute +// queries pass through unchanged. +#if defined(__cplusplus) +__host__ inline hipError_t cvcuda_hipDeviceGetAttribute(int *value, hipDeviceAttribute_t attr, int device) +{ + hipError_t err = hipDeviceGetAttribute(value, attr, device); + if (err == hipSuccess && attr == hipDeviceAttributeTexturePitchAlignment && value && *value > 32) + { + *value = 32; + } + return err; +} +#define cudaDeviceGetAttribute cvcuda_hipDeviceGetAttribute +#else +#define cudaDeviceGetAttribute hipDeviceGetAttribute +#endif + +// ---- runtime: memory ------------------------------------------------------- +#define cudaMalloc hipMalloc +#define cudaMallocManaged hipMallocManaged +#define cudaFree hipFree +#define cudaMallocHost hipHostMalloc +#define cudaHostAlloc hipHostMalloc +#define cudaFreeHost hipHostFree +#define cudaHostFree hipHostFree +#define cudaHostAllocMapped hipHostMallocMapped +#define cudaHostAllocWriteCombined hipHostMallocWriteCombined +#define cudaMemset hipMemset +#define cudaMemsetAsync hipMemsetAsync +#define cudaMemset2D hipMemset2D +#define cudaMemset2DAsync hipMemset2DAsync +#define cudaMemcpy hipMemcpy +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpy2D hipMemcpy2D +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpyKind hipMemcpyKind +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyHostToHost hipMemcpyHostToHost +#define cudaMemcpyDefault hipMemcpyDefault +#define cudaPointerAttributes hipPointerAttribute_t +#define cudaPointerGetAttributes hipPointerGetAttributes +#define cudaMemoryTypeHost hipMemoryTypeHost +#define cudaMemoryTypeDevice hipMemoryTypeDevice +#define cudaMemoryTypeManaged hipMemoryTypeManaged +#define cudaMemoryTypeUnregistered hipMemoryTypeUnregistered + +// ---- runtime: stream / event ---------------------------------------------- +#define cudaStream_t hipStream_t +#define cudaStreamDefault hipStreamDefault +#define cudaStreamPerThread hipStreamPerThread +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent hipStreamWaitEvent +#define cudaStreamDestroy hipStreamDestroy +#define cudaStreamGetId hipStreamGetId +#define cudaStreamCreate hipStreamCreate +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamCreateWithPriority hipStreamCreateWithPriority +#define cudaDeviceGetStreamPriorityRange hipDeviceGetStreamPriorityRange +#define cudaEvent_t hipEvent_t +#define cudaEventDefault hipEventDefault +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventCreate hipEventCreate +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventRecord hipEventRecord +#define cudaEventQuery hipEventQuery +#define cudaEventSynchronize hipEventSynchronize +#define cudaEventElapsedTime hipEventElapsedTime +#define cudaEventDestroy hipEventDestroy + +// ---- full-wavefront mask --------------------------------------------------- +// __shfl*_sync on ROCm static_asserts a 64-bit mask regardless of wave width. +// The width argument (kept explicit at every call site that needs it) controls +// the subgroup; the mask just marks participants. +#define NVCV_WARP_FULL_MASK 0xffffffffffffffffULL + +// ---- built-in index types -------------------------------------------------- +// On CUDA blockIdx/blockDim/threadIdx are uint3/dim3, so the kernels' common +// idiom `blockIdx * blockDim + threadIdx` resolves through the cuda:: compound +// operators. On HIP these are distinct __hip_builtin_*_t structs (with only a +// dim3 conversion), so neither HIP's nor CV-CUDA's vector operators deduce them. +// Provide the exact whole-vector forms the kernels use, lowering to uint3. These +// take only the builtin index types, which carry no NVCV TypeTraits, so they do +// not compete with the cuda:: compound operators. +// Defined only under hipcc (where the __hip_builtin_*_t types exist and where +// __global__ bodies are parsed); plain g++ host TUs that include only the HIP +// runtime API do not see these builtin types, so the operators must not appear +// there. hipcc parses these in both its host and device passes. +#if defined(__cplusplus) && defined(__HIPCC__) +__host__ __device__ __forceinline__ uint3 operator*(const __hip_builtin_blockIdx_t &a, + const __hip_builtin_blockDim_t &b) +{ + return uint3{a.x * b.x, a.y * b.y, a.z * b.z}; +} +__host__ __device__ __forceinline__ uint3 operator*(const __hip_builtin_blockDim_t &a, + const __hip_builtin_blockIdx_t &b) +{ + return uint3{a.x * b.x, a.y * b.y, a.z * b.z}; +} +__host__ __device__ __forceinline__ uint3 operator+(const uint3 &a, const __hip_builtin_threadIdx_t &b) +{ + return uint3{a.x + b.x, a.y + b.y, a.z + b.z}; +} +__host__ __device__ __forceinline__ uint3 operator+(const __hip_builtin_threadIdx_t &a, const uint3 &b) +{ + return uint3{a.x + b.x, a.y + b.y, a.z + b.z}; +} +#endif + +// std::declval is __host__-only (libstdc++); clang rejects it in the unevaluated +// decltype of a __device__-only function (where nvcc is lenient). Provide a +// __host__ __device__ equivalent for those few device-side decltype sites. +#if defined(__cplusplus) +namespace nvcv { namespace cuda { namespace compat { +template +__host__ __device__ T &&declval() noexcept; +}}} // namespace nvcv::cuda::compat +#endif + +#endif // __HIP_PLATFORM_AMD__ || USE_HIP + +#endif // CVCUDA_HIP_COMPAT_H diff --git a/cmake/hip/channel_descriptor.h b/cmake/hip/channel_descriptor.h new file mode 100644 index 000000000..862568004 --- /dev/null +++ b/cmake/hip/channel_descriptor.h @@ -0,0 +1,2 @@ +#pragma once +#include "CvCudaHipCompat.h" diff --git a/cmake/hip/cub/cub.cuh b/cmake/hip/cub/cub.cuh new file mode 100644 index 000000000..ae13ba38b --- /dev/null +++ b/cmake/hip/cub/cub.cuh @@ -0,0 +1,18 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Author: Jeff Daily + */ + +#pragma once +#include "../CvCudaHipCompat.h" +#include + +// CV-CUDA spells the namespace cub::; hipCUB lives in hipcub::. Aliasing the +// namespace keeps every cub::BlockReduce/BlockScan/BlockRadixSort/DeviceReduce +// and cub::BLOCK_* enum call site unchanged. +namespace hipcub +{ +} +namespace cub = hipcub; diff --git a/cmake/hip/cublas_v2.h b/cmake/hip/cublas_v2.h new file mode 100644 index 000000000..3bb9c7caf --- /dev/null +++ b/cmake/hip/cublas_v2.h @@ -0,0 +1,16 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Author: Jeff Daily + */ + +#pragma once +#include "CvCudaHipCompat.h" +#include + +// OpFindHomography only uses the status type and the fill-mode enum from cuBLAS. +#define cublasStatus_t hipblasStatus_t +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_FILL_MODE_LOWER HIPBLAS_FILL_MODE_LOWER +#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER diff --git a/cmake/hip/cuda.h b/cmake/hip/cuda.h new file mode 100644 index 000000000..862568004 --- /dev/null +++ b/cmake/hip/cuda.h @@ -0,0 +1,2 @@ +#pragma once +#include "CvCudaHipCompat.h" diff --git a/cmake/hip/cuda_fp16.h b/cmake/hip/cuda_fp16.h new file mode 100644 index 000000000..50a827d30 --- /dev/null +++ b/cmake/hip/cuda_fp16.h @@ -0,0 +1,3 @@ +#pragma once +#include "CvCudaHipCompat.h" +#include diff --git a/cmake/hip/cuda_runtime.h b/cmake/hip/cuda_runtime.h new file mode 100644 index 000000000..862568004 --- /dev/null +++ b/cmake/hip/cuda_runtime.h @@ -0,0 +1,2 @@ +#pragma once +#include "CvCudaHipCompat.h" diff --git a/cmake/hip/cuda_runtime_api.h b/cmake/hip/cuda_runtime_api.h new file mode 100644 index 000000000..862568004 --- /dev/null +++ b/cmake/hip/cuda_runtime_api.h @@ -0,0 +1,2 @@ +#pragma once +#include "CvCudaHipCompat.h" diff --git a/cmake/hip/curand_kernel.h b/cmake/hip/curand_kernel.h new file mode 100644 index 000000000..76ec52e2a --- /dev/null +++ b/cmake/hip/curand_kernel.h @@ -0,0 +1,16 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Author: Jeff Daily + */ + +#pragma once +#include "CvCudaHipCompat.h" +#include // rocRAND's mtgp32 header calls printf without including it +#include + +#define curandState hiprandState +#define curandState_t hiprandState_t +#define curand_init hiprand_init +#define curand_normal hiprand_normal diff --git a/cmake/hip/cusolverDn.h b/cmake/hip/cusolverDn.h new file mode 100644 index 000000000..173cf6e59 --- /dev/null +++ b/cmake/hip/cusolverDn.h @@ -0,0 +1,34 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Author: Jeff Daily + */ + +#pragma once +#include "CvCudaHipCompat.h" +#include + +// OpFindHomography solves a batched symmetric eigenproblem (syevjBatched). The +// hipSOLVER dense Dn API matches cuSOLVER argument-for-argument, so each call +// site ports by name alone. +#define cusolverStatus_t hipsolverStatus_t +#define CUSOLVER_STATUS_SUCCESS HIPSOLVER_STATUS_SUCCESS +#define cusolverDnHandle_t hipsolverDnHandle_t +#define cusolverEigMode_t hipsolverEigMode_t +#define CUSOLVER_EIG_MODE_VECTOR HIPSOLVER_EIG_MODE_VECTOR +#define CUSOLVER_EIG_MODE_NOVECTOR HIPSOLVER_EIG_MODE_NOVECTOR +#define syevjInfo_t hipsolverSyevjInfo_t + +#define cusolverDnCreate hipsolverDnCreate +#define cusolverDnDestroy hipsolverDnDestroy +#define cusolverDnSetStream hipsolverDnSetStream +#define cusolverDnCreateSyevjInfo hipsolverDnCreateSyevjInfo +#define cusolverDnDestroySyevjInfo hipsolverDnDestroySyevjInfo +#define cusolverDnXsyevjSetTolerance hipsolverDnXsyevjSetTolerance +#define cusolverDnXsyevjSetMaxSweeps hipsolverDnXsyevjSetMaxSweeps +#define cusolverDnXsyevjSetSortEig hipsolverDnXsyevjSetSortEig +#define cusolverDnSsyevjBatched_bufferSize hipsolverDnSsyevjBatched_bufferSize +#define cusolverDnSsyevjBatched hipsolverDnSsyevjBatched +#define cusolverDnDsyevjBatched_bufferSize hipsolverDnDsyevjBatched_bufferSize +#define cusolverDnDsyevjBatched hipsolverDnDsyevjBatched diff --git a/cmake/hip/device_launch_parameters.h b/cmake/hip/device_launch_parameters.h new file mode 100644 index 000000000..862568004 --- /dev/null +++ b/cmake/hip/device_launch_parameters.h @@ -0,0 +1,2 @@ +#pragma once +#include "CvCudaHipCompat.h" diff --git a/cmake/hip/driver_types.h b/cmake/hip/driver_types.h new file mode 100644 index 000000000..862568004 --- /dev/null +++ b/cmake/hip/driver_types.h @@ -0,0 +1,2 @@ +#pragma once +#include "CvCudaHipCompat.h" diff --git a/cmake/hip/library_types.h b/cmake/hip/library_types.h new file mode 100644 index 000000000..08c872744 --- /dev/null +++ b/cmake/hip/library_types.h @@ -0,0 +1,18 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Author: Jeff Daily + */ + +#pragma once +#include "CvCudaHipCompat.h" +#include + +// CUDA's library_types.h provides cudaDataType / CUDA_R_*; map to HIP's. +#define cudaDataType hipDataType +#define CUDA_R_16F HIP_R_16F +#define CUDA_R_32F HIP_R_32F +#define CUDA_R_64F HIP_R_64F +#define CUDA_C_32F HIP_C_32F +#define CUDA_C_64F HIP_C_64F diff --git a/cmake/hip/vector_types.h b/cmake/hip/vector_types.h new file mode 100644 index 000000000..862568004 --- /dev/null +++ b/cmake/hip/vector_types.h @@ -0,0 +1,2 @@ +#pragma once +#include "CvCudaHipCompat.h" diff --git a/docs/sphinx/installation.rst b/docs/sphinx/installation.rst index a037ee324..cd830e12f 100644 --- a/docs/sphinx/installation.rst +++ b/docs/sphinx/installation.rst @@ -281,6 +281,8 @@ The central ``ci/build.sh`` script is used to build the project, Python bindings - ``-DDOC_PYTHON_VERSION='3.11'``: Override Python version for documentation build (default: system Python) - ``-DENABLE_SANITIZER=1|0``: Enable/disable address sanitizer (default: disabled) - ``-DCMAKE_CUDA_COMPILER=/path/to/nvcc``: Override CUDA compiler (default: /usr/local/cuda/bin/nvcc) +- ``-DUSE_HIP=1|0``: Build the GPU code with HIP for AMD GPUs (ROCm) instead of CUDA (default: disabled). See :ref:`Building for AMD GPUs (ROCm) ` below. +- ``-DCMAKE_HIP_ARCHITECTURES='gfx90a'``: AMD GPU architecture(s) to build for when ``USE_HIP=1`` (defaults to ``gfx90a`` when unset). Set to your target, e.g. ``gfx1100`` for RDNA3 desktop GPUs. All boolean options accept both numeric (``0``/``1``) and CMake boolean values (``ON``/``OFF``, ``YES``/``NO``, ``TRUE``/``FALSE``). @@ -310,6 +312,32 @@ All boolean options accept both numeric (``0``/``1``) and CMake boolean values ( # Build with specific CUDA 13 version ci/build.sh -DCMAKE_CUDA_COMPILER=/usr/local/cuda-13/bin/nvcc + # Build for AMD GPUs with HIP/ROCm + ci/build.sh release build-rel -DUSE_HIP=1 -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_PREFIX_PATH=/opt/rocm + +.. _build-rocm: + +Building for AMD GPUs (ROCm) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +CV-CUDA can target AMD GPUs by building its GPU code with HIP instead of CUDA. The HIP path is additive: with ``USE_HIP=0`` (the default) the NVIDIA build is unchanged. + +Prerequisites: + +- A `ROCm `_ installation (7.x recommended), which provides ``hipcc`` and the HIP runtime along with the ROCm math libraries used by some operators (hipCUB, hipBLAS, hipSOLVER, rocRAND). +- CMake and Ninja as for the CUDA build (the CUDA toolkit is not required). + +Build with HIP enabled, selecting the target AMD architecture: + +.. code-block:: shell + + ci/build.sh release build-rel -DUSE_HIP=1 -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_PREFIX_PATH=/opt/rocm + +If ROCm is not on your ``PATH``, pass ``-DCMAKE_PREFIX_PATH=/opt/rocm`` so CMake finds the hip* packages (``find_package(hip)`` and friends). + +When ``CMAKE_HIP_ARCHITECTURES`` is left unset it defaults to ``gfx90a``; set it to the architecture of your GPU (for example ``gfx1100`` for RDNA3 desktop cards). No source or CMake edits are needed to retarget. The build outputs the same library and test layout as the CUDA build, and the test suites run unchanged on the AMD GPU. + +This support has been validated on the CDNA2 ``gfx90a`` (MI200 series) and RDNA3 ``gfx1100`` architectures on Linux. 1. Run Tests ~~~~~~~~~~~~ diff --git a/src/cvcuda/CMakeLists.txt b/src/cvcuda/CMakeLists.txt index 64c6c3d93..659b5b4e8 100644 --- a/src/cvcuda/CMakeLists.txt +++ b/src/cvcuda/CMakeLists.txt @@ -93,6 +93,12 @@ else() list(APPEND CV_CUDA_LIB_FILES ${CV_CUDA_OP_FILES}) endif() +# cuOSD-dependent operators (OSD, bounding box, box blur) are scoped out of the +# ROCm build (cuOSD is a prebuilt CUDA-only static lib with no source). +if(USE_HIP) + list(FILTER CV_CUDA_LIB_FILES EXCLUDE REGEX "OpOSD|OpBndBox|OpBoxBlur") +endif() + add_library(cvcuda SHARED ${CV_CUDA_LIB_FILES} ) diff --git a/src/cvcuda/include/cvcuda/cuda_tools/InterpolationVarShapeWrap.hpp b/src/cvcuda/include/cvcuda/cuda_tools/InterpolationVarShapeWrap.hpp index e44c5e76c..e1c7b7aa9 100644 --- a/src/cvcuda/include/cvcuda/cuda_tools/InterpolationVarShapeWrap.hpp +++ b/src/cvcuda/include/cvcuda/cuda_tools/InterpolationVarShapeWrap.hpp @@ -225,7 +225,9 @@ class InterpolationVarShapeWrap c.x = GetIndexForInterpolation(c.x + .5f); c.y = GetIndexForInterpolation(c.y + .5f); - return doGetValue(c); + // clang/HIP two-phase lookup: doGetValue is inherited from a dependent + // base, so it must be qualified. + return this->doGetValue(c); } }; diff --git a/src/cvcuda/include/cvcuda/cuda_tools/MathOps.hpp b/src/cvcuda/include/cvcuda/cuda_tools/MathOps.hpp index 4506cf101..1b7dd161d 100644 --- a/src/cvcuda/include/cvcuda/cuda_tools/MathOps.hpp +++ b/src/cvcuda/include/cvcuda/cuda_tools/MathOps.hpp @@ -112,9 +112,16 @@ NVCV_CUDA_UNARY_OPERATOR(~, nvcv::cuda::detail::IsIntegralCompound) #undef NVCV_CUDA_UNARY_OPERATOR -#define NVCV_CUDA_BINARY_OPERATOR(OPERATOR, REQUIREMENT) \ - template>> \ - inline __host__ __device__ auto operator OPERATOR(T a, U b) \ +// The element-wise body is shared between the generic operator and, on HIP, a +// more-specialized compound-compound overload (see below). HIP's vector types +// (HIP_vector_type) ship their own operator+(vec, U) and operator+(U, +// vec); for a mixed pair like float3 + uchar3 both bind and they are +// equally specialized, so the call is ambiguous (a HIP header limitation). +// CV-CUDA's generic operator(T, U) is less specialized than HIP's and loses +// partial ordering, so on HIP we also emit operator(vec, vec), +// which is more specialized than HIP's pair and wins, while same-type vec+vec +// still resolves to HIP's own (correct) operator and vec+scalar is unchanged. +#define NVCV_CUDA_BINARY_OPERATOR_BODY(OPERATOR) \ { \ using RT = nvcv::cuda::MakeType< \ decltype(std::declval>() OPERATOR std::declval>()), \ @@ -152,7 +159,103 @@ NVCV_CUDA_UNARY_OPERATOR(~, nvcv::cuda::detail::IsIntegralCompound) else if constexpr (nvcv::cuda::NumElements == 4) \ return RT{a.x OPERATOR b.x, a.y OPERATOR b.y, a.z OPERATOR b.z, a.w OPERATOR b.w}; \ } \ + } + +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) +// HIP_vector_type also ships operator(vec, U) / operator(U, vec) with a +// templated scalar U; for a float/double scalar these keep the vector's integer +// element type (truncating) and win partial ordering over CV-CUDA's promoting +// operator(T, U). Emit vec<>+float and vec<>+double overloads (concrete scalar +// types, hence more specialized than HIP's templated U) so CV-CUDA's promoting +// body is selected. The compound-compound overload covers the mixed-vector case. +// Direct element-wise bodies: the shared BODY macro syntax-checks `b.x` in +// untaken if-constexpr branches, which is a hard error when the scalar arg has a +// concrete (non-dependent) type, so the scalar overloads spell the loop out. +#define NVCV_CUDA_BINARY_OPERATOR_HIP_SCALAR(OPERATOR, SCALAR) \ + template>>> \ + inline __host__ __device__ auto operator OPERATOR(HIP_vector_type a, SCALAR b) \ + { \ + using RT = nvcv::cuda::MakeType() OPERATOR std::declval()), N>; \ + if constexpr (N == 1) \ + return RT{a.x OPERATOR b}; \ + else if constexpr (N == 2) \ + return RT{a.x OPERATOR b, a.y OPERATOR b}; \ + else if constexpr (N == 3) \ + return RT{a.x OPERATOR b, a.y OPERATOR b, a.z OPERATOR b}; \ + else \ + return RT{a.x OPERATOR b, a.y OPERATOR b, a.z OPERATOR b, a.w OPERATOR b}; \ + } \ + template>>> \ + inline __host__ __device__ auto operator OPERATOR(SCALAR a, HIP_vector_type b) \ + { \ + using RT = nvcv::cuda::MakeType() OPERATOR std::declval()), N>; \ + if constexpr (N == 1) \ + return RT{a OPERATOR b.x}; \ + else if constexpr (N == 2) \ + return RT{a OPERATOR b.x, a OPERATOR b.y}; \ + else if constexpr (N == 3) \ + return RT{a OPERATOR b.x, a OPERATOR b.y, a OPERATOR b.z}; \ + else \ + return RT{a OPERATOR b.x, a OPERATOR b.y, a OPERATOR b.z, a OPERATOR b.w}; \ + } +// HIP_vector_type ships its own operator OP for every pair (vec,vec), +// (const vec&, U), (U, const vec&) with U unconstrained and the vector by const +// reference. Two cases need CV-CUDA's promoting semantics instead of HIP's: +// * a mixed-element vector pair (e.g. float3 OP uchar3): HIP's (vec,vec) needs +// both element types identical, so both HIP's (const vec&, U) and (U, const +// vec&) are viable and tie, and either truncates to one operand's element +// type. A both-operands-concrete overload is more specialized than either +// HIP form by signature alone, so it wins without a constraint; the SFINAE +// enable_if(!same element type) keeps same-type vec OP vec on HIP's own +// operator. This is the only case the operator kernels exercise, so it is +// emitted unconditionally (C++17-valid). +// * a vector OP dim3 (dim3 is an NVCV compound but not a HIP_vector_type): +// HIP's (const vec&, U=dim3) is chosen and make_vector_type(dim3) is +// ill-formed. dim3 is the only NVCV compound that is not a HIP_vector_type +// (and is always 3 components), so a forward overload with a CONCRETE dim3 +// operand (more specialized than HIP's templated U) wins unambiguously -- +// for the integral-only operators (%,&,|,^,<<,>>) HIP's own operator is +// itself enable_if-constrained, so a requires-clause could not break the +// tie, but a concrete parameter type still does. The 3-element result body +// is spelled out (the shared body's if-constexpr branches reference .w, +// which is a hard error on the concrete 3-element dim3 even when discarded). +// The mirror covers dim3 OP vec; both are plain C++17. +#define NVCV_CUDA_BINARY_OPERATOR_HIP_DIM3(OPERATOR) \ + template>>> \ + inline __host__ __device__ auto operator OPERATOR(const HIP_vector_type &a, dim3 b) \ + { \ + using RT = nvcv::cuda::MakeType() OPERATOR std::declval()), 3>;\ + return RT{a.x OPERATOR b.x, a.y OPERATOR b.y, a.z OPERATOR b.z}; \ + } \ + template>>> \ + inline __host__ __device__ auto operator OPERATOR(dim3 a, const HIP_vector_type &b) \ + { \ + using RT = nvcv::cuda::MakeType() OPERATOR std::declval()), 3>;\ + return RT{a.x OPERATOR b.x, a.y OPERATOR b.y, a.z OPERATOR b.z}; \ + } +#define NVCV_CUDA_BINARY_OPERATOR_HIP(OPERATOR) \ + template>> \ + inline __host__ __device__ auto operator OPERATOR(const HIP_vector_type &a, \ + const HIP_vector_type &b) \ + { \ + using T = HIP_vector_type; \ + using U = HIP_vector_type; \ + NVCV_CUDA_BINARY_OPERATOR_BODY(OPERATOR) \ } \ + NVCV_CUDA_BINARY_OPERATOR_HIP_DIM3(OPERATOR) \ + NVCV_CUDA_BINARY_OPERATOR_HIP_SCALAR(OPERATOR, float) \ + NVCV_CUDA_BINARY_OPERATOR_HIP_SCALAR(OPERATOR, double) +#else +#define NVCV_CUDA_BINARY_OPERATOR_HIP(OPERATOR) +#endif + +#define NVCV_CUDA_BINARY_OPERATOR(OPERATOR, REQUIREMENT) \ + template>> \ + inline __host__ __device__ auto operator OPERATOR(T a, U b) NVCV_CUDA_BINARY_OPERATOR_BODY(OPERATOR) \ + NVCV_CUDA_BINARY_OPERATOR_HIP(OPERATOR) \ template>> \ inline __host__ __device__ T &operator OPERATOR##=(T &a, U b) \ { \ @@ -196,6 +299,55 @@ inline __host__ __device__ bool operator!=(T a, U b) return !(a == b); } +// HIP_vector_type ships operator==/!= for (vec,vec), (const vec&, U), (U, const +// vec&). A mixed pair like int3 == long3 makes both HIP (const vec&, U) and (U, +// const vec&) viable (tie), and vec == dim3 is ill-formed inside +// make_vector_type(dim3). Same structure and rationale as the arithmetic fix: a +// both-operands-concrete overload (more specialized than either HIP form) for +// the mixed-element vector pair (SFINAE-excluding same element type so HIP's own +// operator keeps that), plus a concrete-dim3 forward and mirror. All C++17. +// Bodies are spelled out (a concrete operand trips the if-constexpr probe of the +// shared arithmetic body). +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) +#define NVCV_CUDA_HIP_COMPARE_BODY(RET_ON_DIFF, RET_DEFAULT) \ + { \ + bool eq = true; \ + if constexpr (N >= 1) \ + eq = eq && (a.x == b.x); \ + if constexpr (N >= 2) \ + eq = eq && (a.y == b.y); \ + if constexpr (N >= 3) \ + eq = eq && (a.z == b.z); \ + if constexpr (N == 4) \ + eq = eq && (a.w == b.w); \ + return eq ? RET_DEFAULT : RET_ON_DIFF; \ + } +#define NVCV_CUDA_HIP_COMPARE(OPERATOR, RET_ON_DIFF, RET_DEFAULT) \ + template>> \ + inline __host__ __device__ bool operator OPERATOR(const HIP_vector_type &a, \ + const HIP_vector_type &b) \ + NVCV_CUDA_HIP_COMPARE_BODY(RET_ON_DIFF, RET_DEFAULT) \ + template>>> \ + inline __host__ __device__ bool operator OPERATOR(const HIP_vector_type &a, dim3 b) \ + { \ + bool eq = (a.x == b.x) && (a.y == b.y) && (a.z == b.z); \ + return eq ? RET_DEFAULT : RET_ON_DIFF; \ + } \ + template>>> \ + inline __host__ __device__ bool operator OPERATOR(dim3 a, const HIP_vector_type &b) \ + { \ + bool eq = (a.x == b.x) && (a.y == b.y) && (a.z == b.z); \ + return eq ? RET_DEFAULT : RET_ON_DIFF; \ + } + +NVCV_CUDA_HIP_COMPARE(==, false, true) +NVCV_CUDA_HIP_COMPARE(!=, true, false) + +#undef NVCV_CUDA_HIP_COMPARE +#undef NVCV_CUDA_HIP_COMPARE_BODY +#endif + namespace nvcv::cuda { template>> diff --git a/src/cvcuda/include/cvcuda/cuda_tools/detail/MathWrappersImpl.hpp b/src/cvcuda/include/cvcuda/cuda_tools/detail/MathWrappersImpl.hpp index e64481777..9b40636be 100644 --- a/src/cvcuda/include/cvcuda/cuda_tools/detail/MathWrappersImpl.hpp +++ b/src/cvcuda/include/cvcuda/cuda_tools/detail/MathWrappersImpl.hpp @@ -26,7 +26,7 @@ namespace nvcv::cuda::detail { -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) template __device__ __forceinline__ T DeviceRoundImpl(U u) @@ -277,6 +277,14 @@ __device__ __forceinline__ T DeviceRoundImpl(U u) template __device__ __forceinline__ U DeviceMinImpl(U a, U b) { +#if defined(__HIP_DEVICE_COMPILE__) + // HIP's global namespace has no typed umin/ullmin/llmin; the ternary lowers to + // the same integer-min instruction the CUDA builtins emit. The comparison is + // spelled exactly like the host std::min fallback (b < a ? b : a) so the device + // result is identical to the host reference for every input, including the + // NaN / signed-zero cases the morphology tests exercise with raw-byte floats. + return b < a ? b : a; +#else if constexpr (std::is_same_v) { return ::umin(a, b); @@ -293,11 +301,17 @@ __device__ __forceinline__ U DeviceMinImpl(U a, U b) { return ::min(a, b); } +#endif } template __device__ __forceinline__ U DeviceMaxImpl(U a, U b) { +#if defined(__HIP_DEVICE_COMPILE__) + // Spelled exactly like the host std::max fallback (a < b ? b : a) so the device + // max matches the host reference exactly on NaN / signed-zero inputs. + return a < b ? b : a; +#else if constexpr (std::is_same_v) { return ::umax(a, b); @@ -314,6 +328,7 @@ __device__ __forceinline__ U DeviceMaxImpl(U a, U b) { return ::max(a, b); } +#endif } template @@ -357,6 +372,21 @@ __device__ __forceinline__ U DeviceExpImpl(U u) template __device__ __forceinline__ U DeviceSqrtImpl(U u) { +#if defined(__HIP_DEVICE_COMPILE__) + // gfx90a's single-precision __fsqrt_rn is not always correctly rounded (it can + // be 1 ULP off, e.g. sqrt(93606.0f)), whereas CUDA's sqrt.rn.f32 and the host + // std::sqrt are correctly rounded. Route 32-bit sqrt through the correctly + // rounded f64 sqrt and round once to keep results identical to the CUDA + // build and the CPU references (CDNA has fast f64 sqrt). + if constexpr (std::is_same_v) + { + return __dsqrt_rn(u); + } + else + { + return static_cast(__dsqrt_rn(static_cast(u))); + } +#else if constexpr (std::is_same_v) { return __fsqrt_rn(u); @@ -373,6 +403,7 @@ __device__ __forceinline__ U DeviceSqrtImpl(U u) { return static_cast(__dsqrt_rn(static_cast(u))); } +#endif } template @@ -417,7 +448,7 @@ inline __host__ U RoundEvenImpl(U u) template inline __host__ __device__ T RoundImpl(U u) { -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) return DeviceRoundImpl(u); #else // In host we use C++ to do round depending on round mode by selecting at compile time the correct function: @@ -444,7 +475,7 @@ inline __host__ __device__ T RoundImpl(U u) template inline __host__ __device__ U MinImpl(U a, U b) { -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) return DeviceMinImpl(a, b); #else return std::min(a, b); @@ -454,7 +485,7 @@ inline __host__ __device__ U MinImpl(U a, U b) template inline __host__ __device__ U MaxImpl(U a, U b) { -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) return DeviceMaxImpl(a, b); #else return std::max(a, b); @@ -464,7 +495,7 @@ inline __host__ __device__ U MaxImpl(U a, U b) template inline __host__ __device__ U PowImpl(U x, S y) { -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) return DevicePowImpl(x, y); #else return std::pow(x, y); @@ -474,7 +505,7 @@ inline __host__ __device__ U PowImpl(U x, S y) template inline __host__ __device__ U ExpImpl(U u) { -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) return DeviceExpImpl(u); #else return std::exp(u); @@ -484,7 +515,7 @@ inline __host__ __device__ U ExpImpl(U u) template inline __host__ __device__ U SqrtImpl(U u) { -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) return DeviceSqrtImpl(u); #else return std::sqrt(u); @@ -500,7 +531,7 @@ inline __host__ __device__ U AbsImpl(U u) } else { -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) return DeviceAbsImpl(u); #else return std::abs(u); diff --git a/src/cvcuda/include/cvcuda/cuda_tools/detail/Metaprogramming.hpp b/src/cvcuda/include/cvcuda/cuda_tools/detail/Metaprogramming.hpp index fc0a9f719..528b79c4b 100644 --- a/src/cvcuda/include/cvcuda/cuda_tools/detail/Metaprogramming.hpp +++ b/src/cvcuda/include/cvcuda/cuda_tools/detail/Metaprogramming.hpp @@ -93,7 +93,15 @@ NVCV_CUDA_TYPE_TRAITS(double, double, 0, 1, DBL_MIN, DBL_MAX); NVCV_CUDA_TYPE_TRAITS_1_TO_3(COMPOUND_TYPE, BASE_TYPE, MIN_VAL, MAX_VAL); \ NVCV_CUDA_TYPE_TRAITS(COMPOUND_TYPE##4, BASE_TYPE, 4, 4, MIN_VAL, MAX_VAL) +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) +// HIP's char1..4 are HIP_vector_type, whose members are plain `char` +// (not `signed char` as in CUDA's vector types), so the base_type must be +// `char` for GetElement's reference return to bind. `char` is signed on the +// supported targets, so the numeric range is unchanged. +NVCV_CUDA_TYPE_TRAITS_1_TO_4(char, char, SCHAR_MIN, SCHAR_MAX); +#else NVCV_CUDA_TYPE_TRAITS_1_TO_4(char, signed char, SCHAR_MIN, SCHAR_MAX); +#endif NVCV_CUDA_TYPE_TRAITS_1_TO_4(uchar, unsigned char, 0, UCHAR_MAX); NVCV_CUDA_TYPE_TRAITS_1_TO_4(short, short, SHRT_MIN, SHRT_MAX); NVCV_CUDA_TYPE_TRAITS_1_TO_4(ushort, unsigned short, 0, USHRT_MAX); diff --git a/src/cvcuda/include/cvcuda/cuda_tools/math/LinAlg.hpp b/src/cvcuda/include/cvcuda/cuda_tools/math/LinAlg.hpp index c413a4685..609334cde 100644 --- a/src/cvcuda/include/cvcuda/cuda_tools/math/LinAlg.hpp +++ b/src/cvcuda/include/cvcuda/cuda_tools/math/LinAlg.hpp @@ -219,8 +219,12 @@ class Vector return v; } - // On-purpose public data to allow POD-class direct initialization. -#ifdef __CUDA_ARCH__ + // On-purpose public data to allow POD-class direct initialization. The + // default member initializer is omitted on HIP in BOTH compiler passes (not + // just the device pass): Vector is used as a __shared__ variable, and a + // __shared__ object's type must have no initializer, which clang enforces in + // the host pass too. +#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) T m_data[N]; #else T m_data[N] = {}; @@ -466,7 +470,7 @@ namespace detail { template constexpr __host__ __device__ void swap(T &a, T &b) { -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) T c = a; a = b; b = c; diff --git a/src/cvcuda/priv/CMakeLists.txt b/src/cvcuda/priv/CMakeLists.txt index deac31c27..c93345216 100644 --- a/src/cvcuda/priv/CMakeLists.txt +++ b/src/cvcuda/priv/CMakeLists.txt @@ -91,6 +91,12 @@ else() list(APPEND CV_CUDA_PRIV_FILES ${CV_CUDA_PRIV_OP_FILES}) endif() +# cuOSD is a prebuilt CUDA-only static lib with no source, so the operators that +# depend on it (OSD, bounding box, box blur) are scoped out of the ROCm build. +if(USE_HIP) + list(FILTER CV_CUDA_PRIV_FILES EXCLUDE REGEX "OpOSD|OpBndBox|OpBoxBlur") +endif() + add_library(cvcuda_priv STATIC ${CV_CUDA_PRIV_FILES} ) diff --git a/src/cvcuda/priv/OpBrightnessContrast.cu b/src/cvcuda/priv/OpBrightnessContrast.cu index 6313b45a5..b049446c2 100644 --- a/src/cvcuda/priv/OpBrightnessContrast.cu +++ b/src/cvcuda/priv/OpBrightnessContrast.cu @@ -120,7 +120,13 @@ inline __device__ void DoBrightnessContrast(SrcWrapper src, DstWrapper dst, cons { using SrcT = std::remove_const_t; using DstT = typename DstWrapper::ValueType; +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) + // std::declval is __host__-only; use the __host__ __device__ shim in this + // device-side decltype (clang rejects host fns in __device__ unevaluated ctx). + using IntermediateT = decltype(nvcv::cuda::compat::declval() * nvcv::cuda::compat::declval()); +#else using IntermediateT = decltype(std::declval() * std::declval()); +#endif using SrcBT = cuda::BaseType; using DstBT = cuda::BaseType; using BI = cuda::BaseType; diff --git a/src/cvcuda/priv/OpCropFlipNormalizeReformat.cu b/src/cvcuda/priv/OpCropFlipNormalizeReformat.cu index cfae84add..6fe40804f 100644 --- a/src/cvcuda/priv/OpCropFlipNormalizeReformat.cu +++ b/src/cvcuda/priv/OpCropFlipNormalizeReformat.cu @@ -71,7 +71,7 @@ __device__ void transfer_data(cuda::BorderVarShapeWrap srcWrap, Dst { float base = get_base_value(baseWrap, c, base_channels); float scale = get_scale_value(scaleWrap, c, scale_channels, epsilon, flags); - dstWrap[(int4){dst_idx.x, dst_idx.y, c, batchidx}] = cuda::SaturateCast( + dstWrap[(int4){dst_idx.x, dst_idx.y, c, batchidx}] = cuda::SaturateCast( (srcWrap[(int4){src_idx.x, src_idx.y, c, batchidx}] - base) * scale * global_scale + global_shift); } } @@ -81,7 +81,7 @@ __device__ void transfer_data(cuda::BorderVarShapeWrap srcWrap, Dst { float base = get_base_value(baseWrap, c, base_channels); float scale = get_scale_value(scaleWrap, c, scale_channels, epsilon, flags); - dstWrap[(int4){c, dst_idx.x, dst_idx.y, batchidx}] = cuda::SaturateCast( + dstWrap[(int4){c, dst_idx.x, dst_idx.y, batchidx}] = cuda::SaturateCast( (srcWrap[(int4){src_idx.x, src_idx.y, c, batchidx}] - base) * scale * global_scale + global_shift); } } @@ -99,7 +99,7 @@ __device__ void transfer_data(cuda::BorderVarShapeWrapNHWC srcWrap, { float base = get_base_value(baseWrap, c, base_channels); float scale = get_scale_value(scaleWrap, c, scale_channels, epsilon, flags); - dstWrap[(int4){dst_idx.x, dst_idx.y, c, batchidx}] = cuda::SaturateCast( + dstWrap[(int4){dst_idx.x, dst_idx.y, c, batchidx}] = cuda::SaturateCast( (srcWrap[(int4){batchidx, src_idx.y, src_idx.x, c}] - base) * scale * global_scale + global_shift); } } @@ -109,7 +109,7 @@ __device__ void transfer_data(cuda::BorderVarShapeWrapNHWC srcWrap, { float base = get_base_value(baseWrap, c, base_channels); float scale = get_scale_value(scaleWrap, c, scale_channels, epsilon, flags); - dstWrap[(int4){c, dst_idx.x, dst_idx.y, batchidx}] = cuda::SaturateCast( + dstWrap[(int4){c, dst_idx.x, dst_idx.y, batchidx}] = cuda::SaturateCast( (srcWrap[(int4){batchidx, src_idx.y, src_idx.x, c}] - base) * scale * global_scale + global_shift); } } @@ -122,14 +122,14 @@ __device__ void set_data(DstWrapper dstWrap, int2 dst_idx, int batchidx, int ch, { for (int c = 0; c < ch; c++) { - dstWrap[(int4){dst_idx.x, dst_idx.y, c, batchidx}] = cuda::StaticCast(val); + dstWrap[(int4){dst_idx.x, dst_idx.y, c, batchidx}] = cuda::StaticCast(val); } } else { for (int c = 0; c < ch; c++) { - dstWrap[(int4){c, dst_idx.x, dst_idx.y, batchidx}] = cuda::StaticCast(val); + dstWrap[(int4){c, dst_idx.x, dst_idx.y, batchidx}] = cuda::StaticCast(val); } } } diff --git a/src/cvcuda/priv/OpFindHomography.cu b/src/cvcuda/priv/OpFindHomography.cu index abd413dbc..3829c041e 100644 --- a/src/cvcuda/priv/OpFindHomography.cu +++ b/src/cvcuda/priv/OpFindHomography.cu @@ -384,7 +384,11 @@ __device__ void reducef(float *data, cuda::math::Vector &warpSums, fl int tid = threadIdx.x; int idx = threadIdx.x + blockIdx.x * blockDim.x; float val = 0.0f; +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) + unsigned long long mask = NVCV_WARP_FULL_MASK; +#else unsigned mask = 0xFFFFFFFFU; +#endif int lane = threadIdx.x % warpSize; int warpID = threadIdx.x / warpSize; while (idx < numPoints) @@ -417,7 +421,11 @@ __device__ void reducef2(float2 *data, cuda::math::Vector &warpSums, int tid = threadIdx.x; int idx = threadIdx.x + blockIdx.x * blockDim.x; float2 val = {0.0f, 0.0f}; +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) + unsigned long long mask = NVCV_WARP_FULL_MASK; +#else unsigned mask = 0xFFFFFFFFU; +#endif int lane = threadIdx.x % warpSize; int warpID = threadIdx.x / warpSize; while (idx < numPoints) @@ -462,7 +470,11 @@ __device__ void reduceLtL(float2 *src, float2 *dst, cuda::math::Vector __f if constexpr (kSupportsLdg) { +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) + // HIP's __ldg has no overload for every vector element type (e.g. + // ushort4); a plain load is correct (the read-only cache hint is + // advisory and maps to a normal load on CDNA anyway). + return *GetWrapPtr(wrap, idxs...); +#else return __ldg(GetWrapPtr(wrap, idxs...)); +#endif } else if constexpr (!kSupportsLdg) { diff --git a/src/cvcuda/priv/OpLabel.cu b/src/cvcuda/priv/OpLabel.cu index 726550512..12f273281 100644 --- a/src/cvcuda/priv/OpLabel.cu +++ b/src/cvcuda/priv/OpLabel.cu @@ -66,6 +66,18 @@ namespace cuda = nvcv::cuda; namespace util = nvcv::util; +// Connected-component labelling shifts the per-row left neighbour with a +// delta-1 warp shuffle. The block is (BW=32, BH=4[, BD=2]) so each image row is +// one 32-lane subgroup; the shuffles below pass an explicit width of 32 so they +// stay within a row on a 64-lane CDNA wavefront (two rows per wavefront) exactly +// as they do within a 32-lane NVIDIA warp. The mask only marks participants and +// must be 64-bit on ROCm. +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) +#define NVCV_SHFL_MASK NVCV_WARP_FULL_MASK +#else +#define NVCV_SHFL_MASK 0xffffffff +#endif + namespace { constexpr int REGION_NOT_MARKED = 0; @@ -135,7 +147,12 @@ __global__ void BlockLabel2D(DstWrap dst, SrcWrap src, ArgWrap minThresh, Ar using DT = typename DstWrap::ValueType; __shared__ DT labels[BW * BH]; +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) + // threadIdx is __hip_builtin_threadIdx_t (no NVCV TypeTraits); convert to uint3. + int2 tc = cuda::StaticCast(cuda::DropCast<2>(uint3{threadIdx.x, threadIdx.y, threadIdx.z})); +#else int2 tc = cuda::StaticCast(cuda::DropCast<2>(threadIdx)); +#endif int3 gc{(int)(blockIdx.x * BW) + tc.x, (int)(blockIdx.y * BH) + tc.y, (int)blockIdx.z}; bool nym1x, nyxm1, nym1xm1; @@ -167,8 +184,8 @@ __global__ void BlockLabel2D(DstWrap dst, SrcWrap src, ArgWrap minThresh, Ar pym1x = (tc.y > 0) ? (pym1x > maxThreshold ? 0 : 1) : 0; } - ST pyxm1 = __shfl_up_sync(__activemask(), pyx, 1); - ST pym1xm1 = __shfl_up_sync(__activemask(), pym1x, 1); + ST pyxm1 = __shfl_up_sync(__activemask(), pyx, 1, 32); + ST pym1xm1 = __shfl_up_sync(__activemask(), pym1x, 1, 32); nym1x = (tc.y > 0) ? (pyx == pym1x) : false; nyxm1 = (tc.x > 0) ? (pyx == pyxm1) : false; @@ -252,8 +269,8 @@ __global__ void YLabelReduction2D(DstWrap dst, SrcWrap src, ArgWrap minThres pym1x = pym1x > maxThreshold ? 0 : 1; } - ST pyxm1 = __shfl_up_sync(0xffffffff, pyx, 1); - ST pym1xm1 = __shfl_up_sync(0xffffffff, pym1x, 1); + ST pyxm1 = __shfl_up_sync(NVCV_SHFL_MASK, pyx, 1, 32); + ST pym1xm1 = __shfl_up_sync(NVCV_SHFL_MASK, pym1x, 1, 32); if ((pyx == pym1x) && ((threadIdx.x == 0) || (pyx != pyxm1) || (pyx != pym1xm1))) { @@ -305,8 +322,8 @@ __global__ void XLabelReduction2D(DstWrap dst, SrcWrap src, ArgWrap minThres bool thread_y = (gc.y % blockDim.y) == 0; - ST pym1x = __shfl_up_sync(0xffffffff, pyx, 1); - ST pym1xm1 = __shfl_up_sync(0xffffffff, pyxm1, 1); + ST pym1x = __shfl_up_sync(NVCV_SHFL_MASK, pyx, 1, 32); + ST pym1xm1 = __shfl_up_sync(NVCV_SHFL_MASK, pyxm1, 1, 32); if ((pyx == pyxm1) && (thread_y || (pyx != pym1x) || (pyx != pym1xm1))) { @@ -679,7 +696,11 @@ __global__ void BlockLabel3D(DstWrap dst, SrcWrap src, ArgWrap minThresh, Ar __shared__ DT labels[BW * BH * BD]; +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) + int3 tc = cuda::StaticCast(uint3{threadIdx.x, threadIdx.y, threadIdx.z}); +#else int3 tc = cuda::StaticCast(threadIdx); +#endif int4 gc{(int)blockIdx.x * BW + tc.x, (int)blockIdx.y * BH + tc.y, (int)blockIdx.z * BD + tc.z, 0}; bool nzm1yx, nzym1x, nzyxm1, nzym1xm1, nzm1yxm1, nzm1ym1x; @@ -722,9 +743,9 @@ __global__ void BlockLabel3D(DstWrap dst, SrcWrap src, ArgWrap minThresh, Ar pzm1ym1x = (tc.z > 0 && tc.y > 0) ? (pzm1ym1x > maxThreshold ? 0 : 1) : 0; } - ST pzyxm1 = __shfl_up_sync(__activemask(), pzyx, 1); - ST pzym1xm1 = __shfl_up_sync(__activemask(), pzym1x, 1); - ST pzm1yxm1 = __shfl_up_sync(__activemask(), pzm1yx, 1); + ST pzyxm1 = __shfl_up_sync(__activemask(), pzyx, 1, 32); + ST pzym1xm1 = __shfl_up_sync(__activemask(), pzym1x, 1, 32); + ST pzm1yxm1 = __shfl_up_sync(__activemask(), pzm1yx, 1, 32); nzm1yx = (tc.z > 0) && (pzyx == pzm1yx); nzym1x = (tc.y > 0) && (pzyx == pzym1x); @@ -827,8 +848,8 @@ __global__ void ZLabelReduction3D(DstWrap dst, SrcWrap src, ArgWrap minThres pzm1yx = pzm1yx > maxThreshold ? 0 : 1; } - ST pzyxm1 = __shfl_up_sync(0xffffffff, pzyx, 1); - ST pzm1yxm1 = __shfl_up_sync(0xffffffff, pzm1yx, 1); + ST pzyxm1 = __shfl_up_sync(NVCV_SHFL_MASK, pzyx, 1, 32); + ST pzm1yxm1 = __shfl_up_sync(NVCV_SHFL_MASK, pzm1yx, 1, 32); if (pzyx == pzm1yx) { @@ -910,8 +931,8 @@ __global__ void YLabelReduction3D(DstWrap dst, SrcWrap src, ArgWrap minThres pzym1x = pzym1x > maxThreshold ? 0 : 1; } - ST pzyxm1 = __shfl_up_sync(0xffffffff, pzyx, 1); - ST pzym1xm1 = __shfl_up_sync(0xffffffff, pzym1x, 1); + ST pzyxm1 = __shfl_up_sync(NVCV_SHFL_MASK, pzyx, 1, 32); + ST pzym1xm1 = __shfl_up_sync(NVCV_SHFL_MASK, pzym1x, 1, 32); if (pzyx == pzym1x) { @@ -993,8 +1014,8 @@ __global__ void XLabelReduction3D(DstWrap dst, SrcWrap src, ArgWrap minThres pzyxm1 = pzyxm1 > maxThreshold ? 0 : 1; } - ST pzm1yx = __shfl_up_sync(0xffffffff, pzyx, 1); - ST pzm1yxm1 = __shfl_up_sync(0xffffffff, pzyxm1, 1); + ST pzm1yx = __shfl_up_sync(NVCV_SHFL_MASK, pzyx, 1, 32); + ST pzm1yxm1 = __shfl_up_sync(NVCV_SHFL_MASK, pzyxm1, 1, 32); if (pzyx == pzyxm1) { diff --git a/src/cvcuda/priv/OpMinMaxLoc.cu b/src/cvcuda/priv/OpMinMaxLoc.cu index 85bc71cd2..5f56d8ea9 100644 --- a/src/cvcuda/priv/OpMinMaxLoc.cu +++ b/src/cvcuda/priv/OpMinMaxLoc.cu @@ -112,12 +112,19 @@ struct OpMin using OutType = OutputType; using BaseOutType = cuda::BaseType; - static constexpr OutType init = {cuda::TypeTraits::max}; + // A function rather than a static constexpr member: OutType is a vector type + // whose ctor is non-constexpr under HIP (HIP_vector_type), so a static const + // initializer cannot be emitted on the device (clang). Returning it by value + // keeps the same codegen after inlining on both backends. + __host__ __device__ static OutType init() + { + return OutType{cuda::TypeTraits::max}; + } template __device__ inline static void initFill(OutWrapper out, int z) { - get<0>(out)[z] = init; + get<0>(out)[z] = init(); } template @@ -141,12 +148,15 @@ struct OpMax using OutType = OutputType; using BaseOutType = cuda::BaseType; - static constexpr OutType init = {cuda::Lowest}; + __host__ __device__ static OutType init() + { + return OutType{cuda::Lowest}; + } template __device__ inline static void initFill(OutWrapper out, int z) { - get<0>(out)[z] = init; + get<0>(out)[z] = init(); } template @@ -170,13 +180,23 @@ struct OpMinMax using BaseOutType = cuda::BaseType>; using OutType = cuda::MakeType; - static constexpr OutType init = {cuda::TypeTraits::max, cuda::Lowest}; + __host__ __device__ static OutType init() + { + return OutType{cuda::TypeTraits::max, cuda::Lowest}; + } template __device__ inline static void initFill(OutWrapper out, int z) { - get<0>(out)[z] = {init.x}; - get<1>(out)[z] = {init.y}; + OutType v = init(); +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) + // HIP_vector_type has no braced-init-list operator=; set the lane member. + get<0>(out)[z].x = v.x; + get<1>(out)[z].x = v.y; +#else + get<0>(out)[z] = {v.x}; + get<1>(out)[z] = {v.y}; +#endif } template @@ -349,7 +369,7 @@ __global__ __launch_bounds__(BW *BH) void FindMinMax(InWrapper in, int2 size, Ou int x = (blockIdx.x * blockDim.x + threadIdx.x) * TW; int y = blockIdx.y * BH * TH + threadIdx.y; - auto threadRet = OP::init; + auto threadRet = OP::init(); #pragma unroll for (int i = 0; i < TH; ++i) diff --git a/src/cvcuda/priv/OpPairwiseMatcher.cu b/src/cvcuda/priv/OpPairwiseMatcher.cu index 9e6af5f6a..b4098ba5e 100644 --- a/src/cvcuda/priv/OpPairwiseMatcher.cu +++ b/src/cvcuda/priv/OpPairwiseMatcher.cu @@ -100,17 +100,44 @@ public: #pragma unroll for (int i = 0; i < kNumElem && i < util::DivUp(numDim * (int)sizeof(T), (int)sizeof(RT)); ++i) { +#if defined(__HIP__) + data.words[i] + = *reinterpret_cast(set.ptr(sampleIdx, setIdx, i * (int)(sizeof(RT) / sizeof(T)))); +#else data[i] = *reinterpret_cast(set.ptr(sampleIdx, setIdx, i * (int)(sizeof(RT) / sizeof(T)))); +#endif } } inline __device__ T &operator[](int i) const { +#if defined(__HIP__) + // The cache is filled as RT (uint32) words in load() but read back as type + // T here. Punning through a union (instead of reinterpret_cast on a private + // RT[] array) keeps the access well defined so the clang/HIP device + // optimizer does not treat the T reads as non-aliasing with the RT stores + // and elide them -- which on HIP left every computed distance unset. nvcc + // does not elide the reinterpret_cast read, and a union with a const-qualified + // variant member would delete this class's default constructor under nvcc, so + // the CUDA path keeps the original array spelling unchanged. + return const_cast(data.elems[i]); +#else return reinterpret_cast(&data[0])[i]; +#endif } private: +#if defined(__HIP__) + union Cache + { + RT words[kNumElem]; + T elems[kNumElem * (int)(sizeof(RT) / sizeof(T))]; + }; + + Cache data; +#else RT data[kNumElem]; +#endif }; // Is compatible checks if a {numDim}-dimensional point fits in the corresponding Point T class (above) @@ -249,6 +276,11 @@ inline __device__ void SortKeyValue(float &sortedDist, int &sortedIdx, const Poi sortedIdx = values[0]; } } + + // SortKeyValue is called twice in the crossCheck path and the compiler aliases + // the two function-local CUB TempStorage allocations. CUB requires a + // __syncthreads() before that shared storage is reused by the next collective. + __syncthreads(); } // Write a match of (set1Idx, set2Idx) with (distance) found at matchIdx inside output matches and distances diff --git a/src/cvcuda/priv/legacy/CMakeLists.txt b/src/cvcuda/priv/legacy/CMakeLists.txt index 4a265c8ea..56fe44fde 100644 --- a/src/cvcuda/priv/legacy/CMakeLists.txt +++ b/src/cvcuda/priv/legacy/CMakeLists.txt @@ -104,6 +104,12 @@ else() list(APPEND CV_CUDA_PRIV_LEGACY_FILES ${CV_CUDA_PRIV_LEGACY_OP_FILES}) endif() +# cuOSD-backed kernels (osd, box_blur, the text backend) are scoped out of the +# ROCm build (cuOSD is a prebuilt CUDA-only static lib with no source). +if(USE_HIP) + list(FILTER CV_CUDA_PRIV_LEGACY_FILES EXCLUDE REGEX "osd\\.cu|box_blur\\.cu|textbackend/") +endif() + add_library(cvcuda_legacy STATIC ${CV_CUDA_PRIV_LEGACY_FILES} ) diff --git a/src/cvcuda/priv/legacy/CvCudaLegacy.h b/src/cvcuda/priv/legacy/CvCudaLegacy.h index fd9ad2515..32cd43b32 100644 --- a/src/cvcuda/priv/legacy/CvCudaLegacy.h +++ b/src/cvcuda/priv/legacy/CvCudaLegacy.h @@ -18,7 +18,11 @@ #ifndef CV_CUDA_LEGACY_H #define CV_CUDA_LEGACY_H +// CvCudaOSD.hpp pulls in cuOSD, a prebuilt CUDA-only static lib with no source. +// The OSD/BndBox/BoxBlur operators it backs are scoped out of the ROCm build. +#if !defined(USE_HIP) #include "CvCudaOSD.hpp" +#endif #include #include @@ -2043,6 +2047,7 @@ class JointBilateralFilterVarShape : public CudaBaseOp NVCVBorderType borderMode, cudaStream_t stream); }; +#if !defined(USE_HIP) class OSD : public CudaBaseOp { public: @@ -2111,6 +2116,7 @@ class BoxBlur : public CudaBaseOp private: nvcv::cuda::osd::cuOSDContext_t m_context; }; +#endif // !USE_HIP class CvtColor : public CudaBaseOp { diff --git a/src/cvcuda/priv/legacy/histogram_eq_var_shape.cu b/src/cvcuda/priv/legacy/histogram_eq_var_shape.cu index c8193ccc5..1c9a7bd1e 100644 --- a/src/cvcuda/priv/legacy/histogram_eq_var_shape.cu +++ b/src/cvcuda/priv/legacy/histogram_eq_var_shape.cu @@ -261,6 +261,12 @@ ErrorCode HistogramEqVarShape::infer(const nvcv::ImageBatchVarShapeDataStridedCu cuda::ImageBatchVarShapeWrapNHWC src(inData, channels); auto histo = nvcv::cuda::Tensor2DWrap(m_histoArray, (int)(256 * channels * sizeof(int))); + // hist_kernel accumulates into m_histoArray with atomicAdd, so it must start + // at zero. The tensor HistogramEq path memsets it (see HistogramEq::infer); + // this varshape path relied on freshly cudaMalloc'd memory reading back as + // zero, which does not hold for recycled hipMalloc allocations. + checkCudaErrors(cudaMemsetAsync(m_histoArray, 0, m_sizeOfHisto, stream)); + { //compute the histogram for each image in the batch into m_histoArray int bsX = 32; //1024 ( 4 ch of 256 bins) diff --git a/src/cvcuda/priv/legacy/reduce_kernel_utils.cuh b/src/cvcuda/priv/legacy/reduce_kernel_utils.cuh index 197170549..1936f6d56 100644 --- a/src/cvcuda/priv/legacy/reduce_kernel_utils.cuh +++ b/src/cvcuda/priv/legacy/reduce_kernel_utils.cuh @@ -28,7 +28,11 @@ namespace nvcv::legacy::cuda_op { +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) +#define FINAL_MASK 0xffffffffffffffffULL +#else #define FINAL_MASK 0xffffffff +#endif template __inline__ __device__ T warpReduceSum(T val) diff --git a/src/cvcuda/priv/legacy/resize_var_shape.cu b/src/cvcuda/priv/legacy/resize_var_shape.cu index e7342bb16..01456d938 100644 --- a/src/cvcuda/priv/legacy/resize_var_shape.cu +++ b/src/cvcuda/priv/legacy/resize_var_shape.cu @@ -277,7 +277,11 @@ __global__ void resize_area_ocv_align(const cuda::ImageBatchVarShapeWrap(fsy2); using work_type = cuda::ConvertBaseTypeTo; +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) + work_type out = {}; // HIP_vector_type's single-arg ctor is explicit; value-init zeroes all lanes +#else work_type out = {0}; +#endif int3 srcCoord = {0, 0, batch_idx}; @@ -312,7 +316,11 @@ __global__ void resize_area_ocv_align(const cuda::ImageBatchVarShapeWrap; +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) + work_type out = {}; // HIP_vector_type's single-arg ctor is explicit; value-init zeroes all lanes +#else work_type out = {0}; +#endif int3 srcCoord = {0, 0, batch_idx}; diff --git a/src/cvcuda/priv/legacy/threshold.cu b/src/cvcuda/priv/legacy/threshold.cu index c002d0121..b5918e3a7 100644 --- a/src/cvcuda/priv/legacy/threshold.cu +++ b/src/cvcuda/priv/legacy/threshold.cu @@ -31,6 +31,16 @@ using namespace nvcv::legacy::cuda_op; using namespace nvcv::cuda; +// Otsu's per-warp scans partition the 256-thread block as warp = localid / 32, +// so the butterfly/scan shuffles below pass an explicit width of 32 to stay +// within a 32-lane subgroup on a 64-lane CDNA wavefront. The mask only marks +// participants and must be 64-bit on ROCm. +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) +#define NVCV_SHFL_MASK NVCV_WARP_FULL_MASK +#else +#define NVCV_SHFL_MASK 0xffffffff +#endif + template __global__ void Binary_overflow(SrcWrap src, DstWrap dst, Tensor1DWrap _thresh, Tensor1DWrap _maxval, int height, int width, int channel) @@ -493,6 +503,17 @@ __global__ void otsu_cal(int *histogram, Tensor1DWrap thresh, i if (localid < 64) reduce[localid] = reduce[localid] + reduce[localid + 64]; __syncthreads(); +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) + // The 64-lane CDNA wavefront gives no warp-synchronous lockstep guarantee + // across the unsynchronized tail below, so keep the __syncthreads tree down + // to one element (same add order, block-wide barrier each step). + for (int s = 32; s > 0; s >>= 1) + { + if (localid < s) + reduce[localid] = reduce[localid] + reduce[localid + s]; + __syncthreads(); + } +#else if (localid < 32) { reduce[localid] = reduce[localid] + reduce[localid + 32]; @@ -503,6 +524,7 @@ __global__ void otsu_cal(int *histogram, Tensor1DWrap thresh, i reduce[localid] = reduce[localid] + reduce[localid + 1]; } __syncthreads(); +#endif mu = reduce[0] * scale; __syncthreads(); @@ -513,11 +535,11 @@ __global__ void otsu_cal(int *histogram, Tensor1DWrap thresh, i int lane = localid % 32, warp = localid / 32; // sum of q1 in warp double temp = q1; - temp += __shfl_xor_sync(0xffffffff, temp, 1); - temp += __shfl_xor_sync(0xffffffff, temp, 2); - temp += __shfl_xor_sync(0xffffffff, temp, 4); - temp += __shfl_xor_sync(0xffffffff, temp, 8); - temp += __shfl_xor_sync(0xffffffff, temp, 16); + temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 1, 32); + temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 2, 32); + temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 4, 32); + temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 8, 32); + temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 16, 32); if (lane == 0) reduce[warp] = temp; __syncthreads(); @@ -536,19 +558,19 @@ __global__ void otsu_cal(int *histogram, Tensor1DWrap thresh, i } __syncthreads(); // prefix scan in warp - temp = __shfl_up_sync(0xffffffff, q1, 1); + temp = __shfl_up_sync(NVCV_SHFL_MASK, q1, 1, 32); if (lane >= 1) q1 += temp; - temp = __shfl_up_sync(0xffffffff, q1, 2); + temp = __shfl_up_sync(NVCV_SHFL_MASK, q1, 2, 32); if (lane >= 2) q1 += temp; - temp = __shfl_up_sync(0xffffffff, q1, 4); + temp = __shfl_up_sync(NVCV_SHFL_MASK, q1, 4, 32); if (lane >= 4) q1 += temp; - temp = __shfl_up_sync(0xffffffff, q1, 8); + temp = __shfl_up_sync(NVCV_SHFL_MASK, q1, 8, 32); if (lane >= 8) q1 += temp; - temp = __shfl_up_sync(0xffffffff, q1, 16); + temp = __shfl_up_sync(NVCV_SHFL_MASK, q1, 16, 32); if (lane >= 16) q1 += temp; q1 += reduce[warp]; @@ -560,11 +582,11 @@ __global__ void otsu_cal(int *histogram, Tensor1DWrap thresh, i double one = localid * hist[localid] * scale; // sum of q1 in warp temp = one; - temp += __shfl_xor_sync(0xffffffff, temp, 1); - temp += __shfl_xor_sync(0xffffffff, temp, 2); - temp += __shfl_xor_sync(0xffffffff, temp, 4); - temp += __shfl_xor_sync(0xffffffff, temp, 8); - temp += __shfl_xor_sync(0xffffffff, temp, 16); + temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 1, 32); + temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 2, 32); + temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 4, 32); + temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 8, 32); + temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 16, 32); if (lane == 0) reduce[warp] = temp; __syncthreads(); @@ -583,19 +605,19 @@ __global__ void otsu_cal(int *histogram, Tensor1DWrap thresh, i } __syncthreads(); // prefix scan in warp - temp = __shfl_up_sync(0xffffffff, one, 1); + temp = __shfl_up_sync(NVCV_SHFL_MASK, one, 1, 32); if (lane >= 1) one += temp; - temp = __shfl_up_sync(0xffffffff, one, 2); + temp = __shfl_up_sync(NVCV_SHFL_MASK, one, 2, 32); if (lane >= 2) one += temp; - temp = __shfl_up_sync(0xffffffff, one, 4); + temp = __shfl_up_sync(NVCV_SHFL_MASK, one, 4, 32); if (lane >= 4) one += temp; - temp = __shfl_up_sync(0xffffffff, one, 8); + temp = __shfl_up_sync(NVCV_SHFL_MASK, one, 8, 32); if (lane >= 8) one += temp; - temp = __shfl_up_sync(0xffffffff, one, 16); + temp = __shfl_up_sync(NVCV_SHFL_MASK, one, 16, 32); if (lane >= 16) one += temp; one += reduce[warp]; @@ -635,6 +657,20 @@ __global__ void otsu_cal(int *histogram, Tensor1DWrap thresh, i } __syncthreads(); +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) + for (int s = 32; s > 0; s >>= 1) + { + if (localid < s && reduce[localid + s] >= reduce[localid]) + { + if (reduce[localid + s] == reduce[localid]) + idx[localid] = min(idx[localid], idx[localid + s]); + else + idx[localid] = idx[localid + s]; + reduce[localid] = reduce[localid + s]; + } + __syncthreads(); + } +#else if (localid < 32) { if (reduce[localid + 32] >= reduce[localid]) @@ -687,6 +723,7 @@ __global__ void otsu_cal(int *histogram, Tensor1DWrap thresh, i } } __syncthreads(); +#endif // write to gpu memory if (localid == 0) diff --git a/src/cvcuda/priv/legacy/threshold_var_shape.cu b/src/cvcuda/priv/legacy/threshold_var_shape.cu index e0c865101..551d70681 100644 --- a/src/cvcuda/priv/legacy/threshold_var_shape.cu +++ b/src/cvcuda/priv/legacy/threshold_var_shape.cu @@ -31,6 +31,15 @@ using namespace nvcv::legacy::cuda_op; using namespace nvcv::cuda; +// See threshold.cu: Otsu's per-warp scans use warp = localid / 32, so the +// shuffles pass an explicit width of 32 to stay within a 32-lane subgroup on a +// 64-lane CDNA wavefront. The mask must be 64-bit on ROCm. +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) +#define NVCV_SHFL_MASK NVCV_WARP_FULL_MASK +#else +#define NVCV_SHFL_MASK 0xffffffff +#endif + template> __global__ void Binary_overflow(ImageBatchVarShapeWrapNHWC src, ImageBatchVarShapeWrapNHWC dst, Tensor1DWrap _thresh, Tensor1DWrap _maxval, @@ -699,6 +708,16 @@ __global__ void otsu_cal_varshape(int *histogram, Tensor1DWrap if (localid < 64) reduce[localid] = reduce[localid] + reduce[localid + 64]; __syncthreads(); +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) + // No warp-synchronous lockstep guarantee on a 64-lane CDNA wavefront; keep + // the __syncthreads tree down to one element (same add order each step). + for (int s = 32; s > 0; s >>= 1) + { + if (localid < s) + reduce[localid] = reduce[localid] + reduce[localid + s]; + __syncthreads(); + } +#else if (localid < 32) { reduce[localid] = reduce[localid] + reduce[localid + 32]; @@ -709,6 +728,7 @@ __global__ void otsu_cal_varshape(int *histogram, Tensor1DWrap reduce[localid] = reduce[localid] + reduce[localid + 1]; } __syncthreads(); +#endif mu = reduce[0] * scale; __syncthreads(); @@ -719,11 +739,11 @@ __global__ void otsu_cal_varshape(int *histogram, Tensor1DWrap int lane = localid % 32, warp = localid / 32; // sum of q1 in warp double temp = q1; - temp += __shfl_xor_sync(0xffffffff, temp, 1); - temp += __shfl_xor_sync(0xffffffff, temp, 2); - temp += __shfl_xor_sync(0xffffffff, temp, 4); - temp += __shfl_xor_sync(0xffffffff, temp, 8); - temp += __shfl_xor_sync(0xffffffff, temp, 16); + temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 1, 32); + temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 2, 32); + temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 4, 32); + temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 8, 32); + temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 16, 32); if (lane == 0) reduce[warp] = temp; __syncthreads(); @@ -742,19 +762,19 @@ __global__ void otsu_cal_varshape(int *histogram, Tensor1DWrap } __syncthreads(); // prefix scan in warp - temp = __shfl_up_sync(0xffffffff, q1, 1); + temp = __shfl_up_sync(NVCV_SHFL_MASK, q1, 1, 32); if (lane >= 1) q1 += temp; - temp = __shfl_up_sync(0xffffffff, q1, 2); + temp = __shfl_up_sync(NVCV_SHFL_MASK, q1, 2, 32); if (lane >= 2) q1 += temp; - temp = __shfl_up_sync(0xffffffff, q1, 4); + temp = __shfl_up_sync(NVCV_SHFL_MASK, q1, 4, 32); if (lane >= 4) q1 += temp; - temp = __shfl_up_sync(0xffffffff, q1, 8); + temp = __shfl_up_sync(NVCV_SHFL_MASK, q1, 8, 32); if (lane >= 8) q1 += temp; - temp = __shfl_up_sync(0xffffffff, q1, 16); + temp = __shfl_up_sync(NVCV_SHFL_MASK, q1, 16, 32); if (lane >= 16) q1 += temp; q1 += reduce[warp]; @@ -766,11 +786,11 @@ __global__ void otsu_cal_varshape(int *histogram, Tensor1DWrap double one = localid * hist[localid] * scale; // sum of q1 in warp temp = one; - temp += __shfl_xor_sync(0xffffffff, temp, 1); - temp += __shfl_xor_sync(0xffffffff, temp, 2); - temp += __shfl_xor_sync(0xffffffff, temp, 4); - temp += __shfl_xor_sync(0xffffffff, temp, 8); - temp += __shfl_xor_sync(0xffffffff, temp, 16); + temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 1, 32); + temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 2, 32); + temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 4, 32); + temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 8, 32); + temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 16, 32); if (lane == 0) reduce[warp] = temp; __syncthreads(); @@ -789,19 +809,19 @@ __global__ void otsu_cal_varshape(int *histogram, Tensor1DWrap } __syncthreads(); // prefix scan in warp - temp = __shfl_up_sync(0xffffffff, one, 1); + temp = __shfl_up_sync(NVCV_SHFL_MASK, one, 1, 32); if (lane >= 1) one += temp; - temp = __shfl_up_sync(0xffffffff, one, 2); + temp = __shfl_up_sync(NVCV_SHFL_MASK, one, 2, 32); if (lane >= 2) one += temp; - temp = __shfl_up_sync(0xffffffff, one, 4); + temp = __shfl_up_sync(NVCV_SHFL_MASK, one, 4, 32); if (lane >= 4) one += temp; - temp = __shfl_up_sync(0xffffffff, one, 8); + temp = __shfl_up_sync(NVCV_SHFL_MASK, one, 8, 32); if (lane >= 8) one += temp; - temp = __shfl_up_sync(0xffffffff, one, 16); + temp = __shfl_up_sync(NVCV_SHFL_MASK, one, 16, 32); if (lane >= 16) one += temp; one += reduce[warp]; @@ -841,6 +861,20 @@ __global__ void otsu_cal_varshape(int *histogram, Tensor1DWrap } __syncthreads(); +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) + for (int s = 32; s > 0; s >>= 1) + { + if (localid < s && reduce[localid + s] >= reduce[localid]) + { + if (reduce[localid + s] == reduce[localid]) + idx[localid] = min(idx[localid], idx[localid + s]); + else + idx[localid] = idx[localid + s]; + reduce[localid] = reduce[localid + s]; + } + __syncthreads(); + } +#else if (localid < 32) { if (reduce[localid + 32] >= reduce[localid]) @@ -893,6 +927,7 @@ __global__ void otsu_cal_varshape(int *histogram, Tensor1DWrap } } __syncthreads(); +#endif // write to gpu memory if (localid == 0) diff --git a/src/cvcuda/util/CMakeLists.txt b/src/cvcuda/util/CMakeLists.txt index 346ab73b4..c5e953ee0 100644 --- a/src/cvcuda/util/CMakeLists.txt +++ b/src/cvcuda/util/CMakeLists.txt @@ -13,7 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -find_package(CUDAToolkit REQUIRED) +if(NOT USE_HIP) + find_package(CUDAToolkit REQUIRED) +endif() add_library(cvcuda_util STATIC Event.cpp diff --git a/src/cvcuda/util/StreamId.cpp b/src/cvcuda/util/StreamId.cpp index baff3f54c..b0a736820 100644 --- a/src/cvcuda/util/StreamId.cpp +++ b/src/cvcuda/util/StreamId.cpp @@ -21,6 +21,32 @@ #include #include +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) + +// ROCm exposes hipStreamGetId directly, which is all the stream-identity cache +// needs. Fall back to the stream-handle pointer value (a stable per-stream key) +// if the runtime cannot supply an id. +namespace nvcv::util { + +bool IsCudaStreamIdHintUnambiguous() +{ + return true; +} + +uint64_t GetCudaStreamIdHint(hipStream_t stream) +{ + unsigned long long id = 0; + hipError_t err = hipStreamGetId(stream, &id); + if (err == hipSuccess) + return id; + (void)hipGetLastError(); + return reinterpret_cast(stream); +} + +} // namespace nvcv::util + +#else + using cuStreamGetId_t = CUresult(CUstream, unsigned long long *); #if CUDA_VERSION >= 12000 @@ -148,3 +174,5 @@ uint64_t GetCudaStreamIdHint(CUstream stream) } } // namespace nvcv::util + +#endif // __HIP_PLATFORM_AMD__ || USE_HIP diff --git a/src/nvcv/cmake/ConfigBuildTree.cmake b/src/nvcv/cmake/ConfigBuildTree.cmake index 31b5fe98b..770cf4956 100644 --- a/src/nvcv/cmake/ConfigBuildTree.cmake +++ b/src/nvcv/cmake/ConfigBuildTree.cmake @@ -76,7 +76,10 @@ include(CMakeDependentOption) option(EXPOSE_CODE "Expose in resulting binaries parts of our code" ${DEFAULT_EXPOSE_CODE}) option(WARNINGS_AS_ERRORS "Treat compilation warnings as errors" OFF) -cmake_dependent_option(ENABLE_COMPAT_OLD_GLIBC "Generates binaries that work with old distros, with old glibc" ON "NOT ARCH_AARCH64" OFF) +# The old-glibc compat shim links prebuilt glibc-2.17 stub .so files (shipped via +# Git LFS); it is a deployment-portability feature unrelated to GPU correctness, +# so default it OFF on the ROCm build. +cmake_dependent_option(ENABLE_COMPAT_OLD_GLIBC "Generates binaries that work with old distros, with old glibc" ON "NOT ARCH_AARCH64;NOT USE_HIP" OFF) cmake_dependent_option(ENABLE_QNX "Enable QNX support" OFF "ARCH_AARCH64" OFF) if(ENABLE_QNX) @@ -86,7 +89,9 @@ else() endif() # Needed to get cuda version -find_package(CUDAToolkit REQUIRED) +if(NOT USE_HIP) + find_package(CUDAToolkit REQUIRED) +endif() # Are we inside a git repo and it has submodules enabled? if(EXISTS ${CMAKE_SOURCE_DIR}/.git AND EXISTS ${CMAKE_SOURCE_DIR}/.gitmodules) @@ -104,7 +109,11 @@ else() "CV-CUDA only supports Linux and QNX platforms.") endif() -set(NVCV_BUILD_SUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}-${NVCV_SYSTEM_NAME}") +if(USE_HIP) + set(NVCV_BUILD_SUFFIX "hip-${NVCV_SYSTEM_NAME}") +else() + set(NVCV_BUILD_SUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}-${NVCV_SYSTEM_NAME}") +endif() function(setup_dso target version) string(REGEX MATCHALL "[0-9]+" version_list "${version}") diff --git a/src/nvcv/src/include/nvcv/detail/CudaFwd.h b/src/nvcv/src/include/nvcv/detail/CudaFwd.h index 79dac8dee..6aa543961 100644 --- a/src/nvcv/src/include/nvcv/detail/CudaFwd.h +++ b/src/nvcv/src/include/nvcv/detail/CudaFwd.h @@ -13,7 +13,13 @@ #ifndef NVCV_DETAIL_FWD_CUDA_H #define NVCV_DETAIL_FWD_CUDA_H +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) +#include +typedef hipStream_t CUstream; +typedef hipArray_t cudaArray_t; +#else typedef struct CUstream_st *CUstream; typedef struct cudaArray *cudaArray_t; +#endif #endif // NVCV_DETAIL_FWD_CUDA_H diff --git a/src/nvcv/src/priv/CMakeLists.txt b/src/nvcv/src/priv/CMakeLists.txt index 7e298d9af..e46dd7798 100644 --- a/src/nvcv/src/priv/CMakeLists.txt +++ b/src/nvcv/src/priv/CMakeLists.txt @@ -13,7 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -find_package(CUDAToolkit REQUIRED) +if(NOT USE_HIP) + find_package(CUDAToolkit REQUIRED) +endif() add_library(nvcv_types_priv STATIC Context.cpp diff --git a/src/nvcv/src/priv/DefaultAllocator.cpp b/src/nvcv/src/priv/DefaultAllocator.cpp index df9a2c921..c382cef8d 100644 --- a/src/nvcv/src/priv/DefaultAllocator.cpp +++ b/src/nvcv/src/priv/DefaultAllocator.cpp @@ -65,6 +65,17 @@ void *DefaultAllocator::doAllocCudaMem(int64_t size, int32_t align) void *ptr = nullptr; NVCV_CHECK_THROW(::cudaMalloc(&ptr, size)); +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) + // hipMalloc returns recycled device memory with stale contents, whereas + // freshly cudaMalloc'd memory on the supported NVIDIA setups reads back as + // zero. Several gtests fill a tensor's valid region and then compare the + // whole strided buffer (including the row-stride padding the operator never + // writes) against a zero-initialized CPU reference, so they assume device + // padding is zero. Zero new device allocations to keep that contract; the + // operators themselves write every valid pixel and are unaffected. + NVCV_CHECK_THROW(::cudaMemset(ptr, 0, size)); +#endif + // TODO: can we do better than this? if (reinterpret_cast(ptr) % align != 0) { diff --git a/src/nvcv/util/CMakeLists.txt b/src/nvcv/util/CMakeLists.txt index aa72802b9..113cff968 100644 --- a/src/nvcv/util/CMakeLists.txt +++ b/src/nvcv/util/CMakeLists.txt @@ -13,7 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -find_package(CUDAToolkit REQUIRED) +if(NOT USE_HIP) + find_package(CUDAToolkit REQUIRED) +endif() # nvcv_util_sanitizer --------------------------------- add_library(nvcv_util_sanitizer STATIC diff --git a/tests/cvcuda/system/CMakeLists.txt b/tests/cvcuda/system/CMakeLists.txt index 134a1502f..dc9d26f79 100644 --- a/tests/cvcuda/system/CMakeLists.txt +++ b/tests/cvcuda/system/CMakeLists.txt @@ -111,29 +111,35 @@ set(CVCUDA_TEST_SOURCES_CUOSD OsdUtils.cu ) -# Build smoke tests unconditionally - they work on all compilers -add_executable(cvcuda_test_system_smoke ${CVCUDA_SMOKE_TEST_SOURCES}) - -target_link_libraries(cvcuda_test_system_smoke - PUBLIC - cvcuda - nvcv_test_common_system - # Note: Does NOT link against cuosd -) +# The smoke tests exercise the OSD/BndBox/BoxBlur operators, which depend on the +# prebuilt CUDA-only cuOSD lib and are scoped out of the ROCm build. +if(NOT USE_HIP) + # Build smoke tests unconditionally - they work on all compilers + add_executable(cvcuda_test_system_smoke ${CVCUDA_SMOKE_TEST_SOURCES}) + + target_link_libraries(cvcuda_test_system_smoke + PUBLIC + cvcuda + nvcv_test_common_system + # Note: Does NOT link against cuosd + ) -nvcv_add_test(cvcuda_test_system_smoke cvcuda cpp) + nvcv_add_test(cvcuda_test_system_smoke cvcuda cpp) +endif() # Include C++20-requiring tests and libcuosd tests only if compiler supports them properly if(NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11.0)) list(APPEND CVCUDA_TEST_SOURCES ${CVCUDA_TEST_SOURCES_CPP20}) - list(APPEND CVCUDA_TEST_SOURCES ${CVCUDA_TEST_SOURCES_CUOSD}) + if(NOT USE_HIP) + list(APPEND CVCUDA_TEST_SOURCES ${CVCUDA_TEST_SOURCES_CUOSD}) + endif() endif() # Build cvcuda_test_system (works with GCC-10+ but with reduced test coverage for GCC-10) add_executable(cvcuda_test_system ${CVCUDA_TEST_SOURCES}) # Link against libcuosd only for GCC-11+ (needed for OSD tests) -if(NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11.0)) +if(NOT USE_HIP AND NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11.0)) target_link_libraries(cvcuda_test_system PUBLIC cvcuda diff --git a/tests/cvcuda/system/TestOpMinMaxLoc.cpp b/tests/cvcuda/system/TestOpMinMaxLoc.cpp index 98fbbaa04..c5d6a8e91 100644 --- a/tests/cvcuda/system/TestOpMinMaxLoc.cpp +++ b/tests/cvcuda/system/TestOpMinMaxLoc.cpp @@ -124,8 +124,8 @@ inline void FindMinMax(InContainerType &in, InStridesType &inStrides, InShapeTyp } } - test::ValueAt(minVal, valStrides, {z}).x = min; - test::ValueAt(maxVal, valStrides, {z}).x = max; + test::ValueAt(minVal, valStrides, int1{z}).x = min; + test::ValueAt(maxVal, valStrides, int1{z}).x = max; int nMin{0}, nMax{0}; @@ -161,8 +161,8 @@ inline void FindMinMax(InContainerType &in, InStridesType &inStrides, InShapeTyp } } - test::ValueAt(numMin, numStrides, {z}).x = nMin; - test::ValueAt(numMax, numStrides, {z}).x = nMax; + test::ValueAt(numMin, numStrides, int1{z}).x = nMin; + test::ValueAt(numMax, numStrides, int1{z}).x = nMax; } } @@ -181,8 +181,8 @@ inline void LocSort(std::vector> &minLocTest, std::vector(numMinVec, numStrides, {z}).x; - int nMax = test::ValueAt(numMaxVec, numStrides, {z}).x; + int nMin = test::ValueAt(numMinVec, numStrides, int1{z}).x; + int nMax = test::ValueAt(numMaxVec, numStrides, int1{z}).x; for (int i = 0; i < nMin && i < capacity; i++) { @@ -225,9 +225,9 @@ inline void GoldMinMaxLoc(const nvcv::Tensor &minVal, const nvcv::Tensor &minLoc int capacity = minLocData->shape(1); int numSamples = minValData->shape(0); - long1 valStrides = {minValData->stride(0)}; + long1 valStrides{minValData->stride(0)}; long2 locStrides = {minLocData->stride(0), minLocData->stride(1)}; - long1 numStrides = {numMinData->stride(0)}; + long1 numStrides{numMinData->stride(0)}; size_t valBufSize = numSamples * valStrides.x; size_t locBufSize = numSamples * locStrides.x; size_t numBufSize = numSamples * numStrides.x; diff --git a/tests/cvcuda/system/TestOpSIFT.cpp b/tests/cvcuda/system/TestOpSIFT.cpp index 032c0d1c2..90f9f2062 100644 --- a/tests/cvcuda/system/TestOpSIFT.cpp +++ b/tests/cvcuda/system/TestOpSIFT.cpp @@ -809,7 +809,7 @@ inline void GoldSIFT(SIFTResults &outResults, const nvcv::Tensor &featCoords, co long2 featMetadataShape = {featMetadataData->shape(0), featMetadataData->shape(1)}; long3 featDescriptorsShape = {featDescriptorsData->shape(0), featDescriptorsData->shape(1), featDescriptorsData->shape(2)}; - long1 numFeaturesShape = {numFeaturesData->shape(0)}; + long1 numFeaturesShape{numFeaturesData->shape(0)}; ASSERT_TRUE((featCoordsShape == long2{srcShape.x, capacity} && featMetadataShape == long2{srcShape.x, capacity} && featDescriptorsShape == long3{srcShape.x, capacity, 128} && numFeaturesShape == long1{srcShape.x})); @@ -819,7 +819,7 @@ inline void GoldSIFT(SIFTResults &outResults, const nvcv::Tensor &featCoords, co long3 featDescriptorsStrides3 = {featDescriptorsData->stride(0), featDescriptorsData->stride(1), featDescriptorsData->stride(2)}; long2 featDescriptorsStrides = {featDescriptorsStrides3.x, featDescriptorsStrides3.y}; - long1 numFeaturesStrides = {numFeaturesData->stride(0)}; + long1 numFeaturesStrides{numFeaturesData->stride(0)}; ASSERT_TRUE(featCoordsStrides.y == sizeof(float4) && featMetadataStrides.y == sizeof(float3) && featDescriptorsStrides3.z == sizeof(uint8_t) && featDescriptorsStrides.y == 128 * sizeof(uint8_t) diff --git a/tests/nvcv_types/CMakeLists.txt b/tests/nvcv_types/CMakeLists.txt index fab8cd3b8..7149796be 100644 --- a/tests/nvcv_types/CMakeLists.txt +++ b/tests/nvcv_types/CMakeLists.txt @@ -53,8 +53,11 @@ if(BUILD_TESTS_CPP) ) # Skip standalone tests with GCC-10 due to ABI incompatibility with system libgtest - # These tests use find_package(GTest) which requires system GTest, causing ABI issues with GCC-10 - if(NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11.0)) + # These tests use find_package(GTest) which requires system GTest, causing ABI issues with GCC-10. + # Also skip on ROCm: the standalone consumer project re-runs cmake on a copied nvcv via + # ExternalProject without the top-level USE_HIP/enable_language(HIP) setup, so it would try to + # find the CUDA toolkit. It is a build-packaging check, not a GPU operator test. + if(NOT USE_HIP AND NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11.0)) # Test NVCV can be built as a shared library via the standalone test ExternalProject_Add( nvcv_standalone diff --git a/tests/nvcv_types/cudatools_system/DeviceFullTensorWrap.cu b/tests/nvcv_types/cudatools_system/DeviceFullTensorWrap.cu index 4e32fdf47..7411d9579 100644 --- a/tests/nvcv_types/cudatools_system/DeviceFullTensorWrap.cu +++ b/tests/nvcv_types/cudatools_system/DeviceFullTensorWrap.cu @@ -30,7 +30,9 @@ namespace cuda = nvcv::cuda; template __global__ void Copy(DstWrapper dst, SrcWrapper src) { - int1 coord = cuda::StaticCast(cuda::DropCast<1>(threadIdx)); + // threadIdx is a builtin struct without NVCV TypeTraits on HIP; uint3 + // brace-init binds the NVCV helpers and is unchanged on CUDA. + int1 coord = cuda::StaticCast(cuda::DropCast<1>(uint3{threadIdx.x, threadIdx.y, threadIdx.z})); dst[coord] = src[coord]; } diff --git a/tests/nvcv_types/cudatools_system/DeviceTensorBatchWrap.cu b/tests/nvcv_types/cudatools_system/DeviceTensorBatchWrap.cu index fe5f2dcef..5e93ceb16 100644 --- a/tests/nvcv_types/cudatools_system/DeviceTensorBatchWrap.cu +++ b/tests/nvcv_types/cudatools_system/DeviceTensorBatchWrap.cu @@ -107,7 +107,7 @@ __global__ void SetReferenceKernel(TensorBatchWrapT wrap) coords[d] = tmp_i % shape[d]; tmp_i /= shape[d]; } - SetValue::Set(wrap, sample, coords, cuda::SetAll(index % 255)); + SetValue::Set(wrap, sample, coords, cuda::SetAll(index % 255)); } } diff --git a/tests/nvcv_types/cudatools_system/DeviceTensorWrap.cu b/tests/nvcv_types/cudatools_system/DeviceTensorWrap.cu index afff2b95d..60ffa1b05 100644 --- a/tests/nvcv_types/cudatools_system/DeviceTensorWrap.cu +++ b/tests/nvcv_types/cudatools_system/DeviceTensorWrap.cu @@ -31,14 +31,16 @@ template __global__ void Copy(DstWrapper dst, SrcWrapper src) { using DimType = cuda::MakeType; - DimType coord = cuda::StaticCast(cuda::DropCast(threadIdx)); + // threadIdx is a builtin struct without NVCV TypeTraits on HIP; uint3 + // brace-init binds the NVCV helpers and is unchanged on CUDA. + DimType coord = cuda::StaticCast(cuda::DropCast(uint3{threadIdx.x, threadIdx.y, threadIdx.z})); dst[coord] = src[coord]; } template __global__ void Copy(cuda::Tensor4DWrap dst, cuda::Tensor4DWrap src, int lastDimSize) { - int3 c3 = cuda::StaticCast(threadIdx); + int3 c3 = cuda::StaticCast(uint3{threadIdx.x, threadIdx.y, threadIdx.z}); for (int k = 0; k < lastDimSize; k++) { int4 c4{k, c3.x, c3.y, c3.z}; diff --git a/tests/nvcv_types/cudatools_system/TestMathOps.cpp b/tests/nvcv_types/cudatools_system/TestMathOps.cpp index 360ecc996..811c00efe 100644 --- a/tests/nvcv_types/cudatools_system/TestMathOps.cpp +++ b/tests/nvcv_types/cudatools_system/TestMathOps.cpp @@ -141,7 +141,9 @@ NVCV_TYPED_TEST_SUITE( ttype::Types, ttype::Value>, ttype::Types, ttype::Value>, ttype::Types, ttype::Value>, - ttype::Types, ttype::Value> + // HIP_vector_type has no partial-init constructor (CUDA aggregate zero-fills + // the unspecified component); spell the 4th element so both backends match. + ttype::Types, ttype::Value> >); // clang-format on @@ -162,7 +164,7 @@ NVCV_TYPED_TEST_SUITE( ttype::Types, ttype::Value>, ttype::Types, ttype::Value>, ttype::Types, ttype::Value>, - ttype::Types, ttype::Value> + ttype::Types, ttype::Value> >); // clang-format on diff --git a/tests/nvcv_types/cudatools_system/TestTypeTraits.cpp b/tests/nvcv_types/cudatools_system/TestTypeTraits.cpp index 96320483e..433434883 100644 --- a/tests/nvcv_types/cudatools_system/TestTypeTraits.cpp +++ b/tests/nvcv_types/cudatools_system/TestTypeTraits.cpp @@ -328,7 +328,10 @@ class TypeTraitsGetElementTest : public TypeTraitsBaseTest } else if constexpr (NumElements == 1) { - pix = {1}; + // A 1-element vector type's scalar constructor is explicit on HIP + // (HIP_vector_type), so copy-list-init from {1} is rejected; SetAll + // is the portable NVCV builder and is unchanged on CUDA. + pix = cuda::SetAll(1); } } };