CVCUDA · jeffdaily · Jun 18, 2026 · Jun 19, 2026 · Jun 23, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -33,7 +33,101 @@ if(NOT CMAKE_CUDA_HOST_COMPILER)
     set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}")
 endif()
 
-enable_language(CUDA)
+# Build the GPU kernels for AMD via HIP/ROCm instead of CUDA. Additive: the
+# NVIDIA path below is unchanged when USE_HIP is OFF.
+option(USE_HIP "Build the GPU code with HIP for AMD GPUs (ROCm)" OFF)
+
+if(USE_HIP)
+    # enable_language(HIP) honors -DCMAKE_HIP_ARCHITECTURES, otherwise auto-detects
+    # the host GPU(s) via rocm_agent_enumerator and errors if none is found (a
+    # no-GPU build host must then set the arch explicitly).
+    enable_language(HIP)
+
+    # Host .cpp files also use the CUDA runtime/library spellings (cudaStream_t,
+    # cudaMalloc, ...); steer every C/C++/HIP TU through the ROCm compat layer.
+    add_compile_definitions(USE_HIP __HIP_PLATFORM_AMD__)
+    set(CVCUDA_HIP_COMPAT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/cmake/hip")
+    include_directories(BEFORE "${CVCUDA_HIP_COMPAT_DIR}")
+    # Force-include the compat header on every C++/HIP TU so the cuda*->hip*
+    # aliases and the 64-bit warp mask precede any include, regardless of file
+    # order. C TUs do not use the CUDA runtime, so they are left alone.
+    set(_cvcuda_force_include "-include ${CVCUDA_HIP_COMPAT_DIR}/CvCudaHipCompat.h")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${_cvcuda_force_include}")
+    set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} ${_cvcuda_force_include}")
+
+    # clang (HIP) defaults to -ffp-contract=fast, which forms FMAs across separate
+    # statements/expressions; nvcc only contracts within a single expression
+    # (--fmad=true). That extra contraction makes HIP float results drift by ~1 ULP
+    # from the CUDA build and the CPU gold references (e.g. the bicubic weight chain
+    # in InterpolationWrap), failing the bit-exact gtests. Pin contraction to the
+    # CUDA/host semantics so float math is reproducible across the two toolchains.
+    set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -ffp-contract=on")
+
+    find_package(hip REQUIRED)
+    find_package(hipcub REQUIRED)
+    find_package(hiprand REQUIRED)
+    find_package(hipblas REQUIRED)
+    find_package(hipsolver REQUIRED)
+
+    # The compat header is force-included on every C++/HIP TU and pulls in the
+    # HIP runtime, so the ROCm headers must be on the include path globally (not
+    # only on targets that link hip::host).
+    get_target_property(_hip_inc hip::host INTERFACE_INCLUDE_DIRECTORIES)
+    if(_hip_inc)
+        include_directories(SYSTEM ${_hip_inc})
+    else()
+        include_directories(SYSTEM "/opt/rocm/include")
+    endif()
+
+    # Map the CUDA::* link targets the build uses onto their HIP equivalents so
+    # the per-target target_link_libraries() lines stay untouched.
+    foreach(_cudalib cudart_static cublas_static cublasLt_static cusolver_static cuda_driver)
+        if(NOT TARGET CUDA::${_cudalib})
+            add_library(CUDA::${_cudalib} INTERFACE IMPORTED)
+            target_link_libraries(CUDA::${_cudalib} INTERFACE hip::host)
+        endif()
+    endforeach()
+    target_link_libraries(CUDA::cusolver_static INTERFACE roc::hipsolver)
+    target_link_libraries(CUDA::cublas_static   INTERFACE roc::hipblas)
+
+    # Retag every .cu source LANGUAGE HIP without editing each per-target
+    # CMakeLists. add_library/add_executable are overridden at top scope to
+    # forward to the real command then flip the target's .cu files to HIP and
+    # set HIP_ARCHITECTURES. The NVIDIA build never defines
+    # these macros, so it is a pure passthrough.
+    macro(_cvcuda_hipify_target _tgt)
+        get_target_property(_t_type ${_tgt} TYPE)
+        if(NOT _t_type STREQUAL "INTERFACE_LIBRARY")
+            get_target_property(_t_srcs ${_tgt} SOURCES)
+            if(_t_srcs)
+                foreach(_s ${_t_srcs})
+                    if(_s MATCHES "\\.cu$")
+                        set_source_files_properties(${_s} PROPERTIES LANGUAGE HIP)
+                    endif()
+                endforeach()
+            endif()
+            set_target_properties(${_tgt} PROPERTIES HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}")
+        endif()
+    endmacro()
+
+    macro(add_library _tgt)
+        _add_library(${_tgt} ${ARGN})
+        set(_args "${ARGN}")
+        if(NOT "ALIAS" IN_LIST _args AND NOT "IMPORTED" IN_LIST _args AND NOT "INTERFACE" IN_LIST _args)
+            _cvcuda_hipify_target(${_tgt})
+        endif()
+    endmacro()
+
+    macro(add_executable _tgt)
+        _add_executable(${_tgt} ${ARGN})
+        set(_args "${ARGN}")
+        if(NOT "ALIAS" IN_LIST _args AND NOT "IMPORTED" IN_LIST _args)
+            _cvcuda_hipify_target(${_tgt})
+        endif()
+    endmacro()
+else()
+    enable_language(CUDA)
+endif()
 
 # Used when creating special builds
 set(PROJECT_VERSION_SUFFIX "")

diff --git a/README.md b/README.md
@@ -65,6 +65,8 @@ We provide pre-built Python wheels on pypi.org for a variety of Python versions
 
 See [Installation](https://cvcuda.github.io/CV-CUDA/installation.html) for complete installation instructions including building from source, installing Debian packages, and tar archives.
 
+CV-CUDA can also be built for AMD GPUs by compiling its GPU code with HIP for ROCm instead of CUDA. This path is additive and off by default; the NVIDIA build is unchanged. See [Building for AMD GPUs (ROCm)](https://cvcuda.github.io/CV-CUDA/installation.html#building-for-amd-gpus-rocm) for details.
+
 ### Compatibility
 
 |CV-CUDA Build|Platform|CUDA Version|CUDA Compute Capability|Hardware Architectures|Nvidia Driver|Python Versions|Supported Compilers (build from source and API compatiblity)|API compatibility with prebuilt binaries|OS/Linux distributions tested with prebuilt packages|

diff --git a/cmake/ConfigBuildTree.cmake b/cmake/ConfigBuildTree.cmake
@@ -66,10 +66,15 @@ include(CMakeDependentOption)
 
 option(EXPOSE_CODE "Expose in resulting binaries parts of our code" ${DEFAULT_EXPOSE_CODE})
 option(WARNINGS_AS_ERRORS "Treat compilation warnings as errors" OFF)
-cmake_dependent_option(ENABLE_COMPAT_OLD_GLIBC "Generates binaries that work with old distros, with old glibc" ON "NOT ARCH_AARCH64" OFF)
+# The old-glibc compat shim links prebuilt glibc-2.17 stub .so files (shipped via
+# Git LFS); it is a deployment-portability feature unrelated to GPU correctness,
+# so default it OFF on the ROCm build.
+cmake_dependent_option(ENABLE_COMPAT_OLD_GLIBC "Generates binaries that work with old distros, with old glibc" ON "NOT ARCH_AARCH64;NOT USE_HIP" OFF)
 
 # Needed to get cuda version
-find_package(CUDAToolkit REQUIRED)
+if(NOT USE_HIP)
+    find_package(CUDAToolkit REQUIRED)
+endif()
 
 # Are we inside a git repo and it has submodules enabled?
 if(EXISTS ${CMAKE_SOURCE_DIR}/.git AND EXISTS ${CMAKE_SOURCE_DIR}/.gitmodules)
@@ -85,7 +90,11 @@ else()
                         "CV-CUDA only supports Linux platform.")
 endif()
 
-set(CVCUDA_BUILD_SUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}-${CVCUDA_SYSTEM_NAME}")
+if(USE_HIP)
+    set(CVCUDA_BUILD_SUFFIX "hip-${CVCUDA_SYSTEM_NAME}")
+else()
+    set(CVCUDA_BUILD_SUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}-${CVCUDA_SYSTEM_NAME}")
+endif()
 
 function(setup_dso target version)
     string(REGEX MATCHALL "[0-9]+" version_list "${version}")

diff --git a/cmake/ConfigCUDA.cmake b/cmake/ConfigCUDA.cmake
@@ -13,6 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# The whole CUDA toolkit configuration is skipped on the HIP/ROCm build; the
+# top-level CMakeLists handles enable_language(HIP) and the HIP packages.
+if(NOT USE_HIP)
+
 string(REPLACE "." ";" CUDA_VERSION_LIST ${CMAKE_CUDA_COMPILER_VERSION})
 list(GET CUDA_VERSION_LIST 0 CUDA_VERSION_MAJOR)
 list(GET CUDA_VERSION_LIST 1 CUDA_VERSION_MINOR)
@@ -100,3 +104,5 @@ if(NOT USE_CMAKE_CUDA_ARCHITECTURES)
     # which is the old architecture supported by nvcc. We don't want that.
     set(CMAKE_CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}" CACHE STRING "CUDA architectures to build for" FORCE)
 endif()
+
+endif() # NOT USE_HIP
diff --git a/cmake/ConfigCompiler.cmake b/cmake/ConfigCompiler.cmake
@@ -46,7 +46,13 @@ endif()
 include(CheckIPOSupported)
 check_ipo_supported(RESULT LTO_SUPPORTED)
 
-set(LTO_ENABLED ON)
+# LTO does not finalize under the HIP link step (device objects stay slim
+# bitcode), which breaks linking; keep it off on ROCm.
+if(USE_HIP)
+    set(LTO_ENABLED OFF)
+else()
+    set(LTO_ENABLED ON)
+endif()
 
 if(ENABLE_SANITIZER)
     set(COMPILER_SANITIZER_FLAGS

diff --git a/cmake/hip/CvCudaHipCompat.h b/cmake/hip/CvCudaHipCompat.h
@@ -0,0 +1,215 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Author: Jeff Daily <jeff.daily@amd.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// ROCm/HIP compatibility layer for CV-CUDA. Force-included on every HIP
+// translation unit (CMAKE_HIP_FLAGS -include) so the aliases below are in
+// scope before any CV-CUDA or CUDA-toolkit header is parsed. The NVIDIA build
+// never sees this file: the cmake/hip shim dir and this header are only on the
+// HIP include path. Only the actual CUDA-runtime/library symbols CV-CUDA uses
+// are aliased here; CV-CUDA's own cuda<Op>Submit/cuda<Op>Create public API
+// names are deliberately left untouched.
+
+#ifndef CVCUDA_HIP_COMPAT_H
+#define CVCUDA_HIP_COMPAT_H
+
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+
+#if defined(__cplusplus)
+// libc host declarations must win over HIP's device overloads of memcpy/memset:
+// pull them in before the HIP runtime so host TUs keep the standard host
+// prototypes.
+#include <cstdlib>
+#include <cstring>
+
+#if defined(__HIPCC__)
+// .cu translation units (compiled by hipcc) need the full device runtime.
+#include <hip/hip_runtime.h>
+#else
+// Plain g++ host TUs only need the runtime API (types + host-callable entry
+// points); the device runtime header is heavier and pulls device builtins.
+#include <hip/hip_runtime_api.h>
+#endif
+#endif
+
+// CUDA_VERSION: several headers (Compat.hpp, Metaprogramming.hpp, StreamId.cpp)
+// branch on it. CV-CUDA wants the pre-13.0 compound-type aliases, so report a
+// version below 13000. Do NOT define __CUDA_ARCH__ on HIP: the SaturateCast PTX
+// table and the NVCV SIMD-video-intrinsic paths are gated on __CUDA_ARCH__ and
+// must stay inert, falling through to their portable C++/per-element bodies.
+#ifndef CUDA_VERSION
+#define CUDA_VERSION 12020
+#endif
+
+// ---- runtime: error/status -------------------------------------------------
+#define cudaError_t                hipError_t
+#define cudaError                  hipError_t
+#define cudaSuccess                hipSuccess
+#define cudaErrorNotReady          hipErrorNotReady
+#define cudaErrorInvalidValue      hipErrorInvalidValue
+#define cudaErrorMemoryAllocation  hipErrorOutOfMemory
+#define cudaErrorCudartUnloading   hipErrorDeinitialized
+#define cudaErrorTextureFetchFailed hipErrorInvalidTexture
+#define cudaGetLastError           hipGetLastError
+#define cudaPeekAtLastError        hipPeekAtLastError
+#define cudaGetErrorString         hipGetErrorString
+#define cudaGetErrorName           hipGetErrorName
+#define cudaGetVersion             hipRuntimeGetVersion
+
+// ---- runtime: device -------------------------------------------------------
+#define cudaGetDevice              hipGetDevice
+#define cudaSetDevice              hipSetDevice
+#define cudaGetDeviceCount         hipGetDeviceCount
+#define cudaDeviceSynchronize      hipDeviceSynchronize
+#define cudaDeviceProp             hipDeviceProp_t
+#define cudaGetDeviceProperties    hipGetDeviceProperties
+#define cudaDevAttrTextureAlignment      hipDeviceAttributeTextureAlignment
+#define cudaDevAttrTexturePitchAlignment hipDeviceAttributeTexturePitchAlignment
+
+// NVCV derives a tensor/image row-pitch alignment from the texture *pitch*
+// alignment device attribute. On NVIDIA that attribute is 32 bytes, so a tightly
+// packed image (e.g. a 640-byte uchar row) keeps a 640-byte row stride. AMD
+// reports 256 there, which would pad that row to 768 and silently change the
+// in-memory layout every NVCV consumer (and the whole-buffer test comparisons)
+// assumes. No CV-CUDA tensor is bound to a HW texture object, so the larger
+// HW pitch is unnecessary here; clamp the queried pitch alignment to the NVIDIA
+// value to keep the byte layout identical to the CUDA build. Other attribute
+// queries pass through unchanged.
+#if defined(__cplusplus)
+__host__ inline hipError_t cvcuda_hipDeviceGetAttribute(int *value, hipDeviceAttribute_t attr, int device)
+{
+    hipError_t err = hipDeviceGetAttribute(value, attr, device);
+    if (err == hipSuccess && attr == hipDeviceAttributeTexturePitchAlignment && value && *value > 32)
+    {
+        *value = 32;
+    }
+    return err;
+}
+#define cudaDeviceGetAttribute cvcuda_hipDeviceGetAttribute
+#else
+#define cudaDeviceGetAttribute hipDeviceGetAttribute
+#endif
+
+// ---- runtime: memory -------------------------------------------------------
+#define cudaMalloc                 hipMalloc
+#define cudaMallocManaged          hipMallocManaged
+#define cudaFree                   hipFree
+#define cudaMallocHost             hipHostMalloc
+#define cudaHostAlloc              hipHostMalloc
+#define cudaFreeHost               hipHostFree
+#define cudaHostFree               hipHostFree
+#define cudaHostAllocMapped        hipHostMallocMapped
+#define cudaHostAllocWriteCombined hipHostMallocWriteCombined
+#define cudaMemset                 hipMemset
+#define cudaMemsetAsync            hipMemsetAsync
+#define cudaMemset2D               hipMemset2D
+#define cudaMemset2DAsync          hipMemset2DAsync
+#define cudaMemcpy                 hipMemcpy
+#define cudaMemcpyAsync            hipMemcpyAsync
+#define cudaMemcpy2D               hipMemcpy2D
+#define cudaMemcpy2DAsync          hipMemcpy2DAsync
+#define cudaMemcpyKind             hipMemcpyKind
+#define cudaMemcpyHostToDevice     hipMemcpyHostToDevice
+#define cudaMemcpyDeviceToHost     hipMemcpyDeviceToHost
+#define cudaMemcpyDeviceToDevice   hipMemcpyDeviceToDevice
+#define cudaMemcpyHostToHost       hipMemcpyHostToHost
+#define cudaMemcpyDefault          hipMemcpyDefault
+#define cudaPointerAttributes      hipPointerAttribute_t
+#define cudaPointerGetAttributes   hipPointerGetAttributes
+#define cudaMemoryTypeHost         hipMemoryTypeHost
+#define cudaMemoryTypeDevice       hipMemoryTypeDevice
+#define cudaMemoryTypeManaged      hipMemoryTypeManaged
+#define cudaMemoryTypeUnregistered hipMemoryTypeUnregistered
+
+// ---- runtime: stream / event ----------------------------------------------
+#define cudaStream_t                       hipStream_t
+#define cudaStreamDefault                  hipStreamDefault
+#define cudaStreamPerThread                hipStreamPerThread
+#define cudaStreamNonBlocking              hipStreamNonBlocking
+#define cudaStreamSynchronize              hipStreamSynchronize
+#define cudaStreamWaitEvent                hipStreamWaitEvent
+#define cudaStreamDestroy                  hipStreamDestroy
+#define cudaStreamGetId                    hipStreamGetId
+#define cudaStreamCreate                   hipStreamCreate
+#define cudaStreamCreateWithFlags          hipStreamCreateWithFlags
+#define cudaStreamCreateWithPriority       hipStreamCreateWithPriority
+#define cudaDeviceGetStreamPriorityRange   hipDeviceGetStreamPriorityRange
+#define cudaEvent_t                hipEvent_t
+#define cudaEventDefault           hipEventDefault
+#define cudaEventDisableTiming     hipEventDisableTiming
+#define cudaEventCreate            hipEventCreate
+#define cudaEventCreateWithFlags   hipEventCreateWithFlags
+#define cudaEventRecord            hipEventRecord
+#define cudaEventQuery             hipEventQuery
+#define cudaEventSynchronize       hipEventSynchronize
+#define cudaEventElapsedTime       hipEventElapsedTime
+#define cudaEventDestroy           hipEventDestroy
+
+// ---- full-wavefront mask ---------------------------------------------------
+// __shfl*_sync on ROCm static_asserts a 64-bit mask regardless of wave width.
+// The width argument (kept explicit at every call site that needs it) controls
+// the subgroup; the mask just marks participants.
+#define NVCV_WARP_FULL_MASK 0xffffffffffffffffULL
+
+// ---- built-in index types --------------------------------------------------
+// On CUDA blockIdx/blockDim/threadIdx are uint3/dim3, so the kernels' common
+// idiom `blockIdx * blockDim + threadIdx` resolves through the cuda:: compound
+// operators. On HIP these are distinct __hip_builtin_*_t structs (with only a
+// dim3 conversion), so neither HIP's nor CV-CUDA's vector operators deduce them.
+// Provide the exact whole-vector forms the kernels use, lowering to uint3. These
+// take only the builtin index types, which carry no NVCV TypeTraits, so they do
+// not compete with the cuda:: compound operators.
+// Defined only under hipcc (where the __hip_builtin_*_t types exist and where
+// __global__ bodies are parsed); plain g++ host TUs that include only the HIP
+// runtime API do not see these builtin types, so the operators must not appear
+// there. hipcc parses these in both its host and device passes.
+#if defined(__cplusplus) && defined(__HIPCC__)
+__host__ __device__ __forceinline__ uint3 operator*(const __hip_builtin_blockIdx_t &a,
+                                                    const __hip_builtin_blockDim_t &b)
+{
+    return uint3{a.x * b.x, a.y * b.y, a.z * b.z};
+}
+__host__ __device__ __forceinline__ uint3 operator*(const __hip_builtin_blockDim_t &a,
+                                                    const __hip_builtin_blockIdx_t &b)
+{
+    return uint3{a.x * b.x, a.y * b.y, a.z * b.z};
+}
+__host__ __device__ __forceinline__ uint3 operator+(const uint3 &a, const __hip_builtin_threadIdx_t &b)
+{
+    return uint3{a.x + b.x, a.y + b.y, a.z + b.z};
+}
+__host__ __device__ __forceinline__ uint3 operator+(const __hip_builtin_threadIdx_t &a, const uint3 &b)
+{
+    return uint3{a.x + b.x, a.y + b.y, a.z + b.z};
+}
+#endif
+
+// std::declval is __host__-only (libstdc++); clang rejects it in the unevaluated
+// decltype of a __device__-only function (where nvcc is lenient). Provide a
+// __host__ __device__ equivalent for those few device-side decltype sites.
+#if defined(__cplusplus)
+namespace nvcv { namespace cuda { namespace compat {
+template<typename T>
+__host__ __device__ T &&declval() noexcept;
+}}} // namespace nvcv::cuda::compat
+#endif
+
+#endif // __HIP_PLATFORM_AMD__ || USE_HIP
+
+#endif // CVCUDA_HIP_COMPAT_H
diff --git a/cmake/hip/channel_descriptor.h b/cmake/hip/channel_descriptor.h
@@ -0,0 +1,2 @@
+#pragma once
+#include "CvCudaHipCompat.h"