diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9ad41245f..5335afb3c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,7 +33,101 @@ if(NOT CMAKE_CUDA_HOST_COMPILER)
     set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}")
 endif()
 
-enable_language(CUDA)
+# Build the GPU kernels for AMD via HIP/ROCm instead of CUDA. Additive: the
+# NVIDIA path below is unchanged when USE_HIP is OFF.
+option(USE_HIP "Build the GPU code with HIP for AMD GPUs (ROCm)" OFF)
+
+if(USE_HIP)
+    # enable_language(HIP) honors -DCMAKE_HIP_ARCHITECTURES, otherwise auto-detects
+    # the host GPU(s) via rocm_agent_enumerator and errors if none is found (a
+    # no-GPU build host must then set the arch explicitly).
+    enable_language(HIP)
+
+    # Host .cpp files also use the CUDA runtime/library spellings (cudaStream_t,
+    # cudaMalloc, ...); steer every C/C++/HIP TU through the ROCm compat layer.
+    add_compile_definitions(USE_HIP __HIP_PLATFORM_AMD__)
+    set(CVCUDA_HIP_COMPAT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/cmake/hip")
+    include_directories(BEFORE "${CVCUDA_HIP_COMPAT_DIR}")
+    # Force-include the compat header on every C++/HIP TU so the cuda*->hip*
+    # aliases and the 64-bit warp mask precede any include, regardless of file
+    # order. C TUs do not use the CUDA runtime, so they are left alone.
+    set(_cvcuda_force_include "-include ${CVCUDA_HIP_COMPAT_DIR}/CvCudaHipCompat.h")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${_cvcuda_force_include}")
+    set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} ${_cvcuda_force_include}")
+
+    # clang (HIP) defaults to -ffp-contract=fast, which forms FMAs across separate
+    # statements/expressions; nvcc only contracts within a single expression
+    # (--fmad=true). That extra contraction makes HIP float results drift by ~1 ULP
+    # from the CUDA build and the CPU gold references (e.g. the bicubic weight chain
+    # in InterpolationWrap), failing the bit-exact gtests. Pin contraction to the
+    # CUDA/host semantics so float math is reproducible across the two toolchains.
+    set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -ffp-contract=on")
+
+    find_package(hip REQUIRED)
+    find_package(hipcub REQUIRED)
+    find_package(hiprand REQUIRED)
+    find_package(hipblas REQUIRED)
+    find_package(hipsolver REQUIRED)
+
+    # The compat header is force-included on every C++/HIP TU and pulls in the
+    # HIP runtime, so the ROCm headers must be on the include path globally (not
+    # only on targets that link hip::host).
+    get_target_property(_hip_inc hip::host INTERFACE_INCLUDE_DIRECTORIES)
+    if(_hip_inc)
+        include_directories(SYSTEM ${_hip_inc})
+    else()
+        include_directories(SYSTEM "/opt/rocm/include")
+    endif()
+
+    # Map the CUDA::* link targets the build uses onto their HIP equivalents so
+    # the per-target target_link_libraries() lines stay untouched.
+    foreach(_cudalib cudart_static cublas_static cublasLt_static cusolver_static cuda_driver)
+        if(NOT TARGET CUDA::${_cudalib})
+            add_library(CUDA::${_cudalib} INTERFACE IMPORTED)
+            target_link_libraries(CUDA::${_cudalib} INTERFACE hip::host)
+        endif()
+    endforeach()
+    target_link_libraries(CUDA::cusolver_static INTERFACE roc::hipsolver)
+    target_link_libraries(CUDA::cublas_static   INTERFACE roc::hipblas)
+
+    # Retag every .cu source LANGUAGE HIP without editing each per-target
+    # CMakeLists. add_library/add_executable are overridden at top scope to
+    # forward to the real command then flip the target's .cu files to HIP and
+    # set HIP_ARCHITECTURES. The NVIDIA build never defines
+    # these macros, so it is a pure passthrough.
+    macro(_cvcuda_hipify_target _tgt)
+        get_target_property(_t_type ${_tgt} TYPE)
+        if(NOT _t_type STREQUAL "INTERFACE_LIBRARY")
+            get_target_property(_t_srcs ${_tgt} SOURCES)
+            if(_t_srcs)
+                foreach(_s ${_t_srcs})
+                    if(_s MATCHES "\\.cu$")
+                        set_source_files_properties(${_s} PROPERTIES LANGUAGE HIP)
+                    endif()
+                endforeach()
+            endif()
+            set_target_properties(${_tgt} PROPERTIES HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}")
+        endif()
+    endmacro()
+
+    macro(add_library _tgt)
+        _add_library(${_tgt} ${ARGN})
+        set(_args "${ARGN}")
+        if(NOT "ALIAS" IN_LIST _args AND NOT "IMPORTED" IN_LIST _args AND NOT "INTERFACE" IN_LIST _args)
+            _cvcuda_hipify_target(${_tgt})
+        endif()
+    endmacro()
+
+    macro(add_executable _tgt)
+        _add_executable(${_tgt} ${ARGN})
+        set(_args "${ARGN}")
+        if(NOT "ALIAS" IN_LIST _args AND NOT "IMPORTED" IN_LIST _args)
+            _cvcuda_hipify_target(${_tgt})
+        endif()
+    endmacro()
+else()
+    enable_language(CUDA)
+endif()
 
 # Used when creating special builds
 set(PROJECT_VERSION_SUFFIX "")
diff --git a/README.md b/README.md
index 92a90d107..43e6edf1f 100644
--- a/README.md
+++ b/README.md
@@ -65,6 +65,8 @@ We provide pre-built Python wheels on pypi.org for a variety of Python versions
 
 See [Installation](https://cvcuda.github.io/CV-CUDA/installation.html) for complete installation instructions including building from source, installing Debian packages, and tar archives.
 
+CV-CUDA can also be built for AMD GPUs by compiling its GPU code with HIP for ROCm instead of CUDA. This path is additive and off by default; the NVIDIA build is unchanged. See [Building for AMD GPUs (ROCm)](https://cvcuda.github.io/CV-CUDA/installation.html#building-for-amd-gpus-rocm) for details.
+
 ### Compatibility
 
 |CV-CUDA Build|Platform|CUDA Version|CUDA Compute Capability|Hardware Architectures|Nvidia Driver|Python Versions|Supported Compilers (build from source and API compatiblity)|API compatibility with prebuilt binaries|OS/Linux distributions tested with prebuilt packages|
diff --git a/cmake/ConfigBuildTree.cmake b/cmake/ConfigBuildTree.cmake
index 7b306ae45..dddd767ac 100644
--- a/cmake/ConfigBuildTree.cmake
+++ b/cmake/ConfigBuildTree.cmake
@@ -66,10 +66,15 @@ include(CMakeDependentOption)
 
 option(EXPOSE_CODE "Expose in resulting binaries parts of our code" ${DEFAULT_EXPOSE_CODE})
 option(WARNINGS_AS_ERRORS "Treat compilation warnings as errors" OFF)
-cmake_dependent_option(ENABLE_COMPAT_OLD_GLIBC "Generates binaries that work with old distros, with old glibc" ON "NOT ARCH_AARCH64" OFF)
+# The old-glibc compat shim links prebuilt glibc-2.17 stub .so files (shipped via
+# Git LFS); it is a deployment-portability feature unrelated to GPU correctness,
+# so default it OFF on the ROCm build.
+cmake_dependent_option(ENABLE_COMPAT_OLD_GLIBC "Generates binaries that work with old distros, with old glibc" ON "NOT ARCH_AARCH64;NOT USE_HIP" OFF)
 
 # Needed to get cuda version
-find_package(CUDAToolkit REQUIRED)
+if(NOT USE_HIP)
+    find_package(CUDAToolkit REQUIRED)
+endif()
 
 # Are we inside a git repo and it has submodules enabled?
 if(EXISTS ${CMAKE_SOURCE_DIR}/.git AND EXISTS ${CMAKE_SOURCE_DIR}/.gitmodules)
@@ -85,7 +90,11 @@ else()
                         "CV-CUDA only supports Linux platform.")
 endif()
 
-set(CVCUDA_BUILD_SUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}-${CVCUDA_SYSTEM_NAME}")
+if(USE_HIP)
+    set(CVCUDA_BUILD_SUFFIX "hip-${CVCUDA_SYSTEM_NAME}")
+else()
+    set(CVCUDA_BUILD_SUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}-${CVCUDA_SYSTEM_NAME}")
+endif()
 
 function(setup_dso target version)
     string(REGEX MATCHALL "[0-9]+" version_list "${version}")
diff --git a/cmake/ConfigCUDA.cmake b/cmake/ConfigCUDA.cmake
index 0ec51d304..a42efc1fe 100644
--- a/cmake/ConfigCUDA.cmake
+++ b/cmake/ConfigCUDA.cmake
@@ -13,6 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# The whole CUDA toolkit configuration is skipped on the HIP/ROCm build; the
+# top-level CMakeLists handles enable_language(HIP) and the HIP packages.
+if(NOT USE_HIP)
+
 string(REPLACE "." ";" CUDA_VERSION_LIST ${CMAKE_CUDA_COMPILER_VERSION})
 list(GET CUDA_VERSION_LIST 0 CUDA_VERSION_MAJOR)
 list(GET CUDA_VERSION_LIST 1 CUDA_VERSION_MINOR)
@@ -100,3 +104,5 @@ if(NOT USE_CMAKE_CUDA_ARCHITECTURES)
     # which is the old architecture supported by nvcc. We don't want that.
     set(CMAKE_CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}" CACHE STRING "CUDA architectures to build for" FORCE)
 endif()
+
+endif() # NOT USE_HIP
diff --git a/cmake/ConfigCompiler.cmake b/cmake/ConfigCompiler.cmake
index c0984eef1..5224d6d19 100644
--- a/cmake/ConfigCompiler.cmake
+++ b/cmake/ConfigCompiler.cmake
@@ -46,7 +46,13 @@ endif()
 include(CheckIPOSupported)
 check_ipo_supported(RESULT LTO_SUPPORTED)
 
-set(LTO_ENABLED ON)
+# LTO does not finalize under the HIP link step (device objects stay slim
+# bitcode), which breaks linking; keep it off on ROCm.
+if(USE_HIP)
+    set(LTO_ENABLED OFF)
+else()
+    set(LTO_ENABLED ON)
+endif()
 
 if(ENABLE_SANITIZER)
     set(COMPILER_SANITIZER_FLAGS
diff --git a/cmake/hip/CvCudaHipCompat.h b/cmake/hip/CvCudaHipCompat.h
new file mode 100644
index 000000000..69692070a
--- /dev/null
+++ b/cmake/hip/CvCudaHipCompat.h
@@ -0,0 +1,215 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Author: Jeff Daily <jeff.daily@amd.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// ROCm/HIP compatibility layer for CV-CUDA. Force-included on every HIP
+// translation unit (CMAKE_HIP_FLAGS -include) so the aliases below are in
+// scope before any CV-CUDA or CUDA-toolkit header is parsed. The NVIDIA build
+// never sees this file: the cmake/hip shim dir and this header are only on the
+// HIP include path. Only the actual CUDA-runtime/library symbols CV-CUDA uses
+// are aliased here; CV-CUDA's own cuda<Op>Submit/cuda<Op>Create public API
+// names are deliberately left untouched.
+
+#ifndef CVCUDA_HIP_COMPAT_H
+#define CVCUDA_HIP_COMPAT_H
+
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+
+#if defined(__cplusplus)
+// libc host declarations must win over HIP's device overloads of memcpy/memset:
+// pull them in before the HIP runtime so host TUs keep the standard host
+// prototypes.
+#include <cstdlib>
+#include <cstring>
+
+#if defined(__HIPCC__)
+// .cu translation units (compiled by hipcc) need the full device runtime.
+#include <hip/hip_runtime.h>
+#else
+// Plain g++ host TUs only need the runtime API (types + host-callable entry
+// points); the device runtime header is heavier and pulls device builtins.
+#include <hip/hip_runtime_api.h>
+#endif
+#endif
+
+// CUDA_VERSION: several headers (Compat.hpp, Metaprogramming.hpp, StreamId.cpp)
+// branch on it. CV-CUDA wants the pre-13.0 compound-type aliases, so report a
+// version below 13000. Do NOT define __CUDA_ARCH__ on HIP: the SaturateCast PTX
+// table and the NVCV SIMD-video-intrinsic paths are gated on __CUDA_ARCH__ and
+// must stay inert, falling through to their portable C++/per-element bodies.
+#ifndef CUDA_VERSION
+#define CUDA_VERSION 12020
+#endif
+
+// ---- runtime: error/status -------------------------------------------------
+#define cudaError_t                hipError_t
+#define cudaError                  hipError_t
+#define cudaSuccess                hipSuccess
+#define cudaErrorNotReady          hipErrorNotReady
+#define cudaErrorInvalidValue      hipErrorInvalidValue
+#define cudaErrorMemoryAllocation  hipErrorOutOfMemory
+#define cudaErrorCudartUnloading   hipErrorDeinitialized
+#define cudaErrorTextureFetchFailed hipErrorInvalidTexture
+#define cudaGetLastError           hipGetLastError
+#define cudaPeekAtLastError        hipPeekAtLastError
+#define cudaGetErrorString         hipGetErrorString
+#define cudaGetErrorName           hipGetErrorName
+#define cudaGetVersion             hipRuntimeGetVersion
+
+// ---- runtime: device -------------------------------------------------------
+#define cudaGetDevice              hipGetDevice
+#define cudaSetDevice              hipSetDevice
+#define cudaGetDeviceCount         hipGetDeviceCount
+#define cudaDeviceSynchronize      hipDeviceSynchronize
+#define cudaDeviceProp             hipDeviceProp_t
+#define cudaGetDeviceProperties    hipGetDeviceProperties
+#define cudaDevAttrTextureAlignment      hipDeviceAttributeTextureAlignment
+#define cudaDevAttrTexturePitchAlignment hipDeviceAttributeTexturePitchAlignment
+
+// NVCV derives a tensor/image row-pitch alignment from the texture *pitch*
+// alignment device attribute. On NVIDIA that attribute is 32 bytes, so a tightly
+// packed image (e.g. a 640-byte uchar row) keeps a 640-byte row stride. AMD
+// reports 256 there, which would pad that row to 768 and silently change the
+// in-memory layout every NVCV consumer (and the whole-buffer test comparisons)
+// assumes. No CV-CUDA tensor is bound to a HW texture object, so the larger
+// HW pitch is unnecessary here; clamp the queried pitch alignment to the NVIDIA
+// value to keep the byte layout identical to the CUDA build. Other attribute
+// queries pass through unchanged.
+#if defined(__cplusplus)
+__host__ inline hipError_t cvcuda_hipDeviceGetAttribute(int *value, hipDeviceAttribute_t attr, int device)
+{
+    hipError_t err = hipDeviceGetAttribute(value, attr, device);
+    if (err == hipSuccess && attr == hipDeviceAttributeTexturePitchAlignment && value && *value > 32)
+    {
+        *value = 32;
+    }
+    return err;
+}
+#define cudaDeviceGetAttribute cvcuda_hipDeviceGetAttribute
+#else
+#define cudaDeviceGetAttribute hipDeviceGetAttribute
+#endif
+
+// ---- runtime: memory -------------------------------------------------------
+#define cudaMalloc                 hipMalloc
+#define cudaMallocManaged          hipMallocManaged
+#define cudaFree                   hipFree
+#define cudaMallocHost             hipHostMalloc
+#define cudaHostAlloc              hipHostMalloc
+#define cudaFreeHost               hipHostFree
+#define cudaHostFree               hipHostFree
+#define cudaHostAllocMapped        hipHostMallocMapped
+#define cudaHostAllocWriteCombined hipHostMallocWriteCombined
+#define cudaMemset                 hipMemset
+#define cudaMemsetAsync            hipMemsetAsync
+#define cudaMemset2D               hipMemset2D
+#define cudaMemset2DAsync          hipMemset2DAsync
+#define cudaMemcpy                 hipMemcpy
+#define cudaMemcpyAsync            hipMemcpyAsync
+#define cudaMemcpy2D               hipMemcpy2D
+#define cudaMemcpy2DAsync          hipMemcpy2DAsync
+#define cudaMemcpyKind             hipMemcpyKind
+#define cudaMemcpyHostToDevice     hipMemcpyHostToDevice
+#define cudaMemcpyDeviceToHost     hipMemcpyDeviceToHost
+#define cudaMemcpyDeviceToDevice   hipMemcpyDeviceToDevice
+#define cudaMemcpyHostToHost       hipMemcpyHostToHost
+#define cudaMemcpyDefault          hipMemcpyDefault
+#define cudaPointerAttributes      hipPointerAttribute_t
+#define cudaPointerGetAttributes   hipPointerGetAttributes
+#define cudaMemoryTypeHost         hipMemoryTypeHost
+#define cudaMemoryTypeDevice       hipMemoryTypeDevice
+#define cudaMemoryTypeManaged      hipMemoryTypeManaged
+#define cudaMemoryTypeUnregistered hipMemoryTypeUnregistered
+
+// ---- runtime: stream / event ----------------------------------------------
+#define cudaStream_t                       hipStream_t
+#define cudaStreamDefault                  hipStreamDefault
+#define cudaStreamPerThread                hipStreamPerThread
+#define cudaStreamNonBlocking              hipStreamNonBlocking
+#define cudaStreamSynchronize              hipStreamSynchronize
+#define cudaStreamWaitEvent                hipStreamWaitEvent
+#define cudaStreamDestroy                  hipStreamDestroy
+#define cudaStreamGetId                    hipStreamGetId
+#define cudaStreamCreate                   hipStreamCreate
+#define cudaStreamCreateWithFlags          hipStreamCreateWithFlags
+#define cudaStreamCreateWithPriority       hipStreamCreateWithPriority
+#define cudaDeviceGetStreamPriorityRange   hipDeviceGetStreamPriorityRange
+#define cudaEvent_t                hipEvent_t
+#define cudaEventDefault           hipEventDefault
+#define cudaEventDisableTiming     hipEventDisableTiming
+#define cudaEventCreate            hipEventCreate
+#define cudaEventCreateWithFlags   hipEventCreateWithFlags
+#define cudaEventRecord            hipEventRecord
+#define cudaEventQuery             hipEventQuery
+#define cudaEventSynchronize       hipEventSynchronize
+#define cudaEventElapsedTime       hipEventElapsedTime
+#define cudaEventDestroy           hipEventDestroy
+
+// ---- full-wavefront mask ---------------------------------------------------
+// __shfl*_sync on ROCm static_asserts a 64-bit mask regardless of wave width.
+// The width argument (kept explicit at every call site that needs it) controls
+// the subgroup; the mask just marks participants.
+#define NVCV_WARP_FULL_MASK 0xffffffffffffffffULL
+
+// ---- built-in index types --------------------------------------------------
+// On CUDA blockIdx/blockDim/threadIdx are uint3/dim3, so the kernels' common
+// idiom `blockIdx * blockDim + threadIdx` resolves through the cuda:: compound
+// operators. On HIP these are distinct __hip_builtin_*_t structs (with only a
+// dim3 conversion), so neither HIP's nor CV-CUDA's vector operators deduce them.
+// Provide the exact whole-vector forms the kernels use, lowering to uint3. These
+// take only the builtin index types, which carry no NVCV TypeTraits, so they do
+// not compete with the cuda:: compound operators.
+// Defined only under hipcc (where the __hip_builtin_*_t types exist and where
+// __global__ bodies are parsed); plain g++ host TUs that include only the HIP
+// runtime API do not see these builtin types, so the operators must not appear
+// there. hipcc parses these in both its host and device passes.
+#if defined(__cplusplus) && defined(__HIPCC__)
+__host__ __device__ __forceinline__ uint3 operator*(const __hip_builtin_blockIdx_t &a,
+                                                    const __hip_builtin_blockDim_t &b)
+{
+    return uint3{a.x * b.x, a.y * b.y, a.z * b.z};
+}
+__host__ __device__ __forceinline__ uint3 operator*(const __hip_builtin_blockDim_t &a,
+                                                    const __hip_builtin_blockIdx_t &b)
+{
+    return uint3{a.x * b.x, a.y * b.y, a.z * b.z};
+}
+__host__ __device__ __forceinline__ uint3 operator+(const uint3 &a, const __hip_builtin_threadIdx_t &b)
+{
+    return uint3{a.x + b.x, a.y + b.y, a.z + b.z};
+}
+__host__ __device__ __forceinline__ uint3 operator+(const __hip_builtin_threadIdx_t &a, const uint3 &b)
+{
+    return uint3{a.x + b.x, a.y + b.y, a.z + b.z};
+}
+#endif
+
+// std::declval is __host__-only (libstdc++); clang rejects it in the unevaluated
+// decltype of a __device__-only function (where nvcc is lenient). Provide a
+// __host__ __device__ equivalent for those few device-side decltype sites.
+#if defined(__cplusplus)
+namespace nvcv { namespace cuda { namespace compat {
+template<typename T>
+__host__ __device__ T &&declval() noexcept;
+}}} // namespace nvcv::cuda::compat
+#endif
+
+#endif // __HIP_PLATFORM_AMD__ || USE_HIP
+
+#endif // CVCUDA_HIP_COMPAT_H
diff --git a/cmake/hip/channel_descriptor.h b/cmake/hip/channel_descriptor.h
new file mode 100644
index 000000000..862568004
--- /dev/null
+++ b/cmake/hip/channel_descriptor.h
@@ -0,0 +1,2 @@
+#pragma once
+#include "CvCudaHipCompat.h"
diff --git a/cmake/hip/cub/cub.cuh b/cmake/hip/cub/cub.cuh
new file mode 100644
index 000000000..ae13ba38b
--- /dev/null
+++ b/cmake/hip/cub/cub.cuh
@@ -0,0 +1,18 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Author: Jeff Daily <jeff.daily@amd.com>
+ */
+
+#pragma once
+#include "../CvCudaHipCompat.h"
+#include <hipcub/hipcub.hpp>
+
+// CV-CUDA spells the namespace cub::; hipCUB lives in hipcub::. Aliasing the
+// namespace keeps every cub::BlockReduce/BlockScan/BlockRadixSort/DeviceReduce
+// and cub::BLOCK_* enum call site unchanged.
+namespace hipcub
+{
+}
+namespace cub = hipcub;
diff --git a/cmake/hip/cublas_v2.h b/cmake/hip/cublas_v2.h
new file mode 100644
index 000000000..3bb9c7caf
--- /dev/null
+++ b/cmake/hip/cublas_v2.h
@@ -0,0 +1,16 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Author: Jeff Daily <jeff.daily@amd.com>
+ */
+
+#pragma once
+#include "CvCudaHipCompat.h"
+#include <hipblas/hipblas.h>
+
+// OpFindHomography only uses the status type and the fill-mode enum from cuBLAS.
+#define cublasStatus_t        hipblasStatus_t
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_FILL_MODE_LOWER HIPBLAS_FILL_MODE_LOWER
+#define CUBLAS_FILL_MODE_UPPER HIPBLAS_FILL_MODE_UPPER
diff --git a/cmake/hip/cuda.h b/cmake/hip/cuda.h
new file mode 100644
index 000000000..862568004
--- /dev/null
+++ b/cmake/hip/cuda.h
@@ -0,0 +1,2 @@
+#pragma once
+#include "CvCudaHipCompat.h"
diff --git a/cmake/hip/cuda_fp16.h b/cmake/hip/cuda_fp16.h
new file mode 100644
index 000000000..50a827d30
--- /dev/null
+++ b/cmake/hip/cuda_fp16.h
@@ -0,0 +1,3 @@
+#pragma once
+#include "CvCudaHipCompat.h"
+#include <hip/hip_fp16.h>
diff --git a/cmake/hip/cuda_runtime.h b/cmake/hip/cuda_runtime.h
new file mode 100644
index 000000000..862568004
--- /dev/null
+++ b/cmake/hip/cuda_runtime.h
@@ -0,0 +1,2 @@
+#pragma once
+#include "CvCudaHipCompat.h"
diff --git a/cmake/hip/cuda_runtime_api.h b/cmake/hip/cuda_runtime_api.h
new file mode 100644
index 000000000..862568004
--- /dev/null
+++ b/cmake/hip/cuda_runtime_api.h
@@ -0,0 +1,2 @@
+#pragma once
+#include "CvCudaHipCompat.h"
diff --git a/cmake/hip/curand_kernel.h b/cmake/hip/curand_kernel.h
new file mode 100644
index 000000000..76ec52e2a
--- /dev/null
+++ b/cmake/hip/curand_kernel.h
@@ -0,0 +1,16 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Author: Jeff Daily <jeff.daily@amd.com>
+ */
+
+#pragma once
+#include "CvCudaHipCompat.h"
+#include <cstdio> // rocRAND's mtgp32 header calls printf without including it
+#include <hiprand/hiprand_kernel.h>
+
+#define curandState   hiprandState
+#define curandState_t hiprandState_t
+#define curand_init   hiprand_init
+#define curand_normal hiprand_normal
diff --git a/cmake/hip/cusolverDn.h b/cmake/hip/cusolverDn.h
new file mode 100644
index 000000000..173cf6e59
--- /dev/null
+++ b/cmake/hip/cusolverDn.h
@@ -0,0 +1,34 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Author: Jeff Daily <jeff.daily@amd.com>
+ */
+
+#pragma once
+#include "CvCudaHipCompat.h"
+#include <hipsolver/hipsolver.h>
+
+// OpFindHomography solves a batched symmetric eigenproblem (syevjBatched). The
+// hipSOLVER dense Dn API matches cuSOLVER argument-for-argument, so each call
+// site ports by name alone.
+#define cusolverStatus_t              hipsolverStatus_t
+#define CUSOLVER_STATUS_SUCCESS       HIPSOLVER_STATUS_SUCCESS
+#define cusolverDnHandle_t            hipsolverDnHandle_t
+#define cusolverEigMode_t             hipsolverEigMode_t
+#define CUSOLVER_EIG_MODE_VECTOR      HIPSOLVER_EIG_MODE_VECTOR
+#define CUSOLVER_EIG_MODE_NOVECTOR    HIPSOLVER_EIG_MODE_NOVECTOR
+#define syevjInfo_t                   hipsolverSyevjInfo_t
+
+#define cusolverDnCreate              hipsolverDnCreate
+#define cusolverDnDestroy             hipsolverDnDestroy
+#define cusolverDnSetStream           hipsolverDnSetStream
+#define cusolverDnCreateSyevjInfo     hipsolverDnCreateSyevjInfo
+#define cusolverDnDestroySyevjInfo    hipsolverDnDestroySyevjInfo
+#define cusolverDnXsyevjSetTolerance  hipsolverDnXsyevjSetTolerance
+#define cusolverDnXsyevjSetMaxSweeps  hipsolverDnXsyevjSetMaxSweeps
+#define cusolverDnXsyevjSetSortEig    hipsolverDnXsyevjSetSortEig
+#define cusolverDnSsyevjBatched_bufferSize hipsolverDnSsyevjBatched_bufferSize
+#define cusolverDnSsyevjBatched       hipsolverDnSsyevjBatched
+#define cusolverDnDsyevjBatched_bufferSize hipsolverDnDsyevjBatched_bufferSize
+#define cusolverDnDsyevjBatched       hipsolverDnDsyevjBatched
diff --git a/cmake/hip/device_launch_parameters.h b/cmake/hip/device_launch_parameters.h
new file mode 100644
index 000000000..862568004
--- /dev/null
+++ b/cmake/hip/device_launch_parameters.h
@@ -0,0 +1,2 @@
+#pragma once
+#include "CvCudaHipCompat.h"
diff --git a/cmake/hip/driver_types.h b/cmake/hip/driver_types.h
new file mode 100644
index 000000000..862568004
--- /dev/null
+++ b/cmake/hip/driver_types.h
@@ -0,0 +1,2 @@
+#pragma once
+#include "CvCudaHipCompat.h"
diff --git a/cmake/hip/library_types.h b/cmake/hip/library_types.h
new file mode 100644
index 000000000..08c872744
--- /dev/null
+++ b/cmake/hip/library_types.h
@@ -0,0 +1,18 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Author: Jeff Daily <jeff.daily@amd.com>
+ */
+
+#pragma once
+#include "CvCudaHipCompat.h"
+#include <hip/library_types.h>
+
+// CUDA's library_types.h provides cudaDataType / CUDA_R_*; map to HIP's.
+#define cudaDataType hipDataType
+#define CUDA_R_16F   HIP_R_16F
+#define CUDA_R_32F   HIP_R_32F
+#define CUDA_R_64F   HIP_R_64F
+#define CUDA_C_32F   HIP_C_32F
+#define CUDA_C_64F   HIP_C_64F
diff --git a/cmake/hip/vector_types.h b/cmake/hip/vector_types.h
new file mode 100644
index 000000000..862568004
--- /dev/null
+++ b/cmake/hip/vector_types.h
@@ -0,0 +1,2 @@
+#pragma once
+#include "CvCudaHipCompat.h"
diff --git a/docs/sphinx/installation.rst b/docs/sphinx/installation.rst
index a037ee324..cd830e12f 100644
--- a/docs/sphinx/installation.rst
+++ b/docs/sphinx/installation.rst
@@ -281,6 +281,8 @@ The central ``ci/build.sh`` script is used to build the project, Python bindings
 - ``-DDOC_PYTHON_VERSION='3.11'``: Override Python version for documentation build (default: system Python)
 - ``-DENABLE_SANITIZER=1|0``: Enable/disable address sanitizer (default: disabled)
 - ``-DCMAKE_CUDA_COMPILER=/path/to/nvcc``: Override CUDA compiler (default: /usr/local/cuda/bin/nvcc)
+- ``-DUSE_HIP=1|0``: Build the GPU code with HIP for AMD GPUs (ROCm) instead of CUDA (default: disabled). See :ref:`Building for AMD GPUs (ROCm) <build-rocm>` below.
+- ``-DCMAKE_HIP_ARCHITECTURES='gfx90a'``: AMD GPU architecture(s) to build for when ``USE_HIP=1`` (defaults to ``gfx90a`` when unset). Set to your target, e.g. ``gfx1100`` for RDNA3 desktop GPUs.
 
 All boolean options accept both numeric (``0``/``1``) and CMake boolean values (``ON``/``OFF``, ``YES``/``NO``, ``TRUE``/``FALSE``).
 
@@ -310,6 +312,32 @@ All boolean options accept both numeric (``0``/``1``) and CMake boolean values (
     # Build with specific CUDA 13 version
     ci/build.sh -DCMAKE_CUDA_COMPILER=/usr/local/cuda-13/bin/nvcc
 
+    # Build for AMD GPUs with HIP/ROCm
+    ci/build.sh release build-rel -DUSE_HIP=1 -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_PREFIX_PATH=/opt/rocm
+
+.. _build-rocm:
+
+Building for AMD GPUs (ROCm)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+CV-CUDA can target AMD GPUs by building its GPU code with HIP instead of CUDA. The HIP path is additive: with ``USE_HIP=0`` (the default) the NVIDIA build is unchanged.
+
+Prerequisites:
+
+- A `ROCm <https://rocm.docs.amd.com/>`_ installation (7.x recommended), which provides ``hipcc`` and the HIP runtime along with the ROCm math libraries used by some operators (hipCUB, hipBLAS, hipSOLVER, rocRAND).
+- CMake and Ninja as for the CUDA build (the CUDA toolkit is not required).
+
+Build with HIP enabled, selecting the target AMD architecture:
+
+.. code-block:: shell
+
+    ci/build.sh release build-rel -DUSE_HIP=1 -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_PREFIX_PATH=/opt/rocm
+
+If ROCm is not on your ``PATH``, pass ``-DCMAKE_PREFIX_PATH=/opt/rocm`` so CMake finds the hip* packages (``find_package(hip)`` and friends).
+
+When ``CMAKE_HIP_ARCHITECTURES`` is left unset it defaults to ``gfx90a``; set it to the architecture of your GPU (for example ``gfx1100`` for RDNA3 desktop cards). No source or CMake edits are needed to retarget. The build outputs the same library and test layout as the CUDA build, and the test suites run unchanged on the AMD GPU.
+
+This support has been validated on the CDNA2 ``gfx90a`` (MI200 series) and RDNA3 ``gfx1100`` architectures on Linux.
 
 1. Run Tests
 ~~~~~~~~~~~~
diff --git a/src/cvcuda/CMakeLists.txt b/src/cvcuda/CMakeLists.txt
index 64c6c3d93..659b5b4e8 100644
--- a/src/cvcuda/CMakeLists.txt
+++ b/src/cvcuda/CMakeLists.txt
@@ -93,6 +93,12 @@ else()
     list(APPEND CV_CUDA_LIB_FILES ${CV_CUDA_OP_FILES})
 endif()
 
+# cuOSD-dependent operators (OSD, bounding box, box blur) are scoped out of the
+# ROCm build (cuOSD is a prebuilt CUDA-only static lib with no source).
+if(USE_HIP)
+    list(FILTER CV_CUDA_LIB_FILES EXCLUDE REGEX "OpOSD|OpBndBox|OpBoxBlur")
+endif()
+
 add_library(cvcuda SHARED
     ${CV_CUDA_LIB_FILES}
 )
diff --git a/src/cvcuda/include/cvcuda/cuda_tools/InterpolationVarShapeWrap.hpp b/src/cvcuda/include/cvcuda/cuda_tools/InterpolationVarShapeWrap.hpp
index e44c5e76c..e1c7b7aa9 100644
--- a/src/cvcuda/include/cvcuda/cuda_tools/InterpolationVarShapeWrap.hpp
+++ b/src/cvcuda/include/cvcuda/cuda_tools/InterpolationVarShapeWrap.hpp
@@ -225,7 +225,9 @@ class InterpolationVarShapeWrap<T, B, NVCV_INTERP_NEAREST>
         c.x = GetIndexForInterpolation<kInterpolationType>(c.x + .5f);
         c.y = GetIndexForInterpolation<kInterpolationType>(c.y + .5f);
 
-        return doGetValue(c);
+        // clang/HIP two-phase lookup: doGetValue is inherited from a dependent
+        // base, so it must be qualified.
+        return this->doGetValue(c);
     }
 };
 
diff --git a/src/cvcuda/include/cvcuda/cuda_tools/MathOps.hpp b/src/cvcuda/include/cvcuda/cuda_tools/MathOps.hpp
index 4506cf101..1b7dd161d 100644
--- a/src/cvcuda/include/cvcuda/cuda_tools/MathOps.hpp
+++ b/src/cvcuda/include/cvcuda/cuda_tools/MathOps.hpp
@@ -112,9 +112,16 @@ NVCV_CUDA_UNARY_OPERATOR(~, nvcv::cuda::detail::IsIntegralCompound)
 
 #undef NVCV_CUDA_UNARY_OPERATOR
 
-#define NVCV_CUDA_BINARY_OPERATOR(OPERATOR, REQUIREMENT)                                                        \
-    template<typename T, typename U, class = nvcv::cuda::Require<REQUIREMENT<T, U>>>                            \
-    inline __host__ __device__ auto operator OPERATOR(T a, U b)                                                 \
+// The element-wise body is shared between the generic operator and, on HIP, a
+// more-specialized compound-compound overload (see below). HIP's vector types
+// (HIP_vector_type) ship their own operator+(vec<T,n>, U) and operator+(U,
+// vec<T,n>); for a mixed pair like float3 + uchar3 both bind and they are
+// equally specialized, so the call is ambiguous (a HIP header limitation).
+// CV-CUDA's generic operator(T, U) is less specialized than HIP's and loses
+// partial ordering, so on HIP we also emit operator(vec<T1,n>, vec<T2,n>),
+// which is more specialized than HIP's pair and wins, while same-type vec+vec
+// still resolves to HIP's own (correct) operator and vec+scalar is unchanged.
+#define NVCV_CUDA_BINARY_OPERATOR_BODY(OPERATOR)                                                                \
     {                                                                                                           \
         using RT = nvcv::cuda::MakeType<                                                                        \
             decltype(std::declval<nvcv::cuda::BaseType<T>>() OPERATOR std::declval<nvcv::cuda::BaseType<U>>()), \
@@ -152,7 +159,103 @@ NVCV_CUDA_UNARY_OPERATOR(~, nvcv::cuda::detail::IsIntegralCompound)
             else if constexpr (nvcv::cuda::NumElements<RT> == 4)                                                \
                 return RT{a.x OPERATOR b.x, a.y OPERATOR b.y, a.z OPERATOR b.z, a.w OPERATOR b.w};              \
         }                                                                                                       \
+    }
+
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+// HIP_vector_type also ships operator(vec<T,n>, U) / operator(U, vec<T,n>) with a
+// templated scalar U; for a float/double scalar these keep the vector's integer
+// element type (truncating) and win partial ordering over CV-CUDA's promoting
+// operator(T, U). Emit vec<>+float and vec<>+double overloads (concrete scalar
+// types, hence more specialized than HIP's templated U) so CV-CUDA's promoting
+// body is selected. The compound-compound overload covers the mixed-vector case.
+// Direct element-wise bodies: the shared BODY macro syntax-checks `b.x` in
+// untaken if-constexpr branches, which is a hard error when the scalar arg has a
+// concrete (non-dependent) type, so the scalar overloads spell the loop out.
+#define NVCV_CUDA_BINARY_OPERATOR_HIP_SCALAR(OPERATOR, SCALAR)                                                  \
+    template<typename T1, unsigned int N,                                                                      \
+             class = nvcv::cuda::Require<nvcv::cuda::IsCompound<HIP_vector_type<T1, N>>>>                       \
+    inline __host__ __device__ auto operator OPERATOR(HIP_vector_type<T1, N> a, SCALAR b)                       \
+    {                                                                                                           \
+        using RT = nvcv::cuda::MakeType<decltype(std::declval<T1>() OPERATOR std::declval<SCALAR>()), N>;       \
+        if constexpr (N == 1)                                                                                  \
+            return RT{a.x OPERATOR b};                                                                          \
+        else if constexpr (N == 2)                                                                             \
+            return RT{a.x OPERATOR b, a.y OPERATOR b};                                                          \
+        else if constexpr (N == 3)                                                                             \
+            return RT{a.x OPERATOR b, a.y OPERATOR b, a.z OPERATOR b};                                          \
+        else                                                                                                   \
+            return RT{a.x OPERATOR b, a.y OPERATOR b, a.z OPERATOR b, a.w OPERATOR b};                          \
+    }                                                                                                           \
+    template<typename T2, unsigned int N,                                                                      \
+             class = nvcv::cuda::Require<nvcv::cuda::IsCompound<HIP_vector_type<T2, N>>>>                       \
+    inline __host__ __device__ auto operator OPERATOR(SCALAR a, HIP_vector_type<T2, N> b)                       \
+    {                                                                                                           \
+        using RT = nvcv::cuda::MakeType<decltype(std::declval<SCALAR>() OPERATOR std::declval<T2>()), N>;       \
+        if constexpr (N == 1)                                                                                  \
+            return RT{a OPERATOR b.x};                                                                          \
+        else if constexpr (N == 2)                                                                             \
+            return RT{a OPERATOR b.x, a OPERATOR b.y};                                                          \
+        else if constexpr (N == 3)                                                                             \
+            return RT{a OPERATOR b.x, a OPERATOR b.y, a OPERATOR b.z};                                          \
+        else                                                                                                   \
+            return RT{a OPERATOR b.x, a OPERATOR b.y, a OPERATOR b.z, a OPERATOR b.w};                          \
+    }
+// HIP_vector_type ships its own operator OP for every pair (vec,vec),
+// (const vec&, U), (U, const vec&) with U unconstrained and the vector by const
+// reference. Two cases need CV-CUDA's promoting semantics instead of HIP's:
+//   * a mixed-element vector pair (e.g. float3 OP uchar3): HIP's (vec,vec) needs
+//     both element types identical, so both HIP's (const vec&, U) and (U, const
+//     vec&) are viable and tie, and either truncates to one operand's element
+//     type. A both-operands-concrete overload is more specialized than either
+//     HIP form by signature alone, so it wins without a constraint; the SFINAE
+//     enable_if(!same element type) keeps same-type vec OP vec on HIP's own
+//     operator. This is the only case the operator kernels exercise, so it is
+//     emitted unconditionally (C++17-valid).
+//   * a vector OP dim3 (dim3 is an NVCV compound but not a HIP_vector_type):
+//     HIP's (const vec&, U=dim3) is chosen and make_vector_type<T,n>(dim3) is
+//     ill-formed. dim3 is the only NVCV compound that is not a HIP_vector_type
+//     (and is always 3 components), so a forward overload with a CONCRETE dim3
+//     operand (more specialized than HIP's templated U) wins unambiguously --
+//     for the integral-only operators (%,&,|,^,<<,>>) HIP's own operator is
+//     itself enable_if-constrained, so a requires-clause could not break the
+//     tie, but a concrete parameter type still does. The 3-element result body
+//     is spelled out (the shared body's if-constexpr branches reference .w,
+//     which is a hard error on the concrete 3-element dim3 even when discarded).
+//     The mirror covers dim3 OP vec; both are plain C++17.
+#define NVCV_CUDA_BINARY_OPERATOR_HIP_DIM3(OPERATOR)                                                            \
+    template<typename T1, class = nvcv::cuda::Require<nvcv::cuda::IsCompound<HIP_vector_type<T1, 3>>>>          \
+    inline __host__ __device__ auto operator OPERATOR(const HIP_vector_type<T1, 3> &a, dim3 b)                  \
+    {                                                                                                           \
+        using RT = nvcv::cuda::MakeType<decltype(std::declval<T1>() OPERATOR std::declval<unsigned int>()), 3>;\
+        return RT{a.x OPERATOR b.x, a.y OPERATOR b.y, a.z OPERATOR b.z};                                        \
+    }                                                                                                           \
+    template<typename T2, class = nvcv::cuda::Require<nvcv::cuda::IsCompound<HIP_vector_type<T2, 3>>>>          \
+    inline __host__ __device__ auto operator OPERATOR(dim3 a, const HIP_vector_type<T2, 3> &b)                  \
+    {                                                                                                           \
+        using RT = nvcv::cuda::MakeType<decltype(std::declval<unsigned int>() OPERATOR std::declval<T2>()), 3>;\
+        return RT{a.x OPERATOR b.x, a.y OPERATOR b.y, a.z OPERATOR b.z};                                        \
+    }
+#define NVCV_CUDA_BINARY_OPERATOR_HIP(OPERATOR)                                                                 \
+    template<typename T1, typename T2, unsigned int N,                                                          \
+             class = nvcv::cuda::Require<!std::is_same_v<T1, T2>>>                                              \
+    inline __host__ __device__ auto operator OPERATOR(const HIP_vector_type<T1, N> &a,                          \
+                                                      const HIP_vector_type<T2, N> &b)                          \
+    {                                                                                                           \
+        using T = HIP_vector_type<T1, N>;                                                                       \
+        using U = HIP_vector_type<T2, N>;                                                                       \
+        NVCV_CUDA_BINARY_OPERATOR_BODY(OPERATOR)                                                                \
     }                                                                                                           \
+    NVCV_CUDA_BINARY_OPERATOR_HIP_DIM3(OPERATOR)                                                                \
+    NVCV_CUDA_BINARY_OPERATOR_HIP_SCALAR(OPERATOR, float)                                                       \
+    NVCV_CUDA_BINARY_OPERATOR_HIP_SCALAR(OPERATOR, double)
+#else
+#define NVCV_CUDA_BINARY_OPERATOR_HIP(OPERATOR)
+#endif
+
+#define NVCV_CUDA_BINARY_OPERATOR(OPERATOR, REQUIREMENT)                                                        \
+    template<typename T, typename U, class = nvcv::cuda::Require<REQUIREMENT<T, U>>>                            \
+    inline __host__ __device__ auto operator OPERATOR(T a, U b) NVCV_CUDA_BINARY_OPERATOR_BODY(OPERATOR)        \
+    NVCV_CUDA_BINARY_OPERATOR_HIP(OPERATOR)                                                                     \
     template<typename T, typename U, class = nvcv::cuda::Require<nvcv::cuda::IsCompound<T>>>                    \
     inline __host__ __device__ T &operator OPERATOR##=(T &a, U b)                                               \
     {                                                                                                           \
@@ -196,6 +299,55 @@ inline __host__ __device__ bool operator!=(T a, U b)
     return !(a == b);
 }
 
+// HIP_vector_type ships operator==/!= for (vec,vec), (const vec&, U), (U, const
+// vec&). A mixed pair like int3 == long3 makes both HIP (const vec&, U) and (U,
+// const vec&) viable (tie), and vec == dim3 is ill-formed inside
+// make_vector_type(dim3). Same structure and rationale as the arithmetic fix: a
+// both-operands-concrete overload (more specialized than either HIP form) for
+// the mixed-element vector pair (SFINAE-excluding same element type so HIP's own
+// operator keeps that), plus a concrete-dim3 forward and mirror. All C++17.
+// Bodies are spelled out (a concrete operand trips the if-constexpr probe of the
+// shared arithmetic body).
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+#define NVCV_CUDA_HIP_COMPARE_BODY(RET_ON_DIFF, RET_DEFAULT)                                                    \
+    {                                                                                                           \
+        bool eq = true;                                                                                         \
+        if constexpr (N >= 1)                                                                                  \
+            eq = eq && (a.x == b.x);                                                                            \
+        if constexpr (N >= 2)                                                                                  \
+            eq = eq && (a.y == b.y);                                                                            \
+        if constexpr (N >= 3)                                                                                  \
+            eq = eq && (a.z == b.z);                                                                            \
+        if constexpr (N == 4)                                                                                  \
+            eq = eq && (a.w == b.w);                                                                            \
+        return eq ? RET_DEFAULT : RET_ON_DIFF;                                                                  \
+    }
+#define NVCV_CUDA_HIP_COMPARE(OPERATOR, RET_ON_DIFF, RET_DEFAULT)                                               \
+    template<typename T1, typename T2, unsigned int N,                                                          \
+             class = nvcv::cuda::Require<!std::is_same_v<T1, T2>>>                                              \
+    inline __host__ __device__ bool operator OPERATOR(const HIP_vector_type<T1, N> &a,                          \
+                                                      const HIP_vector_type<T2, N> &b)                          \
+        NVCV_CUDA_HIP_COMPARE_BODY(RET_ON_DIFF, RET_DEFAULT)                                                    \
+    template<typename T1, class = nvcv::cuda::Require<nvcv::cuda::IsCompound<HIP_vector_type<T1, 3>>>>          \
+    inline __host__ __device__ bool operator OPERATOR(const HIP_vector_type<T1, 3> &a, dim3 b)                  \
+    {                                                                                                           \
+        bool eq = (a.x == b.x) && (a.y == b.y) && (a.z == b.z);                                                 \
+        return eq ? RET_DEFAULT : RET_ON_DIFF;                                                                  \
+    }                                                                                                           \
+    template<typename T2, class = nvcv::cuda::Require<nvcv::cuda::IsCompound<HIP_vector_type<T2, 3>>>>          \
+    inline __host__ __device__ bool operator OPERATOR(dim3 a, const HIP_vector_type<T2, 3> &b)                  \
+    {                                                                                                           \
+        bool eq = (a.x == b.x) && (a.y == b.y) && (a.z == b.z);                                                 \
+        return eq ? RET_DEFAULT : RET_ON_DIFF;                                                                  \
+    }
+
+NVCV_CUDA_HIP_COMPARE(==, false, true)
+NVCV_CUDA_HIP_COMPARE(!=, true, false)
+
+#undef NVCV_CUDA_HIP_COMPARE
+#undef NVCV_CUDA_HIP_COMPARE_BODY
+#endif
+
 namespace nvcv::cuda {
 
 template<typename T, typename U, class = nvcv::cuda::Require<nvcv::cuda::detail::IsSameCompound<T, U>>>
diff --git a/src/cvcuda/include/cvcuda/cuda_tools/detail/MathWrappersImpl.hpp b/src/cvcuda/include/cvcuda/cuda_tools/detail/MathWrappersImpl.hpp
index e64481777..9b40636be 100644
--- a/src/cvcuda/include/cvcuda/cuda_tools/detail/MathWrappersImpl.hpp
+++ b/src/cvcuda/include/cvcuda/cuda_tools/detail/MathWrappersImpl.hpp
@@ -26,7 +26,7 @@
 
 namespace nvcv::cuda::detail {
 
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
 
 template<typename T, typename U, int RM = FE_TONEAREST>
 __device__ __forceinline__ T DeviceRoundImpl(U u)
@@ -277,6 +277,14 @@ __device__ __forceinline__ T DeviceRoundImpl(U u)
 template<typename U>
 __device__ __forceinline__ U DeviceMinImpl(U a, U b)
 {
+#if defined(__HIP_DEVICE_COMPILE__)
+    // HIP's global namespace has no typed umin/ullmin/llmin; the ternary lowers to
+    // the same integer-min instruction the CUDA builtins emit. The comparison is
+    // spelled exactly like the host std::min fallback (b < a ? b : a) so the device
+    // result is identical to the host reference for every input, including the
+    // NaN / signed-zero cases the morphology tests exercise with raw-byte floats.
+    return b < a ? b : a;
+#else
     if constexpr (std::is_same_v<U, unsigned int>)
     {
         return ::umin(a, b);
@@ -293,11 +301,17 @@ __device__ __forceinline__ U DeviceMinImpl(U a, U b)
     {
         return ::min(a, b);
     }
+#endif
 }
 
 template<typename U>
 __device__ __forceinline__ U DeviceMaxImpl(U a, U b)
 {
+#if defined(__HIP_DEVICE_COMPILE__)
+    // Spelled exactly like the host std::max fallback (a < b ? b : a) so the device
+    // max matches the host reference exactly on NaN / signed-zero inputs.
+    return a < b ? b : a;
+#else
     if constexpr (std::is_same_v<U, unsigned int>)
     {
         return ::umax(a, b);
@@ -314,6 +328,7 @@ __device__ __forceinline__ U DeviceMaxImpl(U a, U b)
     {
         return ::max(a, b);
     }
+#endif
 }
 
 template<typename U, typename S>
@@ -357,6 +372,21 @@ __device__ __forceinline__ U DeviceExpImpl(U u)
 template<typename U>
 __device__ __forceinline__ U DeviceSqrtImpl(U u)
 {
+#if defined(__HIP_DEVICE_COMPILE__)
+    // gfx90a's single-precision __fsqrt_rn is not always correctly rounded (it can
+    // be 1 ULP off, e.g. sqrt(93606.0f)), whereas CUDA's sqrt.rn.f32 and the host
+    // std::sqrt are correctly rounded. Route 32-bit sqrt through the correctly
+    // rounded f64 sqrt and round once to keep results identical to the CUDA
+    // build and the CPU references (CDNA has fast f64 sqrt).
+    if constexpr (std::is_same_v<U, double>)
+    {
+        return __dsqrt_rn(u);
+    }
+    else
+    {
+        return static_cast<U>(__dsqrt_rn(static_cast<double>(u)));
+    }
+#else
     if constexpr (std::is_same_v<U, float>)
     {
         return __fsqrt_rn(u);
@@ -373,6 +403,7 @@ __device__ __forceinline__ U DeviceSqrtImpl(U u)
     {
         return static_cast<U>(__dsqrt_rn(static_cast<double>(u)));
     }
+#endif
 }
 
 template<typename U>
@@ -417,7 +448,7 @@ inline __host__ U RoundEvenImpl(U u)
 template<typename T, typename U, int RM = FE_TONEAREST>
 inline __host__ __device__ T RoundImpl(U u)
 {
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
     return DeviceRoundImpl<T, U, RM>(u);
 #else
     // In host we use C++ to do round depending on round mode by selecting at compile time the correct function:
@@ -444,7 +475,7 @@ inline __host__ __device__ T RoundImpl(U u)
 template<typename U>
 inline __host__ __device__ U MinImpl(U a, U b)
 {
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
     return DeviceMinImpl(a, b);
 #else
     return std::min(a, b);
@@ -454,7 +485,7 @@ inline __host__ __device__ U MinImpl(U a, U b)
 template<typename U>
 inline __host__ __device__ U MaxImpl(U a, U b)
 {
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
     return DeviceMaxImpl(a, b);
 #else
     return std::max(a, b);
@@ -464,7 +495,7 @@ inline __host__ __device__ U MaxImpl(U a, U b)
 template<typename U, typename S>
 inline __host__ __device__ U PowImpl(U x, S y)
 {
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
     return DevicePowImpl(x, y);
 #else
     return std::pow(x, y);
@@ -474,7 +505,7 @@ inline __host__ __device__ U PowImpl(U x, S y)
 template<typename U>
 inline __host__ __device__ U ExpImpl(U u)
 {
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
     return DeviceExpImpl(u);
 #else
     return std::exp(u);
@@ -484,7 +515,7 @@ inline __host__ __device__ U ExpImpl(U u)
 template<typename U>
 inline __host__ __device__ U SqrtImpl(U u)
 {
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
     return DeviceSqrtImpl(u);
 #else
     return std::sqrt(u);
@@ -500,7 +531,7 @@ inline __host__ __device__ U AbsImpl(U u)
     }
     else
     {
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
         return DeviceAbsImpl(u);
 #else
         return std::abs(u);
diff --git a/src/cvcuda/include/cvcuda/cuda_tools/detail/Metaprogramming.hpp b/src/cvcuda/include/cvcuda/cuda_tools/detail/Metaprogramming.hpp
index fc0a9f719..528b79c4b 100644
--- a/src/cvcuda/include/cvcuda/cuda_tools/detail/Metaprogramming.hpp
+++ b/src/cvcuda/include/cvcuda/cuda_tools/detail/Metaprogramming.hpp
@@ -93,7 +93,15 @@ NVCV_CUDA_TYPE_TRAITS(double, double, 0, 1, DBL_MIN, DBL_MAX);
     NVCV_CUDA_TYPE_TRAITS_1_TO_3(COMPOUND_TYPE, BASE_TYPE, MIN_VAL, MAX_VAL);    \
     NVCV_CUDA_TYPE_TRAITS(COMPOUND_TYPE##4, BASE_TYPE, 4, 4, MIN_VAL, MAX_VAL)
 
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+// HIP's char1..4 are HIP_vector_type<char,N>, whose members are plain `char`
+// (not `signed char` as in CUDA's vector types), so the base_type must be
+// `char` for GetElement's reference return to bind. `char` is signed on the
+// supported targets, so the numeric range is unchanged.
+NVCV_CUDA_TYPE_TRAITS_1_TO_4(char, char, SCHAR_MIN, SCHAR_MAX);
+#else
 NVCV_CUDA_TYPE_TRAITS_1_TO_4(char, signed char, SCHAR_MIN, SCHAR_MAX);
+#endif
 NVCV_CUDA_TYPE_TRAITS_1_TO_4(uchar, unsigned char, 0, UCHAR_MAX);
 NVCV_CUDA_TYPE_TRAITS_1_TO_4(short, short, SHRT_MIN, SHRT_MAX);
 NVCV_CUDA_TYPE_TRAITS_1_TO_4(ushort, unsigned short, 0, USHRT_MAX);
diff --git a/src/cvcuda/include/cvcuda/cuda_tools/math/LinAlg.hpp b/src/cvcuda/include/cvcuda/cuda_tools/math/LinAlg.hpp
index c413a4685..609334cde 100644
--- a/src/cvcuda/include/cvcuda/cuda_tools/math/LinAlg.hpp
+++ b/src/cvcuda/include/cvcuda/cuda_tools/math/LinAlg.hpp
@@ -219,8 +219,12 @@ class Vector
         return v;
     }
 
-    // On-purpose public data to allow POD-class direct initialization.
-#ifdef __CUDA_ARCH__
+    // On-purpose public data to allow POD-class direct initialization. The
+    // default member initializer is omitted on HIP in BOTH compiler passes (not
+    // just the device pass): Vector is used as a __shared__ variable, and a
+    // __shared__ object's type must have no initializer, which clang enforces in
+    // the host pass too.
+#if defined(__CUDA_ARCH__) || defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
     T m_data[N];
 #else
     T m_data[N] = {};
@@ -466,7 +470,7 @@ namespace detail {
 template<class T>
 constexpr __host__ __device__ void swap(T &a, T &b)
 {
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
     T c = a;
     a   = b;
     b   = c;
diff --git a/src/cvcuda/priv/CMakeLists.txt b/src/cvcuda/priv/CMakeLists.txt
index deac31c27..c93345216 100644
--- a/src/cvcuda/priv/CMakeLists.txt
+++ b/src/cvcuda/priv/CMakeLists.txt
@@ -91,6 +91,12 @@ else()
     list(APPEND CV_CUDA_PRIV_FILES ${CV_CUDA_PRIV_OP_FILES})
 endif()
 
+# cuOSD is a prebuilt CUDA-only static lib with no source, so the operators that
+# depend on it (OSD, bounding box, box blur) are scoped out of the ROCm build.
+if(USE_HIP)
+    list(FILTER CV_CUDA_PRIV_FILES EXCLUDE REGEX "OpOSD|OpBndBox|OpBoxBlur")
+endif()
+
 add_library(cvcuda_priv STATIC
     ${CV_CUDA_PRIV_FILES}
 )
diff --git a/src/cvcuda/priv/OpBrightnessContrast.cu b/src/cvcuda/priv/OpBrightnessContrast.cu
index 6313b45a5..b049446c2 100644
--- a/src/cvcuda/priv/OpBrightnessContrast.cu
+++ b/src/cvcuda/priv/OpBrightnessContrast.cu
@@ -120,7 +120,13 @@ inline __device__ void DoBrightnessContrast(SrcWrapper src, DstWrapper dst, cons
 {
     using SrcT                       = std::remove_const_t<typename SrcWrapper::ValueType>;
     using DstT                       = typename DstWrapper::ValueType;
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+    // std::declval is __host__-only; use the __host__ __device__ shim in this
+    // device-side decltype (clang rejects host fns in __device__ unevaluated ctx).
+    using IntermediateT = decltype(nvcv::cuda::compat::declval<ArgT>() * nvcv::cuda::compat::declval<SrcT>());
+#else
     using IntermediateT              = decltype(std::declval<ArgT>() * std::declval<SrcT>());
+#endif
     using SrcBT                      = cuda::BaseType<SrcT>;
     using DstBT                      = cuda::BaseType<DstT>;
     using BI                         = cuda::BaseType<IntermediateT>;
diff --git a/src/cvcuda/priv/OpCropFlipNormalizeReformat.cu b/src/cvcuda/priv/OpCropFlipNormalizeReformat.cu
index cfae84add..6fe40804f 100644
--- a/src/cvcuda/priv/OpCropFlipNormalizeReformat.cu
+++ b/src/cvcuda/priv/OpCropFlipNormalizeReformat.cu
@@ -71,7 +71,7 @@ __device__ void transfer_data(cuda::BorderVarShapeWrap<const T1, B> srcWrap, Dst
         {
             float base  = get_base_value(baseWrap, c, base_channels);
             float scale = get_scale_value(scaleWrap, c, scale_channels, epsilon, flags);
-            dstWrap[(int4){dst_idx.x, dst_idx.y, c, batchidx}] = cuda::SaturateCast<DstWrapper::ValueType>(
+            dstWrap[(int4){dst_idx.x, dst_idx.y, c, batchidx}] = cuda::SaturateCast<typename DstWrapper::ValueType>(
                 (srcWrap[(int4){src_idx.x, src_idx.y, c, batchidx}] - base) * scale * global_scale + global_shift);
         }
     }
@@ -81,7 +81,7 @@ __device__ void transfer_data(cuda::BorderVarShapeWrap<const T1, B> srcWrap, Dst
         {
             float base  = get_base_value(baseWrap, c, base_channels);
             float scale = get_scale_value(scaleWrap, c, scale_channels, epsilon, flags);
-            dstWrap[(int4){c, dst_idx.x, dst_idx.y, batchidx}] = cuda::SaturateCast<DstWrapper::ValueType>(
+            dstWrap[(int4){c, dst_idx.x, dst_idx.y, batchidx}] = cuda::SaturateCast<typename DstWrapper::ValueType>(
                 (srcWrap[(int4){src_idx.x, src_idx.y, c, batchidx}] - base) * scale * global_scale + global_shift);
         }
     }
@@ -99,7 +99,7 @@ __device__ void transfer_data(cuda::BorderVarShapeWrapNHWC<const T1, B> srcWrap,
         {
             float base  = get_base_value(baseWrap, c, base_channels);
             float scale = get_scale_value(scaleWrap, c, scale_channels, epsilon, flags);
-            dstWrap[(int4){dst_idx.x, dst_idx.y, c, batchidx}] = cuda::SaturateCast<DstWrapper::ValueType>(
+            dstWrap[(int4){dst_idx.x, dst_idx.y, c, batchidx}] = cuda::SaturateCast<typename DstWrapper::ValueType>(
                 (srcWrap[(int4){batchidx, src_idx.y, src_idx.x, c}] - base) * scale * global_scale + global_shift);
         }
     }
@@ -109,7 +109,7 @@ __device__ void transfer_data(cuda::BorderVarShapeWrapNHWC<const T1, B> srcWrap,
         {
             float base  = get_base_value(baseWrap, c, base_channels);
             float scale = get_scale_value(scaleWrap, c, scale_channels, epsilon, flags);
-            dstWrap[(int4){c, dst_idx.x, dst_idx.y, batchidx}] = cuda::SaturateCast<DstWrapper::ValueType>(
+            dstWrap[(int4){c, dst_idx.x, dst_idx.y, batchidx}] = cuda::SaturateCast<typename DstWrapper::ValueType>(
                 (srcWrap[(int4){batchidx, src_idx.y, src_idx.x, c}] - base) * scale * global_scale + global_shift);
         }
     }
@@ -122,14 +122,14 @@ __device__ void set_data(DstWrapper dstWrap, int2 dst_idx, int batchidx, int ch,
     {
         for (int c = 0; c < ch; c++)
         {
-            dstWrap[(int4){dst_idx.x, dst_idx.y, c, batchidx}] = cuda::StaticCast<DstWrapper::ValueType>(val);
+            dstWrap[(int4){dst_idx.x, dst_idx.y, c, batchidx}] = cuda::StaticCast<typename DstWrapper::ValueType>(val);
         }
     }
     else
     {
         for (int c = 0; c < ch; c++)
         {
-            dstWrap[(int4){c, dst_idx.x, dst_idx.y, batchidx}] = cuda::StaticCast<DstWrapper::ValueType>(val);
+            dstWrap[(int4){c, dst_idx.x, dst_idx.y, batchidx}] = cuda::StaticCast<typename DstWrapper::ValueType>(val);
         }
     }
 }
diff --git a/src/cvcuda/priv/OpFindHomography.cu b/src/cvcuda/priv/OpFindHomography.cu
index abd413dbc..3829c041e 100644
--- a/src/cvcuda/priv/OpFindHomography.cu
+++ b/src/cvcuda/priv/OpFindHomography.cu
@@ -384,7 +384,11 @@ __device__ void reducef(float *data, cuda::math::Vector<float, 32> &warpSums, fl
     int      tid    = threadIdx.x;
     int      idx    = threadIdx.x + blockIdx.x * blockDim.x;
     float    val    = 0.0f;
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+    unsigned long long mask = NVCV_WARP_FULL_MASK;
+#else
     unsigned mask   = 0xFFFFFFFFU;
+#endif
     int      lane   = threadIdx.x % warpSize;
     int      warpID = threadIdx.x / warpSize;
     while (idx < numPoints)
@@ -417,7 +421,11 @@ __device__ void reducef2(float2 *data, cuda::math::Vector<float2, 32> &warpSums,
     int      tid    = threadIdx.x;
     int      idx    = threadIdx.x + blockIdx.x * blockDim.x;
     float2   val    = {0.0f, 0.0f};
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+    unsigned long long mask = NVCV_WARP_FULL_MASK;
+#else
     unsigned mask   = 0xFFFFFFFFU;
+#endif
     int      lane   = threadIdx.x % warpSize;
     int      warpID = threadIdx.x / warpSize;
     while (idx < numPoints)
@@ -462,7 +470,11 @@ __device__ void reduceLtL(float2 *src, float2 *dst, cuda::math::Vector<float, 32
     int   idx = threadIdx.x + blockIdx.x * blockDim.x;
     float val = 0.0f;
     ;
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+    unsigned long long mask = NVCV_WARP_FULL_MASK;
+#else
     unsigned mask   = 0xFFFFFFFFU;
+#endif
     int      lane   = threadIdx.x % warpSize;
     int      warpID = threadIdx.x / warpSize;
     while (idx < numPoints)
@@ -511,7 +523,11 @@ __device__ void calculate_Jtx_matvec(float *A, float *B, float *result, matrix8x
     __syncthreads();
 
     float    val[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+    unsigned long long mask = NVCV_WARP_FULL_MASK;
+#else
     unsigned mask   = 0xFFFFFFFFU;
+#endif
     int      lane   = threadIdx.x % warpSize;
     int      warpID = threadIdx.x / warpSize;
     while (idx < numPoints)
@@ -760,7 +776,11 @@ __device__ void max(float *data, vector32 &warpSums, float *result, int numPoint
     int      tid    = threadIdx.x;
     int      idx    = threadIdx.x + blockIdx.x * blockDim.x;
     float    val    = 0.0f;
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+    unsigned long long mask = NVCV_WARP_FULL_MASK;
+#else
     unsigned mask   = 0xFFFFFFFFU;
+#endif
     int      lane   = threadIdx.x % warpSize;
     int      warpID = threadIdx.x / warpSize;
     while (idx < numPoints)
diff --git a/src/cvcuda/priv/OpHQResize.cu b/src/cvcuda/priv/OpHQResize.cu
index dfdc8bb03..b122923f1 100644
--- a/src/cvcuda/priv/OpHQResize.cu
+++ b/src/cvcuda/priv/OpHQResize.cu
@@ -343,7 +343,14 @@ std::enable_if_t<NumChannelsT::kHasStaticChannels, typename Wrap::ValueType> __f
 
     if constexpr (kSupportsLdg)
     {
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+        // HIP's __ldg has no overload for every vector element type (e.g.
+        // ushort4); a plain load is correct (the read-only cache hint is
+        // advisory and maps to a normal load on CDNA anyway).
+        return *GetWrapPtr(wrap, idxs...);
+#else
         return __ldg(GetWrapPtr(wrap, idxs...));
+#endif
     }
     else if constexpr (!kSupportsLdg)
     {
diff --git a/src/cvcuda/priv/OpLabel.cu b/src/cvcuda/priv/OpLabel.cu
index 726550512..12f273281 100644
--- a/src/cvcuda/priv/OpLabel.cu
+++ b/src/cvcuda/priv/OpLabel.cu
@@ -66,6 +66,18 @@
 namespace cuda = nvcv::cuda;
 namespace util = nvcv::util;
 
+// Connected-component labelling shifts the per-row left neighbour with a
+// delta-1 warp shuffle. The block is (BW=32, BH=4[, BD=2]) so each image row is
+// one 32-lane subgroup; the shuffles below pass an explicit width of 32 so they
+// stay within a row on a 64-lane CDNA wavefront (two rows per wavefront) exactly
+// as they do within a 32-lane NVIDIA warp. The mask only marks participants and
+// must be 64-bit on ROCm.
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+#define NVCV_SHFL_MASK NVCV_WARP_FULL_MASK
+#else
+#define NVCV_SHFL_MASK 0xffffffff
+#endif
+
 namespace {
 
 constexpr int REGION_NOT_MARKED  = 0;
@@ -135,7 +147,12 @@ __global__ void BlockLabel2D(DstWrap dst, SrcWrap src, ArgWrap<ST> minThresh, Ar
     using DT = typename DstWrap::ValueType;
     __shared__ DT labels[BW * BH];
 
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+    // threadIdx is __hip_builtin_threadIdx_t (no NVCV TypeTraits); convert to uint3.
+    int2 tc = cuda::StaticCast<int>(cuda::DropCast<2>(uint3{threadIdx.x, threadIdx.y, threadIdx.z}));
+#else
     int2 tc = cuda::StaticCast<int>(cuda::DropCast<2>(threadIdx));
+#endif
     int3 gc{(int)(blockIdx.x * BW) + tc.x, (int)(blockIdx.y * BH) + tc.y, (int)blockIdx.z};
 
     bool nym1x, nyxm1, nym1xm1;
@@ -167,8 +184,8 @@ __global__ void BlockLabel2D(DstWrap dst, SrcWrap src, ArgWrap<ST> minThresh, Ar
             pym1x = (tc.y > 0) ? (pym1x > maxThreshold ? 0 : 1) : 0;
         }
 
-        ST pyxm1   = __shfl_up_sync(__activemask(), pyx, 1);
-        ST pym1xm1 = __shfl_up_sync(__activemask(), pym1x, 1);
+        ST pyxm1   = __shfl_up_sync(__activemask(), pyx, 1, 32);
+        ST pym1xm1 = __shfl_up_sync(__activemask(), pym1x, 1, 32);
 
         nym1x   = (tc.y > 0) ? (pyx == pym1x) : false;
         nyxm1   = (tc.x > 0) ? (pyx == pyxm1) : false;
@@ -252,8 +269,8 @@ __global__ void YLabelReduction2D(DstWrap dst, SrcWrap src, ArgWrap<ST> minThres
         pym1x = pym1x > maxThreshold ? 0 : 1;
     }
 
-    ST pyxm1   = __shfl_up_sync(0xffffffff, pyx, 1);
-    ST pym1xm1 = __shfl_up_sync(0xffffffff, pym1x, 1);
+    ST pyxm1   = __shfl_up_sync(NVCV_SHFL_MASK, pyx, 1, 32);
+    ST pym1xm1 = __shfl_up_sync(NVCV_SHFL_MASK, pym1x, 1, 32);
 
     if ((pyx == pym1x) && ((threadIdx.x == 0) || (pyx != pyxm1) || (pyx != pym1xm1)))
     {
@@ -305,8 +322,8 @@ __global__ void XLabelReduction2D(DstWrap dst, SrcWrap src, ArgWrap<ST> minThres
 
     bool thread_y = (gc.y % blockDim.y) == 0;
 
-    ST pym1x   = __shfl_up_sync(0xffffffff, pyx, 1);
-    ST pym1xm1 = __shfl_up_sync(0xffffffff, pyxm1, 1);
+    ST pym1x   = __shfl_up_sync(NVCV_SHFL_MASK, pyx, 1, 32);
+    ST pym1xm1 = __shfl_up_sync(NVCV_SHFL_MASK, pyxm1, 1, 32);
 
     if ((pyx == pyxm1) && (thread_y || (pyx != pym1x) || (pyx != pym1xm1)))
     {
@@ -679,7 +696,11 @@ __global__ void BlockLabel3D(DstWrap dst, SrcWrap src, ArgWrap<ST> minThresh, Ar
 
     __shared__ DT labels[BW * BH * BD];
 
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+    int3 tc = cuda::StaticCast<int>(uint3{threadIdx.x, threadIdx.y, threadIdx.z});
+#else
     int3 tc = cuda::StaticCast<int>(threadIdx);
+#endif
     int4 gc{(int)blockIdx.x * BW + tc.x, (int)blockIdx.y * BH + tc.y, (int)blockIdx.z * BD + tc.z, 0};
 
     bool nzm1yx, nzym1x, nzyxm1, nzym1xm1, nzm1yxm1, nzm1ym1x;
@@ -722,9 +743,9 @@ __global__ void BlockLabel3D(DstWrap dst, SrcWrap src, ArgWrap<ST> minThresh, Ar
                 pzm1ym1x = (tc.z > 0 && tc.y > 0) ? (pzm1ym1x > maxThreshold ? 0 : 1) : 0;
             }
 
-            ST pzyxm1   = __shfl_up_sync(__activemask(), pzyx, 1);
-            ST pzym1xm1 = __shfl_up_sync(__activemask(), pzym1x, 1);
-            ST pzm1yxm1 = __shfl_up_sync(__activemask(), pzm1yx, 1);
+            ST pzyxm1   = __shfl_up_sync(__activemask(), pzyx, 1, 32);
+            ST pzym1xm1 = __shfl_up_sync(__activemask(), pzym1x, 1, 32);
+            ST pzm1yxm1 = __shfl_up_sync(__activemask(), pzm1yx, 1, 32);
 
             nzm1yx = (tc.z > 0) && (pzyx == pzm1yx);
             nzym1x = (tc.y > 0) && (pzyx == pzym1x);
@@ -827,8 +848,8 @@ __global__ void ZLabelReduction3D(DstWrap dst, SrcWrap src, ArgWrap<ST> minThres
             pzm1yx = pzm1yx > maxThreshold ? 0 : 1;
         }
 
-        ST pzyxm1   = __shfl_up_sync(0xffffffff, pzyx, 1);
-        ST pzm1yxm1 = __shfl_up_sync(0xffffffff, pzm1yx, 1);
+        ST pzyxm1   = __shfl_up_sync(NVCV_SHFL_MASK, pzyx, 1, 32);
+        ST pzm1yxm1 = __shfl_up_sync(NVCV_SHFL_MASK, pzm1yx, 1, 32);
 
         if (pzyx == pzm1yx)
         {
@@ -910,8 +931,8 @@ __global__ void YLabelReduction3D(DstWrap dst, SrcWrap src, ArgWrap<ST> minThres
             pzym1x = pzym1x > maxThreshold ? 0 : 1;
         }
 
-        ST pzyxm1   = __shfl_up_sync(0xffffffff, pzyx, 1);
-        ST pzym1xm1 = __shfl_up_sync(0xffffffff, pzym1x, 1);
+        ST pzyxm1   = __shfl_up_sync(NVCV_SHFL_MASK, pzyx, 1, 32);
+        ST pzym1xm1 = __shfl_up_sync(NVCV_SHFL_MASK, pzym1x, 1, 32);
 
         if (pzyx == pzym1x)
         {
@@ -993,8 +1014,8 @@ __global__ void XLabelReduction3D(DstWrap dst, SrcWrap src, ArgWrap<ST> minThres
             pzyxm1 = pzyxm1 > maxThreshold ? 0 : 1;
         }
 
-        ST pzm1yx   = __shfl_up_sync(0xffffffff, pzyx, 1);
-        ST pzm1yxm1 = __shfl_up_sync(0xffffffff, pzyxm1, 1);
+        ST pzm1yx   = __shfl_up_sync(NVCV_SHFL_MASK, pzyx, 1, 32);
+        ST pzm1yxm1 = __shfl_up_sync(NVCV_SHFL_MASK, pzyxm1, 1, 32);
 
         if (pzyx == pzyxm1)
         {
diff --git a/src/cvcuda/priv/OpMinMaxLoc.cu b/src/cvcuda/priv/OpMinMaxLoc.cu
index 85bc71cd2..5f56d8ea9 100644
--- a/src/cvcuda/priv/OpMinMaxLoc.cu
+++ b/src/cvcuda/priv/OpMinMaxLoc.cu
@@ -112,12 +112,19 @@ struct OpMin
     using OutType     = OutputType<T>;
     using BaseOutType = cuda::BaseType<OutType>;
 
-    static constexpr OutType init = {cuda::TypeTraits<OutType>::max};
+    // A function rather than a static constexpr member: OutType is a vector type
+    // whose ctor is non-constexpr under HIP (HIP_vector_type), so a static const
+    // initializer cannot be emitted on the device (clang). Returning it by value
+    // keeps the same codegen after inlining on both backends.
+    __host__ __device__ static OutType init()
+    {
+        return OutType{cuda::TypeTraits<OutType>::max};
+    }
 
     template<class OutWrapper>
     __device__ inline static void initFill(OutWrapper out, int z)
     {
-        get<0>(out)[z] = init;
+        get<0>(out)[z] = init();
     }
 
     template<typename U>
@@ -141,12 +148,15 @@ struct OpMax
     using OutType     = OutputType<T>;
     using BaseOutType = cuda::BaseType<OutType>;
 
-    static constexpr OutType init = {cuda::Lowest<OutType>};
+    __host__ __device__ static OutType init()
+    {
+        return OutType{cuda::Lowest<OutType>};
+    }
 
     template<class OutWrapper>
     __device__ inline static void initFill(OutWrapper out, int z)
     {
-        get<0>(out)[z] = init;
+        get<0>(out)[z] = init();
     }
 
     template<typename U>
@@ -170,13 +180,23 @@ struct OpMinMax
     using BaseOutType = cuda::BaseType<OutputType<T>>;
     using OutType     = cuda::MakeType<BaseOutType, 2>;
 
-    static constexpr OutType init = {cuda::TypeTraits<OutType>::max, cuda::Lowest<OutType>};
+    __host__ __device__ static OutType init()
+    {
+        return OutType{cuda::TypeTraits<OutType>::max, cuda::Lowest<OutType>};
+    }
 
     template<class OutWrapper>
     __device__ inline static void initFill(OutWrapper out, int z)
     {
-        get<0>(out)[z] = {init.x};
-        get<1>(out)[z] = {init.y};
+        OutType v = init();
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+        // HIP_vector_type has no braced-init-list operator=; set the lane member.
+        get<0>(out)[z].x = v.x;
+        get<1>(out)[z].x = v.y;
+#else
+        get<0>(out)[z] = {v.x};
+        get<1>(out)[z] = {v.y};
+#endif
     }
 
     template<typename U>
@@ -349,7 +369,7 @@ __global__ __launch_bounds__(BW *BH) void FindMinMax(InWrapper in, int2 size, Ou
     int x = (blockIdx.x * blockDim.x + threadIdx.x) * TW;
     int y = blockIdx.y * BH * TH + threadIdx.y;
 
-    auto threadRet = OP::init;
+    auto threadRet = OP::init();
 
 #pragma unroll
     for (int i = 0; i < TH; ++i)
diff --git a/src/cvcuda/priv/OpPairwiseMatcher.cu b/src/cvcuda/priv/OpPairwiseMatcher.cu
index 9e6af5f6a..b4098ba5e 100644
--- a/src/cvcuda/priv/OpPairwiseMatcher.cu
+++ b/src/cvcuda/priv/OpPairwiseMatcher.cu
@@ -100,17 +100,44 @@ public:
 #pragma unroll
         for (int i = 0; i < kNumElem && i < util::DivUp(numDim * (int)sizeof(T), (int)sizeof(RT)); ++i)
         {
+#if defined(__HIP__)
+            data.words[i]
+                = *reinterpret_cast<const RT *>(set.ptr(sampleIdx, setIdx, i * (int)(sizeof(RT) / sizeof(T))));
+#else
             data[i] = *reinterpret_cast<const RT *>(set.ptr(sampleIdx, setIdx, i * (int)(sizeof(RT) / sizeof(T))));
+#endif
         }
     }
 
     inline __device__ T &operator[](int i) const
     {
+#if defined(__HIP__)
+        // The cache is filled as RT (uint32) words in load() but read back as type
+        // T here. Punning through a union (instead of reinterpret_cast on a private
+        // RT[] array) keeps the access well defined so the clang/HIP device
+        // optimizer does not treat the T reads as non-aliasing with the RT stores
+        // and elide them -- which on HIP left every computed distance unset. nvcc
+        // does not elide the reinterpret_cast read, and a union with a const-qualified
+        // variant member would delete this class's default constructor under nvcc, so
+        // the CUDA path keeps the original array spelling unchanged.
+        return const_cast<T &>(data.elems[i]);
+#else
         return reinterpret_cast<T *>(&data[0])[i];
+#endif
     }
 
 private:
+#if defined(__HIP__)
+    union Cache
+    {
+        RT words[kNumElem];
+        T  elems[kNumElem * (int)(sizeof(RT) / sizeof(T))];
+    };
+
+    Cache data;
+#else
     RT data[kNumElem];
+#endif
 };
 
 // Is compatible checks if a {numDim}-dimensional point fits in the corresponding Point T class (above)
@@ -249,6 +276,11 @@ inline __device__ void SortKeyValue(float &sortedDist, int &sortedIdx, const Poi
             sortedIdx  = values[0];
         }
     }
+
+    // SortKeyValue is called twice in the crossCheck path and the compiler aliases
+    // the two function-local CUB TempStorage allocations. CUB requires a
+    // __syncthreads() before that shared storage is reused by the next collective.
+    __syncthreads();
 }
 
 // Write a match of (set1Idx, set2Idx) with (distance) found at matchIdx inside output matches and distances
diff --git a/src/cvcuda/priv/legacy/CMakeLists.txt b/src/cvcuda/priv/legacy/CMakeLists.txt
index 4a265c8ea..56fe44fde 100644
--- a/src/cvcuda/priv/legacy/CMakeLists.txt
+++ b/src/cvcuda/priv/legacy/CMakeLists.txt
@@ -104,6 +104,12 @@ else()
     list(APPEND CV_CUDA_PRIV_LEGACY_FILES ${CV_CUDA_PRIV_LEGACY_OP_FILES})
 endif()
 
+# cuOSD-backed kernels (osd, box_blur, the text backend) are scoped out of the
+# ROCm build (cuOSD is a prebuilt CUDA-only static lib with no source).
+if(USE_HIP)
+    list(FILTER CV_CUDA_PRIV_LEGACY_FILES EXCLUDE REGEX "osd\\.cu|box_blur\\.cu|textbackend/")
+endif()
+
 add_library(cvcuda_legacy STATIC
     ${CV_CUDA_PRIV_LEGACY_FILES}
 )
diff --git a/src/cvcuda/priv/legacy/CvCudaLegacy.h b/src/cvcuda/priv/legacy/CvCudaLegacy.h
index fd9ad2515..32cd43b32 100644
--- a/src/cvcuda/priv/legacy/CvCudaLegacy.h
+++ b/src/cvcuda/priv/legacy/CvCudaLegacy.h
@@ -18,7 +18,11 @@
 #ifndef CV_CUDA_LEGACY_H
 #define CV_CUDA_LEGACY_H
 
+// CvCudaOSD.hpp pulls in cuOSD, a prebuilt CUDA-only static lib with no source.
+// The OSD/BndBox/BoxBlur operators it backs are scoped out of the ROCm build.
+#if !defined(USE_HIP)
 #include "CvCudaOSD.hpp"
+#endif
 
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
@@ -2043,6 +2047,7 @@ class JointBilateralFilterVarShape : public CudaBaseOp
                     NVCVBorderType borderMode, cudaStream_t stream);
 };
 
+#if !defined(USE_HIP)
 class OSD : public CudaBaseOp
 {
 public:
@@ -2111,6 +2116,7 @@ class BoxBlur : public CudaBaseOp
 private:
     nvcv::cuda::osd::cuOSDContext_t m_context;
 };
+#endif // !USE_HIP
 
 class CvtColor : public CudaBaseOp
 {
diff --git a/src/cvcuda/priv/legacy/histogram_eq_var_shape.cu b/src/cvcuda/priv/legacy/histogram_eq_var_shape.cu
index c8193ccc5..1c9a7bd1e 100644
--- a/src/cvcuda/priv/legacy/histogram_eq_var_shape.cu
+++ b/src/cvcuda/priv/legacy/histogram_eq_var_shape.cu
@@ -261,6 +261,12 @@ ErrorCode HistogramEqVarShape::infer(const nvcv::ImageBatchVarShapeDataStridedCu
     cuda::ImageBatchVarShapeWrapNHWC<uchar> src(inData, channels);
     auto histo = nvcv::cuda::Tensor2DWrap<int, int32_t>(m_histoArray, (int)(256 * channels * sizeof(int)));
 
+    // hist_kernel accumulates into m_histoArray with atomicAdd, so it must start
+    // at zero. The tensor HistogramEq path memsets it (see HistogramEq::infer);
+    // this varshape path relied on freshly cudaMalloc'd memory reading back as
+    // zero, which does not hold for recycled hipMalloc allocations.
+    checkCudaErrors(cudaMemsetAsync(m_histoArray, 0, m_sizeOfHisto, stream));
+
     {
         //compute the histogram for each image in the batch into m_histoArray
         int bsX = 32; //1024 ( 4 ch of 256 bins)
diff --git a/src/cvcuda/priv/legacy/reduce_kernel_utils.cuh b/src/cvcuda/priv/legacy/reduce_kernel_utils.cuh
index 197170549..1936f6d56 100644
--- a/src/cvcuda/priv/legacy/reduce_kernel_utils.cuh
+++ b/src/cvcuda/priv/legacy/reduce_kernel_utils.cuh
@@ -28,7 +28,11 @@
 
 namespace nvcv::legacy::cuda_op {
 
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+#define FINAL_MASK 0xffffffffffffffffULL
+#else
 #define FINAL_MASK 0xffffffff
+#endif
 
 template<typename T>
 __inline__ __device__ T warpReduceSum(T val)
diff --git a/src/cvcuda/priv/legacy/resize_var_shape.cu b/src/cvcuda/priv/legacy/resize_var_shape.cu
index e7342bb16..01456d938 100644
--- a/src/cvcuda/priv/legacy/resize_var_shape.cu
+++ b/src/cvcuda/priv/legacy/resize_var_shape.cu
@@ -277,7 +277,11 @@ __global__ void resize_area_ocv_align(const cuda::ImageBatchVarShapeWrap<const T
             int sy2 = cuda::round<cuda::RoundMode::DOWN, int>(fsy2);
 
             using work_type = cuda::ConvertBaseTypeTo<float, T>;
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+            work_type out = {}; // HIP_vector_type's single-arg ctor is explicit; value-init zeroes all lanes
+#else
             work_type out   = {0};
+#endif
 
             int3 srcCoord = {0, 0, batch_idx};
 
@@ -312,7 +316,11 @@ __global__ void resize_area_ocv_align(const cuda::ImageBatchVarShapeWrap<const T
             = 1.f / (fminf(scale_x, src.width(batch_idx) - fsx1) * fminf(scale_y, src.height(batch_idx) - fsy1));
 
         using work_type = cuda::ConvertBaseTypeTo<float, T>;
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+        work_type out = {}; // HIP_vector_type's single-arg ctor is explicit; value-init zeroes all lanes
+#else
         work_type out   = {0};
+#endif
 
         int3 srcCoord = {0, 0, batch_idx};
 
diff --git a/src/cvcuda/priv/legacy/threshold.cu b/src/cvcuda/priv/legacy/threshold.cu
index c002d0121..b5918e3a7 100644
--- a/src/cvcuda/priv/legacy/threshold.cu
+++ b/src/cvcuda/priv/legacy/threshold.cu
@@ -31,6 +31,16 @@ using namespace nvcv::legacy::cuda_op;
 
 using namespace nvcv::cuda;
 
+// Otsu's per-warp scans partition the 256-thread block as warp = localid / 32,
+// so the butterfly/scan shuffles below pass an explicit width of 32 to stay
+// within a 32-lane subgroup on a 64-lane CDNA wavefront. The mask only marks
+// participants and must be 64-bit on ROCm.
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+#define NVCV_SHFL_MASK NVCV_WARP_FULL_MASK
+#else
+#define NVCV_SHFL_MASK 0xffffffff
+#endif
+
 template<typename P, typename SrcWrap, typename DstWrap>
 __global__ void Binary_overflow(SrcWrap src, DstWrap dst, Tensor1DWrap<double, int32_t> _thresh,
                                 Tensor1DWrap<double, int32_t> _maxval, int height, int width, int channel)
@@ -493,6 +503,17 @@ __global__ void otsu_cal(int *histogram, Tensor1DWrap<double, int32_t> thresh, i
     if (localid < 64)
         reduce[localid] = reduce[localid] + reduce[localid + 64];
     __syncthreads();
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+    // The 64-lane CDNA wavefront gives no warp-synchronous lockstep guarantee
+    // across the unsynchronized tail below, so keep the __syncthreads tree down
+    // to one element (same add order, block-wide barrier each step).
+    for (int s = 32; s > 0; s >>= 1)
+    {
+        if (localid < s)
+            reduce[localid] = reduce[localid] + reduce[localid + s];
+        __syncthreads();
+    }
+#else
     if (localid < 32)
     {
         reduce[localid] = reduce[localid] + reduce[localid + 32];
@@ -503,6 +524,7 @@ __global__ void otsu_cal(int *histogram, Tensor1DWrap<double, int32_t> thresh, i
         reduce[localid] = reduce[localid] + reduce[localid + 1];
     }
     __syncthreads();
+#endif
 
     mu = reduce[0] * scale;
     __syncthreads();
@@ -513,11 +535,11 @@ __global__ void otsu_cal(int *histogram, Tensor1DWrap<double, int32_t> thresh, i
     int    lane = localid % 32, warp = localid / 32;
     // sum of q1 in warp
     double temp = q1;
-    temp += __shfl_xor_sync(0xffffffff, temp, 1);
-    temp += __shfl_xor_sync(0xffffffff, temp, 2);
-    temp += __shfl_xor_sync(0xffffffff, temp, 4);
-    temp += __shfl_xor_sync(0xffffffff, temp, 8);
-    temp += __shfl_xor_sync(0xffffffff, temp, 16);
+    temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 1, 32);
+    temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 2, 32);
+    temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 4, 32);
+    temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 8, 32);
+    temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 16, 32);
     if (lane == 0)
         reduce[warp] = temp;
     __syncthreads();
@@ -536,19 +558,19 @@ __global__ void otsu_cal(int *histogram, Tensor1DWrap<double, int32_t> thresh, i
     }
     __syncthreads();
     // prefix scan in warp
-    temp = __shfl_up_sync(0xffffffff, q1, 1);
+    temp = __shfl_up_sync(NVCV_SHFL_MASK, q1, 1, 32);
     if (lane >= 1)
         q1 += temp;
-    temp = __shfl_up_sync(0xffffffff, q1, 2);
+    temp = __shfl_up_sync(NVCV_SHFL_MASK, q1, 2, 32);
     if (lane >= 2)
         q1 += temp;
-    temp = __shfl_up_sync(0xffffffff, q1, 4);
+    temp = __shfl_up_sync(NVCV_SHFL_MASK, q1, 4, 32);
     if (lane >= 4)
         q1 += temp;
-    temp = __shfl_up_sync(0xffffffff, q1, 8);
+    temp = __shfl_up_sync(NVCV_SHFL_MASK, q1, 8, 32);
     if (lane >= 8)
         q1 += temp;
-    temp = __shfl_up_sync(0xffffffff, q1, 16);
+    temp = __shfl_up_sync(NVCV_SHFL_MASK, q1, 16, 32);
     if (lane >= 16)
         q1 += temp;
     q1 += reduce[warp];
@@ -560,11 +582,11 @@ __global__ void otsu_cal(int *histogram, Tensor1DWrap<double, int32_t> thresh, i
     double one = localid * hist[localid] * scale;
     // sum of q1 in warp
     temp = one;
-    temp += __shfl_xor_sync(0xffffffff, temp, 1);
-    temp += __shfl_xor_sync(0xffffffff, temp, 2);
-    temp += __shfl_xor_sync(0xffffffff, temp, 4);
-    temp += __shfl_xor_sync(0xffffffff, temp, 8);
-    temp += __shfl_xor_sync(0xffffffff, temp, 16);
+    temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 1, 32);
+    temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 2, 32);
+    temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 4, 32);
+    temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 8, 32);
+    temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 16, 32);
     if (lane == 0)
         reduce[warp] = temp;
     __syncthreads();
@@ -583,19 +605,19 @@ __global__ void otsu_cal(int *histogram, Tensor1DWrap<double, int32_t> thresh, i
     }
     __syncthreads();
     // prefix scan in warp
-    temp = __shfl_up_sync(0xffffffff, one, 1);
+    temp = __shfl_up_sync(NVCV_SHFL_MASK, one, 1, 32);
     if (lane >= 1)
         one += temp;
-    temp = __shfl_up_sync(0xffffffff, one, 2);
+    temp = __shfl_up_sync(NVCV_SHFL_MASK, one, 2, 32);
     if (lane >= 2)
         one += temp;
-    temp = __shfl_up_sync(0xffffffff, one, 4);
+    temp = __shfl_up_sync(NVCV_SHFL_MASK, one, 4, 32);
     if (lane >= 4)
         one += temp;
-    temp = __shfl_up_sync(0xffffffff, one, 8);
+    temp = __shfl_up_sync(NVCV_SHFL_MASK, one, 8, 32);
     if (lane >= 8)
         one += temp;
-    temp = __shfl_up_sync(0xffffffff, one, 16);
+    temp = __shfl_up_sync(NVCV_SHFL_MASK, one, 16, 32);
     if (lane >= 16)
         one += temp;
     one += reduce[warp];
@@ -635,6 +657,20 @@ __global__ void otsu_cal(int *histogram, Tensor1DWrap<double, int32_t> thresh, i
     }
     __syncthreads();
 
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+    for (int s = 32; s > 0; s >>= 1)
+    {
+        if (localid < s && reduce[localid + s] >= reduce[localid])
+        {
+            if (reduce[localid + s] == reduce[localid])
+                idx[localid] = min(idx[localid], idx[localid + s]);
+            else
+                idx[localid] = idx[localid + s];
+            reduce[localid] = reduce[localid + s];
+        }
+        __syncthreads();
+    }
+#else
     if (localid < 32)
     {
         if (reduce[localid + 32] >= reduce[localid])
@@ -687,6 +723,7 @@ __global__ void otsu_cal(int *histogram, Tensor1DWrap<double, int32_t> thresh, i
         }
     }
     __syncthreads();
+#endif
 
     // write to gpu memory
     if (localid == 0)
diff --git a/src/cvcuda/priv/legacy/threshold_var_shape.cu b/src/cvcuda/priv/legacy/threshold_var_shape.cu
index e0c865101..551d70681 100644
--- a/src/cvcuda/priv/legacy/threshold_var_shape.cu
+++ b/src/cvcuda/priv/legacy/threshold_var_shape.cu
@@ -31,6 +31,15 @@ using namespace nvcv::legacy::cuda_op;
 
 using namespace nvcv::cuda;
 
+// See threshold.cu: Otsu's per-warp scans use warp = localid / 32, so the
+// shuffles pass an explicit width of 32 to stay within a 32-lane subgroup on a
+// 64-lane CDNA wavefront. The mask must be 64-bit on ROCm.
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+#define NVCV_SHFL_MASK NVCV_WARP_FULL_MASK
+#else
+#define NVCV_SHFL_MASK 0xffffffff
+#endif
+
 template<typename T, typename P = MakeType<T, sizeof(T) == 8 ? 2 : 4>>
 __global__ void Binary_overflow(ImageBatchVarShapeWrapNHWC<T> src, ImageBatchVarShapeWrapNHWC<T> dst,
                                 Tensor1DWrap<double, int32_t> _thresh, Tensor1DWrap<double, int32_t> _maxval,
@@ -699,6 +708,16 @@ __global__ void otsu_cal_varshape(int *histogram, Tensor1DWrap<double, int32_t>
     if (localid < 64)
         reduce[localid] = reduce[localid] + reduce[localid + 64];
     __syncthreads();
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+    // No warp-synchronous lockstep guarantee on a 64-lane CDNA wavefront; keep
+    // the __syncthreads tree down to one element (same add order each step).
+    for (int s = 32; s > 0; s >>= 1)
+    {
+        if (localid < s)
+            reduce[localid] = reduce[localid] + reduce[localid + s];
+        __syncthreads();
+    }
+#else
     if (localid < 32)
     {
         reduce[localid] = reduce[localid] + reduce[localid + 32];
@@ -709,6 +728,7 @@ __global__ void otsu_cal_varshape(int *histogram, Tensor1DWrap<double, int32_t>
         reduce[localid] = reduce[localid] + reduce[localid + 1];
     }
     __syncthreads();
+#endif
 
     mu = reduce[0] * scale;
     __syncthreads();
@@ -719,11 +739,11 @@ __global__ void otsu_cal_varshape(int *histogram, Tensor1DWrap<double, int32_t>
     int    lane = localid % 32, warp = localid / 32;
     // sum of q1 in warp
     double temp = q1;
-    temp += __shfl_xor_sync(0xffffffff, temp, 1);
-    temp += __shfl_xor_sync(0xffffffff, temp, 2);
-    temp += __shfl_xor_sync(0xffffffff, temp, 4);
-    temp += __shfl_xor_sync(0xffffffff, temp, 8);
-    temp += __shfl_xor_sync(0xffffffff, temp, 16);
+    temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 1, 32);
+    temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 2, 32);
+    temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 4, 32);
+    temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 8, 32);
+    temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 16, 32);
     if (lane == 0)
         reduce[warp] = temp;
     __syncthreads();
@@ -742,19 +762,19 @@ __global__ void otsu_cal_varshape(int *histogram, Tensor1DWrap<double, int32_t>
     }
     __syncthreads();
     // prefix scan in warp
-    temp = __shfl_up_sync(0xffffffff, q1, 1);
+    temp = __shfl_up_sync(NVCV_SHFL_MASK, q1, 1, 32);
     if (lane >= 1)
         q1 += temp;
-    temp = __shfl_up_sync(0xffffffff, q1, 2);
+    temp = __shfl_up_sync(NVCV_SHFL_MASK, q1, 2, 32);
     if (lane >= 2)
         q1 += temp;
-    temp = __shfl_up_sync(0xffffffff, q1, 4);
+    temp = __shfl_up_sync(NVCV_SHFL_MASK, q1, 4, 32);
     if (lane >= 4)
         q1 += temp;
-    temp = __shfl_up_sync(0xffffffff, q1, 8);
+    temp = __shfl_up_sync(NVCV_SHFL_MASK, q1, 8, 32);
     if (lane >= 8)
         q1 += temp;
-    temp = __shfl_up_sync(0xffffffff, q1, 16);
+    temp = __shfl_up_sync(NVCV_SHFL_MASK, q1, 16, 32);
     if (lane >= 16)
         q1 += temp;
     q1 += reduce[warp];
@@ -766,11 +786,11 @@ __global__ void otsu_cal_varshape(int *histogram, Tensor1DWrap<double, int32_t>
     double one = localid * hist[localid] * scale;
     // sum of q1 in warp
     temp = one;
-    temp += __shfl_xor_sync(0xffffffff, temp, 1);
-    temp += __shfl_xor_sync(0xffffffff, temp, 2);
-    temp += __shfl_xor_sync(0xffffffff, temp, 4);
-    temp += __shfl_xor_sync(0xffffffff, temp, 8);
-    temp += __shfl_xor_sync(0xffffffff, temp, 16);
+    temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 1, 32);
+    temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 2, 32);
+    temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 4, 32);
+    temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 8, 32);
+    temp += __shfl_xor_sync(NVCV_SHFL_MASK, temp, 16, 32);
     if (lane == 0)
         reduce[warp] = temp;
     __syncthreads();
@@ -789,19 +809,19 @@ __global__ void otsu_cal_varshape(int *histogram, Tensor1DWrap<double, int32_t>
     }
     __syncthreads();
     // prefix scan in warp
-    temp = __shfl_up_sync(0xffffffff, one, 1);
+    temp = __shfl_up_sync(NVCV_SHFL_MASK, one, 1, 32);
     if (lane >= 1)
         one += temp;
-    temp = __shfl_up_sync(0xffffffff, one, 2);
+    temp = __shfl_up_sync(NVCV_SHFL_MASK, one, 2, 32);
     if (lane >= 2)
         one += temp;
-    temp = __shfl_up_sync(0xffffffff, one, 4);
+    temp = __shfl_up_sync(NVCV_SHFL_MASK, one, 4, 32);
     if (lane >= 4)
         one += temp;
-    temp = __shfl_up_sync(0xffffffff, one, 8);
+    temp = __shfl_up_sync(NVCV_SHFL_MASK, one, 8, 32);
     if (lane >= 8)
         one += temp;
-    temp = __shfl_up_sync(0xffffffff, one, 16);
+    temp = __shfl_up_sync(NVCV_SHFL_MASK, one, 16, 32);
     if (lane >= 16)
         one += temp;
     one += reduce[warp];
@@ -841,6 +861,20 @@ __global__ void otsu_cal_varshape(int *histogram, Tensor1DWrap<double, int32_t>
     }
     __syncthreads();
 
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+    for (int s = 32; s > 0; s >>= 1)
+    {
+        if (localid < s && reduce[localid + s] >= reduce[localid])
+        {
+            if (reduce[localid + s] == reduce[localid])
+                idx[localid] = min(idx[localid], idx[localid + s]);
+            else
+                idx[localid] = idx[localid + s];
+            reduce[localid] = reduce[localid + s];
+        }
+        __syncthreads();
+    }
+#else
     if (localid < 32)
     {
         if (reduce[localid + 32] >= reduce[localid])
@@ -893,6 +927,7 @@ __global__ void otsu_cal_varshape(int *histogram, Tensor1DWrap<double, int32_t>
         }
     }
     __syncthreads();
+#endif
 
     // write to gpu memory
     if (localid == 0)
diff --git a/src/cvcuda/util/CMakeLists.txt b/src/cvcuda/util/CMakeLists.txt
index 346ab73b4..c5e953ee0 100644
--- a/src/cvcuda/util/CMakeLists.txt
+++ b/src/cvcuda/util/CMakeLists.txt
@@ -13,7 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-find_package(CUDAToolkit REQUIRED)
+if(NOT USE_HIP)
+    find_package(CUDAToolkit REQUIRED)
+endif()
 
 add_library(cvcuda_util STATIC
     Event.cpp
diff --git a/src/cvcuda/util/StreamId.cpp b/src/cvcuda/util/StreamId.cpp
index baff3f54c..b0a736820 100644
--- a/src/cvcuda/util/StreamId.cpp
+++ b/src/cvcuda/util/StreamId.cpp
@@ -21,6 +21,32 @@
 #include <cuda_runtime.h>
 #include <nvcv/Exception.hpp>
 
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+
+// ROCm exposes hipStreamGetId directly, which is all the stream-identity cache
+// needs. Fall back to the stream-handle pointer value (a stable per-stream key)
+// if the runtime cannot supply an id.
+namespace nvcv::util {
+
+bool IsCudaStreamIdHintUnambiguous()
+{
+    return true;
+}
+
+uint64_t GetCudaStreamIdHint(hipStream_t stream)
+{
+    unsigned long long id  = 0;
+    hipError_t         err = hipStreamGetId(stream, &id);
+    if (err == hipSuccess)
+        return id;
+    (void)hipGetLastError();
+    return reinterpret_cast<uint64_t>(stream);
+}
+
+} // namespace nvcv::util
+
+#else
+
 using cuStreamGetId_t = CUresult(CUstream, unsigned long long *);
 
 #if CUDA_VERSION >= 12000
@@ -148,3 +174,5 @@ uint64_t GetCudaStreamIdHint(CUstream stream)
 }
 
 } // namespace nvcv::util
+
+#endif // __HIP_PLATFORM_AMD__ || USE_HIP
diff --git a/src/nvcv/cmake/ConfigBuildTree.cmake b/src/nvcv/cmake/ConfigBuildTree.cmake
index 31b5fe98b..770cf4956 100644
--- a/src/nvcv/cmake/ConfigBuildTree.cmake
+++ b/src/nvcv/cmake/ConfigBuildTree.cmake
@@ -76,7 +76,10 @@ include(CMakeDependentOption)
 
 option(EXPOSE_CODE "Expose in resulting binaries parts of our code" ${DEFAULT_EXPOSE_CODE})
 option(WARNINGS_AS_ERRORS "Treat compilation warnings as errors" OFF)
-cmake_dependent_option(ENABLE_COMPAT_OLD_GLIBC "Generates binaries that work with old distros, with old glibc" ON "NOT ARCH_AARCH64" OFF)
+# The old-glibc compat shim links prebuilt glibc-2.17 stub .so files (shipped via
+# Git LFS); it is a deployment-portability feature unrelated to GPU correctness,
+# so default it OFF on the ROCm build.
+cmake_dependent_option(ENABLE_COMPAT_OLD_GLIBC "Generates binaries that work with old distros, with old glibc" ON "NOT ARCH_AARCH64;NOT USE_HIP" OFF)
 cmake_dependent_option(ENABLE_QNX "Enable QNX support" OFF "ARCH_AARCH64" OFF)
 
 if(ENABLE_QNX)
@@ -86,7 +89,9 @@ else()
 endif()
 
 # Needed to get cuda version
-find_package(CUDAToolkit REQUIRED)
+if(NOT USE_HIP)
+    find_package(CUDAToolkit REQUIRED)
+endif()
 
 # Are we inside a git repo and it has submodules enabled?
 if(EXISTS ${CMAKE_SOURCE_DIR}/.git AND EXISTS ${CMAKE_SOURCE_DIR}/.gitmodules)
@@ -104,7 +109,11 @@ else()
                         "CV-CUDA only supports Linux and QNX platforms.")
 endif()
 
-set(NVCV_BUILD_SUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}-${NVCV_SYSTEM_NAME}")
+if(USE_HIP)
+    set(NVCV_BUILD_SUFFIX "hip-${NVCV_SYSTEM_NAME}")
+else()
+    set(NVCV_BUILD_SUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}-${NVCV_SYSTEM_NAME}")
+endif()
 
 function(setup_dso target version)
     string(REGEX MATCHALL "[0-9]+" version_list "${version}")
diff --git a/src/nvcv/src/include/nvcv/detail/CudaFwd.h b/src/nvcv/src/include/nvcv/detail/CudaFwd.h
index 79dac8dee..6aa543961 100644
--- a/src/nvcv/src/include/nvcv/detail/CudaFwd.h
+++ b/src/nvcv/src/include/nvcv/detail/CudaFwd.h
@@ -13,7 +13,13 @@
 #ifndef NVCV_DETAIL_FWD_CUDA_H
 #define NVCV_DETAIL_FWD_CUDA_H
 
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+#include <hip/hip_runtime_api.h>
+typedef hipStream_t CUstream;
+typedef hipArray_t  cudaArray_t;
+#else
 typedef struct CUstream_st *CUstream;
 typedef struct cudaArray   *cudaArray_t;
+#endif
 
 #endif // NVCV_DETAIL_FWD_CUDA_H
diff --git a/src/nvcv/src/priv/CMakeLists.txt b/src/nvcv/src/priv/CMakeLists.txt
index 7e298d9af..e46dd7798 100644
--- a/src/nvcv/src/priv/CMakeLists.txt
+++ b/src/nvcv/src/priv/CMakeLists.txt
@@ -13,7 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-find_package(CUDAToolkit REQUIRED)
+if(NOT USE_HIP)
+    find_package(CUDAToolkit REQUIRED)
+endif()
 
 add_library(nvcv_types_priv STATIC
     Context.cpp
diff --git a/src/nvcv/src/priv/DefaultAllocator.cpp b/src/nvcv/src/priv/DefaultAllocator.cpp
index df9a2c921..c382cef8d 100644
--- a/src/nvcv/src/priv/DefaultAllocator.cpp
+++ b/src/nvcv/src/priv/DefaultAllocator.cpp
@@ -65,6 +65,17 @@ void *DefaultAllocator::doAllocCudaMem(int64_t size, int32_t align)
     void *ptr = nullptr;
     NVCV_CHECK_THROW(::cudaMalloc(&ptr, size));
 
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+    // hipMalloc returns recycled device memory with stale contents, whereas
+    // freshly cudaMalloc'd memory on the supported NVIDIA setups reads back as
+    // zero. Several gtests fill a tensor's valid region and then compare the
+    // whole strided buffer (including the row-stride padding the operator never
+    // writes) against a zero-initialized CPU reference, so they assume device
+    // padding is zero. Zero new device allocations to keep that contract; the
+    // operators themselves write every valid pixel and are unaffected.
+    NVCV_CHECK_THROW(::cudaMemset(ptr, 0, size));
+#endif
+
     // TODO: can we do better than this?
     if (reinterpret_cast<uintptr_t>(ptr) % align != 0)
     {
diff --git a/src/nvcv/util/CMakeLists.txt b/src/nvcv/util/CMakeLists.txt
index aa72802b9..113cff968 100644
--- a/src/nvcv/util/CMakeLists.txt
+++ b/src/nvcv/util/CMakeLists.txt
@@ -13,7 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-find_package(CUDAToolkit REQUIRED)
+if(NOT USE_HIP)
+    find_package(CUDAToolkit REQUIRED)
+endif()
 
 # nvcv_util_sanitizer ---------------------------------
 add_library(nvcv_util_sanitizer STATIC
diff --git a/tests/cvcuda/system/CMakeLists.txt b/tests/cvcuda/system/CMakeLists.txt
index 134a1502f..dc9d26f79 100644
--- a/tests/cvcuda/system/CMakeLists.txt
+++ b/tests/cvcuda/system/CMakeLists.txt
@@ -111,29 +111,35 @@ set(CVCUDA_TEST_SOURCES_CUOSD
     OsdUtils.cu
 )
 
-# Build smoke tests unconditionally - they work on all compilers
-add_executable(cvcuda_test_system_smoke ${CVCUDA_SMOKE_TEST_SOURCES})
-
-target_link_libraries(cvcuda_test_system_smoke
-    PUBLIC
-        cvcuda
-        nvcv_test_common_system
-        # Note: Does NOT link against cuosd
-)
+# The smoke tests exercise the OSD/BndBox/BoxBlur operators, which depend on the
+# prebuilt CUDA-only cuOSD lib and are scoped out of the ROCm build.
+if(NOT USE_HIP)
+    # Build smoke tests unconditionally - they work on all compilers
+    add_executable(cvcuda_test_system_smoke ${CVCUDA_SMOKE_TEST_SOURCES})
+
+    target_link_libraries(cvcuda_test_system_smoke
+        PUBLIC
+            cvcuda
+            nvcv_test_common_system
+            # Note: Does NOT link against cuosd
+    )
 
-nvcv_add_test(cvcuda_test_system_smoke cvcuda cpp)
+    nvcv_add_test(cvcuda_test_system_smoke cvcuda cpp)
+endif()
 
 # Include C++20-requiring tests and libcuosd tests only if compiler supports them properly
 if(NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11.0))
     list(APPEND CVCUDA_TEST_SOURCES ${CVCUDA_TEST_SOURCES_CPP20})
-    list(APPEND CVCUDA_TEST_SOURCES ${CVCUDA_TEST_SOURCES_CUOSD})
+    if(NOT USE_HIP)
+        list(APPEND CVCUDA_TEST_SOURCES ${CVCUDA_TEST_SOURCES_CUOSD})
+    endif()
 endif()
 
 # Build cvcuda_test_system (works with GCC-10+ but with reduced test coverage for GCC-10)
 add_executable(cvcuda_test_system ${CVCUDA_TEST_SOURCES})
 
 # Link against libcuosd only for GCC-11+ (needed for OSD tests)
-if(NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11.0))
+if(NOT USE_HIP AND NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11.0))
     target_link_libraries(cvcuda_test_system
         PUBLIC
             cvcuda
diff --git a/tests/cvcuda/system/TestOpMinMaxLoc.cpp b/tests/cvcuda/system/TestOpMinMaxLoc.cpp
index 98fbbaa04..c5d6a8e91 100644
--- a/tests/cvcuda/system/TestOpMinMaxLoc.cpp
+++ b/tests/cvcuda/system/TestOpMinMaxLoc.cpp
@@ -124,8 +124,8 @@ inline void FindMinMax(InContainerType &in, InStridesType &inStrides, InShapeTyp
             }
         }
 
-        test::ValueAt<OutVT>(minVal, valStrides, {z}).x = min;
-        test::ValueAt<OutVT>(maxVal, valStrides, {z}).x = max;
+        test::ValueAt<OutVT>(minVal, valStrides, int1{z}).x = min;
+        test::ValueAt<OutVT>(maxVal, valStrides, int1{z}).x = max;
 
         int nMin{0}, nMax{0};
 
@@ -161,8 +161,8 @@ inline void FindMinMax(InContainerType &in, InStridesType &inStrides, InShapeTyp
             }
         }
 
-        test::ValueAt<int1>(numMin, numStrides, {z}).x = nMin;
-        test::ValueAt<int1>(numMax, numStrides, {z}).x = nMax;
+        test::ValueAt<int1>(numMin, numStrides, int1{z}).x = nMin;
+        test::ValueAt<int1>(numMax, numStrides, int1{z}).x = nMax;
     }
 }
 
@@ -181,8 +181,8 @@ inline void LocSort(std::vector<std::vector<int2>> &minLocTest, std::vector<std:
 
     for (int z = 0; z < (int)minLocTest.size(); z++)
     {
-        int nMin = test::ValueAt<int1>(numMinVec, numStrides, {z}).x;
-        int nMax = test::ValueAt<int1>(numMaxVec, numStrides, {z}).x;
+        int nMin = test::ValueAt<int1>(numMinVec, numStrides, int1{z}).x;
+        int nMax = test::ValueAt<int1>(numMaxVec, numStrides, int1{z}).x;
 
         for (int i = 0; i < nMin && i < capacity; i++)
         {
@@ -225,9 +225,9 @@ inline void GoldMinMaxLoc(const nvcv::Tensor &minVal, const nvcv::Tensor &minLoc
 
     int    capacity   = minLocData->shape(1);
     int    numSamples = minValData->shape(0);
-    long1  valStrides = {minValData->stride(0)};
+    long1  valStrides{minValData->stride(0)};
     long2  locStrides = {minLocData->stride(0), minLocData->stride(1)};
-    long1  numStrides = {numMinData->stride(0)};
+    long1  numStrides{numMinData->stride(0)};
     size_t valBufSize = numSamples * valStrides.x;
     size_t locBufSize = numSamples * locStrides.x;
     size_t numBufSize = numSamples * numStrides.x;
diff --git a/tests/cvcuda/system/TestOpSIFT.cpp b/tests/cvcuda/system/TestOpSIFT.cpp
index 032c0d1c2..90f9f2062 100644
--- a/tests/cvcuda/system/TestOpSIFT.cpp
+++ b/tests/cvcuda/system/TestOpSIFT.cpp
@@ -809,7 +809,7 @@ inline void GoldSIFT(SIFTResults &outResults, const nvcv::Tensor &featCoords, co
     long2 featMetadataShape = {featMetadataData->shape(0), featMetadataData->shape(1)};
     long3 featDescriptorsShape
         = {featDescriptorsData->shape(0), featDescriptorsData->shape(1), featDescriptorsData->shape(2)};
-    long1 numFeaturesShape = {numFeaturesData->shape(0)};
+    long1 numFeaturesShape{numFeaturesData->shape(0)};
 
     ASSERT_TRUE((featCoordsShape == long2{srcShape.x, capacity} && featMetadataShape == long2{srcShape.x, capacity}
                  && featDescriptorsShape == long3{srcShape.x, capacity, 128} && numFeaturesShape == long1{srcShape.x}));
@@ -819,7 +819,7 @@ inline void GoldSIFT(SIFTResults &outResults, const nvcv::Tensor &featCoords, co
     long3 featDescriptorsStrides3
         = {featDescriptorsData->stride(0), featDescriptorsData->stride(1), featDescriptorsData->stride(2)};
     long2 featDescriptorsStrides = {featDescriptorsStrides3.x, featDescriptorsStrides3.y};
-    long1 numFeaturesStrides     = {numFeaturesData->stride(0)};
+    long1 numFeaturesStrides{numFeaturesData->stride(0)};
 
     ASSERT_TRUE(featCoordsStrides.y == sizeof(float4) && featMetadataStrides.y == sizeof(float3)
                 && featDescriptorsStrides3.z == sizeof(uint8_t) && featDescriptorsStrides.y == 128 * sizeof(uint8_t)
diff --git a/tests/nvcv_types/CMakeLists.txt b/tests/nvcv_types/CMakeLists.txt
index fab8cd3b8..7149796be 100644
--- a/tests/nvcv_types/CMakeLists.txt
+++ b/tests/nvcv_types/CMakeLists.txt
@@ -53,8 +53,11 @@ if(BUILD_TESTS_CPP)
     )
 
     # Skip standalone tests with GCC-10 due to ABI incompatibility with system libgtest
-    # These tests use find_package(GTest) which requires system GTest, causing ABI issues with GCC-10
-    if(NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11.0))
+    # These tests use find_package(GTest) which requires system GTest, causing ABI issues with GCC-10.
+    # Also skip on ROCm: the standalone consumer project re-runs cmake on a copied nvcv via
+    # ExternalProject without the top-level USE_HIP/enable_language(HIP) setup, so it would try to
+    # find the CUDA toolkit. It is a build-packaging check, not a GPU operator test.
+    if(NOT USE_HIP AND NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11.0))
         # Test NVCV can be built as a shared library via the standalone test
         ExternalProject_Add(
             nvcv_standalone
diff --git a/tests/nvcv_types/cudatools_system/DeviceFullTensorWrap.cu b/tests/nvcv_types/cudatools_system/DeviceFullTensorWrap.cu
index 4e32fdf47..7411d9579 100644
--- a/tests/nvcv_types/cudatools_system/DeviceFullTensorWrap.cu
+++ b/tests/nvcv_types/cudatools_system/DeviceFullTensorWrap.cu
@@ -30,7 +30,9 @@ namespace cuda = nvcv::cuda;
 template<class DstWrapper, class SrcWrapper>
 __global__ void Copy(DstWrapper dst, SrcWrapper src)
 {
-    int1 coord = cuda::StaticCast<int>(cuda::DropCast<1>(threadIdx));
+    // threadIdx is a builtin struct without NVCV TypeTraits on HIP; uint3
+    // brace-init binds the NVCV helpers and is unchanged on CUDA.
+    int1 coord = cuda::StaticCast<int>(cuda::DropCast<1>(uint3{threadIdx.x, threadIdx.y, threadIdx.z}));
     dst[coord] = src[coord];
 }
 
diff --git a/tests/nvcv_types/cudatools_system/DeviceTensorBatchWrap.cu b/tests/nvcv_types/cudatools_system/DeviceTensorBatchWrap.cu
index fe5f2dcef..5e93ceb16 100644
--- a/tests/nvcv_types/cudatools_system/DeviceTensorBatchWrap.cu
+++ b/tests/nvcv_types/cudatools_system/DeviceTensorBatchWrap.cu
@@ -107,7 +107,7 @@ __global__ void SetReferenceKernel(TensorBatchWrapT wrap)
             coords[d] = tmp_i % shape[d];
             tmp_i /= shape[d];
         }
-        SetValue::Set(wrap, sample, coords, cuda::SetAll<TensorBatchWrapT::ValueType>(index % 255));
+        SetValue::Set(wrap, sample, coords, cuda::SetAll<typename TensorBatchWrapT::ValueType>(index % 255));
     }
 }
 
diff --git a/tests/nvcv_types/cudatools_system/DeviceTensorWrap.cu b/tests/nvcv_types/cudatools_system/DeviceTensorWrap.cu
index afff2b95d..60ffa1b05 100644
--- a/tests/nvcv_types/cudatools_system/DeviceTensorWrap.cu
+++ b/tests/nvcv_types/cudatools_system/DeviceTensorWrap.cu
@@ -31,14 +31,16 @@ template<class DstWrapper, class SrcWrapper>
 __global__ void Copy(DstWrapper dst, SrcWrapper src)
 {
     using DimType = cuda::MakeType<int, SrcWrapper::kNumDimensions>;
-    DimType coord = cuda::StaticCast<int>(cuda::DropCast<SrcWrapper::kNumDimensions>(threadIdx));
+    // threadIdx is a builtin struct without NVCV TypeTraits on HIP; uint3
+    // brace-init binds the NVCV helpers and is unchanged on CUDA.
+    DimType coord = cuda::StaticCast<int>(cuda::DropCast<SrcWrapper::kNumDimensions>(uint3{threadIdx.x, threadIdx.y, threadIdx.z}));
     dst[coord]    = src[coord];
 }
 
 template<typename ValueType>
 __global__ void Copy(cuda::Tensor4DWrap<ValueType> dst, cuda::Tensor4DWrap<const ValueType> src, int lastDimSize)
 {
-    int3 c3 = cuda::StaticCast<int>(threadIdx);
+    int3 c3 = cuda::StaticCast<int>(uint3{threadIdx.x, threadIdx.y, threadIdx.z});
     for (int k = 0; k < lastDimSize; k++)
     {
         int4 c4{k, c3.x, c3.y, c3.z};
diff --git a/tests/nvcv_types/cudatools_system/TestMathOps.cpp b/tests/nvcv_types/cudatools_system/TestMathOps.cpp
index 360ecc996..811c00efe 100644
--- a/tests/nvcv_types/cudatools_system/TestMathOps.cpp
+++ b/tests/nvcv_types/cudatools_system/TestMathOps.cpp
@@ -141,7 +141,9 @@ NVCV_TYPED_TEST_SUITE(
     ttype::Types<ttype::Value<short1{-1}>, ttype::Value<int1{1}>>,
     ttype::Types<ttype::Value<uchar2{1, 2}>, ttype::Value<int2{-1, -2}>>,
     ttype::Types<ttype::Value<int3{-1, 0, 1}>, ttype::Value<int3{1, 0, -1}>>,
-    ttype::Types<ttype::Value<float4{-1.23f, 0.12f, 1.23f}>, ttype::Value<float4{1.23f, -0.12f, -1.23f}>>
+    // HIP_vector_type has no partial-init constructor (CUDA aggregate zero-fills
+    // the unspecified component); spell the 4th element so both backends match.
+    ttype::Types<ttype::Value<float4{-1.23f, 0.12f, 1.23f, 0.f}>, ttype::Value<float4{1.23f, -0.12f, -1.23f, 0.f}>>
 >);
 
 // clang-format on
@@ -162,7 +164,7 @@ NVCV_TYPED_TEST_SUITE(
     ttype::Types<ttype::Value<short1{-1}>, ttype::Value<int1{-1}>>,
     ttype::Types<ttype::Value<uchar2{1, 2}>, ttype::Value<int2{1, 2}>>,
     ttype::Types<ttype::Value<int3{-1, 0, 1}>, ttype::Value<int3{-1, 0, 1}>>,
-    ttype::Types<ttype::Value<float4{-1.23f, 0.12f, 1.23f}>, ttype::Value<float4{-1.23f, 0.12f, 1.23f}>>
+    ttype::Types<ttype::Value<float4{-1.23f, 0.12f, 1.23f, 0.f}>, ttype::Value<float4{-1.23f, 0.12f, 1.23f, 0.f}>>
 >);
 
 // clang-format on
diff --git a/tests/nvcv_types/cudatools_system/TestTypeTraits.cpp b/tests/nvcv_types/cudatools_system/TestTypeTraits.cpp
index 96320483e..433434883 100644
--- a/tests/nvcv_types/cudatools_system/TestTypeTraits.cpp
+++ b/tests/nvcv_types/cudatools_system/TestTypeTraits.cpp
@@ -328,7 +328,10 @@ class TypeTraitsGetElementTest : public TypeTraitsBaseTest<T>
         }
         else if constexpr (NumElements == 1)
         {
-            pix = {1};
+            // A 1-element vector type's scalar constructor is explicit on HIP
+            // (HIP_vector_type), so copy-list-init from {1} is rejected; SetAll
+            // is the portable NVCV builder and is unchanged on CUDA.
+            pix = cuda::SetAll<DataType>(1);
         }
     }
 };