From 89dd918ae5d801cfa50218b77f6ca44304615b19 Mon Sep 17 00:00:00 2001
From: "M. Eric Irrgang" <vardarirrgan1@llnl.gov>
Date: Wed, 12 Nov 2025 17:56:42 -0800
Subject: [PATCH 1/7] sort file lists for readability

---
 src/CMakeLists.txt | 95 +++++++++++++++++++++++++++-------------------
 1 file changed, 56 insertions(+), 39 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2e2a41e..bcd604c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -27,31 +27,48 @@ endif()
 #set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/..)
 
 set(HEADER
-  tomographic_models.h
+  analytic_ray_tracing.h
+  analytic_ray_tracing_gpu.cuh
+  backprojectors_VD.cuh
+  bilateral_filter.cuh
+  cpu_utils.h
+  cuda_utils.h
+  file_io.h
+  filtered_backprojection.h
+  find_center_cpu.h
+  geometric_calibration.cuh
+  guided_filter.cuh
+  leap_defines.h
   list_of_tomographic_models.h
-  tomographic_models_c_interface.h
+  log.h
+  matching_pursuit.cuh
+  noise_filters.cuh
   parameters.h
+  phantom.h
   projectors.h
-  filtered_backprojection.h
+  projectors_Joseph.cuh
+  projectors_Joseph_cpu.h
   projectors_SF.cuh
   projectors_SF_cpu.h
+  projectors_Siddon.cuh
+  projectors_Siddon_cpu.h
+  projectors_attenuated.cuh
   projectors_extendedSF.cuh
-  projectors_Joseph.cuh
-  projectors_Joseph_cpu.h
   projectors_symmetric.cuh
   projectors_symmetric_cpu.h
-  projectors_attenuated.cuh
-  projectors_Siddon.cuh
-  projectors_Siddon_cpu.h
-  backprojectors_VD.cuh
-  sensitivity_cpu.h
-  sensitivity.cuh
   ramp_filter.cuh
   ramp_filter_cpu.h
   ray_weighting.cuh
   ray_weighting_cpu.h
+  rebin.h
+  resample.cuh
+  resample_cpu.h
   scatter_models.cuh
-  noise_filters.cuh
+  sensitivity.cuh
+  sensitivity_cpu.h
+  sinogram_replacement.h
+  tomographic_models.h
+  tomographic_models_c_interface.h
   total_variation.cuh
   matching_pursuit.cuh
   bilateral_filter.cuh
@@ -73,50 +90,50 @@ set(HEADER
 )
 
 set(SRC_CPP
-  tomographic_models.cpp
+  analytic_ray_tracing.cpp
+  cpu_utils.cpp
+  file_io.cpp
+  filtered_backprojection.cpp
+  find_center_cpu.cpp
   list_of_tomographic_models.cpp
-  tomographic_models_c_interface.cpp
   parameters.cpp
+  phantom.cpp
   projectors.cpp
-  filtered_backprojection.cpp
-  projectors_SF_cpu.cpp
   projectors_Joseph_cpu.cpp
-  projectors_symmetric_cpu.cpp
+  projectors_SF_cpu.cpp
   projectors_Siddon_cpu.cpp
-  sensitivity_cpu.cpp
+  projectors_symmetric_cpu.cpp
   ramp_filter_cpu.cpp
   ray_weighting_cpu.cpp
-  find_center_cpu.cpp
-  sinogram_replacement.cpp
-  resample_cpu.cpp
-  cpu_utils.cpp
-  phantom.cpp
-  analytic_ray_tracing.cpp
   rebin.cpp
-  file_io.cpp
+  resample_cpu.cpp
+  sensitivity_cpu.cpp
+  sinogram_replacement.cpp
+  tomographic_models.cpp
+  tomographic_models_c_interface.cpp
 )
 
 set(SRC_CU
+  analytic_ray_tracing_gpu.cu
+  backprojectors_VD.cu
+  bilateral_filter.cu
+  cuda_utils.cu
+  geometric_calibration.cu
+  guided_filter.cu
+  matching_pursuit.cu
+  noise_filters.cu
+  projectors_Joseph.cu
   projectors_SF.cu
+  projectors_Siddon.cu
+  projectors_attenuated.cu
   projectors_extendedSF.cu
-  projectors_Joseph.cu
   projectors_symmetric.cu
-  projectors_attenuated.cu
-  projectors_Siddon.cu
-  backprojectors_VD.cu
-  sensitivity.cu
   ramp_filter.cu
-  resample.cu
-  noise_filters.cu
-  total_variation.cu
-  matching_pursuit.cu
-  bilateral_filter.cu
-  guided_filter.cu
-  geometric_calibration.cu
   ray_weighting.cu
+  resample.cu
   scatter_models.cu
-  analytic_ray_tracing_gpu.cu
-  cuda_utils.cu
+  sensitivity.cu
+  total_variation.cu
 )
 
 include_directories(

From de3a1c6ecb0f224ecf21a0fb49e33261021c2670 Mon Sep 17 00:00:00 2001
From: "M. Eric Irrgang" <vardarirrgan1@llnl.gov>
Date: Thu, 13 Nov 2025 16:10:05 -0800
Subject: [PATCH 2/7] Modern Python packaging.

Use pyproject.toml and scikit-build-core to drive the CMake build.

Minor CMake modernization.

Add some CMake infrastructure to try to handle the
three target build types (cuda, hip, and cpu-only)
but the project infrastructure may not be set up
for non-cuda builds at this point.
---
 .gitignore                                    |   4 +-
 CMakeLists.txt                                |  56 ++++++-
 pyproject.toml                                |  44 ++++++
 setup.cfg                                     |   4 -
 setup.py                                      | 141 ------------------
 src/CMakeLists.txt                            | 108 +++++++-------
 .../__init__.py}                              |   0
 .../__init__.py}                              |   0
 src/{leapctype.py => leapctype/__init__.py}   |   0
 src/{leaptorch.py => leaptorch/__init__.py}   |   0
 10 files changed, 149 insertions(+), 208 deletions(-)
 create mode 100644 pyproject.toml
 delete mode 100644 setup.cfg
 delete mode 100644 setup.py
 rename src/{leap_filter_sequence.py => leap_filter_sequence/__init__.py} (100%)
 rename src/{leap_preprocessing_algorithms.py => leap_preprocessing_algorithms/__init__.py} (100%)
 rename src/{leapctype.py => leapctype/__init__.py} (100%)
 rename src/{leaptorch.py => leaptorch/__init__.py} (100%)

diff --git a/.gitignore b/.gitignore
index dc69e64..ffd7efd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -182,7 +182,7 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
 
 
 
@@ -645,4 +645,4 @@ FodyWeavers.xsd
 *.msp
 
 # JetBrains Rider
-*.sln.iml
\ No newline at end of file
+*.sln.iml
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5a0f8d4..7b1e846 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,12 +1,60 @@
-#cmake_minimum_required(VERSION 3.23 FATAL_ERROR)
-cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+# cmake version 3.23 or higher is needed to support
+# the argument CUDA_ARCHITECTURES all-major
+cmake_minimum_required(VERSION 3.23 FATAL_ERROR)
 
-project(leapct)
+project(leapct LANGUAGES CXX)
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-ENABLE_TESTING()
+# Find a default GPU accelerator language.
+set(_default_accelerator_type "NONE")
+include(CheckLanguage)
+check_language(HIP)
+if(CMAKE_HIP_COMPILER)
+    set(_default_accelerator_type "AMD")
+endif()
+check_language(CUDA)
+if(CMAKE_CUDA_COMPILER)
+    set(_default_accelerator_type "NVIDIA")
+endif()
+
+set(LEAP_GPU "NVIDIA" CACHE STRING "GPU acceleration type")
+unset(_default_accelerator_type)
+
+set_property(CACHE LEAP_GPU PROPERTY STRINGS "NVIDIA" "AMD" "None")
+# Extract "STRINGS" property of the parameter
+get_property(OPT_STRINGS CACHE LEAP_GPU PROPERTY STRINGS)
+# Check that value of the parameter is inside "STRINGS" list.
+if (NOT LEAP_GPU IN_LIST OPT_STRINGS)
+  message(FATAL_ERROR "Wrong value of the parameter 'LEAP_GPU'")
+endif ()
+
+string(TOUPPER "${LEAP_GPU}" LEAP_GPU)
+if (LEAP_GPU STREQUAL "NVIDIA")
+    # To minimize binary size and compile time, users are suggested to
+    # set the CUDAARCHS environment variable or define the CMake variable CMAKE_CUDA_ARCHITECTURES
+    # to "native" if building specifically for the GPU visible to the build environment,
+    # or to an explicit numerical architecture code, such as "70"
+    if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+      set(CMAKE_CUDA_ARCHITECTURES all-major)
+    endif()
+    enable_language(CUDA)
+    set(LEAP_CUDA ON)
+elseif (LEAP_GPU STREQUAL "AMD")
+    # Users are advised to set CMAKE_HIP_ARCHITECTURES to a minimal subset.
+    # See also https://rocm.docs.amd.com/en/latest/conceptual/cmake-packages.html#using-hip-in-cmake
+    enable_language(HIP)
+    set(LEAP_HIP ON)
+elseif (LEAP_GPU STREQUAL "NONE")
+    set(LEAP_CPU_ONLY ON)
+else ()
+    message(FATAL_ERROR "CMake scripting error: ${LEAP_GPU} didn't match.")
+endif ()
+
+if((BUILD_TESTING) OR (NOT DEFINED BUILD_TESTING))
+    ENABLE_TESTING()
+endif()
 
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..df8f71d
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,44 @@
+[build-system]
+requires = ["scikit-build-core>=0.10"]
+build-backend = "scikit_build_core.build"
+
+[project]
+name = "leapct"
+dynamic = ["version"]
+description = "LivermorE AI Projector for Computed Tomography (LEAPCT)"
+authors = [
+    { name = "Kyle Champley", email = "champley@gmail.com" },
+    { name = "Hyojin Kim", email = "hkim@llnl.gov" }
+]
+readme = "README.md"
+license-files = ["LICENSE"]
+keywords = [
+    "Machine Learning", "ML", "AI", "Computed Tomography", "CT",
+    "Differentiable Project", "Forward Project", "Back Project"
+]
+requires-python = ">=3.6"
+dependencies = [
+    "imageio",
+    "matplotlib",
+    "numpy",
+    "scipy",
+]
+
+[tool.scikit-build]
+minimum-version = "build-system.requires"
+cmake.version = ">=3.23"
+metadata.version.provider = "scikit_build_core.metadata.setuptools_scm"
+
+wheel.packages = ["src/leap_filter_sequence", "src/leap_preprocessing_algorithms", "src/leapctype", "src/leaptorch"]
+wheel.py-api = "py3"
+# If we have to issue revised packages for the same software version,
+# we can increment a build number:
+wheel.build-tag = 0
+
+[tool.scikit-build.cmake.define]
+BUILD_TESTING = false
+LEAP_GPU = "NVIDIA"
+
+[tool.setuptools_scm]
+# There are several packages. Is there an appropriate place to put a package version?
+#write_to = "src/leap/_version.py"
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index a71d070..0000000
--- a/setup.cfg
+++ /dev/null
@@ -1,4 +0,0 @@
-[metadata]
-description_file=README.md
-license_files=LICENSE.rst
-
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 840c4ec..0000000
--- a/setup.py
+++ /dev/null
@@ -1,141 +0,0 @@
-################################################################################
-# Copyright 2022-2023 Lawrence Livermore National Security, LLC and other 
-# LEAP project developers. See the LICENSE file for details.
-# SPDX-License-Identifier: MIT
-#
-# LivermorE AI Projector for Computed Tomography (LEAP)
-# setup.py for pytorch module
-################################################################################
-
-from setuptools import setup, find_packages
-from setuptools.command.install import install
-from torch.utils.cpp_extension import CUDAExtension
-from torch.utils.cpp_extension import CppExtension
-from torch.utils.cpp_extension import BuildExtension
-import os
-import pybind11
-import torch
-from sys import platform as _platform
-
-
-## todo
-# compiler options (optimization flags, cufft options)
-# leapctype : libleap.so 
-
-cpp_files=[
-    'analytic_ray_tracing.cpp', 
-    'cpu_utils.cpp', 
-    'file_io.cpp', 
-    'filtered_backprojection.cpp', 
-    'find_center_cpu.cpp', 
-    'list_of_tomographic_models.cpp', 
-    'parameters.cpp', 
-    'phantom.cpp', 
-    'projectors.cpp', 
-    'projectors_Joseph_cpu.cpp', 
-    'projectors_SF_cpu.cpp', 
-    'projectors_Siddon_cpu.cpp', 
-    'projectors_symmetric_cpu.cpp', 
-    'ramp_filter_cpu.cpp', 
-    'ray_weighting_cpu.cpp', 
-    'rebin.cpp', 
-    'sensitivity_cpu.cpp', 
-    'resample_cpu.cpp', 
-    'sinogram_replacement.cpp', 
-    'tomographic_models_c_interface.cpp', 
-    'tomographic_models.cpp', 
-]
-
-cuda_files=[
-    'bilateral_filter.cu', 
-    'guided_filter.cu', 
-    'cuda_utils.cu', 
-    'matching_pursuit.cu', 
-    'noise_filters.cu', 
-    'projectors_attenuated.cu', 
-    'projectors_extendedSF.cu', 
-    'projectors_Joseph.cu', 
-    'projectors_SF.cu', 
-    'projectors_Siddon.cu', 
-    'projectors_symmetric.cu', 
-    'ramp_filter.cu', 
-    'ray_weighting.cu', 
-    'scatter_models.cu', 
-    'sensitivity.cu', 
-    'resample.cu', 
-    'total_variation.cu',
-    'geometric_calibration.cu',
-    'analytic_ray_tracing_gpu.cu',
-    'backprojectors_VD.cu',
-]
-
-cuda = torch.cuda.is_available()
-if cuda:
-    source_files = []
-    for cpp_file in cpp_files:
-        source_files.append(os.path.join('src', cpp_file))
-    for cuda_file in cuda_files:
-        source_files.append(os.path.join('src', cuda_file))
-
-    # optionally we could add '-O3'
-    # or extra_link_args=["-std=c++11"]
-    rocm = "AMD" in torch.cuda.get_device_name(0)
-    if rocm: # AMD ROCM GPU
-        #print("########## AMD ROCM architecture found! ##########")
-        extra_compile_args={'cxx':  ['-D__USE_GPU', '-D__USE_NOTEX', '-O3', '-lhipfft', '-D__INCLUDE_CUFFT'], 
-                            'nvcc': ['-D__USE_GPU', '-D__USE_NOTEX', '-O3', '-lhipfft', '-D__INCLUDE_CUFFT']}
-        libraries = ['hipfft']
-    else: # CUDA GPU
-        #print("########## NVIDIA CUDA architecture found! ##########")
-        # for debug
-        #extra_compile_args={'cxx':  ['-D__USE_GPU', '-D__USE_NOTEX', '-O3'], 
-        #                    'nvcc': ['-D__USE_GPU', '-D__USE_NOTEX', '-O3']} #, '-arch=compute_61'
-        #libraries = []
-
-        # for debug
-        extra_compile_args={'cxx':  ['-D__USE_GPU', '-D__USE_NOTEX', '-O3', '-lcufft', '-D__INCLUDE_CUFFT'], 
-                            'nvcc': ['-D__USE_GPU', '-D__USE_NOTEX', '-O3', '-lcufft', '-D__INCLUDE_CUFFT']} #, '-arch=compute_61'
-        libraries = ['cufft']
-
-        # for release
-        #extra_compile_args={'cxx':  ['-D__USE_GPU', '-O3', '-lcufft', '-D__INCLUDE_CUFFT'], 
-        #                    'nvcc': ['-D__USE_GPU', '-O3', '-lcufft', '-D__INCLUDE_CUFFT']}
-        #libraries = ['cufft']
-    ext_mod = CUDAExtension(
-        name='leapct',
-        sources=source_files,
-        extra_compile_args=extra_compile_args,
-        libraries = libraries,
-        #extra_link_args=["-lcufft"], 
-        extra_cflags=['-O3'])
-else:
-    source_files = []
-    for cpp_file in cpp_files:
-        source_files.append(os.path.join('src', cpp_file))
-
-    ext_mod = CppExtension(
-        name='leapct',
-        sources=source_files,
-        extra_cflags=['-O3'],
-        #extra_link_args=["-lcufft"], 
-        extra_compile_args={'cxx': ['-D__USE_CPU']}
-        #extra_compile_args=['-g', '-D__USE_CPU'],
-    )
-
-setup(
-    name='leapct',
-    version='1.27', 
-    author='Kyle Champley, Hyojin Kim', 
-    author_email='champley@gmail.com, hkim@llnl.gov', 
-    description='LivermorE AI Projector for Computed Tomography (LEAPCT)', 
-    keywords='Machine Learning, ML, AI, Computed Tomography, CT, Differentiable Project, Forward Project, Back Project', 
-    python_requires='>=3.6', 
-    packages=find_packages("src"), 
-    package_dir={'': 'src'},
-    #install_requires=['numpy', 'torch'], 
-    py_modules=['leaptorch','leapctype', 'leap_filter_sequence', 'leap_preprocessing_algorithms'], 
-    ext_modules=[ext_mod], 
-    cmdclass={'build_ext': BuildExtension}, 
-    #package_data={'': [lib_fname]},
-)
-
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index bcd604c..457018d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,21 +1,7 @@
-# cmake version 3.23 or higher is needed to support 
-# the argument CUDA_ARCHITECTURES all-major
-# You could use cmake as old as version 3.18 if you replaced
-# the line below that specifies all-major with the line below
-# it which specifies the list of CUDA architectures 
-#cmake_minimum_required(VERSION 3.23 FATAL_ERROR)
-cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
-
-project(leapct CXX CUDA)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-find_package(CUDA 11.7 REQUIRED)
 find_package(OpenMP REQUIRED)
-#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -O3)
-set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -D__INCLUDE_CUFFT)
-add_compile_options(-D__INCLUDE_CUFFT)
+if(LEAP_CUDA)
+    find_package(CUDAToolkit 11.7 REQUIRED)
+endif()
 
 if (WIN32)
   set(CMAKE_SHARED_LIBRARY_PREFIX "lib")
@@ -70,23 +56,6 @@ set(HEADER
   tomographic_models.h
   tomographic_models_c_interface.h
   total_variation.cuh
-  matching_pursuit.cuh
-  bilateral_filter.cuh
-  guided_filter.cuh
-  geometric_calibration.cuh
-  find_center_cpu.h
-  sinogram_replacement.h
-  resample_cpu.h
-  resample.cuh
-  cuda_utils.h
-  cpu_utils.h
-  phantom.h
-  analytic_ray_tracing.h
-  analytic_ray_tracing_gpu.cuh
-  rebin.h
-  file_io.h
-  leap_defines.h
-  log.h
 )
 
 set(SRC_CPP
@@ -146,9 +115,12 @@ include_directories(
 add_library(leapct SHARED
   ${HEADER}
   ${SRC_CPP}
-  ${SRC_CU}
 )
 
+if(LEAP_CUDA)
+    target_sources(leapct PRIVATE ${SRC_CU})
+endif()
+
 # <<<--- Create filters for subfolders for Visual Studio
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
@@ -167,31 +139,53 @@ foreach(_source IN ITEMS ${_source_list})
 endforeach()
 # --->>>
 
-target_link_libraries(${PROJECT_NAME}
-  OpenMP::OpenMP_CXX
-  ${CUDA_LIBRARIES}
-  ${CUDA_cublas_LIBRARY}
-  ${CUDA_cufft_LIBRARY}
-)
+if (LEAP_CUDA)
+    target_compile_definitions(
+        leapct PRIVATE
+        __USE_GPU
+        __USE_NOTEX
+        __INCLUDE_CUFFT
+    )
+    target_link_libraries(${PROJECT_NAME}
+      OpenMP::OpenMP_CXX
+      CUDA::cudart
+      CUDA::cublas
+      CUDA::cufft
+    )
+elseif (LEAP_HIP)
+    target_compile_definitions(
+        leapct PRIVATE
+        __USE_GPU
+        __USE_NOTEX
+        __INCLUDE_CUFFT
+    )
+    target_link_libraries(${PROJECT_NAME}
+      OpenMP::OpenMP_CXX
+      hip::hipfft
+    )
+else ()
+    target_compile_definitions(
+        leapct PRIVATE
+        __USE_CPU
+    )
+    target_link_libraries(${PROJECT_NAME}
+      OpenMP::OpenMP_CXX
+    )
+endif()
 
 target_compile_options(leapct PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
   --use_fast_math
 >)
 
-#set_property(TARGET leapct PROPERTY CXX_STANDARD 14)
-#set_property(TARGET leapct PROPERTY CUDA_ARCHITECTURES native)
-if(CMAKE_VERSION VERSION_GREATER "3.23")
-    message("Building for all major cuda architectures")
-    set_property(TARGET leapct PROPERTY CUDA_ARCHITECTURES all-major)
+if(SKBUILD)
+    install(TARGETS ${PROJECT_NAME}
+    DESTINATION ${SKBUILD_PLATLIB_DIR}/leapctype/
+    )
+else()
+    install(TARGETS ${PROJECT_NAME}
+      PUBLIC_HEADER DESTINATION include
+      RUNTIME DESTINATION bin
+      ARCHIVE DESTINATION lib
+      LIBRARY DESTINATION lib
+    )
 endif()
-#set_property(TARGET leapct PROPERTY CUDA_ARCHITECTURES all-major)
-#set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_ARCHITECTURES 60 61 62 70 72 75 80 86 87 89)
-#set_property(TARGET ${PROJECT_NAME} PROPERTY CUDA_ARCHITECTURES 75 86 87 89)
-#set_property(TARGET leapct PROPERTY CUDA_ARCHITECTURES OFF)
-
-install(TARGETS ${PROJECT_NAME}
-  PUBLIC_HEADER DESTINATION include
-  RUNTIME DESTINATION bin
-  ARCHIVE DESTINATION lib
-  LIBRARY DESTINATION lib
-)
diff --git a/src/leap_filter_sequence.py b/src/leap_filter_sequence/__init__.py
similarity index 100%
rename from src/leap_filter_sequence.py
rename to src/leap_filter_sequence/__init__.py
diff --git a/src/leap_preprocessing_algorithms.py b/src/leap_preprocessing_algorithms/__init__.py
similarity index 100%
rename from src/leap_preprocessing_algorithms.py
rename to src/leap_preprocessing_algorithms/__init__.py
diff --git a/src/leapctype.py b/src/leapctype/__init__.py
similarity index 100%
rename from src/leapctype.py
rename to src/leapctype/__init__.py
diff --git a/src/leaptorch.py b/src/leaptorch/__init__.py
similarity index 100%
rename from src/leaptorch.py
rename to src/leaptorch/__init__.py

From 18f775b5bcba791bd930c9b4f9cf34b7bc4cc279 Mon Sep 17 00:00:00 2001
From: "M. Eric Irrgang" <vardarirrgan1@llnl.gov>
Date: Wed, 19 Nov 2025 12:29:42 -0800
Subject: [PATCH 3/7] Only need CXX component for OpenMP

---
 src/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 457018d..6cc47bd 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,4 +1,4 @@
-find_package(OpenMP REQUIRED)
+find_package(OpenMP REQUIRED COMPONENTS CXX)
 if(LEAP_CUDA)
     find_package(CUDAToolkit 11.7 REQUIRED)
 endif()

From 0faa6b961e8ab92db0085bd73fa9cd61fb0104cd Mon Sep 17 00:00:00 2001
From: Eric Vardar-Irrgang <vardarirrgan1@llnl.gov>
Date: Tue, 3 Mar 2026 14:12:56 -0800
Subject: [PATCH 4/7] Improve GPU handling.

- Fix default accelerator framework.
- Only define `__USE_NOTEX` for AMD
---
 CMakeLists.txt     | 4 ++--
 src/CMakeLists.txt | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7b1e846..5abdd57 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 # Find a default GPU accelerator language.
-set(_default_accelerator_type "NONE")
+set(_default_accelerator_type "None")
 include(CheckLanguage)
 check_language(HIP)
 if(CMAKE_HIP_COMPILER)
@@ -19,7 +19,7 @@ if(CMAKE_CUDA_COMPILER)
     set(_default_accelerator_type "NVIDIA")
 endif()
 
-set(LEAP_GPU "NVIDIA" CACHE STRING "GPU acceleration type")
+set(LEAP_GPU "$_default_accelerator_type" CACHE STRING "GPU acceleration type")
 unset(_default_accelerator_type)
 
 set_property(CACHE LEAP_GPU PROPERTY STRINGS "NVIDIA" "AMD" "None")
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6cc47bd..2c38822 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -143,7 +143,6 @@ if (LEAP_CUDA)
     target_compile_definitions(
         leapct PRIVATE
         __USE_GPU
-        __USE_NOTEX
         __INCLUDE_CUFFT
     )
     target_link_libraries(${PROJECT_NAME}

From 6effaa976c52207d5c08ba8987187cb8d3b83d84 Mon Sep 17 00:00:00 2001
From: Eric Vardar-Irrgang <vardarirrgan1@llnl.gov>
Date: Wed, 4 Mar 2026 15:33:06 -0800
Subject: [PATCH 5/7] Report the selected accelerator flavor.

---
 CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5abdd57..f32e759 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,8 +27,9 @@ set_property(CACHE LEAP_GPU PROPERTY STRINGS "NVIDIA" "AMD" "None")
 get_property(OPT_STRINGS CACHE LEAP_GPU PROPERTY STRINGS)
 # Check that value of the parameter is inside "STRINGS" list.
 if (NOT LEAP_GPU IN_LIST OPT_STRINGS)
-  message(FATAL_ERROR "Wrong value of the parameter 'LEAP_GPU'")
+  message(FATAL_ERROR "Wrong value of the parameter 'LEAP_GPU': ${LEAP_GPU}")
 endif ()
+message(STATUS "LEAP_GPU selected accelerator type ${LEAP_GPU}")
 
 string(TOUPPER "${LEAP_GPU}" LEAP_GPU)
 if (LEAP_GPU STREQUAL "NVIDIA")

From 0577a9cc9d593fb9d8222fd1264f5f46cffd87d7 Mon Sep 17 00:00:00 2001
From: Eric Vardar-Irrgang <vardarirrgan1@llnl.gov>
Date: Mon, 23 Mar 2026 16:28:12 -0700
Subject: [PATCH 6/7] Add a fallback version.

---
 pyproject.toml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index df8f71d..289e0d1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,5 +40,10 @@ BUILD_TESTING = false
 LEAP_GPU = "NVIDIA"
 
 [tool.setuptools_scm]
+# Ref https://setuptools-scm.readthedocs.io/en/latest/config/#configuration-parameters
+# WARNING: A fallback version allows the package to be built and installed without errors from a downloaded archive
+# (i.e. without using git to clone a repository), but the fallback version identifier then becomes an extra point
+# of maintenance.
+fallback_version = "1.26+untagged"
 # There are several packages. Is there an appropriate place to put a package version?
 #write_to = "src/leap/_version.py"

From 6de22007ea27e7b8935c2de58154b49b0b82737b Mon Sep 17 00:00:00 2001
From: Eric Vardar-Irrgang <vardarirrgan1@llnl.gov>
Date: Mon, 27 Apr 2026 14:45:33 -0700
Subject: [PATCH 7/7] Rework HIP source generation.

Normalize some math functions to improve compatibility and consistency.

Try to use the hipify wrappers in `torch.utils`, if available, else try
to call `hipify-clang` directly.

Prefer torch-based hipify into `/hipified_src`. Keep `hipify-clang` as
an explicitly experimental fallback, and stage fallback inputs under a
separate build-local `hipify_stage` tree.

Add `gpu_runtime.h` and `gpu_fft.h` compatibility shims with comments
explaining why LEAP still needs them even after source translation.
Update the translated source set and include ordering so generated
sources consistently see translated headers and copied support headers
before the original src tree.

Harden tools/run_hipify_clang.py by recording retry-aware manifest
files, removing stale outputs, retrying known -p/-o conflicts without
compile_commands.json context, accepting stdout-only output as a
compatibility fallback, and surfacing clearer diagnostics for likely
CUDA-arch propagation failures inside hipify-clang.

Document the supported build and wheel paths in the README, including
CPU, CUDA, and AMD usage, when visible GPUs are or are not required on
the build host, and the caveats around isolated builds and the
experimental hipify-clang fallback.
---
 CMakeLists.txt                   | 296 ++++++++++++++++-
 README.md                        | 143 +++++++++
 cmake/LeapHipifyClang.cmake      | 238 ++++++++++++++
 pyproject.toml                   |  19 +-
 src/CMakeLists.txt               | 532 ++++++++++++++++++++++---------
 src/analytic_ray_tracing_gpu.cu  |  11 +-
 src/analytic_ray_tracing_gpu.cuh |   2 +-
 src/backprojectors_VD.cu         |   2 +-
 src/bilateral_filter.cu          |   2 +-
 src/cuda_utils.cu                |   2 +-
 src/cuda_utils.h                 |   4 +-
 src/file_io.h                    |   1 +
 src/geometric_calibration.cu     |   2 +-
 src/gpu_fft.h                    |  29 ++
 src/gpu_runtime.h                |  76 +++++
 src/guided_filter.cu             |   2 +-
 src/matching_pursuit.cu          |   2 +-
 src/noise_filters.cu             |   2 +-
 src/projectors_Joseph.cu         |   2 +-
 src/projectors_Joseph_cpu.h      |   2 +-
 src/projectors_SF.cu             |   2 +-
 src/projectors_Siddon.cu         |   2 +-
 src/projectors_attenuated.cu     |   2 +-
 src/projectors_extendedSF.cu     |   2 +-
 src/projectors_symmetric.cu      |   2 +-
 src/ramp_filter.cu               |   5 +-
 src/ramp_filter.cuh              |   4 +-
 src/ray_weighting.cu             |   2 +-
 src/resample.cu                  |   2 +-
 src/scatter_models.cu            |   2 +-
 src/scatter_models.cuh           |   2 +-
 src/scatter_models_old.cu        |   2 +-
 src/sensitivity.cu               |   3 +-
 src/total_variation.cu           |   2 +-
 tools/hipify_torch.py            | 467 +++++++++++++++++++++++++++
 tools/run_hipify_clang.py        | 418 ++++++++++++++++++++++++
 36 files changed, 2090 insertions(+), 198 deletions(-)
 create mode 100644 cmake/LeapHipifyClang.cmake
 create mode 100644 src/gpu_fft.h
 create mode 100644 src/gpu_runtime.h
 create mode 100644 tools/hipify_torch.py
 create mode 100644 tools/run_hipify_clang.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f32e759..a12eb0b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,58 +7,320 @@ project(leapct LANGUAGES CXX)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-# Find a default GPU accelerator language.
+# Keep project-local helper modules under version control in one place.
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+
+# ------------------------------------------------------------------------------
+# GPU backend selection
+#
+# We select a default accelerator type by probing for available languages.
+# Users can always override this at configure time, for example:
+#
+#   cmake -S . -B build -DLEAP_GPU=NVIDIA
+#   cmake -S . -B build -DLEAP_GPU=AMD
+#   cmake -S . -B build -DLEAP_GPU=None
+#
+# NOTE:
+# The default is chosen at configure time based on available compilers/toolchains,
+# not based on a runtime-visible GPU device. This is more reliable than the old
+# setup.py approach that queried torch.cuda.get_device_name(0).
+# ------------------------------------------------------------------------------
+
 set(_default_accelerator_type "None")
 include(CheckLanguage)
+
 check_language(HIP)
 if(CMAKE_HIP_COMPILER)
     set(_default_accelerator_type "AMD")
 endif()
+
 check_language(CUDA)
 if(CMAKE_CUDA_COMPILER)
     set(_default_accelerator_type "NVIDIA")
 endif()
 
-set(LEAP_GPU "$_default_accelerator_type" CACHE STRING "GPU acceleration type")
+set(LEAP_GPU "${_default_accelerator_type}" CACHE STRING "GPU acceleration type")
 unset(_default_accelerator_type)
 
 set_property(CACHE LEAP_GPU PROPERTY STRINGS "NVIDIA" "AMD" "None")
-# Extract "STRINGS" property of the parameter
 get_property(OPT_STRINGS CACHE LEAP_GPU PROPERTY STRINGS)
-# Check that value of the parameter is inside "STRINGS" list.
+
 if (NOT LEAP_GPU IN_LIST OPT_STRINGS)
-  message(FATAL_ERROR "Wrong value of the parameter 'LEAP_GPU': ${LEAP_GPU}")
+    message(FATAL_ERROR "Wrong value of the parameter 'LEAP_GPU': ${LEAP_GPU}")
 endif ()
+
 message(STATUS "LEAP_GPU selected accelerator type ${LEAP_GPU}")
 
+# Normalize for branch comparisons below.
 string(TOUPPER "${LEAP_GPU}" LEAP_GPU)
+
+# ------------------------------------------------------------------------------
+# Shared path variables used by src/CMakeLists.txt
+#
+# LEAP_SELECTED_SRC_DIR:
+#   The source root that src/CMakeLists.txt should build from.
+#   - original src/ for CPU or CUDA builds
+#   - torch-generated hipified tree when torch helper succeeds
+#
+# LEAP_HIPIFIED_SRC_DIR:
+#   Generated files directory under the build tree.
+#
+# LEAP_HIPIFY_STAGE_DIR:
+#   Build-local staging tree containing copied raw inputs used by hipify-clang.
+#
+# HIPIFY_METHOD:
+#   none, torch, hipify-clang
+# ------------------------------------------------------------------------------
+
+set(LEAP_ORIGINAL_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src")
+set(LEAP_SELECTED_SRC_DIR "${LEAP_ORIGINAL_SRC_DIR}")
+set(LEAP_HIPIFIED_SRC_DIR "${CMAKE_CURRENT_BINARY_DIR}/hipified_src")
+set(LEAP_HIPIFY_STAGE_DIR "${CMAKE_CURRENT_BINARY_DIR}/hipify_stage")
+set(HIPIFY_METHOD "none")
+
+# ------------------------------------------------------------------------------
+# NVIDIA CUDA build
+# ------------------------------------------------------------------------------
+
 if (LEAP_GPU STREQUAL "NVIDIA")
     # To minimize binary size and compile time, users are suggested to
-    # set the CUDAARCHS environment variable or define the CMake variable CMAKE_CUDA_ARCHITECTURES
-    # to "native" if building specifically for the GPU visible to the build environment,
-    # or to an explicit numerical architecture code, such as "70"
+    # set the CUDAARCHS environment variable or define the CMake variable
+    # CMAKE_CUDA_ARCHITECTURES to "native" if building specifically for the
+    # GPU visible to the build environment, or to an explicit numerical
+    # architecture code, such as "70".
     if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-      set(CMAKE_CUDA_ARCHITECTURES all-major)
+        set(CMAKE_CUDA_ARCHITECTURES all-major)
     endif()
+
     enable_language(CUDA)
     set(LEAP_CUDA ON)
-elseif (LEAP_GPU STREQUAL "AMD")
-    # Users are advised to set CMAKE_HIP_ARCHITECTURES to a minimal subset.
-    # See also https://rocm.docs.amd.com/en/latest/conceptual/cmake-packages.html#using-hip-in-cmake
+
+# ------------------------------------------------------------------------------
+# AMD HIP build
+#
+# For AMD builds, we do not try to compile raw CUDA sources directly. Instead:
+#   1. Prefer torch-based hipify if torch is importable at build time
+#   2. Otherwise fall back to per-file hipify-clang via cmake/LeapHipifyClang.cmake
+#   3. Otherwise fail early
+#
+# In this repository, step 1 is the supported path. It writes translated files
+# into ${CMAKE_BINARY_DIR}/hipified_src, keeps generated artifacts out of src/,
+# and is the only path that has been validated end-to-end for ROCm 7.2.1.
+# The hipify-clang branch remains available as an escape hatch for other
+# installations, but it is still experimental here.
+#
+# IMPORTANT:
+# ROCm 6.4 hipify-clang does not support the directory-wide "-i -o src dst"
+# style used in some examples. Therefore we only detect hipify-clang here.
+# The helper is selected here and invoked from src/CMakeLists.txt.
+# ------------------------------------------------------------------------------
+
+elseif(LEAP_GPU STREQUAL "AMD")
     enable_language(HIP)
     set(LEAP_HIP ON)
-elseif (LEAP_GPU STREQUAL "NONE")
+
+    # Controls for hipify behavior.
+    #
+    # HIPIFY_PREFER_TORCH:
+    #   Try torch.utils.hipify first if torch is importable.
+    #
+    # HIPIFY_ALLOW_TOOLCHAIN:
+    #   Allow the experimental hipify-clang fallback if torch hipify is
+    #   unavailable or intentionally disabled.
+    #
+    # HIPIFY_FAIL_IF_MISSING:
+    #   Recommended ON for AMD builds, since compiling untranslated CUDA
+    #   sources as HIP is usually not what you want.
+    option(HIPIFY_PREFER_TORCH "Prefer torch-based hipify when torch is importable" ON)
+    option(HIPIFY_ALLOW_TOOLCHAIN "Allow fallback to hipify-clang" ON)
+    option(HIPIFY_FAIL_IF_MISSING "Fail if AMD build is requested and hipify cannot run" ON)
+
+    # These are only used for the torch-helper branch. The per-file clang
+    # branch manages its own staging and generated trees inside src/CMakeLists.txt.
+    file(MAKE_DIRECTORY "${LEAP_HIPIFIED_SRC_DIR}")
+
+    set(HIPIFY_DONE_FILE "${LEAP_HIPIFIED_SRC_DIR}/hipify.done")
+    set(HIPIFY_SKIPPED_FILE "${LEAP_HIPIFIED_SRC_DIR}/hipify.skipped")
+    set(HIPIFY_FAILED_FILE "${LEAP_HIPIFIED_SRC_DIR}/hipify.failed")
+
+    file(REMOVE "${HIPIFY_DONE_FILE}")
+    file(REMOVE "${HIPIFY_SKIPPED_FILE}")
+    file(REMOVE "${HIPIFY_FAILED_FILE}")
+
+    # Optional explicit request for hipify behavior.
+    #
+    # Example:
+    #   HIPIFY_AT_BUILD=1 cmake -S . -B build -DLEAP_GPU=AMD
+    #
+    # The helper script may use this to force a torch hipify attempt even if
+    # the installed torch is not ROCm-enabled.
+    set(HIPIFY_REQUESTED OFF)
+    if(DEFINED ENV{HIPIFY_AT_BUILD})
+        if("$ENV{HIPIFY_AT_BUILD}" STREQUAL "1" OR
+           "$ENV{HIPIFY_AT_BUILD}" STREQUAL "true" OR
+           "$ENV{HIPIFY_AT_BUILD}" STREQUAL "TRUE" OR
+           "$ENV{HIPIFY_AT_BUILD}" STREQUAL "ON")
+            set(HIPIFY_REQUESTED ON)
+        endif()
+    endif()
+
+    # --------------------------------------------------------------------------
+    # Path A, torch-based hipify
+    #
+    # This is the preferred AMD path in this repository. It uses PyTorch's
+    # hipify utilities when torch is available in the build environment and
+    # generates a full build-local source tree under ${LEAP_HIPIFIED_SRC_DIR}.
+    #
+    # IMPORTANT:
+    # In default isolated PEP 517 builds, torch is NOT available unless the
+    # build environment provides it explicitly. In that common case, this
+    # branch will be skipped and we will try hipify-clang instead.
+    # --------------------------------------------------------------------------
+    if(HIPIFY_PREFER_TORCH)
+        find_package(Python3 COMPONENTS Interpreter QUIET)
+
+        if(Python3_Interpreter_FOUND)
+            execute_process(
+                COMMAND ${Python3_EXECUTABLE} -c "import torch; print('yes')"
+                RESULT_VARIABLE TORCH_IMPORT_RESULT
+                OUTPUT_QUIET
+                ERROR_QUIET
+            )
+
+            if(TORCH_IMPORT_RESULT EQUAL 0)
+                message(STATUS "torch import succeeded, trying torch-based hipify")
+
+                execute_process(
+                    COMMAND
+                        ${Python3_EXECUTABLE}
+                        ${CMAKE_CURRENT_SOURCE_DIR}/tools/hipify_torch.py
+                        ${LEAP_ORIGINAL_SRC_DIR}
+                        ${LEAP_HIPIFIED_SRC_DIR}
+                        --project-root ${CMAKE_CURRENT_SOURCE_DIR}
+                        --include src/*.cu
+                        --include src/*.cuh
+                        --include src/*.cpp
+                        --include src/*.h
+                        --header-include-dir ${LEAP_ORIGINAL_SRC_DIR}
+                        --ignore build/**
+                        --ignore .git/**
+                        --ignore **/CMakeFiles/**
+                        --copy-tree-first
+                    RESULT_VARIABLE HIPIFY_TORCH_RESULT
+                    OUTPUT_VARIABLE HIPIFY_TORCH_OUT
+                    ERROR_VARIABLE HIPIFY_TORCH_ERR
+                )
+
+                message(STATUS "torch hipify stdout:\n${HIPIFY_TORCH_OUT}")
+
+                if(HIPIFY_TORCH_RESULT EQUAL 0 AND EXISTS "${HIPIFY_DONE_FILE}")
+                    set(HIPIFY_METHOD "torch")
+                    set(LEAP_SELECTED_SRC_DIR "${LEAP_HIPIFIED_SRC_DIR}")
+                elseif(HIPIFY_TORCH_RESULT EQUAL 0 AND EXISTS "${HIPIFY_SKIPPED_FILE}")
+                    message(STATUS "torch hipify helper skipped transformation")
+                else()
+                    message(WARNING "torch hipify helper failed: ${HIPIFY_TORCH_ERR}")
+                endif()
+            else()
+                message(STATUS "torch not importable in build environment")
+            endif()
+        else()
+            message(STATUS "Python3 interpreter not found, skipping torch hipify path")
+        endif()
+    endif()
+
+    # --------------------------------------------------------------------------
+    # Path B, hipify-clang fallback
+    #
+    # We only locate the executable here and record that this method is
+    # available. The actual per-file translation is performed in
+    # src/CMakeLists.txt.
+    #
+    # This path is intentionally retained for toolchain experiments, but on a
+    # ROCm 7.2.1 install it has shown multiple tool-side failure modes:
+    # missing outputs in direct -o mode, "-p ... -o ..." conflicts, and CUDA
+    # arch flags not always propagating into the tool's internal compile step.
+    # Treat it as poorly tested unless you have verified your local ROCm build.
+    # If a future hipify-clang release improves this, first re-check whether:
+    #   - directory-oriented translation can replace the per-file flow
+    #   - -p and -o now work together for the file types we translate
+    #   - CUDA arch / extra-arg settings propagate into the tool's internal
+    #     compile step consistently
+    #   - LEAP still needs the local compatibility shim headers afterward
+    #
+    # If later you discover that your hipify-clang installation needs extra
+    # arguments, add them here as cache variables and pass them down to src/.
+    # For example:
+    #
+    #   set(LEAP_HIPIFY_EXTRA_ARGS "--some-flag" CACHE STRING "extra hipify args")
+    #
+    # and then consume them in src/CMakeLists.txt.
+    # --------------------------------------------------------------------------
+    if(HIPIFY_METHOD STREQUAL "none" AND HIPIFY_ALLOW_TOOLCHAIN)
+        if(DEFINED ENV{ROCM_HOME})
+            set(ROCM_HOME $ENV{ROCM_HOME})
+        endif()
+
+        find_program(HIPIFY_CLANG hipify-clang
+            HINTS
+                $ENV{ROCM_HOME}/bin
+            PATHS
+                ENV PATH
+        )
+
+        if(HIPIFY_CLANG)
+            message(STATUS "Found hipify-clang fallback: ${HIPIFY_CLANG}")
+            set(HIPIFY_METHOD "hipify-clang")
+            set(LEAP_HIPIFY_CLANG_EXECUTABLE "${HIPIFY_CLANG}")
+        else()
+            message(STATUS "hipify-clang not found")
+        endif()
+    endif()
+
+    # --------------------------------------------------------------------------
+    # AMD build must have a successful translation path
+    # --------------------------------------------------------------------------
+    if(HIPIFY_METHOD STREQUAL "none")
+        if(HIPIFY_FAIL_IF_MISSING)
+            message(FATAL_ERROR
+                "LEAP_GPU=AMD was requested, but no hipify path succeeded. "
+                "Install torch in a non-isolated build environment, or install ROCm hipify-clang."
+            )
+        else()
+            message(WARNING
+                "LEAP_GPU=AMD requested, but hipify did not run. "
+                "Build may fail if raw CUDA sources are not HIP-compatible."
+            )
+        endif()
+    else()
+        message(STATUS "HIPIFY_METHOD selected: ${HIPIFY_METHOD}")
+    endif()
+
+# ------------------------------------------------------------------------------
+# CPU-only build
+# ------------------------------------------------------------------------------
+
+elseif(LEAP_GPU STREQUAL "NONE")
     set(LEAP_CPU_ONLY ON)
-else ()
+
+else()
     message(FATAL_ERROR "CMake scripting error: ${LEAP_GPU} didn't match.")
-endif ()
+endif()
+
+# ------------------------------------------------------------------------------
+# Testing and output directories
+# ------------------------------------------------------------------------------
 
 if((BUILD_TESTING) OR (NOT DEFINED BUILD_TESTING))
-    ENABLE_TESTING()
+    enable_testing()
 endif()
 
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
- 
+
+# ------------------------------------------------------------------------------
+# Delegate actual target creation and per-file source handling to src/
+# ------------------------------------------------------------------------------
+
 add_subdirectory(src)
diff --git a/README.md b/README.md
index 88c2390..e05095e 100644
--- a/README.md
+++ b/README.md
@@ -23,6 +23,149 @@ Demo scripts for most functionality in the [demo_leapctype](https://github.com/L
 
 Demo scripts for AI/ML/DL applications in the [demo_leaptorch](https://github.com/LLNL/LEAP/tree/main/demo_leaptorch) directory
 
+## Installation
+
+If your compilers are discoverable and your GPU is accessible, installation
+may be as simple as running `pip install .` from a copy of the source code
+repository.
+
+Instead of calling `pip install` directly on the source directory (`.`),
+you can build a "wheel" binary distribution first with `python -m build ...`
+or `python -m build --wheel ...`.
+
+If you find that the simplest install does not work for you, see below for
+additional hints you may need to provide under different circumstances.
+The `-C` arguments described below for `python -m build` should also work for
+`pip install .` command lines.
+
+If you are using a GPU and PyTorch, we recommend that you install PyTorch
+**first**, and make sure that PyTorch and your GPU are accessible in the build
+environment. On a high-performance computing environment, this may mean that
+you need to build in the package in an HPC job rather than on a login node.
+
+To make sure that PyTorch is available to the build system, use the
+`--no-build-isolation` option to `pip` or the `--no-isolation` option to
+`python -m build`.
+
+## Build and Wheel Notes
+
+LEAP can be built directly with CMake for development, or packaged as a Python
+wheel through `scikit-build-core` with `python -m build --wheel`.
+The wheel path is the supported way to produce something installable with `pip`.
+
+The build host does not usually need to have the target GPU visible.
+CPU builds never do.
+AMD builds only need the ROCm toolchain and a Python environment with the
+required packaging tools.
+CUDA builds only need a visible GPU if you choose an auto-detected architecture
+mode such as `CMAKE_CUDA_ARCHITECTURES=native`;
+if you set an explicit architecture list such as `80` or accept LEAP's
+`all-major` default, GPU visibility is not required at build time.
+
+### Simple wheel builds
+
+CPU-only wheel:
+
+```bash
+python -m build --wheel -Ccmake.define.LEAP_GPU=None
+```
+
+CUDA wheel:
+
+```bash
+python -m build --wheel \
+  -Ccmake.define.LEAP_GPU=NVIDIA \
+  -Ccmake.define.CMAKE_CUDA_ARCHITECTURES=80
+```
+
+AMD wheel, using the currently supported HIP path in this repository:
+
+```bash
+python -m build --wheel --no-isolation \
+  -Ccmake.define.LEAP_GPU=AMD \
+  -Ccmake.define.CMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++
+```
+
+Install a built wheel with:
+
+```bash
+pip install dist/*.whl
+```
+
+### Direct CMake builds
+
+Direct CMake builds are useful when debugging the native build before packaging:
+
+```bash
+cmake -S . -B build-cpu -G Ninja -DLEAP_GPU=None
+ninja -C build-cpu leapct
+```
+
+```bash
+cmake -S . -B build-cuda -G Ninja \
+  -DLEAP_GPU=NVIDIA \
+  -DCMAKE_CUDA_ARCHITECTURES=80
+ninja -C build-cuda leapct
+```
+
+```bash
+cmake -S . -B build-hip -G Ninja \
+  -DLEAP_GPU=AMD \
+  -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ \
+  -DPython3_EXECUTABLE=$(which python)
+ninja -C build-hip leapct
+```
+
+For AMD, the preferred flow uses `tools/hipify_torch.py` to copy `src/` into
+`build/.../hipified_src` and rewrite the translated files there. Generated HIP
+sources are intentionally kept out of the repository's tracked `src/` tree.
+
+### AMD-specific caveats
+
+The preferred AMD path depends on `torch.utils.hipify`, so `torch` must be
+importable in the build environment.
+That is why `--no-isolation` is often the simplest choice for AMD wheel builds:
+a default isolated PEP 517 build environment installs `scikit-build-core`,
+but it does not automatically include PyTorch.
+
+This repository carries a `hipify-clang` fallback controlled by
+`HIPIFY_ALLOW_TOOLCHAIN`,  but it is not the recommended path
+(fails testing on a ROCm 7.2.1 environment, at least).
+It is retained for experimentation and for sites where PyTorch is unavailable
+at build time.
+Known problems on this node include:
+
+- direct `hipify-clang -o` runs that exit successfully without writing an output file
+- `-p <build-dir>` invocations that report `conflict: -o and multiple source files are specified`
+- CUDA arch flags accepted by `hipify-clang` but not always propagated into its internal CUDA compile step
+
+If you intentionally want to investigate that fallback,
+set `HIPIFY_PREFER_TORCH=FALSE`.
+Otherwise, leave the default torch-first behavior in place.
+
+### More control
+
+CMake variables can be passed through the Python packaging front-end with,
+e.g. `python -m build . -Ccmake.define.LEAP_GPU=AMD ...`
+
+Useful configuration knobs when the simple commands are not enough:
+
+- `LEAP_GPU=None|NVIDIA|AMD` selects the backend explicitly instead of relying
+  on toolchain detection.
+- `CMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++` is the usual AMD compiler override.
+- `CMAKE_CUDA_ARCHITECTURES=<list>` or `CUDAARCHS=<list>` controls which NVIDIA
+  architectures are compiled.
+- `HIPIFY_PREFER_TORCH=ON|OFF` chooses whether the AMD configure step tries
+  PyTorch hipify first.
+- `HIPIFY_ALLOW_TOOLCHAIN=ON|OFF` controls whether CMake may fall back to the
+  experimental `hipify-clang` path.
+- `CMAKE_EXPORT_COMPILE_COMMANDS=ON` is already enabled for packaging and is
+  useful when diagnosing translation failures.
+- `python -m build --no-isolation` is often required on restricted systems where
+  the isolated build environment cannot install dependencies, or when an AMD
+  build needs to reuse an existing Python environment that already has ROCm
+  PyTorch installed.
+
 ## Example Results
 
 As a simple demonstration of the accuracy of our projectors we show below the results of FDK reconstructions using ASTRA and LEAP of the walnut CT data.  The LEAP reconstruction has 1.7 times higher SNR than ASTRA.  An explanation for this improvement in SNR can be found [here](https://github.com/LLNL/LEAP/blob/main/results/SF_vs_VD.md).
diff --git a/cmake/LeapHipifyClang.cmake b/cmake/LeapHipifyClang.cmake
new file mode 100644
index 0000000..90cc389
--- /dev/null
+++ b/cmake/LeapHipifyClang.cmake
@@ -0,0 +1,238 @@
+function(leap_configure_hipify_clang_sources)
+    if(NOT DEFINED LEAP_HIPIFY_CLANG_EXECUTABLE OR LEAP_HIPIFY_CLANG_EXECUTABLE STREQUAL "")
+        message(FATAL_ERROR "HIPIFY_METHOD=hipify-clang, but LEAP_HIPIFY_CLANG_EXECUTABLE is not set")
+    endif()
+
+    if(NOT DEFINED Python3_EXECUTABLE OR Python3_EXECUTABLE STREQUAL "")
+        # The wrapper script is Python-based because add_custom_command() should
+        # not rely on shell redirection like:
+        #   hipify-clang input.cu > output.cu
+        #
+        # The wrapper captures stdout portably and writes the output file.
+        find_package(Python3 REQUIRED COMPONENTS Interpreter)
+    endif()
+
+    set(LEAP_HIPIFY_WRAPPER "${PROJECT_SOURCE_DIR}/tools/run_hipify_clang.py")
+    if(NOT EXISTS "${LEAP_HIPIFY_WRAPPER}")
+        message(FATAL_ERROR "Missing wrapper script: ${LEAP_HIPIFY_WRAPPER}")
+    endif()
+
+    set(LEAP_HIPIFY_SUPPORT_HEADER_NAMES
+        gpu_runtime.h
+        gpu_fft.h
+    )
+
+    file(MAKE_DIRECTORY "${LEAP_HIPIFIED_SRC_DIR}")
+    file(MAKE_DIRECTORY "${LEAP_HIPIFY_STAGE_DIR}")
+
+    # --------------------------------------------------------------------------
+    # Include dirs and extra arguments for hipify-clang
+    #
+    # START SIMPLE:
+    #   Most projects should start with just the original src/ include dir.
+    #
+    # IF YOU LATER NEED MORE:
+    #   Append more paths here, for example generated headers, third-party
+    #   include trees, or PyTorch/ROCm include locations if translation appears
+    #   to depend on them.
+    #
+    # EXAMPLES:
+    #   list(APPEND LEAP_HIPIFY_INCLUDE_DIRS "${PROJECT_SOURCE_DIR}/include")
+    #   list(APPEND LEAP_HIPIFY_EXTRA_ARGS "-D__USE_GPU")
+    #   list(APPEND LEAP_HIPIFY_EXTRA_ARGS "-D__USE_NOTEX")
+    #
+    # In many projects, include dirs are the first thing to adjust if hipified
+    # output looks incomplete or incorrect.
+    # --------------------------------------------------------------------------
+    set(LEAP_HIPIFY_INCLUDE_DIRS
+        "${LEAP_HIPIFY_STAGE_DIR}"
+        "${LEAP_ORIG_SRC_ROOT}"
+    )
+
+    set(LEAP_HIPIFY_CUDA_GPU_ARCH "sm_80" CACHE STRING
+        "CUDA GPU architecture passed to hipify-clang for parsing untranslated CUDA sources"
+    )
+
+    set(LEAP_HIPIFY_DEFINES
+        "__USE_GPU"
+        "__USE_NOTEX"
+        "__INCLUDE_CUFFT"
+        CACHE STRING
+        "Semicolon-separated preprocessor definitions passed directly to hipify-clang"
+    )
+
+    set(LEAP_HIPIFY_HIPIFY_ARGS
+        "--default-preprocessor"
+        "--cuda-gpu-arch=${LEAP_HIPIFY_CUDA_GPU_ARCH}"
+    )
+
+    foreach(definition IN LISTS LEAP_HIPIFY_DEFINES)
+        if(NOT definition STREQUAL "")
+            list(APPEND LEAP_HIPIFY_HIPIFY_ARGS "-D${definition}")
+        endif()
+    endforeach()
+
+    set(LEAP_HIPIFY_USE_BUILD_PATH "${CMAKE_EXPORT_COMPILE_COMMANDS}")
+    if(LEAP_HIPIFY_USE_BUILD_PATH)
+        message(STATUS "hipify-clang fallback will try compile_commands.json context for source files")
+    else()
+        message(WARNING
+            "hipify-clang fallback is active without CMAKE_EXPORT_COMPILE_COMMANDS. "
+            "Source translation will run without compile_commands.json context, which is less tested."
+        )
+    endif()
+
+    # ROCm 7.2's direct per-file hipify-clang mode accepts the CUDA arch flag
+    # reliably, but its clang passthrough handling is still inconsistent across
+    # file types. Keep the default passthrough list empty unless a concrete file
+    # requires more. Future hipify-clang releases may let this become simpler.
+    set(LEAP_HIPIFY_EXTRA_ARGS)
+
+    set(ALL_STAGED_FILES)
+    foreach(name IN LISTS HEADER_NAMES HIPIFY_GENERATED_CPP_NAMES SRC_CU_NAMES LEAP_HIPIFY_SUPPORT_HEADER_NAMES)
+        list(APPEND _hipify_stage_names "${name}")
+    endforeach()
+    list(REMOVE_DUPLICATES _hipify_stage_names)
+
+    foreach(name IN LISTS _hipify_stage_names)
+        set(infile    "${LEAP_ORIG_SRC_ROOT}/${name}")
+        set(stagefile "${LEAP_HIPIFY_STAGE_DIR}/${name}")
+
+        add_custom_command(
+            OUTPUT "${stagefile}"
+            COMMAND "${CMAKE_COMMAND}" -E make_directory "${LEAP_HIPIFY_STAGE_DIR}"
+            COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${infile}" "${stagefile}"
+            DEPENDS "${infile}"
+            VERBATIM
+        )
+
+        list(APPEND ALL_STAGED_FILES "${stagefile}")
+    endforeach()
+
+    leap_prefix_paths(STAGED_HIPIFY_CPP "${LEAP_HIPIFY_STAGE_DIR}" ${HIPIFY_GENERATED_CPP_NAMES})
+    leap_prefix_paths(STAGED_HIPIFY_CU  "${LEAP_HIPIFY_STAGE_DIR}" ${SRC_CU_NAMES})
+
+    add_library(leapct_hipify_probe OBJECT EXCLUDE_FROM_ALL
+        ${STAGED_HIPIFY_CPP}
+        ${STAGED_HIPIFY_CU}
+    )
+
+    # Populate compile_commands.json with representative staged translations so
+    # the wrapper can optionally pass -p for source files. This is mainly a
+    # parsing aid for hipify-clang; it is not part of the shipped library.
+    set_source_files_properties(${STAGED_HIPIFY_CU} PROPERTIES LANGUAGE HIP)
+
+    target_include_directories(leapct_hipify_probe PRIVATE
+        "${LEAP_HIPIFY_STAGE_DIR}"
+        "${LEAP_ORIG_SRC_ROOT}"
+    )
+
+    target_compile_definitions(leapct_hipify_probe PRIVATE
+        __USE_GPU
+        __USE_NOTEX
+        __INCLUDE_CUFFT
+    )
+
+    target_link_libraries(leapct_hipify_probe PRIVATE
+        hip::host
+        hip::device
+        hip::hipfft
+    )
+
+    set(GENERATED_HEADER)
+    set(GENERATED_CPP)
+    set(GENERATED_CU)
+    set(COPIED_SUPPORT_HEADERS)
+    set(ALL_GENERATED_FILES)
+
+    foreach(name IN LISTS LEAP_HIPIFY_SUPPORT_HEADER_NAMES)
+        set(infile  "${LEAP_ORIG_SRC_ROOT}/${name}")
+        set(outfile "${LEAP_HIPIFIED_SRC_DIR}/${name}")
+
+        add_custom_command(
+            OUTPUT "${outfile}"
+            COMMAND "${CMAKE_COMMAND}" -E make_directory "${LEAP_HIPIFIED_SRC_DIR}"
+            COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${infile}" "${outfile}"
+            DEPENDS "${infile}"
+            VERBATIM
+        )
+
+        list(APPEND COPIED_SUPPORT_HEADERS "${outfile}")
+        list(APPEND ALL_GENERATED_FILES "${outfile}")
+    endforeach()
+
+    foreach(name IN LISTS HIPIFY_GENERATED_HEADER_NAMES HIPIFY_GENERATED_CPP_NAMES SRC_CU_NAMES)
+        set(stagefile "${LEAP_HIPIFY_STAGE_DIR}/${name}")
+        set(outfile   "${LEAP_HIPIFIED_SRC_DIR}/${name}")
+
+        set(_cmd
+            "${Python3_EXECUTABLE}" "${LEAP_HIPIFY_WRAPPER}"
+            "${LEAP_HIPIFY_CLANG_EXECUTABLE}"
+            "${stagefile}"
+            "${outfile}"
+        )
+
+        # Source files benefit from the probe compile database. Headers are
+        # translated directly because some hipify-clang builds reject -p/-o
+        # combinations for non-source inputs.
+        if(LEAP_HIPIFY_USE_BUILD_PATH AND name MATCHES "\\.(cpp|cu)$")
+            list(APPEND _cmd "--build-path=${PROJECT_BINARY_DIR}")
+        endif()
+
+        foreach(inc IN LISTS LEAP_HIPIFY_INCLUDE_DIRS)
+            list(APPEND _cmd -I "${inc}")
+        endforeach()
+
+        foreach(arg IN LISTS LEAP_HIPIFY_HIPIFY_ARGS)
+            list(APPEND _cmd "--hipify-arg=${arg}")
+        endforeach()
+
+        foreach(arg IN LISTS LEAP_HIPIFY_EXTRA_ARGS)
+            list(APPEND _cmd "--extra-arg=${arg}")
+        endforeach()
+
+        list(APPEND _cmd
+            "--manifest=${outfile}.manifest.json"
+            "--stderr-to-manifest"
+        )
+
+        add_custom_command(
+            OUTPUT "${outfile}"
+            COMMAND "${CMAKE_COMMAND}" -E make_directory "${LEAP_HIPIFIED_SRC_DIR}"
+            COMMAND ${_cmd}
+            DEPENDS "${stagefile}" "${LEAP_HIPIFY_WRAPPER}"
+            VERBATIM
+        )
+
+        if(name MATCHES "\\.(h|cuh)$")
+            list(APPEND GENERATED_HEADER "${outfile}")
+        elseif(name MATCHES "\\.cpp$")
+            list(APPEND GENERATED_CPP "${outfile}")
+        elseif(name MATCHES "\\.cu$")
+            list(APPEND GENERATED_CU "${outfile}")
+        endif()
+
+        list(APPEND ALL_GENERATED_FILES "${outfile}")
+    endforeach()
+
+    leap_prefix_paths(PASSTHROUGH_HEADER "${LEAP_ORIG_SRC_ROOT}" ${PASSTHROUGH_HEADER_NAMES})
+    leap_prefix_paths(PASSTHROUGH_CPP    "${LEAP_ORIG_SRC_ROOT}" ${PASSTHROUGH_CPP_NAMES})
+
+    set(_header
+        ${GENERATED_HEADER}
+        ${COPIED_SUPPORT_HEADERS}
+        ${PASSTHROUGH_HEADER}
+    )
+    set(_src_cpp
+        ${GENERATED_CPP}
+        ${PASSTHROUGH_CPP}
+    )
+
+    # Group all generated files into one target so compile steps can depend on
+    # translated headers and sources before any CXX/HIP compilation begins.
+    add_custom_target(leapct_hipify_sources DEPENDS ${ALL_GENERATED_FILES})
+
+    set(HEADER "${_header}" PARENT_SCOPE)
+    set(SRC_CPP "${_src_cpp}" PARENT_SCOPE)
+    set(SRC_CU "${GENERATED_CU}" PARENT_SCOPE)
+endfunction()
diff --git a/pyproject.toml b/pyproject.toml
index 289e0d1..aad820c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,10 +23,22 @@ dependencies = [
     "numpy",
     "scipy",
 ]
+[project.optional-dependencies]
+dev = [
+  "build",
+  "ninja",
+  "pip",
+  "setuptools"
+]
+hip = [
+  "torch"
+]
 
 [tool.scikit-build]
-minimum-version = "build-system.requires"
+build-dir = "build/{wheel_tag}"
+cmake.build-type = "Release"
 cmake.version = ">=3.23"
+minimum-version = "build-system.requires"
 metadata.version.provider = "scikit_build_core.metadata.setuptools_scm"
 
 wheel.packages = ["src/leap_filter_sequence", "src/leap_preprocessing_algorithms", "src/leapctype", "src/leaptorch"]
@@ -37,7 +49,10 @@ wheel.build-tag = 0
 
 [tool.scikit-build.cmake.define]
 BUILD_TESTING = false
-LEAP_GPU = "NVIDIA"
+CMAKE_EXPORT_COMPILE_COMMANDS = "ON"
+HIPIFY_PREFER_TORCH = "ON"
+HIPIFY_ALLOW_TOOLCHAIN = "ON"
+HIPIFY_FAIL_IF_MISSING = "ON"
 
 [tool.setuptools_scm]
 # Ref https://setuptools-scm.readthedocs.io/en/latest/config/#configuration-parameters
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2c38822..65a5079 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,190 +1,434 @@
 find_package(OpenMP REQUIRED COMPONENTS CXX)
+
 if(LEAP_CUDA)
     find_package(CUDAToolkit 11.7 REQUIRED)
 endif()
 
-if (WIN32)
-  set(CMAKE_SHARED_LIBRARY_PREFIX "lib")
-  set(CMAKE_STATIC_LIBRARY_PREFIX "lib")
+if(LEAP_HIP)
+    # Depending on your ROCm installation, these package names and imported
+    # targets can vary slightly. If configuration fails here, inspect the
+    # ROCm CMake package files on your system and adjust accordingly.
+    find_package(hip REQUIRED CONFIG)
+    find_package(hipfft REQUIRED CONFIG)
+endif()
+
+if(WIN32)
+    set(CMAKE_SHARED_LIBRARY_PREFIX "lib")
+    set(CMAKE_STATIC_LIBRARY_PREFIX "lib")
 endif()
 
-#set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/..)
-#set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/..)
-#set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/..)
-
-set(HEADER
-  analytic_ray_tracing.h
-  analytic_ray_tracing_gpu.cuh
-  backprojectors_VD.cuh
-  bilateral_filter.cuh
-  cpu_utils.h
-  cuda_utils.h
-  file_io.h
-  filtered_backprojection.h
-  find_center_cpu.h
-  geometric_calibration.cuh
-  guided_filter.cuh
-  leap_defines.h
-  list_of_tomographic_models.h
-  log.h
-  matching_pursuit.cuh
-  noise_filters.cuh
-  parameters.h
-  phantom.h
-  projectors.h
-  projectors_Joseph.cuh
-  projectors_Joseph_cpu.h
-  projectors_SF.cuh
-  projectors_SF_cpu.h
-  projectors_Siddon.cuh
-  projectors_Siddon_cpu.h
-  projectors_attenuated.cuh
-  projectors_extendedSF.cuh
-  projectors_symmetric.cuh
-  projectors_symmetric_cpu.h
-  ramp_filter.cuh
-  ramp_filter_cpu.h
-  ray_weighting.cuh
-  ray_weighting_cpu.h
-  rebin.h
-  resample.cuh
-  resample_cpu.h
-  scatter_models.cuh
-  sensitivity.cuh
-  sensitivity_cpu.h
-  sinogram_replacement.h
-  tomographic_models.h
-  tomographic_models_c_interface.h
-  total_variation.cuh
+# ------------------------------------------------------------------------------
+# Source lists
+#
+# Keep these explicit. This is one of the biggest strengths of your current
+# layout, because it makes per-file hipify deterministic and easy to debug.
+#
+# If you add new source files later, update these lists directly.
+# ------------------------------------------------------------------------------
+
+set(HEADER_NAMES
+    analytic_ray_tracing.h
+    analytic_ray_tracing_gpu.cuh
+    backprojectors_VD.cuh
+    bilateral_filter.cuh
+    cpu_utils.h
+    cuda_utils.h
+    file_io.h
+    filtered_backprojection.h
+    find_center_cpu.h
+    geometric_calibration.cuh
+    guided_filter.cuh
+    leap_defines.h
+    list_of_tomographic_models.h
+    log.h
+    matching_pursuit.cuh
+    noise_filters.cuh
+    parameters.h
+    phantom.h
+    projectors.h
+    projectors_Joseph.cuh
+    projectors_Joseph_cpu.h
+    projectors_SF.cuh
+    projectors_SF_cpu.h
+    projectors_Siddon.cuh
+    projectors_Siddon_cpu.h
+    projectors_attenuated.cuh
+    projectors_extendedSF.cuh
+    projectors_symmetric.cuh
+    projectors_symmetric_cpu.h
+    ramp_filter.cuh
+    ramp_filter_cpu.h
+    ray_weighting.cuh
+    ray_weighting_cpu.h
+    rebin.h
+    resample.cuh
+    resample_cpu.h
+    scatter_models.cuh
+    sensitivity.cuh
+    sensitivity_cpu.h
+    sinogram_replacement.h
+    tomographic_models.h
+    tomographic_models_c_interface.h
+    total_variation.cuh
 )
 
-set(SRC_CPP
-  analytic_ray_tracing.cpp
-  cpu_utils.cpp
-  file_io.cpp
-  filtered_backprojection.cpp
-  find_center_cpu.cpp
-  list_of_tomographic_models.cpp
-  parameters.cpp
-  phantom.cpp
-  projectors.cpp
-  projectors_Joseph_cpu.cpp
-  projectors_SF_cpu.cpp
-  projectors_Siddon_cpu.cpp
-  projectors_symmetric_cpu.cpp
-  ramp_filter_cpu.cpp
-  ray_weighting_cpu.cpp
-  rebin.cpp
-  resample_cpu.cpp
-  sensitivity_cpu.cpp
-  sinogram_replacement.cpp
-  tomographic_models.cpp
-  tomographic_models_c_interface.cpp
+set(SRC_CPP_NAMES
+    analytic_ray_tracing.cpp
+    cpu_utils.cpp
+    file_io.cpp
+    filtered_backprojection.cpp
+    find_center_cpu.cpp
+    list_of_tomographic_models.cpp
+    parameters.cpp
+    phantom.cpp
+    projectors.cpp
+    projectors_Joseph_cpu.cpp
+    projectors_SF_cpu.cpp
+    projectors_Siddon_cpu.cpp
+    projectors_symmetric_cpu.cpp
+    ramp_filter_cpu.cpp
+    ray_weighting_cpu.cpp
+    rebin.cpp
+    resample_cpu.cpp
+    sensitivity_cpu.cpp
+    sinogram_replacement.cpp
+    tomographic_models.cpp
+    tomographic_models_c_interface.cpp
 )
 
-set(SRC_CU
-  analytic_ray_tracing_gpu.cu
-  backprojectors_VD.cu
-  bilateral_filter.cu
-  cuda_utils.cu
-  geometric_calibration.cu
-  guided_filter.cu
-  matching_pursuit.cu
-  noise_filters.cu
-  projectors_Joseph.cu
-  projectors_SF.cu
-  projectors_Siddon.cu
-  projectors_attenuated.cu
-  projectors_extendedSF.cu
-  projectors_symmetric.cu
-  ramp_filter.cu
-  ray_weighting.cu
-  resample.cu
-  scatter_models.cu
-  sensitivity.cu
-  total_variation.cu
+set(SRC_CU_NAMES
+    analytic_ray_tracing_gpu.cu
+    backprojectors_VD.cu
+    bilateral_filter.cu
+    cuda_utils.cu
+    geometric_calibration.cu
+    guided_filter.cu
+    matching_pursuit.cu
+    noise_filters.cu
+    projectors_Joseph.cu
+    projectors_SF.cu
+    projectors_Siddon.cu
+    projectors_attenuated.cu
+    projectors_extendedSF.cu
+    projectors_symmetric.cu
+    ramp_filter.cu
+    ray_weighting.cu
+    resample.cu
+    scatter_models.cu
+    sensitivity.cu
+    total_variation.cu
 )
 
-include_directories(
-  ./
-  ${CUDA_INCLUDE_DIRS}
+# ------------------------------------------------------------------------------
+# Helper function for prefixing a list of relative file names with a root dir.
+# ------------------------------------------------------------------------------
+
+function(leap_prefix_paths out_var root_dir)
+    set(result)
+    foreach(name IN LISTS ARGN)
+        list(APPEND result "${root_dir}/${name}")
+    endforeach()
+    set(${out_var} "${result}" PARENT_SCOPE)
+endfunction()
+
+include(LeapHipifyClang)
+
+set(LEAP_ORIG_SRC_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
+
+# By default, build from the source root selected by the top-level file.
+#
+# For:
+#   - CPU builds, this is the original src/
+#   - CUDA builds, this is the original src/
+#   - torch hipify builds, this is the torch-generated hipified tree
+#
+# For hipify-clang builds, we override this logic below and generate files
+# per source instead of building directly from LEAP_SELECTED_SRC_DIR.
+set(LEAP_SRC_ROOT "${LEAP_SELECTED_SRC_DIR}")
+
+# ------------------------------------------------------------------------------
+# hipify-clang translation policy
+#
+# Keep the master source inventories above as the only explicit file lists.
+# The hipify-clang fallback derives translated subsets from short exclusion
+# lists so the common case stays readable and in sync.
+# ------------------------------------------------------------------------------
+
+set(HIPIFY_HEADER_EXCLUDE_NAMES
+    file_io.h
+    leap_defines.h
+    projectors_Joseph_cpu.h
 )
 
-#add_library(leapct SHARED main_projector_ctype.cpp parameters.cpp projectors_cpu.cpp projectors.cu projectors_SF.cu)
-#target_link_libraries(leapct "${TORCH_LIBRARIES}")
-add_library(leapct SHARED
-  ${HEADER}
-  ${SRC_CPP}
+set(HIPIFY_CPP_EXCLUDE_NAMES
+    file_io.cpp
+    parameters.cpp
+    projectors_Joseph_cpu.cpp
 )
 
+set(HIPIFY_GENERATED_HEADER_NAMES ${HEADER_NAMES})
+list(REMOVE_ITEM HIPIFY_GENERATED_HEADER_NAMES ${HIPIFY_HEADER_EXCLUDE_NAMES})
+
+set(HIPIFY_GENERATED_CPP_NAMES ${SRC_CPP_NAMES})
+list(REMOVE_ITEM HIPIFY_GENERATED_CPP_NAMES ${HIPIFY_CPP_EXCLUDE_NAMES})
+
+set(PASSTHROUGH_HEADER_NAMES ${HEADER_NAMES})
+list(REMOVE_ITEM PASSTHROUGH_HEADER_NAMES ${HIPIFY_GENERATED_HEADER_NAMES})
+
+set(PASSTHROUGH_CPP_NAMES ${SRC_CPP_NAMES})
+list(REMOVE_ITEM PASSTHROUGH_CPP_NAMES ${HIPIFY_GENERATED_CPP_NAMES})
+
+if(LEAP_HIP AND HIPIFY_METHOD STREQUAL "hipify-clang")
+    leap_configure_hipify_clang_sources()
+
+# ------------------------------------------------------------------------------
+# Non hipify-clang cases
+#
+# This covers:
+#   - CPU-only builds
+#   - CUDA builds
+#   - torch-generated hipified tree builds
+# ------------------------------------------------------------------------------
+
+else()
+    leap_prefix_paths(HEADER  "${LEAP_SRC_ROOT}" ${HEADER_NAMES})
+    leap_prefix_paths(SRC_CPP "${LEAP_SRC_ROOT}" ${SRC_CPP_NAMES})
+    leap_prefix_paths(SRC_CU  "${LEAP_SRC_ROOT}" ${SRC_CU_NAMES})
+endif()
+
+# ------------------------------------------------------------------------------
+# Target definition
+# ------------------------------------------------------------------------------
+
+add_library(leapct_host OBJECT
+    ${HEADER}
+    ${SRC_CPP}
+)
+
+set_target_properties(leapct_host PROPERTIES
+    POSITION_INDEPENDENT_CODE ON
+)
+
+if(LEAP_HIP AND HIPIFY_METHOD STREQUAL "torch")
+    target_include_directories(leapct_host PRIVATE
+        "${LEAP_HIPIFIED_SRC_DIR}"
+        "${LEAP_SELECTED_SRC_DIR}"
+        "${LEAP_ORIG_SRC_ROOT}"
+    )
+elseif(LEAP_HIP AND HIPIFY_METHOD STREQUAL "hipify-clang")
+    target_include_directories(leapct_host PRIVATE
+        "${LEAP_HIPIFIED_SRC_DIR}"
+        "${LEAP_ORIG_SRC_ROOT}"
+    )
+else()
+    target_include_directories(leapct_host PRIVATE
+        "${LEAP_SRC_ROOT}"
+        "${LEAP_ORIG_SRC_ROOT}"
+    )
+endif()
+
 if(LEAP_CUDA)
-    target_sources(leapct PRIVATE ${SRC_CU})
+    target_include_directories(leapct_host PRIVATE
+        ${CUDAToolkit_INCLUDE_DIRS}
+    )
 endif()
 
-# <<<--- Create filters for subfolders for Visual Studio
-set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+if(LEAP_CUDA)
+    target_compile_definitions(leapct_host PRIVATE
+        __USE_GPU
+        __INCLUDE_CUFFT
+    )
+elseif(LEAP_HIP)
+    target_compile_definitions(leapct_host PRIVATE
+        LEAP_HIP_BUILD
+        __USE_GPU
+        __USE_NOTEX
+        __INCLUDE_CUFFT
+    )
+else()
+    target_compile_definitions(leapct_host PRIVATE
+        __USE_CPU
+    )
+endif()
+
+target_link_libraries(leapct_host PRIVATE
+    OpenMP::OpenMP_CXX
+)
+
+if(LEAP_HIP)
+    target_link_libraries(leapct_host PRIVATE
+        hip::host
+        hip::hipfft
+    )
+endif()
+
+if(LEAP_CUDA OR LEAP_HIP)
+    add_library(leapct_gpu OBJECT
+        ${SRC_CU}
+    )
+
+    set_target_properties(leapct_gpu PROPERTIES
+        POSITION_INDEPENDENT_CODE ON
+    )
 
-file(GLOB_RECURSE _source_list
-  LIST_DIRECTORIES false
-  ${HEADER}
-  ${SRC_CPP}
-  ${SRC_CU}
+    if(LEAP_HIP AND HIPIFY_METHOD STREQUAL "torch")
+        target_include_directories(leapct_gpu PRIVATE
+            "${LEAP_HIPIFIED_SRC_DIR}"
+            "${LEAP_SELECTED_SRC_DIR}"
+            "${LEAP_ORIG_SRC_ROOT}"
+        )
+    elseif(LEAP_HIP AND HIPIFY_METHOD STREQUAL "hipify-clang")
+        target_include_directories(leapct_gpu PRIVATE
+            "${LEAP_HIPIFIED_SRC_DIR}"
+            "${LEAP_ORIG_SRC_ROOT}"
+        )
+    else()
+        target_include_directories(leapct_gpu PRIVATE
+            "${LEAP_SRC_ROOT}"
+            "${LEAP_ORIG_SRC_ROOT}"
+        )
+    endif()
+
+    if(LEAP_CUDA)
+        target_include_directories(leapct_gpu PRIVATE
+            ${CUDAToolkit_INCLUDE_DIRS}
+        )
+    endif()
+
+    if(LEAP_HIP)
+        set_source_files_properties(${SRC_CU} PROPERTIES LANGUAGE HIP)
+        target_compile_definitions(leapct_gpu PRIVATE
+            LEAP_HIP_BUILD
+            __USE_GPU
+            __USE_NOTEX
+            __INCLUDE_CUFFT
+        )
+        target_link_libraries(leapct_gpu PRIVATE
+            hip::device
+            hip::hipfft
+        )
+    else()
+        set_source_files_properties(${SRC_CU} PROPERTIES LANGUAGE CUDA)
+        target_compile_definitions(leapct_gpu PRIVATE
+            __USE_GPU
+            __INCLUDE_CUFFT
+        )
+    endif()
+endif()
+
+add_library(leapct SHARED
+    $<TARGET_OBJECTS:leapct_host>
 )
 
+if(TARGET leapct_gpu)
+    target_sources(leapct PRIVATE
+        $<TARGET_OBJECTS:leapct_gpu>
+    )
+endif()
+
+if(TARGET leapct_hipify_sources)
+    add_dependencies(leapct_host leapct_hipify_sources)
+    add_dependencies(leapct_gpu leapct_hipify_sources)
+endif()
+
+# ------------------------------------------------------------------------------
+# Visual Studio source groups
+# ------------------------------------------------------------------------------
+
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+set(_source_list ${HEADER} ${SRC_CPP} ${SRC_CU})
 foreach(_source IN ITEMS ${_source_list})
-  get_filename_component(_source_path "${_source}" PATH)
-  file(RELATIVE_PATH _source_path_rel "${PROJECT_SOURCE_DIR}" "${_source_path}")
-  string(REPLACE "/" "\\" _group_path "${_source_path_rel}")
-  source_group("${_group_path}" FILES "${_source}")
+    get_filename_component(_source_path "${_source}" PATH)
+    file(RELATIVE_PATH _source_path_rel "${PROJECT_SOURCE_DIR}" "${_source_path}")
+    string(REPLACE "/" "\\" _group_path "${_source_path_rel}")
+    source_group("${_group_path}" FILES "${_source}")
 endforeach()
-# --->>>
 
-if (LEAP_CUDA)
-    target_compile_definitions(
-        leapct PRIVATE
+# ------------------------------------------------------------------------------
+# Compile definitions and link libraries
+#
+# NOTE:
+# The legacy setup.py passed some things like "-lhipfft" and "-lcufft" via
+# compile arguments. In CMake, it is better to express those as link libraries,
+# which is what we do here.
+#
+# If you later need additional compile definitions or options, prefer:
+#   - target_compile_definitions()
+#   - target_compile_options()
+#   - target_link_libraries()
+#
+# instead of global variables.
+# ------------------------------------------------------------------------------
+
+if(LEAP_CUDA)
+    target_compile_definitions(leapct PRIVATE
         __USE_GPU
         __INCLUDE_CUFFT
     )
-    target_link_libraries(${PROJECT_NAME}
-      OpenMP::OpenMP_CXX
-      CUDA::cudart
-      CUDA::cublas
-      CUDA::cufft
+
+    target_link_libraries(leapct PRIVATE
+        OpenMP::OpenMP_CXX
+        CUDA::cudart
+        CUDA::cublas
+        CUDA::cufft
     )
-elseif (LEAP_HIP)
-    target_compile_definitions(
-        leapct PRIVATE
+
+elseif(LEAP_HIP)
+    target_compile_definitions(leapct PRIVATE
         __USE_GPU
         __USE_NOTEX
         __INCLUDE_CUFFT
     )
-    target_link_libraries(${PROJECT_NAME}
-      OpenMP::OpenMP_CXX
-      hip::hipfft
+
+    # If your ROCm package exports a different imported target name for hipfft,
+    # adjust this section. Some environments differ slightly.
+    target_link_libraries(leapct PRIVATE
+        OpenMP::OpenMP_CXX
+        hip::device
+        hip::hipfft
     )
-else ()
-    target_compile_definitions(
-        leapct PRIVATE
+
+else()
+    target_compile_definitions(leapct PRIVATE
         __USE_CPU
     )
-    target_link_libraries(${PROJECT_NAME}
-      OpenMP::OpenMP_CXX
+
+    target_link_libraries(leapct PRIVATE
+        OpenMP::OpenMP_CXX
     )
 endif()
 
-target_compile_options(leapct PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
-  --use_fast_math
->)
+# ------------------------------------------------------------------------------
+# Compile options
+#
+# We keep the CUDA fast-math option from your prior build logic.
+#
+# If later you want a HIP-specific fast-math or optimization option, add a
+# parallel generator expression here, for example:
+#
+#   $<$<COMPILE_LANGUAGE:HIP>:-ffast-math>
+#
+# but only after confirming the desired flag for your ROCm toolchain.
+# ------------------------------------------------------------------------------
+
+target_compile_options(leapct PRIVATE
+    $<$<COMPILE_LANGUAGE:CUDA>:--use_fast_math>
+)
+
+# ------------------------------------------------------------------------------
+# Installation
+# ------------------------------------------------------------------------------
 
 if(SKBUILD)
-    install(TARGETS ${PROJECT_NAME}
-    DESTINATION ${SKBUILD_PLATLIB_DIR}/leapctype/
+    install(TARGETS leapct
+        DESTINATION ${SKBUILD_PLATLIB_DIR}/leapctype/
     )
 else()
-    install(TARGETS ${PROJECT_NAME}
-      PUBLIC_HEADER DESTINATION include
-      RUNTIME DESTINATION bin
-      ARCHIVE DESTINATION lib
-      LIBRARY DESTINATION lib
+    install(TARGETS leapct
+        PUBLIC_HEADER DESTINATION include
+        RUNTIME DESTINATION bin
+        ARCHIVE DESTINATION lib
+        LIBRARY DESTINATION lib
     )
 endif()
diff --git a/src/analytic_ray_tracing_gpu.cu b/src/analytic_ray_tracing_gpu.cu
index 9cb13e4..bb21cc8 100644
--- a/src/analytic_ray_tracing_gpu.cu
+++ b/src/analytic_ray_tracing_gpu.cu
@@ -8,9 +8,10 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include <math.h>
+#include <algorithm>
 #include "analytic_ray_tracing_gpu.cuh"
 #include "analytic_ray_tracing.h"
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 //#include "device_launch_parameters.h"
 #include "cuda_utils.h"
 
@@ -863,12 +864,12 @@ void setConstantMemoryGeometryParameters(parameters* params, int oversampling)
 
 bool rayTrace_gpu(float* g, parameters* params, phantom* aPhantom, bool data_on_cpu, int oversampling)
 {
-    if (g == NULL || params == NULL || params->geometryDefined() == false)
+	if (g == NULL || params == NULL || params->geometryDefined() == false)
         return false;
-	oversampling = max(1, min(oversampling, 11));
+	oversampling = std::clamp(oversampling, 1, 11);
 	if (oversampling % 2 == 0)
 		oversampling += 1;
-	oversampling = max(1, min(oversampling, 11));
+	oversampling = std::clamp(oversampling, 1, 11);
 
     cudaSetDevice(params->whichGPU);
     cudaError_t cudaStatus;
@@ -952,7 +953,7 @@ bool rayTrace_gpu(float* g, parameters* params, phantom* aPhantom, bool data_on_
 	}
 	delete[] solids;
 
-	int num_gpu_cores = max(1024, getSPcores(params->whichGPU));
+	int num_gpu_cores = std::max(1024, getSPcores(params->whichGPU));
 	if (params->projectionData_numberOfElements() < uint64(num_gpu_cores))
 		num_gpu_cores = int(params->projectionData_numberOfElements());
 	uint64 numChunks = uint64(ceil(double(params->projectionData_numberOfElements()) / double(num_gpu_cores)));
diff --git a/src/analytic_ray_tracing_gpu.cuh b/src/analytic_ray_tracing_gpu.cuh
index cd1fde7..742c273 100644
--- a/src/analytic_ray_tracing_gpu.cuh
+++ b/src/analytic_ray_tracing_gpu.cuh
@@ -18,7 +18,7 @@
 #include <vector>
 #include "parameters.h"
 #include "phantom.h"
-#include "vector_types.h"
+#include "gpu_runtime.h"
 
 /**
  * This class provides GPU-based implementations to perform analytic ray tracing simulation through geometric solids.
diff --git a/src/backprojectors_VD.cu b/src/backprojectors_VD.cu
index ff926dd..9df97ff 100644
--- a/src/backprojectors_VD.cu
+++ b/src/backprojectors_VD.cu
@@ -12,7 +12,7 @@
 #include <math.h>
 #include <algorithm>
 #include "log.h"
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 //#include "device_launch_parameters.h"
 #include "backprojectors_VD.cuh"
 #include "cuda_utils.h"
diff --git a/src/bilateral_filter.cu b/src/bilateral_filter.cu
index b3a3f47..f5d5177 100644
--- a/src/bilateral_filter.cu
+++ b/src/bilateral_filter.cu
@@ -9,7 +9,7 @@
 // (Kyle) several years ago in a package called "3Ddensoing"
 ////////////////////////////////////////////////////////////////////////////////
 #include "bilateral_filter.cuh"
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 //#include "device_launch_parameters.h"
 #include "cuda_utils.h"
 #include "noise_filters.cuh"
diff --git a/src/cuda_utils.cu b/src/cuda_utils.cu
index 25214fa..7de0df7 100644
--- a/src/cuda_utils.cu
+++ b/src/cuda_utils.cu
@@ -11,7 +11,7 @@
 
 #ifndef __USE_CPU
 #include "cuda_utils.h"
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 
 
 
diff --git a/src/cuda_utils.h b/src/cuda_utils.h
index ee50618..286762a 100644
--- a/src/cuda_utils.h
+++ b/src/cuda_utils.h
@@ -25,7 +25,7 @@
 #define GPU_MEMORY_SAFETY_MULTIPLIER 0.9
 
 #ifndef __USE_CPU
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 
 #define CUDA_CHECK(expr) cudaSafeCall((expr), __FILE__, __LINE__, #expr)
 #define CHECK_LAST_ERROR() cudaCall(cudaGetLastError(), NULL, __FILE__, __LINE__)
@@ -80,7 +80,7 @@ extern dim3 setGridSize(int4 N, dim3 dimBlock);
 
 //// these linear interpolation functions are for GPUs/APUs with no texture memory support (e.g., AMD MI300)
 ////
-#ifdef __USE_NOTEX
+#if defined(__USE_NOTEX) && (defined(__CUDACC__) || defined(__HIPCC__))
 
 #define __MIN__(a, b) ((a) < (b) ? (a) : (b))
 #define __MAX__(a, b) ((a) > (b) ? (a) : (b))
diff --git a/src/file_io.h b/src/file_io.h
index 5d2bc1c..d161e34 100644
--- a/src/file_io.h
+++ b/src/file_io.h
@@ -15,6 +15,7 @@
 #endif
 
 //#define USE_OLD_PARAM_NAMES
+#include <cstdio>
 #include "parameters.h"
 #include "leap_defines.h"
 
diff --git a/src/geometric_calibration.cu b/src/geometric_calibration.cu
index cd66d7d..9c39472 100644
--- a/src/geometric_calibration.cu
+++ b/src/geometric_calibration.cu
@@ -11,7 +11,7 @@
 #include <math.h>
 #include <algorithm>
 #include "cuda_utils.h"
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 #include "leap_defines.h"
 #include "log.h"
 #include "geometric_calibration.cuh"
diff --git a/src/gpu_fft.h b/src/gpu_fft.h
new file mode 100644
index 0000000..568bffc
--- /dev/null
+++ b/src/gpu_fft.h
@@ -0,0 +1,29 @@
+#ifndef __GPU_FFT_H
+#define __GPU_FFT_H
+
+#pragma once
+
+#if defined(LEAP_HIP_BUILD) || defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+#include <hipfft/hipfft.h>
+
+// Same rationale as gpu_runtime.h: hipify does not synthesize a compatibility
+// shim, and generated LEAP sources can still retain cufft* spellings.
+using cufftHandle = hipfftHandle;
+using cufftResult = hipfftResult;
+using cufftComplex = hipfftComplex;
+using cufftReal = hipfftReal;
+
+#define CUFFT_SUCCESS HIPFFT_SUCCESS
+#define CUFFT_R2C HIPFFT_R2C
+#define CUFFT_C2R HIPFFT_C2R
+
+#define cufftPlan1d hipfftPlan1d
+#define cufftPlan2d hipfftPlan2d
+#define cufftExecR2C hipfftExecR2C
+#define cufftExecC2R hipfftExecC2R
+#define cufftDestroy hipfftDestroy
+#else
+#include <cufft.h>
+#endif
+
+#endif
diff --git a/src/gpu_runtime.h b/src/gpu_runtime.h
new file mode 100644
index 0000000..f4d1f0e
--- /dev/null
+++ b/src/gpu_runtime.h
@@ -0,0 +1,76 @@
+#ifndef __GPU_RUNTIME_H
+#define __GPU_RUNTIME_H
+
+#pragma once
+
+#if defined(LEAP_HIP_BUILD) || defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+#include <hip/hip_runtime.h>
+
+// LEAP still needs this CUDA->HIP compatibility layer even when sources are
+// generated with hipify. Both torch.utils.hipify and hipify-clang are
+// source-to-source rewriters: they try to replace CUDA spellings directly in
+// translated files, but they do not inject a fallback shim header for names
+// that survive translation. In this codebase the generated HIP sources still
+// contain some CUDA runtime identifiers such as cudaError_t, cudaSetDevice,
+// and cudaMalloc, so HIP builds need these aliases as a final compatibility
+// layer. The torch path may have appeared not to need a shim because it ships a
+// broad mapping table and rewrites many common CUDA names directly, but that
+// rewriting is not complete for LEAP for all versions of torch, ROCM, and CUDA.
+
+using cudaError_t = hipError_t;
+using cudaStream_t = hipStream_t;
+using cudaArray = hipArray;
+using cudaArray_t = hipArray_t;
+using cudaChannelFormatDesc = hipChannelFormatDesc;
+using cudaExtent = hipExtent;
+using cudaMemcpy3DParms = hipMemcpy3DParms;
+using cudaMemcpyKind = hipMemcpyKind;
+using cudaPitchedPtr = hipPitchedPtr;
+using cudaPos = hipPos;
+using cudaResourceDesc = hipResourceDesc;
+using cudaTextureAddressMode = hipTextureAddressMode;
+using cudaTextureDesc = hipTextureDesc;
+using cudaTextureFilterMode = hipTextureFilterMode;
+using cudaTextureObject_t = hipTextureObject_t;
+using cudaDeviceProp = hipDeviceProp_t;
+
+#define cudaSuccess hipSuccess
+#define cudaAddressModeBorder hipAddressModeBorder
+#define cudaAddressModeClamp hipAddressModeClamp
+#define cudaFilterModeLinear hipFilterModeLinear
+#define cudaFilterModePoint hipFilterModePoint
+#define cudaReadModeElementType hipReadModeElementType
+#define cudaResourceTypeArray hipResourceTypeArray
+#define cudaCreateChannelDesc hipCreateChannelDesc
+#define cudaCreateTextureObject hipCreateTextureObject
+#define cudaDestroyTextureObject hipDestroyTextureObject
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaFree hipFree
+#define cudaFreeArray hipFreeArray
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaGetErrorName hipGetErrorName
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#define cudaMalloc hipMalloc
+#define cudaMalloc3DArray hipMalloc3DArray
+#define cudaMallocArray hipMallocArray
+#define cudaMemGetInfo hipMemGetInfo
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpy2D hipMemcpy2D
+#define cudaMemcpy3D hipMemcpy3D
+#define cudaMemcpy3DAsync hipMemcpy3DAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyToArray hipMemcpyToArray
+#define cudaMemcpyToSymbol hipMemcpyToSymbol
+#define cudaMemset hipMemset
+#define cudaPeekAtLastError hipPeekAtLastError
+#define cudaSetDevice hipSetDevice
+#define cudaStreamCreate hipStreamCreate
+#else
+#include "cuda_runtime.h"
+#endif
+
+#endif
diff --git a/src/guided_filter.cu b/src/guided_filter.cu
index a25f9d5..b953ac7 100644
--- a/src/guided_filter.cu
+++ b/src/guided_filter.cu
@@ -7,7 +7,7 @@
 // cuda module for guided filter algorithms
 ////////////////////////////////////////////////////////////////////////////////
 #include "guided_filter.cuh"
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 #include "cuda_utils.h"
 
 #include <iostream>
diff --git a/src/matching_pursuit.cu b/src/matching_pursuit.cu
index f5477e2..23ada20 100644
--- a/src/matching_pursuit.cu
+++ b/src/matching_pursuit.cu
@@ -9,7 +9,7 @@
 // (Kyle) several years ago in a package called "3Ddensoing"
 ////////////////////////////////////////////////////////////////////////////////
 #include "matching_pursuit.cuh"
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 //#include "device_launch_parameters.h"
 #include "cuda_utils.h"
 
diff --git a/src/noise_filters.cu b/src/noise_filters.cu
index f3b240a..51ecc9d 100644
--- a/src/noise_filters.cu
+++ b/src/noise_filters.cu
@@ -11,7 +11,7 @@
 #include <math.h>
 
 #include "cuda_utils.h"
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 //#include "device_launch_parameters.h"
 #include "parameters.h"
 
diff --git a/src/projectors_Joseph.cu b/src/projectors_Joseph.cu
index 78eb871..0cc5863 100644
--- a/src/projectors_Joseph.cu
+++ b/src/projectors_Joseph.cu
@@ -11,7 +11,7 @@
 #include <stdio.h>
 #include <math.h>
 #include <algorithm>
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 //#include "device_launch_parameters.h"
 #include "projectors.h"
 #include "projectors_Joseph.cuh"
diff --git a/src/projectors_Joseph_cpu.h b/src/projectors_Joseph_cpu.h
index e77b386..6c4b371 100644
--- a/src/projectors_Joseph_cpu.h
+++ b/src/projectors_Joseph_cpu.h
@@ -36,7 +36,7 @@ struct double3
 double3 make_double3(double, double, double);
 
 #else
-	#include "cuda_runtime.h"
+	#include "gpu_runtime.h"
 #endif
 
 bool project_Joseph_cpu(float*, float*, parameters*);
diff --git a/src/projectors_SF.cu b/src/projectors_SF.cu
index c9f2f9f..8129add 100644
--- a/src/projectors_SF.cu
+++ b/src/projectors_SF.cu
@@ -11,7 +11,7 @@
 #include <stdio.h>
 #include <math.h>
 #include <algorithm>
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 //#include "device_launch_parameters.h"
 #include "projectors.h"
 #include "projectors_SF.cuh"
diff --git a/src/projectors_Siddon.cu b/src/projectors_Siddon.cu
index b592a1d..05060f4 100644
--- a/src/projectors_Siddon.cu
+++ b/src/projectors_Siddon.cu
@@ -11,7 +11,7 @@
 #include <stdio.h>
 #include <math.h>
 #include <algorithm>
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 //#include "device_launch_parameters.h"
 #include "projectors_Siddon.cuh"
 #include "cuda_utils.h"
diff --git a/src/projectors_attenuated.cu b/src/projectors_attenuated.cu
index dcc3a0d..1ee1632 100644
--- a/src/projectors_attenuated.cu
+++ b/src/projectors_attenuated.cu
@@ -11,7 +11,7 @@
 #include <stdio.h>
 #include <math.h>
 #include <algorithm>
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 //#include "device_launch_parameters.h"
 #include "projectors_attenuated.cuh"
 #include "cuda_utils.h"
diff --git a/src/projectors_extendedSF.cu b/src/projectors_extendedSF.cu
index 97bc981..82de0e9 100644
--- a/src/projectors_extendedSF.cu
+++ b/src/projectors_extendedSF.cu
@@ -12,7 +12,7 @@
 #include <stdio.h>
 #include <math.h>
 #include <algorithm>
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 //#include "device_launch_parameters.h"
 #include "projectors.h"
 #include "cuda_utils.h"
diff --git a/src/projectors_symmetric.cu b/src/projectors_symmetric.cu
index 13fda0c..31696c2 100644
--- a/src/projectors_symmetric.cu
+++ b/src/projectors_symmetric.cu
@@ -11,7 +11,7 @@
 #include <stdio.h>
 #include <math.h>
 #include <algorithm>
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 //#include "device_launch_parameters.h"
 #include "projectors_symmetric.cuh"
 #include "cuda_utils.h"
diff --git a/src/ramp_filter.cu b/src/ramp_filter.cu
index 7264bd5..1e7b164 100644
--- a/src/ramp_filter.cu
+++ b/src/ramp_filter.cu
@@ -13,7 +13,7 @@
 
 #include <iostream>
 #include <algorithm>
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 //#include "device_launch_parameters.h"
 #include "cuda_utils.h"
 #include "cpu_utils.h"
@@ -25,7 +25,7 @@
 #define NUM_RAYS_PER_THREAD 8
 
 #ifdef __INCLUDE_CUFFT
-#include <cufft.h>
+#include "gpu_fft.h"
 #endif
 
 __global__ void zeroPadForOffsetScanKernel(float* g, float* g_pad, const int3 N, const int N_add, const bool padOnLeft, const float* offsetScanWeights)
@@ -2370,4 +2370,3 @@ float* zeroPadForOffsetScan_GPU(float* g, parameters* params, float* g_out, bool
         return NULL;
     }
 }
-
diff --git a/src/ramp_filter.cuh b/src/ramp_filter.cuh
index 9d45c8a..bd9edd9 100644
--- a/src/ramp_filter.cuh
+++ b/src/ramp_filter.cuh
@@ -23,7 +23,7 @@
  */
 
 #ifdef __INCLUDE_CUFFT
-#include <cufft.h>
+#include "gpu_fft.h"
 cufftComplex* HilbertTransformFrequencyResponse(int N, parameters* params, float scalar = 1.0, float sampleShift = 0.0);
 float* rampFilterFrequencyResponseMagnitude(int N, parameters* params);
 #endif
@@ -54,4 +54,4 @@ bool parallelRay_derivative_chunk(float*& g, parameters* params, bool data_on_cp
 float* zeroPadForOffsetScan_GPU(float* g, parameters* params, float* g_out = NULL, bool data_on_cpu = false);
 
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/ray_weighting.cu b/src/ray_weighting.cu
index f60224e..0c1a967 100644
--- a/src/ray_weighting.cu
+++ b/src/ray_weighting.cu
@@ -8,7 +8,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 #include "ray_weighting.cuh"
 #include "ray_weighting_cpu.h"
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 #include "cuda_utils.h"
 #include "log.h"
 
diff --git a/src/resample.cu b/src/resample.cu
index 4785fda..2fff7ed 100644
--- a/src/resample.cu
+++ b/src/resample.cu
@@ -11,7 +11,7 @@
 #include <math.h>
 #include <algorithm>
 #include "cuda_utils.h"
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 #include "leap_defines.h"
 #include "resample.cuh"
 
diff --git a/src/scatter_models.cu b/src/scatter_models.cu
index cdbc90b..55ce8c5 100644
--- a/src/scatter_models.cu
+++ b/src/scatter_models.cu
@@ -14,7 +14,7 @@
 #include <stdio.h>
 #include <math.h>
 #include <algorithm>
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 #include "cuda_utils.h"
 #include "scatter_models.cuh"
 
diff --git a/src/scatter_models.cuh b/src/scatter_models.cuh
index e6df216..db68e76 100644
--- a/src/scatter_models.cuh
+++ b/src/scatter_models.cuh
@@ -15,7 +15,7 @@
 #endif
 
 #include "parameters.h"
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 #include "cuda_utils.h"
 
 /**
diff --git a/src/scatter_models_old.cu b/src/scatter_models_old.cu
index 108b4ad..f004b5e 100644
--- a/src/scatter_models_old.cu
+++ b/src/scatter_models_old.cu
@@ -11,7 +11,7 @@
 #include <stdio.h>
 #include <math.h>
 #include <algorithm>
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 #include "cuda_utils.h"
 #include "scatter_models.cuh"
 
diff --git a/src/sensitivity.cu b/src/sensitivity.cu
index 2340707..0773586 100644
--- a/src/sensitivity.cu
+++ b/src/sensitivity.cu
@@ -12,7 +12,7 @@
 #include <stdio.h>
 #include <math.h>
 #include <algorithm>
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 //#include "device_launch_parameters.h"
 #include "cuda_utils.h"
 
@@ -531,4 +531,3 @@ bool sensitivity_modular_gpu(float*& f, parameters* params, bool data_on_cpu)
 
     return true;
 }
-
diff --git a/src/total_variation.cu b/src/total_variation.cu
index 6d6f9f1..15d3b40 100644
--- a/src/total_variation.cu
+++ b/src/total_variation.cu
@@ -11,7 +11,7 @@
 #include <math.h>
 
 #include "cuda_utils.h"
-#include "cuda_runtime.h"
+#include "gpu_runtime.h"
 #include "cpu_utils.h"
 //#include "device_launch_parameters.h"
 
diff --git a/tools/hipify_torch.py b/tools/hipify_torch.py
new file mode 100644
index 0000000..68f28ed
--- /dev/null
+++ b/tools/hipify_torch.py
@@ -0,0 +1,467 @@
+"""Run torch.utils.hipify into a build-local source tree.
+
+This is the preferred AMD source-generation path in LEAP. The script can copy
+the current source tree into a build directory first, then rewrite HIP-relevant
+files in place there. That keeps generated artifacts out of the repository's
+tracked src/ tree and avoids accidentally consuming stale ``*.hip`` sidecars.
+"""
+
+from __future__ import annotations
+
+import argparse
+import importlib
+import inspect
+import json
+import os
+import shutil
+import sys
+import traceback
+from pathlib import Path
+from typing import Any, Callable
+
+
+def eprint(*args: object) -> None:
+    print(*args, file=sys.stderr)
+
+
+def parse_bool_env(name: str, default: bool = False) -> bool:
+    value = os.environ.get(name)
+    if value is None:
+        return default
+    return value.strip().lower() in {"1", "true", "on", "yes"}
+
+
+def unique_strs(items: list[str]) -> list[str]:
+    seen: set[str] = set()
+    result: list[str] = []
+    for item in items:
+        if item not in seen:
+            seen.add(item)
+            result.append(item)
+    return result
+
+
+def write_text(path: Path, text: str) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(text, encoding="utf-8")
+
+
+def write_json(path: Path, data: dict[str, Any]) -> None:
+    write_text(path, json.dumps(data, indent=2, sort_keys=True) + "\n")
+
+
+def is_generated_hip_sidecar(path: Path) -> bool:
+    name = path.name
+    return name.endswith((".cu.hip", ".cuh.hip", ".cpp.hip", ".h.hip"))
+
+
+def copy_tree_contents(src_dir: Path, out_dir: Path, verbose: bool = False) -> None:
+    """
+    Copy the source tree into the output tree before running hipify.
+
+    This is useful because some torch hipify flows are tree-oriented and the
+    output directory may need the full project structure present, not only the
+    transformed files. We also skip generated *.hip sidecars so the build-local
+    tree starts from canonical checked-in sources rather than prior byproducts.
+    """
+    for src_path in src_dir.rglob("*"):
+        if src_path.is_file() and is_generated_hip_sidecar(src_path):
+            if verbose:
+                print(f"[hipify_torch] skipped generated sidecar: {src_path}")
+            continue
+
+        rel = src_path.relative_to(src_dir)
+        dst_path = out_dir / rel
+
+        if src_path.is_dir():
+            dst_path.mkdir(parents=True, exist_ok=True)
+            continue
+
+        dst_path.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copy2(src_path, dst_path)
+        if verbose:
+            print(f"[hipify_torch] copied: {src_path} -> {dst_path}")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run torch hipify with project-aware options and version-tolerant API handling."
+    )
+    parser.add_argument("src_dir", help="Source directory to hipify")
+    parser.add_argument("out_dir", help="Output directory for hipified sources")
+
+    parser.add_argument(
+        "--project-root",
+        default=None,
+        help="Project root. Defaults to the parent of src_dir.",
+    )
+    parser.add_argument(
+        "--include",
+        action="append",
+        default=[],
+        help="Glob of files to include, may be repeated",
+    )
+    parser.add_argument(
+        "--ignore",
+        action="append",
+        default=[],
+        help="Glob of files to ignore, may be repeated",
+    )
+    parser.add_argument(
+        "--header-include-dir",
+        action="append",
+        default=[],
+        help="Header include directory, may be repeated",
+    )
+    parser.add_argument(
+        "--extra-json",
+        default=None,
+        help="Optional JSON file containing extra keyword arguments for the hipify call",
+    )
+    parser.add_argument(
+        "--copy-tree-first",
+        action="store_true",
+        help="Copy the source tree into out_dir before hipify",
+    )
+    parser.add_argument(
+        "--copy-if-skipped",
+        action="store_true",
+        help="If hipify is skipped, still copy src_dir into out_dir",
+    )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Force hipify even if torch does not appear ROCm-enabled",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Enable verbose logging",
+    )
+    return parser.parse_args()
+
+
+def load_extra_json(path: str | None) -> dict[str, Any]:
+    if not path:
+        return {}
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    if not isinstance(data, dict):
+        raise TypeError("extra JSON file must contain a JSON object")
+    return data
+
+
+def import_optional(module_name: str):
+    try:
+        return importlib.import_module(module_name)
+    except Exception:
+        return None
+
+
+def try_resolve_attr(module_name: str, attr_name: str):
+    try:
+        module = importlib.import_module(module_name)
+        return getattr(module, attr_name, None)
+    except Exception:
+        return None
+
+
+def discover_hipify_callable(verbose: bool = False) -> tuple[Callable[..., Any], str]:
+    """
+    Try a range of historically plausible import locations.
+
+    We avoid assuming a single import path because torch packaging has varied.
+    The common case remains torch.utils.hipify, but we probe a few candidates.
+    """
+    candidates: list[tuple[str, str]] = [
+        ("torch.utils.hipify", "hipify"),
+        ("torch.utils.cpp_extension", "hipify"),
+        ("torch.utils.hipify.hipify_python", "hipify"),
+        ("torch.utils.hipify.hipify_python", "main"),
+    ]
+
+    errors: list[str] = []
+
+    for module_name, attr_name in candidates:
+        try:
+            module = importlib.import_module(module_name)
+            value = getattr(module, attr_name, None)
+            if callable(value):
+                if verbose:
+                    print(f"[hipify_torch] using callable {module_name}.{attr_name}")
+                return value, f"{module_name}.{attr_name}"
+            errors.append(f"{module_name}.{attr_name}: found but not callable")
+        except Exception as exc:
+            errors.append(f"{module_name}.{attr_name}: {exc}")
+
+    raise ImportError(
+        "Could not locate a usable hipify callable. Tried:\n  - "
+        + "\n  - ".join(errors)
+    )
+
+
+def filter_supported_kwargs(func: Callable[..., Any], kwargs: dict[str, Any]) -> dict[str, Any]:
+    """
+    Keep only kwargs accepted by the resolved hipify callable, unless it accepts **kwargs.
+    """
+    try:
+        sig = inspect.signature(func)
+    except Exception:
+        return kwargs
+
+    has_var_kw = any(
+        p.kind == inspect.Parameter.VAR_KEYWORD
+        for p in sig.parameters.values()
+    )
+    if has_var_kw:
+        return kwargs
+
+    accepted = set(sig.parameters.keys())
+    return {k: v for k, v in kwargs.items() if k in accepted}
+
+
+def infer_torch_environment(verbose: bool = False) -> dict[str, Any]:
+    import torch  # local import on purpose
+
+    version_hip = getattr(getattr(torch, "version", None), "hip", None)
+    version_cuda = getattr(getattr(torch, "version", None), "cuda", None)
+
+    cuda_is_available = None
+    try:
+        cuda_is_available = bool(torch.cuda.is_available())
+    except Exception:
+        cuda_is_available = None
+
+    device_name = None
+    if cuda_is_available:
+        try:
+            device_name = torch.cuda.get_device_name(0)
+        except Exception:
+            device_name = None
+
+    info = {
+        "torch_version": getattr(torch, "__version__", None),
+        "torch_version_hip": version_hip,
+        "torch_version_cuda": version_cuda,
+        "torch_cuda_is_available": cuda_is_available,
+        "torch_device_name_0": device_name,
+        "torch_rocm_detected": version_hip is not None,
+    }
+
+    if verbose:
+        print("[hipify_torch] torch environment:")
+        print(json.dumps(info, indent=2, sort_keys=True))
+
+    return info
+
+
+def maybe_copy_if_requested(src_dir: Path, out_dir: Path, should_copy: bool, verbose: bool) -> None:
+    if should_copy:
+        copy_tree_contents(src_dir, out_dir, verbose=verbose)
+
+
+def build_candidate_kwargs(
+    project_root: Path,
+    src_dir: Path,
+    out_dir: Path,
+    include_patterns: list[str],
+    ignore_patterns: list[str],
+    header_include_dirs: list[str],
+    verbose: bool,
+    extra_kwargs: dict[str, Any],
+) -> dict[str, Any]:
+    """
+    Build a superset of kwargs from multiple known / plausible hipify APIs.
+
+    The resolved callable may accept only a subset. We filter later using its signature.
+    """
+    candidate_kwargs: dict[str, Any] = {
+        "project_directory": str(project_root),
+        "output_directory": str(out_dir),
+        "includes": include_patterns,
+        "ignores": ignore_patterns,
+        "header_include_dirs": header_include_dirs,
+        "show_detailed": verbose,
+        "is_pytorch_extension": True,
+        "hip_clang_launch": False,
+        "clean_ctx": False,
+        "from_dir": str(src_dir),
+        "to_dir": str(out_dir),
+        "src_dir": str(src_dir),
+    }
+    candidate_kwargs.update(extra_kwargs)
+    return candidate_kwargs
+
+
+def main() -> int:
+    args = parse_args()
+
+    src_dir = Path(args.src_dir).resolve()
+    out_dir = Path(args.out_dir).resolve()
+    project_root = (
+        Path(args.project_root).resolve()
+        if args.project_root
+        else src_dir.parent.resolve()
+    )
+
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    done_file = out_dir / "hipify.done"
+    skipped_file = out_dir / "hipify.skipped"
+    failed_file = out_dir / "hipify.failed"
+    manifest_file = out_dir / "hipify.manifest.json"
+
+    for p in [done_file, skipped_file, failed_file, manifest_file]:
+        if p.exists():
+            p.unlink()
+
+    manifest: dict[str, Any] = {
+        "status": "starting",
+        "src_dir": str(src_dir),
+        "out_dir": str(out_dir),
+        "project_root": str(project_root),
+        "copy_tree_first": bool(args.copy_tree_first),
+        "copy_if_skipped": bool(args.copy_if_skipped),
+        "force_flag": bool(args.force),
+        "env_force": parse_bool_env("HIPIFY_AT_BUILD", default=False),
+    }
+
+    try:
+        import torch  # noqa: F401
+    except Exception as exc:
+        msg = f"torch import failed: {exc}"
+        manifest["status"] = "failed"
+        manifest["error"] = msg
+        write_json(manifest_file, manifest)
+        write_text(failed_file, msg + "\n")
+        eprint(msg)
+        return 1
+
+    try:
+        torch_info = infer_torch_environment(verbose=args.verbose)
+        manifest["torch_info"] = torch_info
+    except Exception as exc:
+        msg = f"failed to inspect torch environment: {exc}"
+        manifest["status"] = "failed"
+        manifest["error"] = msg
+        write_json(manifest_file, manifest)
+        write_text(failed_file, msg + "\n")
+        eprint(msg)
+        return 1
+
+    force = bool(args.force) or parse_bool_env("HIPIFY_AT_BUILD", default=False)
+    is_rocm = bool(manifest["torch_info"].get("torch_rocm_detected"))
+
+    include_patterns = unique_strs(
+        args.include or [
+            "src/**/*.cu",
+            "src/**/*.cuh",
+            "src/**/*.cpp",
+            "src/**/*.h",
+        ]
+    )
+    ignore_patterns = unique_strs(
+        args.ignore or [
+            "build/**",
+            ".git/**",
+            "**/CMakeFiles/**",
+            "**/__pycache__/**",
+        ]
+    )
+    header_include_dirs = unique_strs(
+        [str(Path(p).resolve()) for p in (args.header_include_dir or [str(src_dir)])]
+    )
+
+    manifest["include_patterns"] = include_patterns
+    manifest["ignore_patterns"] = ignore_patterns
+    manifest["header_include_dirs"] = header_include_dirs
+
+    if args.copy_tree_first:
+        # This is the normal LEAP path: stage a full build-local tree first,
+        # then let torch hipify rewrite only the files it understands.
+        maybe_copy_if_requested(src_dir, out_dir, should_copy=True, verbose=args.verbose)
+
+    if not is_rocm and not force:
+        msg = "torch is not ROCm-enabled and hipify was not forced"
+        manifest["status"] = "skipped"
+        manifest["reason"] = msg
+        if args.copy_if_skipped:
+            maybe_copy_if_requested(src_dir, out_dir, should_copy=True, verbose=args.verbose)
+            manifest["copied_tree_when_skipped"] = True
+        write_json(manifest_file, manifest)
+        write_text(skipped_file, msg + "\n")
+        if args.verbose:
+            print("[hipify_torch] HIPIFY_STATUS=SKIPPED")
+            print(f"[hipify_torch] reason: {msg}")
+        return 0
+
+    try:
+        hipify_callable, hipify_name = discover_hipify_callable(verbose=args.verbose)
+    except Exception as exc:
+        msg = str(exc)
+        manifest["status"] = "failed"
+        manifest["error"] = msg
+        write_json(manifest_file, manifest)
+        write_text(failed_file, msg + "\n")
+        eprint(msg)
+        return 1
+
+    manifest["hipify_callable"] = hipify_name
+
+    try:
+        extra_kwargs = load_extra_json(args.extra_json)
+    except Exception as exc:
+        msg = f"failed to load extra JSON config: {exc}"
+        manifest["status"] = "failed"
+        manifest["error"] = msg
+        write_json(manifest_file, manifest)
+        write_text(failed_file, msg + "\n")
+        eprint(msg)
+        return 1
+
+    candidate_kwargs = build_candidate_kwargs(
+        project_root=project_root,
+        src_dir=src_dir,
+        out_dir=out_dir,
+        include_patterns=include_patterns,
+        ignore_patterns=ignore_patterns,
+        header_include_dirs=header_include_dirs,
+        verbose=args.verbose,
+        extra_kwargs=extra_kwargs,
+    )
+    final_kwargs = filter_supported_kwargs(hipify_callable, candidate_kwargs)
+
+    manifest["candidate_kwargs_keys"] = sorted(candidate_kwargs.keys())
+    manifest["used_kwargs_keys"] = sorted(final_kwargs.keys())
+
+    if args.verbose:
+        print(f"[hipify_torch] calling {hipify_name} with supported kwargs:")
+        for key in sorted(final_kwargs.keys()):
+            print(f"  {key} = {final_kwargs[key]!r}")
+
+    try:
+        result = hipify_callable(**final_kwargs)
+        manifest["status"] = "done"
+        manifest["result_repr"] = repr(result)
+    except Exception as exc:
+        tb = traceback.format_exc()
+        msg = f"hipify invocation failed: {exc}"
+        manifest["status"] = "failed"
+        manifest["error"] = msg
+        manifest["traceback"] = tb
+        write_json(manifest_file, manifest)
+        write_text(failed_file, msg + "\n" + tb)
+        eprint(msg)
+        return 1
+
+    write_json(manifest_file, manifest)
+    write_text(done_file, "done\n")
+
+    if args.verbose:
+        print("[hipify_torch] HIPIFY_STATUS=DONE")
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tools/run_hipify_clang.py b/tools/run_hipify_clang.py
new file mode 100644
index 0000000..6ae9bf1
--- /dev/null
+++ b/tools/run_hipify_clang.py
@@ -0,0 +1,418 @@
+"""Run hipify-clang for one file and insist on a real output artifact.
+
+This wrapper exists because the direct CLI has not been reliable enough to use
+bare from CMake on ROCm 7.2. Observed failure modes include:
+
+- success exit status with no output file when using direct ``-o`` mode
+- ``-p <build-dir>`` rejecting some invocations with
+  ``conflict: -o and multiple source files are specified``
+- CUDA arch flags accepted by hipify-clang but not always propagated into its
+  internal CUDA compile step
+
+The wrapper therefore records a manifest, prints the exact command on failure,
+and treats "success without a translated file" as an error.
+
+The optional manifest is meant to be a compact postmortem record for one
+requested translation. Top-level fields summarize the final outcome that CMake
+should care about, while ``attempts`` preserves the detailed history when the
+wrapper retries. In practice:
+
+- ``status`` is the final wrapper result: ``done`` or ``failed``
+- ``command`` is the command from the selected or last attempt
+- ``selected_attempt_index`` and ``used_build_path`` explain which retry path
+  actually produced the final result
+- ``output_mode`` distinguishes normal ``-o`` output from the stdout fallback
+- ``attempts`` contains per-attempt commands, return codes, warnings, stderr,
+  and any retry reason
+
+When debugging, read the manifest from the top-level summary first, then inspect
+the relevant entry in ``attempts`` for the exact tool invocation and stderr.
+
+If a future hipify-clang release stabilizes the CLI, re-check whether direct
+``-o`` mode always materializes the file, whether ``-p`` works across the file
+types LEAP translates, and whether CUDA arch / extra-arg handling matches the
+tool's final internal compile behavior.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import pathlib
+import shlex
+import subprocess
+import sys
+import traceback
+from typing import Any
+
+
+BUILD_PATH_CONFLICT_MARKERS = (
+    "conflict: -o and multiple source files are specified",
+)
+
+ARCH_PROPAGATION_FAILURE_MARKERS = (
+    "ptxas",
+    "nvlink",
+    "unsupported gpu architecture",
+    "unsupported .version",
+)
+
+
+def eprint(*args: object) -> None:
+    print(*args, file=sys.stderr)
+
+
+def write_text(path: pathlib.Path, text: str) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(text, encoding="utf-8")
+
+
+def write_json(path: pathlib.Path, data: dict[str, Any]) -> None:
+    write_text(path, json.dumps(data, indent=2, sort_keys=True) + "\n")
+
+
+def build_command(
+    *,
+    hipify: pathlib.Path,
+    infile: pathlib.Path,
+    outfile: pathlib.Path,
+    build_path: str | None,
+    include_dirs: list[str],
+    response_file: str | None,
+    hipify_args: list[str],
+    extra_args: list[str],
+) -> list[str]:
+    cmd: list[str] = [str(hipify)]
+
+    if build_path:
+        # Some source translations need compile_commands.json context, but this
+        # mode is also one of the areas where hipify-clang has been flaky.
+        cmd.extend(["-p", str(pathlib.Path(build_path).resolve())])
+
+    # Prefer hipify-clang's own file output mode rather than redirecting stdout
+    # from CMake, then validate that the tool actually created something.
+    cmd.extend(["-o", str(outfile)])
+
+    for inc in include_dirs:
+        cmd.append(f"-I{pathlib.Path(inc).resolve()}")
+
+    cmd.extend(load_response_file(response_file))
+    cmd.extend(hipify_args)
+
+    for extra in extra_args:
+        cmd.append(f"--extra-arg={extra}")
+
+    cmd.append(str(infile))
+    return cmd
+
+
+def remove_stale_output(path: pathlib.Path) -> None:
+    if path.exists():
+        path.unlink()
+
+
+def stderr_has_marker(stderr: str, markers: tuple[str, ...]) -> bool:
+    lowered = stderr.lower()
+    return any(marker in lowered for marker in markers)
+
+
+def write_stdout_fallback(outfile: pathlib.Path, stdout: str) -> bool:
+    if not stdout:
+        return False
+    write_text(outfile, stdout)
+    return True
+
+
+def summarize_arch_failure(stderr: str) -> str:
+    return (
+        "hipify-clang failed inside its internal CUDA compile/parsing step. "
+        "This usually means the tool did not honor the requested CUDA arch or "
+        "related clang arguments consistently. Try the torch hipify path first; "
+        "otherwise re-check LEAP_HIPIFY_CUDA_GPU_ARCH, ROCm version, and the "
+        "tool's internal compiler defaults."
+    )
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run hipify-clang on a single file and write translated output."
+    )
+    parser.add_argument("hipify_clang", help="Path to hipify-clang executable")
+    parser.add_argument("input", help="Input source/header file")
+    parser.add_argument("output", help="Output translated file")
+    parser.add_argument(
+        "--build-path",
+        default=None,
+        help="Optional build directory containing compile_commands.json for hipify-clang -p",
+    )
+
+    parser.add_argument(
+        "-I",
+        "--include-dir",
+        action="append",
+        default=[],
+        help="Include directory to pass to hipify-clang, may be repeated",
+    )
+    parser.add_argument(
+        "--extra-arg",
+        action="append",
+        default=[],
+        help="Extra clang argument to pass through via hipify-clang --extra-arg=..., may be repeated",
+    )
+    parser.add_argument(
+        "--hipify-arg",
+        action="append",
+        default=[],
+        help="Raw top-level argument to pass directly to hipify-clang, may be repeated",
+    )
+    parser.add_argument(
+        "--response-file",
+        default=None,
+        help="Optional text file containing additional arguments, one shell-style string or line",
+    )
+    parser.add_argument(
+        "--manifest",
+        default=None,
+        help=(
+            "Optional JSON sidecar file. Records the final wrapper outcome plus "
+            "per-attempt commands, return codes, warnings, and stderr for retries."
+        ),
+    )
+    parser.add_argument(
+        "--print-command",
+        action="store_true",
+        help="Print the final hipify-clang command to stderr",
+    )
+    parser.add_argument(
+        "--stderr-to-manifest",
+        action="store_true",
+        help="Store stderr in the manifest on success as well as failure",
+    )
+    parser.add_argument(
+        "--allow-empty-stdout",
+        action="store_true",
+        help="Deprecated compatibility flag; retained for older invocations",
+    )
+    return parser.parse_args()
+
+
+def load_response_file(path: str | None) -> list[str]:
+    if not path:
+        return []
+    text = pathlib.Path(path).read_text(encoding="utf-8")
+    args: list[str] = []
+    for line in text.splitlines():
+        line = line.strip()
+        if not line or line.startswith("#"):
+            continue
+        args.extend(shlex.split(line))
+    return args
+
+
+def main() -> int:
+    args = parse_args()
+
+    hipify = pathlib.Path(args.hipify_clang).resolve()
+    infile = pathlib.Path(args.input).resolve()
+    outfile = pathlib.Path(args.output).resolve()
+    manifest_path = pathlib.Path(args.manifest).resolve() if args.manifest else None
+
+    outfile.parent.mkdir(parents=True, exist_ok=True)
+
+    # Keep top-level fields focused on the final outcome, and put retry-specific
+    # diagnostics in manifest["attempts"] so callers can inspect one stable
+    # summary record without losing the detailed execution history.
+    manifest: dict[str, Any] = {
+        "hipify_clang": str(hipify),
+        "input": str(infile),
+        "output": str(outfile),
+        "build_path": str(pathlib.Path(args.build_path).resolve()) if args.build_path else None,
+        "cwd": str(pathlib.Path.cwd()),
+        "include_dirs": [str(pathlib.Path(p).resolve()) for p in args.include_dir],
+        "hipify_args": args.hipify_arg,
+        "extra_args": args.extra_arg,
+        "response_file": str(pathlib.Path(args.response_file).resolve()) if args.response_file else None,
+        "status": "starting",
+        "attempts": [],
+    }
+
+    attempt_build_paths: list[str | None] = [args.build_path]
+    if args.build_path:
+        attempt_build_paths.append(None)
+
+    final_proc: subprocess.CompletedProcess[str] | None = None
+
+    for attempt_index, attempt_build_path in enumerate(attempt_build_paths, start=1):
+        cmd = build_command(
+            hipify=hipify,
+            infile=infile,
+            outfile=outfile,
+            build_path=attempt_build_path,
+            include_dirs=args.include_dir,
+            response_file=args.response_file,
+            hipify_args=args.hipify_arg,
+            extra_args=args.extra_arg,
+        )
+        manifest["command"] = cmd
+
+        if args.print_command:
+            eprint("[run_hipify_clang] command:")
+            eprint("  " + " ".join(shlex.quote(x) for x in cmd))
+
+        remove_stale_output(outfile)
+
+        attempt_record: dict[str, Any] = {
+            "attempt_index": attempt_index,
+            "build_path": str(pathlib.Path(attempt_build_path).resolve()) if attempt_build_path else None,
+            "command": cmd,
+        }
+
+        try:
+            proc = subprocess.run(
+                cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+            )
+        except Exception as exc:
+            tb = traceback.format_exc()
+            attempt_record["status"] = "failed_to_start"
+            attempt_record["error"] = f"failed to start hipify-clang: {exc}"
+            attempt_record["traceback"] = tb
+            manifest["attempts"].append(attempt_record)
+            manifest["status"] = "failed"
+            manifest["error"] = attempt_record["error"]
+            manifest["traceback"] = tb
+            if manifest_path:
+                write_json(manifest_path, manifest)
+            eprint(manifest["error"])
+            return 1
+
+        final_proc = proc
+        attempt_record["returncode"] = proc.returncode
+        attempt_record["stdout_size"] = len(proc.stdout)
+        attempt_record["stderr_size"] = len(proc.stderr)
+        if proc.stdout:
+            attempt_record["stdout"] = proc.stdout
+        if proc.stderr:
+            attempt_record["stderr"] = proc.stderr
+
+        if proc.returncode != 0:
+            if attempt_build_path and stderr_has_marker(proc.stderr, BUILD_PATH_CONFLICT_MARKERS):
+                attempt_record["status"] = "retry_without_build_path"
+                attempt_record["warning"] = (
+                    "hipify-clang rejected the -p/-o combination; retrying without compile_commands.json context"
+                )
+                manifest["attempts"].append(attempt_record)
+                continue
+
+            attempt_record["status"] = "failed"
+            if stderr_has_marker(proc.stderr, ARCH_PROPAGATION_FAILURE_MARKERS):
+                attempt_record["error_hint"] = summarize_arch_failure(proc.stderr)
+            manifest["attempts"].append(attempt_record)
+            manifest["status"] = "failed"
+            manifest["returncode"] = proc.returncode
+            manifest["stdout"] = proc.stdout
+            manifest["stderr"] = proc.stderr
+            manifest["error"] = attempt_record.get("error_hint", "hipify-clang failed")
+            if manifest_path:
+                write_json(manifest_path, manifest)
+            eprint("[run_hipify_clang] command:")
+            eprint("  " + " ".join(shlex.quote(x) for x in cmd))
+            if manifest.get("error"):
+                eprint(manifest["error"])
+            if proc.stderr:
+                sys.stderr.write(proc.stderr)
+            return proc.returncode
+
+        if not outfile.exists():
+            if write_stdout_fallback(outfile, proc.stdout):
+                attempt_record["status"] = "done"
+                attempt_record["output_mode"] = "stdout_fallback"
+            elif attempt_build_path:
+                attempt_record["status"] = "retry_without_build_path"
+                attempt_record["warning"] = (
+                    "hipify-clang returned success without creating the requested output file; retrying without compile_commands.json context"
+                )
+                manifest["attempts"].append(attempt_record)
+                continue
+            else:
+                attempt_record["status"] = "failed"
+                attempt_record["error"] = "hipify-clang succeeded but did not create the requested output file"
+                manifest["attempts"].append(attempt_record)
+                manifest["status"] = "failed"
+                manifest["stdout"] = proc.stdout
+                manifest["stderr"] = proc.stderr
+                manifest["error"] = attempt_record["error"]
+                manifest["output_mode"] = "missing_output_file"
+                if manifest_path:
+                    write_json(manifest_path, manifest)
+                eprint("[run_hipify_clang] command:")
+                eprint("  " + " ".join(shlex.quote(x) for x in cmd))
+                eprint(manifest["error"])
+                if proc.stderr:
+                    sys.stderr.write(proc.stderr)
+                return 1
+        else:
+            attempt_record["status"] = "done"
+            attempt_record["output_mode"] = "hipify_o"
+
+        output_size = outfile.stat().st_size
+        if output_size == 0:
+            if attempt_build_path:
+                attempt_record["status"] = "retry_without_build_path"
+                attempt_record["warning"] = (
+                    "hipify-clang created an empty output file; retrying without compile_commands.json context"
+                )
+                manifest["attempts"].append(attempt_record)
+                continue
+
+            attempt_record["status"] = "failed"
+            attempt_record["error"] = "hipify-clang created an empty output file"
+            manifest["attempts"].append(attempt_record)
+            manifest["status"] = "failed"
+            manifest["stdout"] = proc.stdout
+            manifest["stderr"] = proc.stderr
+            manifest["error"] = attempt_record["error"]
+            manifest["output_mode"] = "empty_output_file"
+            if manifest_path:
+                write_json(manifest_path, manifest)
+            eprint("[run_hipify_clang] command:")
+            eprint("  " + " ".join(shlex.quote(x) for x in cmd))
+            eprint(manifest["error"])
+            if proc.stderr:
+                sys.stderr.write(proc.stderr)
+            return 1
+
+        attempt_record["output_size"] = output_size
+        manifest["attempts"].append(attempt_record)
+        manifest["status"] = "done"
+        manifest["returncode"] = proc.returncode
+        manifest["output_mode"] = attempt_record["output_mode"]
+        manifest["stdout_size"] = len(proc.stdout)
+        manifest["output_size"] = output_size
+        manifest["selected_attempt_index"] = attempt_index
+        manifest["used_build_path"] = attempt_build_path is not None
+        if proc.stdout:
+            manifest["stdout"] = proc.stdout
+        if args.stderr_to_manifest and proc.stderr:
+            manifest["stderr"] = proc.stderr
+        break
+
+    if manifest["status"] != "done":
+        manifest["status"] = "failed"
+        manifest["error"] = "hipify-clang exhausted its retry strategy without producing a usable output file"
+        if manifest_path:
+            write_json(manifest_path, manifest)
+        if final_proc and final_proc.stderr:
+            sys.stderr.write(final_proc.stderr)
+        return 1
+
+    if manifest_path:
+        write_json(manifest_path, manifest)
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())