From 8d672f5fccd147d4f7797dbd8694cfd8eb434f6c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Sat, 25 Apr 2026 21:20:47 +0200
Subject: [PATCH 01/24] Initial port of CUDA support

---
 cmake/typeartToolchainOptions.cmake           |  7 ++
 lib/passes/TypeARTPass.cpp                    | 10 +++
 lib/passes/analysis/MemOpData.h               |  7 ++
 lib/passes/analysis/MemOpVisitor.cpp          | 12 ++-
 lib/passes/configuration/PassBuilderUtil.h    |  7 +-
 .../CallBackFunctionInserter.cpp              |  8 +-
 .../instrumentation/MemOpArgCollector.cpp     |  8 ++
 .../instrumentation/MemOpInstrumentation.cpp  | 28 ++++++-
 .../instrumentation/TypeARTFunctions.cpp      | 48 ++++++-----
 lib/passes/instrumentation/TypeARTFunctions.h | 15 +++-
 lib/passes/support/CudaUtil.h                 | 84 +++++++++++++++++++
 lib/passes/support/TypeUtil.cpp               | 15 ++++
 lib/passes/support/TypeUtil.h                 |  4 +
 lib/passes/typegen/ir/TypeManager.cpp         | 50 ++++++++++-
 lib/runtime/AllocationTracking.cpp            | 12 +++
 lib/runtime/CMakeLists.txt                    | 22 +++++
 lib/runtime/CallbackInterface.h               |  3 +
 lib/runtime/CudaRuntimeInterface.h            | 36 ++++++++
 lib/runtime/CudaSupport.cpp                   | 43 ++++++++++
 test/CMakeLists.txt                           | 65 ++++++++++++++
 test/cuda/pass/01_cudamalloc.c                | 16 ++++
 test/cuda/pass/02_cudafree.c                  | 14 ++++
 test/cuda/pass/03_cudahostalloc.c             | 14 ++++
 test/cuda/pass/06_cudamalloc_nonvoid.cpp      | 25 ++++++
 test/cuda/pass/07_axpy.c                      | 32 +++++++
 test/lit.cfg                                  | 12 +++
 test/lit.site.cfg.in                          |  4 +
 test/pass/misc/05_make_all_callbacks.c        |  4 +-
 28 files changed, 565 insertions(+), 40 deletions(-)
 create mode 100644 lib/passes/support/CudaUtil.h
 create mode 100644 lib/runtime/CudaRuntimeInterface.h
 create mode 100644 lib/runtime/CudaSupport.cpp
 create mode 100644 test/cuda/pass/01_cudamalloc.c
 create mode 100644 test/cuda/pass/02_cudafree.c
 create mode 100644 test/cuda/pass/03_cudahostalloc.c
 create mode 100644 test/cuda/pass/06_cudamalloc_nonvoid.cpp
 create mode 100644 test/cuda/pass/07_axpy.c

diff --git a/cmake/typeartToolchainOptions.cmake b/cmake/typeartToolchainOptions.cmake
index 741c9b3e..e0f3f8ed 100644
--- a/cmake/typeartToolchainOptions.cmake
+++ b/cmake/typeartToolchainOptions.cmake
@@ -163,6 +163,13 @@ set_package_properties(Python3 PROPERTIES
   "The Python3 interpreter is used for lit-testing and the MPI interceptor tool code generation."
 )
 
+find_package(CUDAToolkit QUIET)
+set_package_properties(CUDAToolkit PROPERTIES
+  TYPE OPTIONAL
+  PURPOSE
+  "CUDA toolkit enables host-side CUDA instrumentation and runtime helpers."
+)
+
 typeart_find_llvm_progs(TYPEART_CLANG_EXEC "clang-${LLVM_VERSION_MAJOR};clang" DEFAULT_EXE "clang")
 typeart_find_llvm_progs(TYPEART_CLANGCXX_EXEC "clang++-${LLVM_VERSION_MAJOR};clang++" DEFAULT_EXE "clang++")
 typeart_find_llvm_progs(TYPEART_LLC_EXEC "llc-${LLVM_VERSION_MAJOR};llc" DEFAULT_EXE "llc")
diff --git a/lib/passes/TypeARTPass.cpp b/lib/passes/TypeARTPass.cpp
index cf63ac3b..79256a4b 100644
--- a/lib/passes/TypeARTPass.cpp
+++ b/lib/passes/TypeARTPass.cpp
@@ -25,6 +25,7 @@
 #include "instrumentation/TypeARTFunctions.h"
 #include "instrumentation/TypeIDProvider.h"
 #include "support/ConfigurationBase.h"
+#include "support/CudaUtil.h"
 #include "support/Logger.h"
 #include "support/ModuleDumper.h"
 #include "support/Table.h"
@@ -280,6 +281,11 @@ run(llvm::Module& m, llvm::ModuleAnalysisManager&) {
 }
 
 bool runOnModule(llvm::Module& m) {
+  if (cuda::is_device_module(m)) {
+    LOG_DEBUG("Skipping CUDA device module: " << m.getName());
+    return false;
+  }
+
   meminst_finder->runOnModule(m);
   const bool instrument_global = configuration()[config::ConfigStdArgs::global];
   bool globals_were_instrumented{false};
@@ -324,6 +330,10 @@ bool runOnFunc(llvm::Function& f) {
     return false;
   }
 
+  if (cuda::is_cuda_helper_function(f)) {
+    return false;
+  }
+
   if (!meminst_finder->hasFunctionData(f)) {
     LOG_WARNING("No allocation data could be retrieved for function: " << f.getName());
     return false;
diff --git a/lib/passes/analysis/MemOpData.h b/lib/passes/analysis/MemOpData.h
index 34d173e5..b96db234 100644
--- a/lib/passes/analysis/MemOpData.h
+++ b/lib/passes/analysis/MemOpData.h
@@ -39,6 +39,7 @@ enum class MemOpKind : uint8_t {
   ReallocLike        = 1 << 4,            // re-allocated (existing) memory
   FreeLike           = 1 << 5,            // free memory
   DeleteLike         = 1 << 6,            // delete (cpp) memory
+  CudaMallocLike     = 1 << 7,            // cuda out-parameter allocation
   MallocOrCallocLike = MallocLike | CallocLike | AlignedAllocLike,
   AllocLike          = MallocOrCallocLike,
   AnyAlloc           = AllocLike | ReallocLike,
@@ -101,6 +102,10 @@ struct MemOps {
       {"_ZnajSt11align_val_tRKSt9nothrow_t", MemOpKind::MallocLike}, /*new[](unsigned int, align_val_t, nothrow)*/
       {"_ZnamSt11align_val_t", MemOpKind::NewLike},                  /*new[](unsigned long, align_val_t)*/
       {"_ZnamSt11align_val_tRKSt9nothrow_t", MemOpKind::MallocLike}, /*new[](unsigned long, align_val_t, nothrow)*/
+      {"cudaMalloc", MemOpKind::CudaMallocLike},
+      {"cudaHostAlloc", MemOpKind::CudaMallocLike},
+      {"cudaMallocHost", MemOpKind::CudaMallocLike},
+      {"cudaMallocManaged", MemOpKind::CudaMallocLike},
   };
 
   const llvm::StringMap<MemOpKind> dealloc_map{
@@ -119,6 +124,8 @@ struct MemOps {
       {"_ZdlPvmSt11align_val_t", MemOpKind::DeleteLike},              /* delete(void*, unsigned long, align_val_t) */
       {"_ZdaPvjSt11align_val_t", MemOpKind::DeleteLike},              /* delete[](void*, unsigned int, align_val_t) */
       {"_ZdaPvmSt11align_val_t", MemOpKind::DeleteLike},              /* delete[](void*, unsigned long, align_val_t) */
+      {"cudaFree", MemOpKind::FreeLike},
+      {"cudaFreeHost", MemOpKind::FreeLike},
   };
   //clang-format off
 };
diff --git a/lib/passes/analysis/MemOpVisitor.cpp b/lib/passes/analysis/MemOpVisitor.cpp
index 9b835cc8..c42a92ea 100644
--- a/lib/passes/analysis/MemOpVisitor.cpp
+++ b/lib/passes/analysis/MemOpVisitor.cpp
@@ -16,6 +16,7 @@
 #include "compat/CallSite.h"
 #include "configuration/Configuration.h"
 #include "support/ConfigurationBase.h"
+#include "support/CudaUtil.h"
 #include "support/Error.h"
 #include "support/Logger.h"
 #include "support/TypeUtil.h"
@@ -222,10 +223,17 @@ void collect_casts_from_stack(llvm::StoreInst* store_inst, MallocBcasts& out_bca
   }
 }
 
-std::pair<MallocGeps, MallocBcasts> collectRelevantMallocUsers(llvm::CallBase& call_inst) {
+std::pair<MallocGeps, MallocBcasts> collectRelevantMallocUsers(llvm::CallBase& call_inst, MemOpKind kind) {
   auto geps   = MallocGeps{};
   auto bcasts = MallocBcasts{};
 
+  if (kind == MemOpKind::CudaMallocLike) {
+    if (auto bitcast = cuda::bitcast_for(call_inst); bitcast.has_value()) {
+      bcasts.insert(*bitcast);
+    }
+    return {geps, bcasts};
+  }
+
   for (auto* user : call_inst.users()) {
     if (auto* bit_cast = llvm::dyn_cast<llvm::BitCastInst>(user)) {
       bcasts.insert(bit_cast);
@@ -325,7 +333,7 @@ std::optional<ArrayCookieData> handleArrayCookie(llvm::CallBase& ci, const Mallo
 }
 
 void MemOpVisitor::visitMallocLike(llvm::CallBase& ci, MemOpKind k) {
-  auto [geps, bcasts] = collectRelevantMallocUsers(ci);
+  auto [geps, bcasts] = collectRelevantMallocUsers(ci, k);
   auto primary_cast   = bcasts.empty() ? nullptr : *bcasts.begin();
   auto array_cookie   = handleArrayCookie(ci, geps, bcasts, primary_cast);
   if (primary_cast == nullptr) {
diff --git a/lib/passes/configuration/PassBuilderUtil.h b/lib/passes/configuration/PassBuilderUtil.h
index cea89fcd..94fac7ce 100644
--- a/lib/passes/configuration/PassBuilderUtil.h
+++ b/lib/passes/configuration/PassBuilderUtil.h
@@ -16,6 +16,7 @@
 #define TYPEART_PASS_BUILDER_UTIL_H
 
 #include "support/Logger.h"
+#include "support/Util.h"
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Error.h"
@@ -29,11 +30,7 @@ inline bool checkParametrizedPassName(llvm::StringRef Name, llvm::StringRef Pass
   // normal pass name w/o parameters == default parameters
   if (Name.empty())
     return true;
-#if LLVM_VERSION_MAJOR > 15
-  return Name.starts_with("<") && Name.ends_with(">");
-#else
-  return Name.startswith("<") && Name.endswith(">");
-#endif
+  return starts_with_any_of(Name, "<") && ends_with_any_of(Name, ">");
 }
 
 /// This performs customized parsing of pass name with parameters.
diff --git a/lib/passes/instrumentation/CallBackFunctionInserter.cpp b/lib/passes/instrumentation/CallBackFunctionInserter.cpp
index cd89f9f7..b9c1d361 100644
--- a/lib/passes/instrumentation/CallBackFunctionInserter.cpp
+++ b/lib/passes/instrumentation/CallBackFunctionInserter.cpp
@@ -5,6 +5,7 @@
 #include "support/ConfigurationBase.h"
 #include "support/Logger.h"
 
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
@@ -55,8 +56,9 @@ llvm::CallInst* CallbackFunctionInserter::create_instrumentation_call(llvm::IRBu
   const auto callback_id = ifunc_for_function(callback_type, instruction_or_value);
   auto type_id_param_out = type_id_handler_->getOrRegister(args.typeid_value);
 
-  const auto mode = llvm::isa<llvm::GlobalVariable>(type_id_param_out) ? mode_ : TypeSerializationImplementation::FILE;
-  auto function   = function_query_->getFunctionFor(callback_id, mode);
+  const bool has_global_type_payload = !llvm::isa<llvm::ConstantInt>(type_id_param_out);
+  const auto mode                    = has_global_type_payload ? mode_ : TypeSerializationImplementation::FILE;
+  auto function                      = function_query_->getFunctionFor(callback_id, mode);
 
   return IRB.CreateCall(function,
                         llvm::ArrayRef<llvm::Value*>{args.pointer_value, type_id_param_out, args.element_count});
@@ -91,4 +93,4 @@ std::unique_ptr<InstrumentationInserter> make_callback_inserter(const config::Co
   return std::make_unique<CallbackFunctionInserter>(configuration, std::move(type_id_handler), function_query);
 }
 
-}  // namespace typeart
\ No newline at end of file
+}  // namespace typeart
diff --git a/lib/passes/instrumentation/MemOpArgCollector.cpp b/lib/passes/instrumentation/MemOpArgCollector.cpp
index bd6315ed..b70ba713 100644
--- a/lib/passes/instrumentation/MemOpArgCollector.cpp
+++ b/lib/passes/instrumentation/MemOpArgCollector.cpp
@@ -104,6 +104,14 @@ HeapArgList MemOpArgCollector::collectHeap(const MallocDataList& mallocs) {
       case MemOpKind::AlignedAllocLike:
         byte_count = malloc_call->getArgOperand(1);
         break;
+      case MemOpKind::CudaMallocLike:
+        byte_count = malloc_call->getArgOperand(1);
+        if (mdata.primary != nullptr) {
+          pointer = mdata.primary->getOperand(0);
+        } else {
+          pointer = malloc_call->getArgOperand(0);
+        }
+        break;
       default:
         LOG_ERROR("Unknown malloc kind. Not instrumenting. " << util::dump(*malloc_call));
         // TODO see above continues
diff --git a/lib/passes/instrumentation/MemOpInstrumentation.cpp b/lib/passes/instrumentation/MemOpInstrumentation.cpp
index 950bfd8d..d16a99f5 100644
--- a/lib/passes/instrumentation/MemOpInstrumentation.cpp
+++ b/lib/passes/instrumentation/MemOpInstrumentation.cpp
@@ -72,8 +72,13 @@ InstrCount MemOpInstrumentation::instrumentHeap(const HeapArgList& heap) {
   const bool is_llvm_ir_type = static_cast<int>(type_gen) == static_cast<int>(TypegenImplementation::IR);
 
   for (const auto& [malloc, args] : heap) {
-    auto kind         = malloc.kind;
-    auto* malloc_call = args.get_as<Instruction>(ArgMap::ID::pointer);
+    auto kind = malloc.kind;
+    Instruction* malloc_call{nullptr};
+    if (malloc.kind == MemOpKind::CudaMallocLike) {
+      malloc_call = llvm::cast<Instruction>(malloc.call);
+    } else {
+      malloc_call = args.get_as<llvm::Instruction>(ArgMap::ID::pointer);
+    }
 
     Instruction* insertBefore = malloc_call->getNextNode();
     if (malloc.array_cookie) {
@@ -89,6 +94,7 @@ InstrCount MemOpInstrumentation::instrumentHeap(const HeapArgList& heap) {
 
     auto typeid_value    = args.get_as<ConstantInt>(ArgMap::ID::type_id);
     auto type_size_value = args.get_value(ArgMap::ID::type_size);
+    Value* pointer_value = args.get_value(ArgMap::ID::pointer);
 
     bool single_byte_type{false};
     if (auto* const_int = llvm::dyn_cast<ConstantInt>(type_size_value)) {
@@ -143,12 +149,26 @@ InstrCount MemOpInstrumentation::instrumentHeap(const HeapArgList& heap) {
                                                             target_memory_address);
         break;
       }
+      case MemOpKind::CudaMallocLike: {
+        auto* runtime_ptr_type = instrumentation_helper->getTypeFor(IType::ptr);
+#if LLVM_VERSION_MAJOR >= 15
+        auto* loaded_ptr = IRB.CreateLoad(runtime_ptr_type, pointer_value);
+#else
+        auto* pointer_slot_type = llvm::PointerType::get(runtime_ptr_type, 0);
+        auto* pointer_slot      = IRB.CreateBitOrPointerCast(pointer_value, pointer_slot_type);
+        auto* loaded_ptr        = IRB.CreateLoad(runtime_ptr_type, pointer_slot);
+#endif
+        pointer_value = IRB.CreateBitOrPointerCast(loaded_ptr, instrumentation_helper->getTypeFor(IType::ptr));
+        auto bytes    = args.get_value(ArgMap::ID::byte_count);
+        element_count = calculate_element_count(bytes);
+        break;
+      }
       default:
         LOG_ERROR("Unknown malloc kind. Not instrumenting. " << util::dump(*malloc_call));
         continue;
     }
 
-    function_instrumenter_->insert_heap_instrumentation(IRB, malloc.call, {malloc_call, element_count, typeid_value});
+    function_instrumenter_->insert_heap_instrumentation(IRB, malloc.call, {pointer_value, element_count, typeid_value});
 
     // const auto callback_id = ifunc_for_function(IFunc::heap, malloc.call);
     // auto type_id_param     = function_instrumenter->getOrRegister(typeid_value);
@@ -294,4 +314,4 @@ InstrCount MemOpInstrumentation::instrumentGlobal(const GlobalArgList& globals)
   return counter;
 }
 
-}  // namespace typeart
\ No newline at end of file
+}  // namespace typeart
diff --git a/lib/passes/instrumentation/TypeARTFunctions.cpp b/lib/passes/instrumentation/TypeARTFunctions.cpp
index 0a1a8ea5..67168d47 100644
--- a/lib/passes/instrumentation/TypeARTFunctions.cpp
+++ b/lib/passes/instrumentation/TypeARTFunctions.cpp
@@ -16,6 +16,7 @@
 #include "configuration/Configuration.h"
 #include "instrumentation/TypeIDProvider.h"
 #include "support/ConfigurationBase.h"
+#include "support/CudaUtil.h"
 #include "support/Logger.h"
 #include "support/OmpUtil.h"
 
@@ -49,9 +50,9 @@ namespace typeart {
 namespace detail {
 std::string get_func_suffix(IFunc id) {
   switch (id) {
-    // case IFunc::free_cuda:
-    // case IFunc::heap_cuda:
-    //   return "_cuda";
+    case IFunc::free_cuda:
+    case IFunc::heap_cuda:
+      return "_cuda";
     case IFunc::free_omp:
     case IFunc::heap_omp:
     case IFunc::stack_omp:
@@ -69,6 +70,10 @@ IFuncType ifunc_type_for(llvm::Function* f) {
     return IFuncType::standard;
   }
 
+  if (cuda::is_cuda_function(*f)) {
+    return IFuncType::cuda;
+  }
+
   if (util::omp::isOmpContext(f)) {
     return IFuncType::omp;
   }
@@ -88,28 +93,27 @@ IFunc ifunc_for_function(IFunc general_type, llvm::Value* value) {
   } else if (llvm::isa<GlobalVariable>(value)) {
     type = detail::ifunc_type_for(nullptr);
   } else if (auto callbase = llvm::dyn_cast<CallBase>(value)) {
-    type = detail::ifunc_type_for(callbase->getFunction());
-    // auto maybe_cuda = detail::ifunc_type_for(callbase->getCalledFunction());
-    // if (maybe_cuda == detail::IFuncType::cuda) {
-    //   type = detail::IFuncType::cuda;
-    // }
+    type            = detail::ifunc_type_for(callbase->getFunction());
+    auto maybe_cuda = detail::ifunc_type_for(callbase->getCalledFunction());
+    if (maybe_cuda == detail::IFuncType::cuda) {
+      type = detail::IFuncType::cuda;
+    }
   }
 
   if (detail::IFuncType::standard == type) {
     return general_type;
   }
 
-  // if (detail::IFuncType::cuda == type) {
-  //   switch (general_type) {
-  //     case IFunc::heap:
-  //       return IFunc::heap_cuda;
-  //     case IFunc::free:
-  //       return IFunc::free_cuda;
-  //     default:
-  //       return general_type;
-  //       //        llvm_unreachable("IFunc not supported for CUDA.");
-  //   }
-  // }
+  if (detail::IFuncType::cuda == type) {
+    switch (general_type) {
+      case IFunc::heap:
+        return IFunc::heap_cuda;
+      case IFunc::free:
+        return IFunc::free_cuda;
+      default:
+        return general_type;
+    }
+  }
 
   switch (general_type) {
     case IFunc::stack:
@@ -287,6 +291,8 @@ TypeArtFunc typeart_alloc_omp        = typeart_alloc;
 TypeArtFunc typeart_alloc_stacks_omp = typeart_alloc_stack;
 TypeArtFunc typeart_free_omp         = typeart_free;
 TypeArtFunc typeart_leave_scope_omp  = typeart_leave_scope;
+TypeArtFunc typeart_alloc_cuda       = typeart_alloc;
+TypeArtFunc typeart_free_cuda        = typeart_free;
 
 TypeArtFunc typeart_alloc_mty{"__typeart_alloc_mty"};
 TypeArtFunc typeart_alloc_stack_mty{"__typeart_alloc_stack_mty"};
@@ -318,6 +324,7 @@ std::unique_ptr<TAFunctionQuery> declare_instrumentation_functions(llvm::Module&
       decl_alternatives.make_function(IFunc::stack, typeart_alloc_stack_mty.name, alloc_arg_types_mty);
   typeart_alloc_global_mty.f =
       decl_alternatives.make_function(IFunc::global, typeart_alloc_global_mty.name, alloc_arg_types_mty);
+  functions_alternative.putFunctionFor(IFunc::heap_cuda, llvm::cast<llvm::Function>(typeart_alloc_mty.f));
   typeart_register_type.f = decl.make_function(IFunc::type, typeart_register_type.name, free_arg_types);
 
   typeart_alloc.f        = decl.make_function(IFunc::heap, typeart_alloc.name, alloc_arg_types);
@@ -332,6 +339,9 @@ std::unique_ptr<TAFunctionQuery> declare_instrumentation_functions(llvm::Module&
 
   typeart_leave_scope_omp.f = decl.make_function(IFunc::scope_omp, typeart_leave_scope_omp.name, leavescope_arg_types);
 
+  typeart_alloc_cuda.f = decl.make_function(IFunc::heap_cuda, typeart_alloc_cuda.name, alloc_arg_types);
+  typeart_free_cuda.f  = decl.make_function(IFunc::free_cuda, typeart_free_cuda.name, free_arg_types);
+
   typeart_alloc_omp_mty.f =
       decl_alternatives.make_function(IFunc::heap_omp, typeart_alloc_omp_mty.name, alloc_arg_types_mty);
   typeart_alloc_stacks_omp_mty.f =
diff --git a/lib/passes/instrumentation/TypeARTFunctions.h b/lib/passes/instrumentation/TypeARTFunctions.h
index 21c135da..3128d92e 100644
--- a/lib/passes/instrumentation/TypeARTFunctions.h
+++ b/lib/passes/instrumentation/TypeARTFunctions.h
@@ -32,7 +32,20 @@ namespace config {
 class Configuration;
 }
 
-enum class IFunc : unsigned { heap, stack, global, free, scope, heap_omp, stack_omp, free_omp, scope_omp, type };
+enum class IFunc : unsigned {
+  heap,
+  stack,
+  global,
+  free,
+  scope,
+  heap_omp,
+  stack_omp,
+  free_omp,
+  scope_omp,
+  heap_cuda,
+  free_cuda,
+  type
+};
 
 IFunc ifunc_for_function(IFunc general_type, llvm::Value* value);
 
diff --git a/lib/passes/support/CudaUtil.h b/lib/passes/support/CudaUtil.h
new file mode 100644
index 00000000..69eba5a0
--- /dev/null
+++ b/lib/passes/support/CudaUtil.h
@@ -0,0 +1,84 @@
+// TypeART library
+//
+// Copyright (c) 2017-2026 TypeART Authors
+// Distributed under the BSD 3-Clause license.
+// (See accompanying file LICENSE.txt or copy at
+// https://opensource.org/licenses/BSD-3-Clause)
+//
+// Project home: https://github.com/tudasc/TypeART
+//
+// SPDX-License-Identifier: BSD-3-Clause
+//
+
+#ifndef TYPEART_CUDAUTIL_H
+#define TYPEART_CUDAUTIL_H
+
+#include "support/Util.h"
+
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+
+#include <optional>
+#include <string>
+
+namespace typeart::cuda {
+
+inline std::optional<llvm::BitCastInst*> bitcast_for(llvm::Value* cuda_ptr) {
+  std::optional<llvm::BitCastInst*> fallback;
+  for (auto& use : cuda_ptr->uses()) {
+    auto* use_value = use.get();
+    auto* bitcast   = llvm::dyn_cast<llvm::BitCastInst>(use_value);
+    if (bitcast == nullptr) {
+      continue;
+    }
+
+    if (auto* primary_bitcast = llvm::dyn_cast<llvm::BitCastInst>(bitcast->getOperand(0))) {
+      return primary_bitcast;
+    }
+
+    fallback = bitcast;
+    return fallback;
+  }
+  return fallback;
+}
+
+inline std::optional<llvm::BitCastInst*> bitcast_for(const llvm::CallBase& cuda_call) {
+  return bitcast_for(cuda_call.getArgOperand(0));
+}
+
+inline bool is_device_module(const llvm::Module& module) {
+#if LLVM_VERSION_MAJOR >= 20
+  const auto triple = module.getTargetTriple().str();
+#else
+  const auto triple = module.getTargetTriple();
+#endif
+  return llvm::StringRef{triple}.find("nvptx") != llvm::StringRef::npos;
+}
+
+inline bool is_device_stub(const llvm::Function& function) {
+  const auto function_name = util::demangle(function.getName());
+  return function_name.find("__device_stub__") != std::string::npos;
+}
+
+inline bool is_dim3_init(const llvm::Function& function) {
+  const auto function_name = util::demangle(function.getName());
+  return function_name.find("dim3::dim3") != std::string::npos;
+}
+
+inline bool is_cuda_function(const llvm::Function& function) {
+  const auto function_name = llvm::StringRef{function.getName()};
+  return util::starts_with_any_of(function_name, "cuda");
+}
+
+inline bool is_cuda_helper_function(const llvm::Function& function) {
+  if (is_device_stub(function) || is_dim3_init(function)) {
+    return true;
+  }
+  const auto function_name = llvm::StringRef{function.getName()};
+  return util::starts_with_any_of(function_name, "__cuda");
+}
+
+}  // namespace typeart::cuda
+
+#endif  // TYPEART_CUDAUTIL_H
diff --git a/lib/passes/support/TypeUtil.cpp b/lib/passes/support/TypeUtil.cpp
index 94c9fe75..653ec0d4 100644
--- a/lib/passes/support/TypeUtil.cpp
+++ b/lib/passes/support/TypeUtil.cpp
@@ -24,6 +24,8 @@
 #include "llvm/Support/TypeSize.h"
 #include "llvm/Support/raw_ostream.h"
 
+#include <optional>
+
 using namespace llvm;
 
 namespace typeart::util::type {
@@ -106,4 +108,17 @@ unsigned getPointerSizeInBytes(llvm::Type* /*ptrT*/, const llvm::DataLayout& dl)
   return dl.getPointerSizeInBits() / 8;
 }
 
+std::optional<llvm::Type*> getPointerElementType(llvm::Type* ptr_type) {
+  auto* pointer_type = dyn_cast_or_null<llvm::PointerType>(ptr_type);
+  if (pointer_type == nullptr) {
+    return {};
+  }
+
+#if LLVM_VERSION_MAJOR < 15
+  return pointer_type->getPointerElementType();
+#else
+  return {};
+#endif
+}
+
 }  // namespace typeart::util::type
diff --git a/lib/passes/support/TypeUtil.h b/lib/passes/support/TypeUtil.h
index 129f6d99..684e4055 100644
--- a/lib/passes/support/TypeUtil.h
+++ b/lib/passes/support/TypeUtil.h
@@ -20,6 +20,8 @@ class AllocaInst;
 class LLVMContext;
 }  // namespace llvm
 
+#include <optional>
+
 namespace typeart::util::type {
 
 #if LLVM_VERSION_MAJOR < 15
@@ -44,6 +46,8 @@ unsigned getStructSizeInBytes(llvm::Type* structT, const llvm::DataLayout& dl);
 
 unsigned getPointerSizeInBytes(llvm::Type* ptrT, const llvm::DataLayout& dl);
 
+std::optional<llvm::Type*> getPointerElementType(llvm::Type* ptr_type);
+
 }  // namespace typeart::util::type
 
 #endif  // TYPEART_TYPE_UTIL_H
diff --git a/lib/passes/typegen/ir/TypeManager.cpp b/lib/passes/typegen/ir/TypeManager.cpp
index daf26dbb..1927f920 100644
--- a/lib/passes/typegen/ir/TypeManager.cpp
+++ b/lib/passes/typegen/ir/TypeManager.cpp
@@ -15,6 +15,7 @@
 #include "IRTypeGen.h"
 #include "StructTypeHandler.h"
 #include "VectorTypeHandler.h"
+#include "support/CudaUtil.h"
 #include "support/Logger.h"
 #include "support/TypeUtil.h"
 #include "support/Util.h"
@@ -277,9 +278,27 @@ TypeIdentifier TypeManager::getOrRegisterType(const MallocData& mdata) {
   auto malloc_call            = mdata.call;
   const llvm::DataLayout& dl  = malloc_call->getModule()->getDataLayout();
   BitCastInst* primaryBitcast = mdata.primary;
+  llvm::Type* allocation_type = nullptr;
+
+  if (mdata.kind == MemOpKind::CudaMallocLike && primaryBitcast == nullptr) {
+    if (auto bitcast = cuda::bitcast_for(*malloc_call); bitcast.has_value()) {
+      primaryBitcast = *bitcast;
+    }
+  }
+
+  if (mdata.kind == MemOpKind::CudaMallocLike) {
+    allocation_type = llvm::Type::getInt8Ty(malloc_call->getContext());
+  } else {
+    auto pointee_type = tu::getPointerElementType(malloc_call->getType());
+    allocation_type   = !pointee_type ? llvm::Type::getInt8Ty(malloc_call->getContext()) : *pointee_type;
+  }
+
+  int typeId = getOrRegisterType(allocation_type, dl);  // retrieveTypeID(tu::getVoidType(c));
+
+  if (mdata.kind == MemOpKind::CudaMallocLike) {
+    typeId = TYPEART_POINTER;
+  }
 
-  int typeId = getOrRegisterType(malloc_call->getType()->getPointerElementType(),
-                                 dl);  // retrieveTypeID(tu::getVoidType(c));
   if (typeId == TYPEART_UNKNOWN_TYPE) {
     LOG_ERROR("Unknown heap type. Not instrumenting. " << util::dump(*malloc_call));
     // TODO notify caller that we skipped: via lambda callback function
@@ -287,11 +306,34 @@ TypeIdentifier TypeManager::getOrRegisterType(const MallocData& mdata) {
   };
 
   // Number of bytes per element, 1 for void*
-  unsigned typeSize = tu::getTypeSizeInBytes(malloc_call->getType()->getPointerElementType(), dl);
+  unsigned typeSize = tu::getTypeSizeInBytes(allocation_type, dl);
+
+  if (mdata.kind == MemOpKind::CudaMallocLike) {
+    typeSize = 1;
+  }
 
   // Use the first cast as the determining type (if there is any)
   if (primaryBitcast != nullptr) {
-    auto* dstPtrType = primaryBitcast->getDestTy()->getPointerElementType();
+    llvm::Type* dstPtrType = nullptr;
+    if (auto pointee_type = tu::getPointerElementType(primaryBitcast->getDestTy()); pointee_type.has_value()) {
+      dstPtrType = *pointee_type;
+    }
+    // Basically: getSrcTy()->getPointerElementType()->getPointerElementType():
+    if (mdata.kind == MemOpKind::CudaMallocLike && dstPtrType == nullptr) {
+      if (auto pointee_type = tu::getPointerElementType(primaryBitcast->getSrcTy()); pointee_type.has_value()) {
+        dstPtrType = *pointee_type;
+      }
+      if (dstPtrType != nullptr && dstPtrType->isPointerTy()) {
+        if (auto nested = tu::getPointerElementType(dstPtrType); nested.has_value()) {
+          dstPtrType = *nested;
+        }
+      }
+    }
+
+    if (dstPtrType == nullptr) {
+      LOG_WARNING("Could not resolve non-opaque pointee type for allocation cast. Keeping fallback type.")
+      return {typeId, 0};
+    }
 
     typeSize = tu::getTypeSizeInBytes(dstPtrType, dl);
 
diff --git a/lib/runtime/AllocationTracking.cpp b/lib/runtime/AllocationTracking.cpp
index afcbf973..d27b5d0f 100644
--- a/lib/runtime/AllocationTracking.cpp
+++ b/lib/runtime/AllocationTracking.cpp
@@ -289,6 +289,18 @@ void __typeart_leave_scope_omp(int alloca_count) {
   typeart::RuntimeSystem::get().allocation_tracker().onLeaveScope(alloca_count, retAddr);
 }
 
+void __typeart_alloc_cuda(const void* addr, int typeId, size_t count) {
+  TYPEART_RUNTIME_GUARD;
+  const void* retAddr = __builtin_return_address(0);
+  typeart::RuntimeSystem::get().allocation_tracker().onAlloc(addr, typeId, count, retAddr);
+}
+
+void __typeart_free_cuda(const void* addr) {
+  TYPEART_RUNTIME_GUARD;
+  const void* retAddr = __builtin_return_address(0);
+  typeart::RuntimeSystem::get().allocation_tracker().onFreeHeap(addr, retAddr);
+}
+
 void __typeart_alloc_mty(const void* addr, const void* info, size_t count) {
   TYPEART_RUNTIME_GUARD;
   const void* retAddr = __builtin_return_address(0);
diff --git a/lib/runtime/CMakeLists.txt b/lib/runtime/CMakeLists.txt
index c889e7d7..c8467f71 100644
--- a/lib/runtime/CMakeLists.txt
+++ b/lib/runtime/CMakeLists.txt
@@ -18,6 +18,7 @@ add_custom_command(
 set(RUNTIME_LIB_SOURCES
     AccessCounter.h
     CallbackInterface.h
+    CudaRuntimeInterface.h
     RuntimeData.h
     RuntimeInterface.h
     TypeResolution.cpp
@@ -27,6 +28,7 @@ set(RUNTIME_LIB_SOURCES
     TypeResolution.h
     Runtime.cpp
     Runtime.h
+    CudaSupport.cpp
     ${TYPEART_META_SOURCE}
     $<$<BOOL:${TYPEART_MPI_LOGGER}>:../support/MPILogger.cpp>
 )
@@ -99,6 +101,10 @@ target_include_directories(
 
 target_include_directories(${TYPEART_PREFIX}_Runtime SYSTEM PRIVATE ${LLVM_INCLUDE_DIRS})
 
+if(CUDAToolkit_FOUND)
+  target_include_directories(${TYPEART_PREFIX}_Runtime SYSTEM PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+endif()
+
 target_compile_definitions(
   ${TYPEART_PREFIX}_Runtime
   PRIVATE TYPEART_LOG_LEVEL=${TYPEART_LOG_LEVEL_RT}
@@ -108,8 +114,18 @@ target_compile_definitions(
           $<$<BOOL:${TYPEART_ABSEIL}>:TYPEART_ABSEIL>
           $<$<BOOL:${TYPEART_SAFEPTR}>:USE_SAFEPTR>
           $<$<BOOL:${TYPEART_DISABLE_THREAD_SAFETY}>:TYPEART_DISABLE_THREAD_SAFETY>
+          $<$<BOOL:${CUDAToolkit_FOUND}>:TYPEART_HAS_CUDA=1>
 )
 
+if(CUDAToolkit_FOUND)
+  if(TARGET CUDA::cudart)
+    target_link_libraries(${TYPEART_PREFIX}_Runtime PRIVATE CUDA::cudart)
+  endif()
+  if(TARGET CUDA::cuda_driver)
+    target_link_libraries(${TYPEART_PREFIX}_Runtime PRIVATE CUDA::cuda_driver)
+  endif()
+endif()
+
 typeart_target_compile_options(${TYPEART_PREFIX}_Runtime)
 typeart_target_define_file_basename(${TYPEART_PREFIX}_Runtime)
 typeart_target_coverage_options(${TYPEART_PREFIX}_Runtime)
@@ -137,6 +153,12 @@ install(FILES
   DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}
 )
 
+if(CUDAToolkit_FOUND)
+  install(FILES CudaRuntimeInterface.h
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}
+  )
+endif()
+
 install(
   TARGETS ${TYPEART_PREFIX}_Runtime
   EXPORT ${TARGETS_EXPORT_NAME}
diff --git a/lib/runtime/CallbackInterface.h b/lib/runtime/CallbackInterface.h
index 83ec2d06..b9db02b9 100644
--- a/lib/runtime/CallbackInterface.h
+++ b/lib/runtime/CallbackInterface.h
@@ -39,6 +39,9 @@ TYPEART_EXPORT void __typeart_free_omp(const void* addr);
 TYPEART_EXPORT void __typeart_alloc_stack_omp(const void* addr, int type_id, size_t count);
 TYPEART_EXPORT void __typeart_leave_scope_omp(int alloca_count);
 
+TYPEART_EXPORT void __typeart_alloc_cuda(const void* addr, int type_id, size_t count);
+TYPEART_EXPORT void __typeart_free_cuda(const void* addr);
+
 // Called for inlined type definitions mode
 TYPEART_EXPORT void __typeart_alloc_mty(const void* addr, const void* info, size_t count);
 TYPEART_EXPORT void __typeart_alloc_global_mty(const void* addr, const void* info, size_t count);
diff --git a/lib/runtime/CudaRuntimeInterface.h b/lib/runtime/CudaRuntimeInterface.h
new file mode 100644
index 00000000..2b42ee96
--- /dev/null
+++ b/lib/runtime/CudaRuntimeInterface.h
@@ -0,0 +1,36 @@
+// TypeART library
+//
+// Copyright (c) 2017-2026 TypeART Authors
+// Distributed under the BSD 3-Clause license.
+// (See accompanying file LICENSE.txt or copy at
+// https://opensource.org/licenses/BSD-3-Clause)
+//
+// Project home: https://github.com/tudasc/TypeART
+//
+// SPDX-License-Identifier: BSD-3-Clause
+//
+
+#ifndef TYPEART_CUDARUNTIMEINTERFACE_H
+#define TYPEART_CUDARUNTIMEINTERFACE_H
+
+#include "RuntimeExport.h"
+#include "RuntimeInterface.h"
+
+#ifdef __cplusplus
+#include <cstddef>
+#else
+#include <stdbool.h>
+#include <stddef.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+TYPEART_EXPORT typeart_status typeart_cuda_is_device_ptr(const void* addr, bool* is_device_ptr_flag);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // TYPEART_CUDARUNTIMEINTERFACE_H
diff --git a/lib/runtime/CudaSupport.cpp b/lib/runtime/CudaSupport.cpp
new file mode 100644
index 00000000..3f358bc5
--- /dev/null
+++ b/lib/runtime/CudaSupport.cpp
@@ -0,0 +1,43 @@
+// TypeART library
+//
+// Copyright (c) 2017-2026 TypeART Authors
+// Distributed under the BSD 3-Clause license.
+// (See accompanying file LICENSE.txt or copy at
+// https://opensource.org/licenses/BSD-3-Clause)
+//
+// Project home: https://github.com/tudasc/TypeART
+//
+// SPDX-License-Identifier: BSD-3-Clause
+//
+
+#include "CudaRuntimeInterface.h"
+
+#ifdef TYPEART_HAS_CUDA
+#include <cuda.h>
+#endif
+
+typeart_status typeart_cuda_is_device_ptr(const void* addr, bool* is_device_ptr_flag) {
+  if (is_device_ptr_flag == nullptr) {
+    return TYPEART_ERROR;
+  }
+
+#ifdef TYPEART_HAS_CUDA
+  CUmemorytype mem_type;
+  CUresult status =
+      cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, reinterpret_cast<CUdeviceptr>(addr));
+  if (status != CUDA_SUCCESS) {
+    *is_device_ptr_flag = false;
+    if (status == CUDA_ERROR_INVALID_VALUE) {
+      return TYPEART_OK;
+    }
+    return TYPEART_ERROR;
+  }
+
+  *is_device_ptr_flag = (mem_type == CU_MEMORYTYPE_DEVICE);
+  return TYPEART_OK;
+#else
+  (void)addr;
+  *is_device_ptr_flag = false;
+  return TYPEART_OK;
+#endif
+}
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 13ae0faa..361044d0 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -14,6 +14,37 @@ typeart_find_llvm_progs(TYPEART_LIT_EXEC
 
 # add_subdirectory(compat)
 
+function(cuda_runnable_detect cuda_is_runnable)
+  set(${cuda_is_runnable} 0 PARENT_SCOPE)
+  if(NOT CUDAToolkit_FOUND)
+    return()
+  endif()
+
+  find_program(TYPEART_NVIDIA_SMI_EXEC nvidia-smi)
+  mark_as_advanced(TYPEART_NVIDIA_SMI_EXEC)
+  if(NOT TYPEART_NVIDIA_SMI_EXEC)
+    return()
+  endif()
+
+  execute_process(
+    COMMAND ${TYPEART_NVIDIA_SMI_EXEC} -L
+    RESULT_VARIABLE smi_result
+    OUTPUT_VARIABLE smi_out
+    ERROR_QUIET
+  )
+  if(NOT smi_result EQUAL 0)
+    return()
+  endif()
+
+  string(REGEX MATCHALL "GPU [0-9]+:" smi_gpu_list ${smi_out})
+  list(LENGTH smi_gpu_list TYPEART_CUDA_GPU_COUNT)
+  if(TYPEART_CUDA_GPU_COUNT GREATER 0)
+    set(${cuda_is_runnable} 1 PARENT_SCOPE)
+  endif()
+endfunction()
+
+cuda_runnable_detect(TYPEART_CUDA_RUNNABLE)
+
 macro(pythonize_bool truth_var var)
   if(${truth_var})
     set(${var} True)
@@ -43,6 +74,37 @@ function(typeart_configure_lit_site input output)
   pythonize_bool(Threads_FOUND TYPEARTPASS_THREADS)
   pythonize_bool(TYPEART_DISABLE_THREAD_SAFETY TYPEARTPASS_THREAD_UNSAFE)
 
+  set(TYPEARTPASS_CUDA_FLAGS "")
+  set(TYPEARTPASS_CUDA_PATH "")
+  if(CUDAToolkit_FOUND)
+    if(DEFINED CUDAToolkit_TARGET_DIR AND NOT "${CUDAToolkit_TARGET_DIR}" STREQUAL "")
+      set(TYPEARTPASS_CUDA_PATH "${CUDAToolkit_TARGET_DIR}")
+    elseif(DEFINED CUDAToolkit_LIBRARY_ROOT AND NOT "${CUDAToolkit_LIBRARY_ROOT}" STREQUAL "")
+      set(TYPEARTPASS_CUDA_PATH "${CUDAToolkit_LIBRARY_ROOT}")
+    elseif(DEFINED CUDAToolkit_BIN_DIR AND NOT "${CUDAToolkit_BIN_DIR}" STREQUAL "")
+      cmake_path(GET CUDAToolkit_BIN_DIR PARENT_PATH TYPEARTPASS_CUDA_PATH)
+    endif()
+
+    foreach(cuda_inc ${CUDAToolkit_INCLUDE_DIRS})
+      string(APPEND TYPEARTPASS_CUDA_FLAGS " -isystem ${cuda_inc}")
+    endforeach()
+  else()
+    find_path(TYPEART_CUDA_RUNTIME_HEADER_DIR cuda_runtime_api.h
+      HINTS /usr/local/cuda/include /usr/include /opt/cuda/include
+    )
+    if(TYPEART_CUDA_RUNTIME_HEADER_DIR)
+      cmake_path(GET TYPEART_CUDA_RUNTIME_HEADER_DIR PARENT_PATH TYPEARTPASS_CUDA_PATH)
+    endif()
+  endif()
+
+  set(TYPEART_CUDA_STATIC_AVAILABLE FALSE)
+  if(CUDAToolkit_FOUND OR NOT "${TYPEARTPASS_CUDA_PATH}" STREQUAL "")
+    set(TYPEART_CUDA_STATIC_AVAILABLE TRUE)
+  endif()
+
+  pythonize_bool(TYPEART_CUDA_STATIC_AVAILABLE TYPEARTPASS_CUDA_STATIC)
+  pythonize_bool(TYPEART_CUDA_RUNNABLE TYPEARTPASS_CUDA_RUNTIME)
+
   pythonize_bool(MPI_C_FOUND TYPEARTPASS_MPI_C)
   pythonize_bool(MPI_CXX_FOUND TYPEARTPASS_MPI_CXX)
   pythonize_bool(TYPEART_CI_RUN TYPEARTPASS_CI_RUN)
@@ -143,6 +205,7 @@ endif()
 set(TYPEART_SUITES
   all
   pass
+  cuda
   runtime
   script
   typemapping
@@ -155,8 +218,10 @@ set(TYPEART_SUITES_WORKERS
   ${NUM_CPU}
   1
   1
+  1
   ${NUM_CPU}
   1
+  1
 )
 
 typeart_add_lit_target(SUITES ${TYPEART_SUITES} WORKERS ${TYPEART_SUITES_WORKERS})
diff --git a/test/cuda/pass/01_cudamalloc.c b/test/cuda/pass/01_cudamalloc.c
new file mode 100644
index 00000000..a5b5f183
--- /dev/null
+++ b/test/cuda/pass/01_cudamalloc.c
@@ -0,0 +1,16 @@
+// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s
+
+// REQUIRES: cuda_static
+
+// CHECK: call i32 @cudaMalloc
+// CHECK-NEXT: [[CUDA_PTR:%[0-9a-z]+]] = load {{.*}}, {{.*}}
+// CHECK-NEXT: call void @__typeart_alloc_cuda({{(ptr|i8\*)}} {{.*}}[[CUDA_PTR]],
+
+int main() {
+  const int N = 20;
+  float* d_x;
+
+  cudaMalloc((void**)&d_x, N * sizeof(float));
+
+  return 0;
+}
diff --git a/test/cuda/pass/02_cudafree.c b/test/cuda/pass/02_cudafree.c
new file mode 100644
index 00000000..b91c4b9a
--- /dev/null
+++ b/test/cuda/pass/02_cudafree.c
@@ -0,0 +1,14 @@
+// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s
+
+// REQUIRES: cuda_static
+
+// CHECK: call i32 @cudaFree({{(ptr|i8\*)}} {{.*}}[[CU_POINTER:%[0-9a-z]+]])
+// CHECK-NEXT: __typeart_free_cuda({{(ptr|i8\*)}} {{.*}}[[CU_POINTER]])
+
+int main() {
+  float* d_x;
+
+  cudaFree(d_x);
+
+  return 0;
+}
diff --git a/test/cuda/pass/03_cudahostalloc.c b/test/cuda/pass/03_cudahostalloc.c
new file mode 100644
index 00000000..ab7d9f2c
--- /dev/null
+++ b/test/cuda/pass/03_cudahostalloc.c
@@ -0,0 +1,14 @@
+// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s
+
+// REQUIRES: cuda_static
+
+// CHECK: call void @__typeart_alloc_cuda({{(ptr|i8\*)}} %{{[0-9a-z]+}}, i32 {{[0-9]+}}, i64 20)
+
+int main() {
+  const int N = 20;
+  float* d_x;
+
+  cudaHostAlloc((void**)&d_x, N * sizeof(float), cudaHostAllocDefault);
+
+  return 0;
+}
diff --git a/test/cuda/pass/06_cudamalloc_nonvoid.cpp b/test/cuda/pass/06_cudamalloc_nonvoid.cpp
new file mode 100644
index 00000000..1f07164f
--- /dev/null
+++ b/test/cuda/pass/06_cudamalloc_nonvoid.cpp
@@ -0,0 +1,25 @@
+// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s
+
+// REQUIRES: cuda_static
+
+// CHECK: __typeart_alloc_cuda({{(ptr|i8\*)}} %{{[0-9a-z]+}}, i32 {{[0-9]+}}, i64 {{.*}})
+// CHECK: __typeart_alloc_cuda({{(ptr|i8\*)}} %{{[0-9a-z]+}}, i32 {{[0-9]+}}, i64 {{.*}})
+// CHECK: __typeart_alloc_cuda({{(ptr|i8\*)}} %{{[0-9a-z]+}}, i32 {{[0-9]+}}, i64 {{.*}})
+
+struct X {
+  int a;
+  int b;
+};
+
+int main() {
+  const int N = 20;
+  float* d_x;
+  double* dd_y;
+  X* sd_z;
+
+  cudaMalloc(&d_x, N * sizeof(float));
+  cudaMalloc(&dd_y, N * sizeof(double));
+  cudaMalloc(&sd_z, N * sizeof(X));
+
+  return 0;
+}
diff --git a/test/cuda/pass/07_axpy.c b/test/cuda/pass/07_axpy.c
new file mode 100644
index 00000000..279053cf
--- /dev/null
+++ b/test/cuda/pass/07_axpy.c
@@ -0,0 +1,32 @@
+// RUN: %apply %s -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s
+
+// REQUIRES: cuda
+
+// CHECK: Malloc :   2
+
+__global__ void axpy(float a, float* x, float* y) {
+  y[threadIdx.x] = a * x[threadIdx.x];
+}
+
+int main(int argc, char* argv[]) {
+  const int kDataLen = 4;
+
+  float a                = 2.0f;
+  float host_x[kDataLen] = {1.0f, 2.0f, 3.0f, 4.0f};
+  float host_y[kDataLen];
+
+  float* device_x;
+  float* device_y;
+  cudaMalloc((void**)&device_x, kDataLen * sizeof(float));
+  cudaMalloc((void**)&device_y, kDataLen * sizeof(float));
+
+  cudaMemcpy(device_x, host_x, kDataLen * sizeof(float), cudaMemcpyHostToDevice);
+
+  axpy<<<1, kDataLen>>>(a, device_x, device_y);
+
+  cudaDeviceSynchronize();
+  cudaMemcpy(host_y, device_y, kDataLen * sizeof(float), cudaMemcpyDeviceToHost);
+
+  cudaDeviceReset();
+  return 0;
+}
diff --git a/test/lit.cfg b/test/lit.cfg
index fc8dbd45..ea2158a7 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -58,6 +58,11 @@ if config.has_legacy_wrapper:
 if config.is_ci:
   config.available_features.add('ci')
 
+if getattr(config, 'cuda_static', False):
+  config.available_features.add('cuda')
+if getattr(config, 'cuda_runtime', False):
+  config.available_features.add('cuda_runtime')
+
 profile_files       = getattr(config, 'profile_file', None)
 typeart_base_lib_dir= getattr(config, 'typeart_base_lib_dir', None)
 typeart_lib_root    = getattr(config, 'typeart_lib_dir', None)
@@ -69,6 +74,11 @@ transform_pass      = '{}/{}'.format(typeart_lib_root, transform_name)
 # std_plugin_args     = 'typeart --typeart-stats=true'
 std_plugin_args_newpm = 'typeart<stats'
 to_llvm_args        = '-O1 -g -Xclang -disable-llvm-passes -S -emit-llvm -o -'
+cuda_path           = getattr(config, 'cuda_path', '')
+cuda_host_args      = '-x cuda --cuda-host-only -nocudalib'
+if cuda_path:
+  cuda_host_args += ' --cuda-path={}'.format(cuda_path)
+cuda_flags          = getattr(config, 'cuda_flags', '')
 type_file           = 'typeart-types.yaml'
 openmp_c_flags      = getattr(config, 'openmp_c_flags', None)
 openmp_cxx_flags    = getattr(config, 'openmp_cxx_flags', None)
@@ -138,6 +148,8 @@ config.substitutions.append(('%apply-typeart', '{} {} -load-pass-plugin {} -pass
 
 config.substitutions.append(('%c-to-llvm', '{} {}'.format(clang_cc, to_llvm_args)))
 config.substitutions.append(('%cpp-to-llvm', '{} {}'.format(clang_cpp, to_llvm_args)))
+config.substitutions.append(('%cuda-c-to-llvm', '{} {} {} {}'.format(clang_cc, cuda_host_args, cuda_flags, to_llvm_args)))
+config.substitutions.append(('%cuda-cpp-to-llvm', '{} {} {} {}'.format(clang_cpp, cuda_host_args, cuda_flags, to_llvm_args)))
 config.substitutions.append(('%run', '{}/run.sh'.format(typeart_script_dir)))
 config.substitutions.append(('%apply', '{}/apply.sh'.format(typeart_script_dir)))
 
diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in
index 7a37f7cc..96fb248e 100644
--- a/test/lit.site.cfg.in
+++ b/test/lit.site.cfg.in
@@ -39,6 +39,10 @@ config.tsan=@TYPEARTPASS_TSAN@
 config.asan=@TYPEARTPASS_ASAN@
 config.ubsan=@TYPEARTPASS_UBSAN@
 config.coverage=@TYPEART_COVERAGE@
+config.cuda_static=@TYPEARTPASS_CUDA_STATIC@
+config.cuda_runtime=@TYPEARTPASS_CUDA_RUNTIME@
+config.cuda_flags="@TYPEARTPASS_CUDA_FLAGS@"
+config.cuda_path="@TYPEARTPASS_CUDA_PATH@"
 
 config.python_interp = "@Python3_EXECUTABLE@"
 
diff --git a/test/pass/misc/05_make_all_callbacks.c b/test/pass/misc/05_make_all_callbacks.c
index 4a27a617..f54e3258 100644
--- a/test/pass/misc/05_make_all_callbacks.c
+++ b/test/pass/misc/05_make_all_callbacks.c
@@ -1,5 +1,5 @@
-// RUN: %c-to-llvm -Wimplicit-function-declaration %s -I%runtime_path -I%base_path/lib/runtime | %apply-typeart --typeart-stack=true -S 2>&1 | \
-// RUN: %filecheck %s
+// RUN: %c-to-llvm -Wimplicit-function-declaration %s -I%runtime_path -I%base_path/lib/runtime | %apply-typeart
+// --typeart-stack=true -S 2>&1 | \ RUN: %filecheck %s
 
 #include "CallbackInterface.h"
 

From 35b2ea8a39fdbe07ffb2101acdfa6ee207b4be64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Sun, 26 Apr 2026 10:54:43 +0200
Subject: [PATCH 02/24] Fix tests

---
 test/cuda/pass/07_axpy.c | 4 ++--
 test/lit.cfg             | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/cuda/pass/07_axpy.c b/test/cuda/pass/07_axpy.c
index 279053cf..c661593f 100644
--- a/test/cuda/pass/07_axpy.c
+++ b/test/cuda/pass/07_axpy.c
@@ -1,6 +1,6 @@
-// RUN: %apply %s -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s
+// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s
 
-// REQUIRES: cuda
+// REQUIRES: cuda_static
 
 // CHECK: Malloc :   2
 
diff --git a/test/lit.cfg b/test/lit.cfg
index ea2158a7..a4fdda17 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -58,9 +58,9 @@ if config.has_legacy_wrapper:
 if config.is_ci:
   config.available_features.add('ci')
 
-if getattr(config, 'cuda_static', False):
-  config.available_features.add('cuda')
-if getattr(config, 'cuda_runtime', False):
+if config.cuda_static:
+  config.available_features.add('cuda_static')
+if config.cuda_runtime:
   config.available_features.add('cuda_runtime')
 
 profile_files       = getattr(config, 'profile_file', None)

From 26005ffeaa53e104158cdfaf33556ee157fc5a1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Sun, 26 Apr 2026 11:20:18 +0200
Subject: [PATCH 03/24] Initial tests pass

---
 lib/passes/TypeARTPass.cpp  |  9 ++++-----
 test/cuda/runtime/07_axpy.c | 34 ++++++++++++++++++++++++++++++++++
 test/lit.cfg                |  3 +++
 test/lit.site.cfg.in        |  1 +
 4 files changed, 42 insertions(+), 5 deletions(-)
 create mode 100644 test/cuda/runtime/07_axpy.c

diff --git a/lib/passes/TypeARTPass.cpp b/lib/passes/TypeARTPass.cpp
index 79256a4b..ba3d4b2f 100644
--- a/lib/passes/TypeARTPass.cpp
+++ b/lib/passes/TypeARTPass.cpp
@@ -270,6 +270,10 @@ class TypeArtPass : public llvm::PassInfoMixin<TypeArtPass> {
 
 llvm::PreservedAnalyses
 run(llvm::Module& m, llvm::ModuleAnalysisManager&) {
+  if (cuda::is_device_module(m)) {
+    LOG_DEBUG("Skipping CUDA device module: " << m.getName());
+    return llvm::PreservedAnalyses::all();
+  }
   bool changed{false};
   changed |= doInitialization(m);
   const bool heap = configuration()[config::ConfigStdArgs::heap];  // Must happen after doInit
@@ -281,11 +285,6 @@ run(llvm::Module& m, llvm::ModuleAnalysisManager&) {
 }
 
 bool runOnModule(llvm::Module& m) {
-  if (cuda::is_device_module(m)) {
-    LOG_DEBUG("Skipping CUDA device module: " << m.getName());
-    return false;
-  }
-
   meminst_finder->runOnModule(m);
   const bool instrument_global = configuration()[config::ConfigStdArgs::global];
   bool globals_were_instrumented{false};
diff --git a/test/cuda/runtime/07_axpy.c b/test/cuda/runtime/07_axpy.c
new file mode 100644
index 00000000..2ca6a2e6
--- /dev/null
+++ b/test/cuda/runtime/07_axpy.c
@@ -0,0 +1,34 @@
+// RUN: %wrapper-cc -x cuda --cuda-gpu-arch=sm_50 %cuda_link %s -o %s.exe
+// RUN: %s.exe 2>&1 | %filecheck %s
+
+// REQUIRES: cuda_runtime
+// UNSUPPORTED: sanitizer
+
+// CHECK: Total heap{{[ ]*}}:   2 ,    2 ,    -
+
+__global__ void axpy(float a, float* x, float* y) {
+  y[threadIdx.x] = a * x[threadIdx.x];
+}
+
+int main(int argc, char* argv[]) {
+  const int kDataLen = 4;
+
+  float a                = 2.0f;
+  float host_x[kDataLen] = {1.0f, 2.0f, 3.0f, 4.0f};
+  float host_y[kDataLen];
+
+  float* device_x;
+  float* device_y;
+  cudaMalloc((void**)&device_x, kDataLen * sizeof(float));
+  cudaMalloc((void**)&device_y, kDataLen * sizeof(float));
+
+  cudaMemcpy(device_x, host_x, kDataLen * sizeof(float), cudaMemcpyHostToDevice);
+
+  axpy<<<1, kDataLen>>>(a, device_x, device_y);
+
+  cudaDeviceSynchronize();
+  cudaMemcpy(host_y, device_y, kDataLen * sizeof(float), cudaMemcpyDeviceToHost);
+
+  cudaDeviceReset();
+  return 0;
+}
diff --git a/test/lit.cfg b/test/lit.cfg
index a4fdda17..928bf8e8 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -79,6 +79,7 @@ cuda_host_args      = '-x cuda --cuda-host-only -nocudalib'
 if cuda_path:
   cuda_host_args += ' --cuda-path={}'.format(cuda_path)
 cuda_flags          = getattr(config, 'cuda_flags', '')
+cuda_link_flags     = f"-L{config.cuda_lib} -Wl,-rpath,{config.cuda_lib} -lcudart"
 type_file           = 'typeart-types.yaml'
 openmp_c_flags      = getattr(config, 'openmp_c_flags', None)
 openmp_cxx_flags    = getattr(config, 'openmp_cxx_flags', None)
@@ -140,6 +141,8 @@ config.substitutions.append(('%arg_std', std_plugin_args_newpm))
 config.substitutions.append(('%omp_c_flags', openmp_c_flags))
 config.substitutions.append(('%omp_cpp_flags', openmp_cxx_flags))
 
+config.substitutions.append(('%cuda_link', cuda_link_flags))
+
 # TODO refactor typeart arguments (and add args for enabling heap/stack/globals etc.)
 # config.substitutions.append(('%arg_stack', '-typeart-stack'))
 # config.substitutions.append(('%arg_heap', '-typeart-heap'))
diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in
index 96fb248e..b399460c 100644
--- a/test/lit.site.cfg.in
+++ b/test/lit.site.cfg.in
@@ -43,6 +43,7 @@ config.cuda_static=@TYPEARTPASS_CUDA_STATIC@
 config.cuda_runtime=@TYPEARTPASS_CUDA_RUNTIME@
 config.cuda_flags="@TYPEARTPASS_CUDA_FLAGS@"
 config.cuda_path="@TYPEARTPASS_CUDA_PATH@"
+config.cuda_lib="@CUDAToolkit_LIBRARY_DIR@"
 
 config.python_interp = "@Python3_EXECUTABLE@"
 

From a37e9619ae17474ddc208f852289ffb4e1a55c32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Mon, 27 Apr 2026 15:11:29 +0200
Subject: [PATCH 04/24] Workaround handling of templated cuda allocation fn

---
 lib/passes/typegen/dimeta/DimetaTypeGen.cpp | 64 +++++++++++++++------
 test/cuda/pass/01_cudamalloc.c              |  4 +-
 test/cuda/pass/03_cudahostalloc.c           |  4 +-
 test/cuda/pass/06_cudamalloc_nonvoid.cpp    |  6 +-
 test/cuda/runtime/07_axpy.c                 |  8 +++
 5 files changed, 61 insertions(+), 25 deletions(-)

diff --git a/lib/passes/typegen/dimeta/DimetaTypeGen.cpp b/lib/passes/typegen/dimeta/DimetaTypeGen.cpp
index 90a19cc3..cf19791d 100644
--- a/lib/passes/typegen/dimeta/DimetaTypeGen.cpp
+++ b/lib/passes/typegen/dimeta/DimetaTypeGen.cpp
@@ -13,6 +13,7 @@
 #include "../TypeIDGenerator.h"
 #include "Dimeta.h"
 #include "DimetaData.h"
+#include "analysis/MemOpData.h"
 #include "support/Logger.h"
 #include "typegen/TypeGenerator.h"
 #include "typelib/TypeDatabase.h"
@@ -97,6 +98,29 @@ auto apply_function(const Type& type, Func&& handle_qualified_type) {
 }  // namespace detail
 
 namespace workaround {
+namespace detail {
+void remove_pointer_level_impl(dimeta::LocatedType& val) {
+  const auto remove_pointer_level = [](auto& qual) {
+    auto pointer_like_iter = llvm::find_if(qual, [](auto qualifier) {
+      switch (qualifier) {
+        case dimeta::Qualifier::kPtr:
+        case dimeta::Qualifier::kRef:
+        case dimeta::Qualifier::kPtrToMember:
+          return true;
+        default:
+          break;
+      }
+      return false;
+    });
+    if (pointer_like_iter != std::end(qual)) {
+      LOG_DEBUG("Removing pointer level " << static_cast<int>(*pointer_like_iter))
+      qual.erase(pointer_like_iter);
+    }
+  };
+  std::visit([&](auto&& qualified_type) { remove_pointer_level(qualified_type.qual); }, val.type);
+}
+}  // namespace detail
+
 void remove_pointer_level(const llvm::AllocaInst* alloc, dimeta::LocatedType& val) {
   // If the alloca instruction is not a pointer, but the located_type has a pointer-like qualifier, we remove it.
   // Workaround for inlining issue, see test typemapping/05_milc_inline_metadata.c
@@ -105,27 +129,16 @@ void remove_pointer_level(const llvm::AllocaInst* alloc, dimeta::LocatedType& va
   // this will cause MPI handle arrays (typedef "ptr to opaque struct") to be considered a pointer
   if (!alloc->getAllocatedType()->isPointerTy() && !alloc->getAllocatedType()->isArrayTy()) {
     LOG_DEBUG("Alloca is not a pointer type: " << *alloc->getAllocatedType())
-
-    const auto remove_pointer_level = [](auto& qual) {
-      auto pointer_like_iter = llvm::find_if(qual, [](auto qualifier) {
-        switch (qualifier) {
-          case dimeta::Qualifier::kPtr:
-          case dimeta::Qualifier::kRef:
-          case dimeta::Qualifier::kPtrToMember:
-            return true;
-          default:
-            break;
-        }
-        return false;
-      });
-      if (pointer_like_iter != std::end(qual)) {
-        LOG_DEBUG("Removing pointer level " << static_cast<int>(*pointer_like_iter))
-        qual.erase(pointer_like_iter);
-      }
-    };
-    std::visit([&](auto&& qualified_type) { remove_pointer_level(qualified_type.qual); }, val.type);
+    detail::remove_pointer_level_impl(val);
   }
 }
+
+void remove_pointer_level(const llvm::CallBase* call, dimeta::LocatedType& val) {
+  // If the call base is a templated cudaMalloc<...> call, current we need to remove a single pointer level to correct
+  // determine the allocated type
+  detail::remove_pointer_level_impl(val);
+}
+
 }  // namespace workaround
 
 template <typename Type>
@@ -479,6 +492,19 @@ class DimetaTypeManager final : public TypeIDGenerator {
 
       if (val) {
         LOG_DEBUG("Registering malloc-like")
+        const auto is_template_fn = [](const auto& func_str) {
+          return !func_str.empty() && func_str.back() == '>' && func_str.find('<') != std::string::npos;
+        };
+
+        const auto function_name = val->location.function;
+        if (call->getCalledFunction() != nullptr && is_template_fn(function_name)) {
+          MemOps mem_operations;
+          auto kind = mem_operations.kind(call->getCalledFunction()->getName());
+          if (kind && kind.value() == MemOpKind::CudaMallocLike) {
+            LOG_DEBUG("Workaround for pointer level of call base " << function_name)
+            workaround::remove_pointer_level(call, val.value());
+          }
+        }
 
         return {getOrRegister(val->type, true), array_size(val->type)};
       }
diff --git a/test/cuda/pass/01_cudamalloc.c b/test/cuda/pass/01_cudamalloc.c
index a5b5f183..efceb49a 100644
--- a/test/cuda/pass/01_cudamalloc.c
+++ b/test/cuda/pass/01_cudamalloc.c
@@ -2,8 +2,8 @@
 
 // REQUIRES: cuda_static
 
-// CHECK: call i32 @cudaMalloc
-// CHECK-NEXT: [[CUDA_PTR:%[0-9a-z]+]] = load {{.*}}, {{.*}}
+// CHECK: call i32 @cudaMalloc({{(ptr|i8\*)}} {{.*}}[[CU_POINTER:%[_0-9a-z]+]],
+// CHECK-NEXT: [[CUDA_PTR:%[0-9a-z_]+]] = load {{.*}}, {{.*}}[[CU_POINTER]]
 // CHECK-NEXT: call void @__typeart_alloc_cuda({{(ptr|i8\*)}} {{.*}}[[CUDA_PTR]],
 
 int main() {
diff --git a/test/cuda/pass/03_cudahostalloc.c b/test/cuda/pass/03_cudahostalloc.c
index ab7d9f2c..01c25109 100644
--- a/test/cuda/pass/03_cudahostalloc.c
+++ b/test/cuda/pass/03_cudahostalloc.c
@@ -2,7 +2,9 @@
 
 // REQUIRES: cuda_static
 
-// CHECK: call void @__typeart_alloc_cuda({{(ptr|i8\*)}} %{{[0-9a-z]+}}, i32 {{[0-9]+}}, i64 20)
+// CHECK: call i32 @cudaHostAlloc({{(ptr|i8\*)}} {{.*}}[[CU_POINTER:%[_0-9a-z]+]],
+// CHECK-NEXT: [[CUDA_PTR:%[0-9a-z_]+]] = load {{.*}}, {{.*}}[[CU_POINTER]]
+// CHECK-NEXT: call void @__typeart_alloc_cuda({{(ptr|i8\*)}} {{.*}}[[CUDA_PTR]],
 
 int main() {
   const int N = 20;
diff --git a/test/cuda/pass/06_cudamalloc_nonvoid.cpp b/test/cuda/pass/06_cudamalloc_nonvoid.cpp
index 1f07164f..07e9297d 100644
--- a/test/cuda/pass/06_cudamalloc_nonvoid.cpp
+++ b/test/cuda/pass/06_cudamalloc_nonvoid.cpp
@@ -2,9 +2,9 @@
 
 // REQUIRES: cuda_static
 
-// CHECK: __typeart_alloc_cuda({{(ptr|i8\*)}} %{{[0-9a-z]+}}, i32 {{[0-9]+}}, i64 {{.*}})
-// CHECK: __typeart_alloc_cuda({{(ptr|i8\*)}} %{{[0-9a-z]+}}, i32 {{[0-9]+}}, i64 {{.*}})
-// CHECK: __typeart_alloc_cuda({{(ptr|i8\*)}} %{{[0-9a-z]+}}, i32 {{[0-9]+}}, i64 {{.*}})
+// CHECK: __typeart_alloc_cuda({{(ptr|i8\*)}} %{{[0-9a-z_]+}}, i32 23, i64 {{.*}})
+// CHECK: __typeart_alloc_cuda({{(ptr|i8\*)}} %{{[0-9a-z_]+}}, i32 24, i64 {{.*}})
+// CHECK: __typeart_alloc_mty({{(ptr|i8\*)}} %{{[0-9a-z_]+}}, {{(ptr|i32)}} {{[@_a-z0-9A-Z]+}}, i64 {{.*}})
 
 struct X {
   int a;
diff --git a/test/cuda/runtime/07_axpy.c b/test/cuda/runtime/07_axpy.c
index 2ca6a2e6..fb89b8f7 100644
--- a/test/cuda/runtime/07_axpy.c
+++ b/test/cuda/runtime/07_axpy.c
@@ -4,8 +4,10 @@
 // REQUIRES: cuda_runtime
 // UNSUPPORTED: sanitizer
 
+// CHECK: [0]=2 [1]=4 [2]=6 [3]=8
 // CHECK: Total heap{{[ ]*}}:   2 ,    2 ,    -
 
+#include <stdio.h>
 __global__ void axpy(float a, float* x, float* y) {
   y[threadIdx.x] = a * x[threadIdx.x];
 }
@@ -30,5 +32,11 @@ int main(int argc, char* argv[]) {
   cudaMemcpy(host_y, device_y, kDataLen * sizeof(float), cudaMemcpyDeviceToHost);
 
   cudaDeviceReset();
+
+  for (int i = 0; i < kDataLen; ++i) {
+    printf("[%i]=%.0f ", i, host_y[i]);
+  }
+  printf("\n");
+
   return 0;
 }

From 4fb4d795bc6b6fe0eec599d701c124891724e742 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Mon, 27 Apr 2026 15:31:08 +0200
Subject: [PATCH 05/24] Instead of cuda, use gpu for callbacks

---
 lib/passes/instrumentation/TypeARTFunctions.cpp |  9 +++++++--
 lib/runtime/AllocationTracking.cpp              | 13 +++++++++++--
 lib/runtime/CallbackInterface.h                 |  6 ++++--
 test/cuda/pass/01_cudamalloc.c                  |  2 +-
 test/cuda/pass/02_cudafree.c                    |  2 +-
 test/cuda/pass/03_cudahostalloc.c               |  2 +-
 test/cuda/pass/06_cudamalloc_nonvoid.cpp        |  6 +++---
 7 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/lib/passes/instrumentation/TypeARTFunctions.cpp b/lib/passes/instrumentation/TypeARTFunctions.cpp
index 67168d47..17c295fa 100644
--- a/lib/passes/instrumentation/TypeARTFunctions.cpp
+++ b/lib/passes/instrumentation/TypeARTFunctions.cpp
@@ -52,7 +52,7 @@ std::string get_func_suffix(IFunc id) {
   switch (id) {
     case IFunc::free_cuda:
     case IFunc::heap_cuda:
-      return "_cuda";
+      return "_gpu";
     case IFunc::free_omp:
     case IFunc::heap_omp:
     case IFunc::stack_omp:
@@ -301,6 +301,8 @@ TypeArtFunc typeart_register_type{"__typeart_register_type"};
 TypeArtFunc typeart_alloc_omp_mty        = typeart_alloc_mty;
 TypeArtFunc typeart_alloc_stacks_omp_mty = typeart_alloc_stack_mty;
 
+TypeArtFunc typeart_alloc_cuda_mty = typeart_alloc_mty;
+
 }  // namespace callbacks
 
 std::unique_ptr<TAFunctionQuery> declare_instrumentation_functions(llvm::Module& m,
@@ -324,7 +326,7 @@ std::unique_ptr<TAFunctionQuery> declare_instrumentation_functions(llvm::Module&
       decl_alternatives.make_function(IFunc::stack, typeart_alloc_stack_mty.name, alloc_arg_types_mty);
   typeart_alloc_global_mty.f =
       decl_alternatives.make_function(IFunc::global, typeart_alloc_global_mty.name, alloc_arg_types_mty);
-  functions_alternative.putFunctionFor(IFunc::heap_cuda, llvm::cast<llvm::Function>(typeart_alloc_mty.f));
+  // functions_alternative.putFunctionFor(IFunc::heap_cuda, llvm::cast<llvm::Function>(typeart_alloc_mty.f));
   typeart_register_type.f = decl.make_function(IFunc::type, typeart_register_type.name, free_arg_types);
 
   typeart_alloc.f        = decl.make_function(IFunc::heap, typeart_alloc.name, alloc_arg_types);
@@ -347,6 +349,9 @@ std::unique_ptr<TAFunctionQuery> declare_instrumentation_functions(llvm::Module&
   typeart_alloc_stacks_omp_mty.f =
       decl_alternatives.make_function(IFunc::stack_omp, typeart_alloc_stacks_omp_mty.name, alloc_arg_types_mty);
 
+  typeart_alloc_cuda_mty.f =
+      decl_alternatives.make_function(IFunc::heap_cuda, typeart_alloc_cuda_mty.name, alloc_arg_types_mty);
+
   return std::make_unique<TAFunctionAlternatives>(functions, functions_alternative);
 }
 
diff --git a/lib/runtime/AllocationTracking.cpp b/lib/runtime/AllocationTracking.cpp
index d27b5d0f..ce03c931 100644
--- a/lib/runtime/AllocationTracking.cpp
+++ b/lib/runtime/AllocationTracking.cpp
@@ -289,13 +289,13 @@ void __typeart_leave_scope_omp(int alloca_count) {
   typeart::RuntimeSystem::get().allocation_tracker().onLeaveScope(alloca_count, retAddr);
 }
 
-void __typeart_alloc_cuda(const void* addr, int typeId, size_t count) {
+void __typeart_alloc_gpu(const void* addr, int typeId, size_t count) {
   TYPEART_RUNTIME_GUARD;
   const void* retAddr = __builtin_return_address(0);
   typeart::RuntimeSystem::get().allocation_tracker().onAlloc(addr, typeId, count, retAddr);
 }
 
-void __typeart_free_cuda(const void* addr) {
+void __typeart_free_gpu(const void* addr) {
   TYPEART_RUNTIME_GUARD;
   const void* retAddr = __builtin_return_address(0);
   typeart::RuntimeSystem::get().allocation_tracker().onFreeHeap(addr, retAddr);
@@ -310,6 +310,15 @@ void __typeart_alloc_mty(const void* addr, const void* info, size_t count) {
   rt.allocation_tracker().onAlloc(addr, type_id, count, retAddr);
 }
 
+void __typeart_alloc_mty_gpu(const void* addr, const void* info, size_t count) {
+  TYPEART_RUNTIME_GUARD;
+  const void* retAddr = __builtin_return_address(0);
+  const auto type_id  = reinterpret_cast<const typeart::global_types::GlobalTypeInfo*>(info)->type_id;
+  auto& rt            = typeart::RuntimeSystem::get();
+  assert(type_id == rt.type_translator().get_type_id_for(info) && "Type ID of global and lookup must match");
+  rt.allocation_tracker().onAlloc(addr, type_id, count, retAddr);
+}
+
 void __typeart_alloc_stack_mty(const void* addr, const void* info, size_t count) {
   TYPEART_RUNTIME_GUARD;
   const void* retAddr = __builtin_return_address(0);
diff --git a/lib/runtime/CallbackInterface.h b/lib/runtime/CallbackInterface.h
index b9db02b9..95a3f885 100644
--- a/lib/runtime/CallbackInterface.h
+++ b/lib/runtime/CallbackInterface.h
@@ -39,8 +39,8 @@ TYPEART_EXPORT void __typeart_free_omp(const void* addr);
 TYPEART_EXPORT void __typeart_alloc_stack_omp(const void* addr, int type_id, size_t count);
 TYPEART_EXPORT void __typeart_leave_scope_omp(int alloca_count);
 
-TYPEART_EXPORT void __typeart_alloc_cuda(const void* addr, int type_id, size_t count);
-TYPEART_EXPORT void __typeart_free_cuda(const void* addr);
+TYPEART_EXPORT void __typeart_alloc_gpu(const void* addr, int type_id, size_t count);
+TYPEART_EXPORT void __typeart_free_gpu(const void* addr);
 
 // Called for inlined type definitions mode
 TYPEART_EXPORT void __typeart_alloc_mty(const void* addr, const void* info, size_t count);
@@ -50,6 +50,8 @@ TYPEART_EXPORT void __typeart_register_type(const void* type);
 
 TYPEART_EXPORT void __typeart_alloc_global_mty_omp(const void* addr, const void* info, size_t count);
 TYPEART_EXPORT void __typeart_alloc_stack_mty_omp(const void* addr, const void* info, size_t count);
+
+TYPEART_EXPORT void __typeart_alloc_mty_gpu(const void* addr, const void* info, size_t count);
 #ifdef __cplusplus
 }
 #endif
diff --git a/test/cuda/pass/01_cudamalloc.c b/test/cuda/pass/01_cudamalloc.c
index efceb49a..2e1c7822 100644
--- a/test/cuda/pass/01_cudamalloc.c
+++ b/test/cuda/pass/01_cudamalloc.c
@@ -4,7 +4,7 @@
 
 // CHECK: call i32 @cudaMalloc({{(ptr|i8\*)}} {{.*}}[[CU_POINTER:%[_0-9a-z]+]],
 // CHECK-NEXT: [[CUDA_PTR:%[0-9a-z_]+]] = load {{.*}}, {{.*}}[[CU_POINTER]]
-// CHECK-NEXT: call void @__typeart_alloc_cuda({{(ptr|i8\*)}} {{.*}}[[CUDA_PTR]],
+// CHECK-NEXT: call void @__typeart_alloc_gpu({{(ptr|i8\*)}} {{.*}}[[CUDA_PTR]],
 
 int main() {
   const int N = 20;
diff --git a/test/cuda/pass/02_cudafree.c b/test/cuda/pass/02_cudafree.c
index b91c4b9a..be1da9ed 100644
--- a/test/cuda/pass/02_cudafree.c
+++ b/test/cuda/pass/02_cudafree.c
@@ -3,7 +3,7 @@
 // REQUIRES: cuda_static
 
 // CHECK: call i32 @cudaFree({{(ptr|i8\*)}} {{.*}}[[CU_POINTER:%[0-9a-z]+]])
-// CHECK-NEXT: __typeart_free_cuda({{(ptr|i8\*)}} {{.*}}[[CU_POINTER]])
+// CHECK-NEXT: __typeart_free_gpu({{(ptr|i8\*)}} {{.*}}[[CU_POINTER]])
 
 int main() {
   float* d_x;
diff --git a/test/cuda/pass/03_cudahostalloc.c b/test/cuda/pass/03_cudahostalloc.c
index 01c25109..097611ae 100644
--- a/test/cuda/pass/03_cudahostalloc.c
+++ b/test/cuda/pass/03_cudahostalloc.c
@@ -4,7 +4,7 @@
 
 // CHECK: call i32 @cudaHostAlloc({{(ptr|i8\*)}} {{.*}}[[CU_POINTER:%[_0-9a-z]+]],
 // CHECK-NEXT: [[CUDA_PTR:%[0-9a-z_]+]] = load {{.*}}, {{.*}}[[CU_POINTER]]
-// CHECK-NEXT: call void @__typeart_alloc_cuda({{(ptr|i8\*)}} {{.*}}[[CUDA_PTR]],
+// CHECK-NEXT: call void @__typeart_alloc_gpu({{(ptr|i8\*)}} {{.*}}[[CUDA_PTR]],
 
 int main() {
   const int N = 20;
diff --git a/test/cuda/pass/06_cudamalloc_nonvoid.cpp b/test/cuda/pass/06_cudamalloc_nonvoid.cpp
index 07e9297d..53551de3 100644
--- a/test/cuda/pass/06_cudamalloc_nonvoid.cpp
+++ b/test/cuda/pass/06_cudamalloc_nonvoid.cpp
@@ -2,9 +2,9 @@
 
 // REQUIRES: cuda_static
 
-// CHECK: __typeart_alloc_cuda({{(ptr|i8\*)}} %{{[0-9a-z_]+}}, i32 23, i64 {{.*}})
-// CHECK: __typeart_alloc_cuda({{(ptr|i8\*)}} %{{[0-9a-z_]+}}, i32 24, i64 {{.*}})
-// CHECK: __typeart_alloc_mty({{(ptr|i8\*)}} %{{[0-9a-z_]+}}, {{(ptr|i32)}} {{[@_a-z0-9A-Z]+}}, i64 {{.*}})
+// CHECK: __typeart_alloc_gpu({{(ptr|i8\*)}} %{{[0-9a-z_]+}}, i32 23, i64 {{.*}})
+// CHECK: __typeart_alloc_gpu({{(ptr|i8\*)}} %{{[0-9a-z_]+}}, i32 24, i64 {{.*}})
+// CHECK: __typeart_alloc_mty_gpu({{(ptr|i8\*)}} %{{[0-9a-z_]+}}, {{(ptr|i32)}} {{[@_a-z0-9A-Z]+}}, i64 {{.*}})
 
 struct X {
   int a;

From eaf5170e535772c8ba44dfbe4b1b6f96e317003e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Tue, 28 Apr 2026 11:02:57 +0200
Subject: [PATCH 06/24] Solidify the cuda template workaround

---
 lib/passes/support/CudaUtil.h               | 17 +++++++++++++++++
 lib/passes/typegen/dimeta/DimetaTypeGen.cpp |  8 +++-----
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/lib/passes/support/CudaUtil.h b/lib/passes/support/CudaUtil.h
index 69eba5a0..af4add08 100644
--- a/lib/passes/support/CudaUtil.h
+++ b/lib/passes/support/CudaUtil.h
@@ -13,12 +13,14 @@
 #ifndef TYPEART_CUDAUTIL_H
 #define TYPEART_CUDAUTIL_H
 
+#include "analysis/MemOpData.h"
 #include "support/Util.h"
 
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 
+#include <algorithm>
 #include <optional>
 #include <string>
 
@@ -79,6 +81,21 @@ inline bool is_cuda_helper_function(const llvm::Function& function) {
   return util::starts_with_any_of(function_name, "__cuda");
 }
 
+inline bool is_templated_malloc_like(llvm::StringRef name) {
+  const auto templ_start_pos = name.find_first_of('<');
+  if (templ_start_pos == llvm::StringRef::npos) {
+    return false;
+  }
+  auto extracted_fn = name.substr(0, templ_start_pos);
+  MemOps ops;
+  return ops.allocKind(extracted_fn) == MemOpKind::CudaMallocLike;
+}
+
+inline bool is_templated_malloc_like(const llvm::Function& function) {
+  const std::string name = util::try_demangle(function);
+  return is_templated_malloc_like(name);
+}
+
 }  // namespace typeart::cuda
 
 #endif  // TYPEART_CUDAUTIL_H
diff --git a/lib/passes/typegen/dimeta/DimetaTypeGen.cpp b/lib/passes/typegen/dimeta/DimetaTypeGen.cpp
index cf19791d..528bce9d 100644
--- a/lib/passes/typegen/dimeta/DimetaTypeGen.cpp
+++ b/lib/passes/typegen/dimeta/DimetaTypeGen.cpp
@@ -14,6 +14,7 @@
 #include "Dimeta.h"
 #include "DimetaData.h"
 #include "analysis/MemOpData.h"
+#include "support/CudaUtil.h"
 #include "support/Logger.h"
 #include "typegen/TypeGenerator.h"
 #include "typelib/TypeDatabase.h"
@@ -492,15 +493,12 @@ class DimetaTypeManager final : public TypeIDGenerator {
 
       if (val) {
         LOG_DEBUG("Registering malloc-like")
-        const auto is_template_fn = [](const auto& func_str) {
-          return !func_str.empty() && func_str.back() == '>' && func_str.find('<') != std::string::npos;
-        };
 
         const auto function_name = val->location.function;
-        if (call->getCalledFunction() != nullptr && is_template_fn(function_name)) {
+        if (call->getCalledFunction() != nullptr && cuda::is_templated_malloc_like(function_name)) {
           MemOps mem_operations;
           auto kind = mem_operations.kind(call->getCalledFunction()->getName());
-          if (kind && kind.value() == MemOpKind::CudaMallocLike) {
+          if (kind == MemOpKind::CudaMallocLike) {
             LOG_DEBUG("Workaround for pointer level of call base " << function_name)
             workaround::remove_pointer_level(call, val.value());
           }

From 627794f2dee46c0bacc8f97958714b2dd37f3135 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Tue, 28 Apr 2026 12:04:16 +0200
Subject: [PATCH 07/24] Tests pass with LLVM 14

---
 test/cuda/pass/01_cudamalloc.c           | 14 ++++++++++----
 test/cuda/pass/03_cudahostalloc.c        | 14 ++++++++++----
 test/cuda/pass/06_cudamalloc_nonvoid.cpp | 12 ++++++++----
 test/lit.cfg                             |  4 ++++
 4 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/test/cuda/pass/01_cudamalloc.c b/test/cuda/pass/01_cudamalloc.c
index 2e1c7822..9aa7cde3 100644
--- a/test/cuda/pass/01_cudamalloc.c
+++ b/test/cuda/pass/01_cudamalloc.c
@@ -1,10 +1,16 @@
-// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s
+// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
 
 // REQUIRES: cuda_static
 
-// CHECK: call i32 @cudaMalloc({{(ptr|i8\*)}} {{.*}}[[CU_POINTER:%[_0-9a-z]+]],
-// CHECK-NEXT: [[CUDA_PTR:%[0-9a-z_]+]] = load {{.*}}, {{.*}}[[CU_POINTER]]
-// CHECK-NEXT: call void @__typeart_alloc_gpu({{(ptr|i8\*)}} {{.*}}[[CUDA_PTR]],
+// LLVM: call i32 @cudaMalloc(ptr {{.*}}[[CU_POINTER:%[_0-9a-z]+]],
+// LLVM-NEXT: [[CUDA_PTR:%[0-9a-z_]+]] = load {{.*}}, {{.*}}[[CU_POINTER]]
+// LLVM-NEXT: call void @__typeart_alloc_gpu(ptr {{.*}}[[CUDA_PTR]], i32 23, i64 20)
+
+// LLVM_LEGACY: [[CAST1:%[0-9a-z_]+]] = bitcast float** [[SRC_VAR:%[0-9a-zA-Z_]+]] to i8**
+// LLVM_LEGACY: call i32 @cudaMalloc(i8** {{.*}}[[CAST1]],
+// LLVM_LEGACY: [[CAST2:%[0-9a-z_]+]] = bitcast float** [[SRC_VAR]] to i8**
+// LLVM_LEGACY: [[LOADED_PTR:%[0-9a-z_]+]] = load i8*, i8** [[CAST2]]
+// LLVM_LEGACY: call void @__typeart_alloc_gpu(i8* [[LOADED_PTR]], i32 23, i64 20)
 
 int main() {
   const int N = 20;
diff --git a/test/cuda/pass/03_cudahostalloc.c b/test/cuda/pass/03_cudahostalloc.c
index 097611ae..9ea9477c 100644
--- a/test/cuda/pass/03_cudahostalloc.c
+++ b/test/cuda/pass/03_cudahostalloc.c
@@ -1,10 +1,16 @@
-// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s
+// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
 
 // REQUIRES: cuda_static
 
-// CHECK: call i32 @cudaHostAlloc({{(ptr|i8\*)}} {{.*}}[[CU_POINTER:%[_0-9a-z]+]],
-// CHECK-NEXT: [[CUDA_PTR:%[0-9a-z_]+]] = load {{.*}}, {{.*}}[[CU_POINTER]]
-// CHECK-NEXT: call void @__typeart_alloc_gpu({{(ptr|i8\*)}} {{.*}}[[CUDA_PTR]],
+// LLVM: call i32 @cudaHostAlloc(ptr {{.*}}[[CU_POINTER:%[_0-9a-z]+]],
+// LLVM-NEXT: [[CUDA_PTR:%[0-9a-z_]+]] = load ptr, ptr [[CU_POINTER]]
+// LLVM-NEXT: call void @__typeart_alloc_gpu(ptr {{.*}}[[CUDA_PTR]],
+
+// LLVM_LEGACY: [[CAST1:%[0-9a-z_]+]] = bitcast float** [[SRC_VAR:%[0-9a-zA-Z_]+]] to i8**
+// LLVM_LEGACY: call i32 @cudaHostAlloc(i8** {{.*}}[[CAST1]],
+// LLVM_LEGACY: [[CAST2:%[0-9a-z_]+]] = bitcast float** [[SRC_VAR]] to i8**
+// LLVM_LEGACY: [[LOADED_PTR:%[0-9a-z_]+]] = load i8*, i8** [[CAST2]]
+// LLVM_LEGACY: call void @__typeart_alloc_gpu(i8* [[LOADED_PTR]], i32 23, i64 20)
 
 int main() {
   const int N = 20;
diff --git a/test/cuda/pass/06_cudamalloc_nonvoid.cpp b/test/cuda/pass/06_cudamalloc_nonvoid.cpp
index 53551de3..1c58cf8d 100644
--- a/test/cuda/pass/06_cudamalloc_nonvoid.cpp
+++ b/test/cuda/pass/06_cudamalloc_nonvoid.cpp
@@ -1,10 +1,14 @@
-// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s
+// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
 
 // REQUIRES: cuda_static
 
-// CHECK: __typeart_alloc_gpu({{(ptr|i8\*)}} %{{[0-9a-z_]+}}, i32 23, i64 {{.*}})
-// CHECK: __typeart_alloc_gpu({{(ptr|i8\*)}} %{{[0-9a-z_]+}}, i32 24, i64 {{.*}})
-// CHECK: __typeart_alloc_mty_gpu({{(ptr|i8\*)}} %{{[0-9a-z_]+}}, {{(ptr|i32)}} {{[@_a-z0-9A-Z]+}}, i64 {{.*}})
+// LLVM: __typeart_alloc_gpu(ptr %{{[0-9a-z_]+}}, i32 23, i64 {{.*}})
+// LLVM: __typeart_alloc_gpu(ptr %{{[0-9a-z_]+}}, i32 24, i64 {{.*}})
+// LLVM: __typeart_alloc_mty_gpu(ptr %{{[0-9a-z_]+}}, ptr {{[@_a-z0-9A-Z]+}}, i64 {{.*}})
+
+// LLVM_LEGACY: __typeart_alloc_gpu(i8* %{{[0-9a-z_]+}}, i32 23, i64 {{.*}})
+// LLVM_LEGACY: __typeart_alloc_gpu(i8* %{{[0-9a-z_]+}}, i32 24, i64 {{.*}})
+// LLVM_LEGACY: __typeart_alloc_gpu(i8* %{{[0-9a-z_]+}}, i32 2{{[0-9][0-9]}}, i64 {{.*}})
 
 struct X {
   int a;
diff --git a/test/lit.cfg b/test/lit.cfg
index 928bf8e8..d565c392 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -117,6 +117,10 @@ config.substitutions.append(('%clang-cc', clang_cc))
 config.substitutions.append(('%opt', opt))
 config.substitutions.append(('%filecheck', filecheck))
 config.substitutions.append(('%llc', llc))
+if config.llvm_version < 15:
+  config.substitutions.append(('%llvm-version-check', 'LLVM_LEGACY'))
+else:
+  config.substitutions.append(('%llvm-version-check', 'LLVM'))
 
 # Substitutions: executables use "-" separator, variables use underscore
 config.substitutions.append(('%base_path', config.typeart_project_dir))

From 0faa9b124756c3b692eeee24488305ec9f31e93b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Tue, 28 Apr 2026 13:10:56 +0200
Subject: [PATCH 08/24] Extended allocation function set

---
 lib/passes/analysis/MemOpData.h        |  2 ++
 lib/runtime/CMakeLists.txt             |  1 -
 test/cuda/pass/04_cudamalloc_host.c    | 22 ++++++++++++++
 test/cuda/pass/05_cudamalloc_managed.c | 22 ++++++++++++++
 test/cuda/pass/08_cudamalloc_async.c   | 41 ++++++++++++++++++++++++++
 5 files changed, 87 insertions(+), 1 deletion(-)
 create mode 100644 test/cuda/pass/04_cudamalloc_host.c
 create mode 100644 test/cuda/pass/05_cudamalloc_managed.c
 create mode 100644 test/cuda/pass/08_cudamalloc_async.c

diff --git a/lib/passes/analysis/MemOpData.h b/lib/passes/analysis/MemOpData.h
index b96db234..f2d2145b 100644
--- a/lib/passes/analysis/MemOpData.h
+++ b/lib/passes/analysis/MemOpData.h
@@ -106,6 +106,8 @@ struct MemOps {
       {"cudaHostAlloc", MemOpKind::CudaMallocLike},
       {"cudaMallocHost", MemOpKind::CudaMallocLike},
       {"cudaMallocManaged", MemOpKind::CudaMallocLike},
+      {"cudaMallocAsync", MemOpKind::CudaMallocLike},
+      {"cudaMallocFromPoolAsync", MemOpKind::CudaMallocLike},
   };
 
   const llvm::StringMap<MemOpKind> dealloc_map{
diff --git a/lib/runtime/CMakeLists.txt b/lib/runtime/CMakeLists.txt
index c8467f71..9e2c0e46 100644
--- a/lib/runtime/CMakeLists.txt
+++ b/lib/runtime/CMakeLists.txt
@@ -28,7 +28,6 @@ set(RUNTIME_LIB_SOURCES
     TypeResolution.h
     Runtime.cpp
     Runtime.h
-    CudaSupport.cpp
     ${TYPEART_META_SOURCE}
     $<$<BOOL:${TYPEART_MPI_LOGGER}>:../support/MPILogger.cpp>
 )
diff --git a/test/cuda/pass/04_cudamalloc_host.c b/test/cuda/pass/04_cudamalloc_host.c
new file mode 100644
index 00000000..fc0923ea
--- /dev/null
+++ b/test/cuda/pass/04_cudamalloc_host.c
@@ -0,0 +1,22 @@
+// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
+
+// REQUIRES: cuda_static
+
+// LLVM: call i32 @cudaMallocHost(ptr {{.*}}[[CU_POINTER:%[_0-9a-z]+]],
+// LLVM-NEXT: [[CUDA_PTR:%[0-9a-z_]+]] = load {{.*}}, {{.*}}[[CU_POINTER]]
+// LLVM-NEXT: call void @__typeart_alloc_gpu(ptr {{.*}}[[CUDA_PTR]], i32 23, i64 20)
+
+// LLVM_LEGACY: [[CAST1:%[0-9a-z_]+]] = bitcast float** [[SRC_VAR:%[0-9a-zA-Z_]+]] to i8**
+// LLVM_LEGACY: call i32 @cudaMallocHost(i8** {{.*}}[[CAST1]],
+// LLVM_LEGACY: [[CAST2:%[0-9a-z_]+]] = bitcast float** [[SRC_VAR]] to i8**
+// LLVM_LEGACY: [[LOADED_PTR:%[0-9a-z_]+]] = load i8*, i8** [[CAST2]]
+// LLVM_LEGACY: call void @__typeart_alloc_gpu(i8* [[LOADED_PTR]], i32 23, i64 20)
+
+int main() {
+  const int N = 20;
+  float* d_x;
+
+  cudaMallocHost((void**)&d_x, N * sizeof(float));
+
+  return 0;
+}
diff --git a/test/cuda/pass/05_cudamalloc_managed.c b/test/cuda/pass/05_cudamalloc_managed.c
new file mode 100644
index 00000000..47e21d32
--- /dev/null
+++ b/test/cuda/pass/05_cudamalloc_managed.c
@@ -0,0 +1,22 @@
+// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
+
+// REQUIRES: cuda_static
+
+// LLVM: call i32 @cudaMallocManaged(ptr {{.*}}[[CU_POINTER:%[_0-9a-z]+]],
+// LLVM-NEXT: [[CUDA_PTR:%[0-9a-z_]+]] = load {{.*}}, {{.*}}[[CU_POINTER]]
+// LLVM-NEXT: call void @__typeart_alloc_gpu(ptr {{.*}}[[CUDA_PTR]], i32 23, i64 20)
+
+// LLVM_LEGACY: [[CAST1:%[0-9a-z_]+]] = bitcast float** [[SRC_VAR:%[0-9a-zA-Z_]+]] to i8**
+// LLVM_LEGACY: call i32 @cudaMallocManaged(i8** {{.*}}[[CAST1]],
+// LLVM_LEGACY: [[CAST2:%[0-9a-z_]+]] = bitcast float** [[SRC_VAR]] to i8**
+// LLVM_LEGACY: [[LOADED_PTR:%[0-9a-z_]+]] = load i8*, i8** [[CAST2]]
+// LLVM_LEGACY: call void @__typeart_alloc_gpu(i8* [[LOADED_PTR]], i32 23, i64 20)
+
+int main() {
+  const int N = 20;
+  float* d_x;
+
+  cudaMallocManaged((void**)&d_x, N * sizeof(float));
+
+  return 0;
+}
diff --git a/test/cuda/pass/08_cudamalloc_async.c b/test/cuda/pass/08_cudamalloc_async.c
new file mode 100644
index 00000000..655450e0
--- /dev/null
+++ b/test/cuda/pass/08_cudamalloc_async.c
@@ -0,0 +1,41 @@
+// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
+
+// REQUIRES: cuda_static
+
+// clang-format off
+// LLVM: call i32 @cudaMallocAsync(ptr {{.*}}[[CU_POINTER_X:%[_0-9a-z]+]], i64{{.*}} 80, ptr {{.*}})
+// LLVM-NEXT: [[CUDA_PTR_X:%[0-9a-z_]+]] = load ptr, ptr [[CU_POINTER_X]]
+// LLVM-NEXT: call void @__typeart_alloc_gpu(ptr [[CUDA_PTR_X]], i32 23, i64 20)
+
+// LLVM: call i32 @cudaMallocFromPoolAsync(ptr {{.*}}[[CU_POINTER_Y:%[_0-9a-z]+]], i64{{.*}} 80, ptr {{.*}}, ptr {{.*}})
+// LLVM-NEXT: [[CUDA_PTR_Y:%[0-9a-z_]+]] = load ptr, ptr [[CU_POINTER_Y]]
+// LLVM-NEXT: call void @__typeart_alloc_gpu(ptr [[CUDA_PTR_Y]], i32 23, i64 20)
+
+// LLVM_LEGACY: [[CAST1:%[0-9a-z_]+]] = bitcast float** [[SRC_VAR:%[0-9a-zA-Z_]+]] to i8**
+// LLVM_LEGACY: call i32 @cudaMallocAsync(i8** {{.*}}[[CAST1]],
+// LLVM_LEGACY: [[CAST2:%[0-9a-z_]+]] = bitcast float** [[SRC_VAR]] to i8**
+// LLVM_LEGACY: [[LOADED_PTR:%[0-9a-z_]+]] = load i8*, i8** [[CAST2]]
+// LLVM_LEGACY: call void @__typeart_alloc_gpu(i8* [[LOADED_PTR]], i32 23, i64 20)
+
+// LLVM_LEGACY: [[CAST1:%[0-9a-z_]+]] = bitcast float** [[SRC_VAR:%[0-9a-zA-Z_]+]] to i8**
+// LLVM_LEGACY: call i32 @cudaMallocFromPoolAsync(i8** {{.*}}[[CAST1]],
+// LLVM_LEGACY: [[CAST2:%[0-9a-z_]+]] = bitcast float** [[SRC_VAR]] to i8**
+// LLVM_LEGACY: [[LOADED_PTR:%[0-9a-z_]+]] = load i8*, i8** [[CAST2]]
+// LLVM_LEGACY: call void @__typeart_alloc_gpu(i8* [[LOADED_PTR]], i32 23, i64 20)
+// clang-format on
+
+int main() {
+  const int N = 20;
+  float* d_x;
+  float* d_y;
+
+  cudaStream_t stream;
+  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+
+  cudaMallocAsync((void**)&d_x, N * sizeof(float), stream);
+
+  cudaMemPool_t pool;
+  cudaMallocFromPoolAsync((void**)&d_y, N * sizeof(float), pool, stream);
+
+  return 0;
+}
\ No newline at end of file

From fb8058ae390b871ca9b9d5fc074280a2fc08b553 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Tue, 28 Apr 2026 13:22:24 +0200
Subject: [PATCH 09/24] cudaFree variation

---
 lib/passes/analysis/MemOpData.h |  1 +
 test/cuda/pass/02_cudafree.c    | 11 +++++++++++
 2 files changed, 12 insertions(+)

diff --git a/lib/passes/analysis/MemOpData.h b/lib/passes/analysis/MemOpData.h
index f2d2145b..4fcadeea 100644
--- a/lib/passes/analysis/MemOpData.h
+++ b/lib/passes/analysis/MemOpData.h
@@ -128,6 +128,7 @@ struct MemOps {
       {"_ZdaPvmSt11align_val_t", MemOpKind::DeleteLike},              /* delete[](void*, unsigned long, align_val_t) */
       {"cudaFree", MemOpKind::FreeLike},
       {"cudaFreeHost", MemOpKind::FreeLike},
+      {"cudaFreeAsync", MemOpKind::FreeLike},
   };
   //clang-format off
 };
diff --git a/test/cuda/pass/02_cudafree.c b/test/cuda/pass/02_cudafree.c
index be1da9ed..12e5cbf6 100644
--- a/test/cuda/pass/02_cudafree.c
+++ b/test/cuda/pass/02_cudafree.c
@@ -5,10 +5,21 @@
 // CHECK: call i32 @cudaFree({{(ptr|i8\*)}} {{.*}}[[CU_POINTER:%[0-9a-z]+]])
 // CHECK-NEXT: __typeart_free_gpu({{(ptr|i8\*)}} {{.*}}[[CU_POINTER]])
 
+// CHECK: call i32 @cudaFreeHost({{(ptr|i8\*)}} {{.*}}[[CU_POINTER:%[0-9a-z]+]])
+// CHECK-NEXT: __typeart_free_gpu({{(ptr|i8\*)}} {{.*}}[[CU_POINTER]])
+
+// CHECK: call i32 @cudaFreeAsync({{(ptr|i8\*)}} {{.*}}[[CU_POINTER:%[0-9a-z]+]],
+// CHECK-NEXT: __typeart_free_gpu({{(ptr|i8\*)}} {{.*}}[[CU_POINTER]])
+
 int main() {
   float* d_x;
 
   cudaFree(d_x);
 
+  cudaFreeHost(d_x);
+
+  cudaStream_t stream;
+  cudaFreeAsync(d_x, stream);
+
   return 0;
 }

From 46adda7bedf25ea25f00d24686189cedda4ed05b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Tue, 28 Apr 2026 14:06:05 +0200
Subject: [PATCH 10/24] Add GPU flag to enable CUDA instrumentation

---
 lib/passes/Commandline.cpp                     | 13 ++++++++++---
 lib/passes/analysis/MemOpVisitor.cpp           | 18 +++++++++++++-----
 lib/passes/analysis/MemOpVisitor.h             |  2 ++
 .../configuration/EnvironmentConfiguration.cpp |  3 +++
 lib/passes/configuration/PassConfiguration.cpp |  8 +++++++-
 lib/passes/configuration/TypeARTOptions.cpp    |  3 +++
 lib/passes/configuration/TypeARTOptions.h      |  1 +
 lib/support/ConfigurationBaseOptions.h         |  1 +
 test/cuda/pass/01_cudamalloc.c                 |  2 +-
 test/cuda/pass/02_cudafree.c                   |  2 +-
 test/cuda/pass/03_cudahostalloc.c              |  2 +-
 test/cuda/pass/04_cudamalloc_host.c            |  2 +-
 test/cuda/pass/05_cudamalloc_managed.c         |  2 +-
 test/cuda/pass/06_cudamalloc_nonvoid.cpp       |  2 +-
 test/cuda/pass/07_axpy.c                       |  2 +-
 test/cuda/pass/08_cudamalloc_async.c           |  2 +-
 test/cuda/runtime/07_axpy.c                    |  2 +-
 test/pass/misc/05_make_all_callbacks.c         |  4 ++--
 test/pass/misc/07_config_file.c                |  1 +
 test/pass/misc/08_config_file_default.c        |  1 +
 test/pass/misc/11_env_flags.c                  |  1 +
 test/pass/misc/12_env_flags_single_override.c  |  1 +
 test/pass/misc/13_env_flags_phase_heap.c       |  1 +
 test/pass/misc/14_env_flags_phase_ignored.c    |  1 +
 24 files changed, 57 insertions(+), 20 deletions(-)

diff --git a/lib/passes/Commandline.cpp b/lib/passes/Commandline.cpp
index 9619e166..8e4100ee 100644
--- a/lib/passes/Commandline.cpp
+++ b/lib/passes/Commandline.cpp
@@ -70,9 +70,14 @@ static cl::opt<ConfigStdArgTypes::stats_ty> cl_typeart_stats(CommandlineStdArgs:
                                                              cl::cat(typeart_category));
 
 static cl::opt<ConfigStdArgTypes::heap_ty> cl_typeart_instrument_heap(CommandlineStdArgs::heap,
-                                                                      cl::desc(ConfigStdArgDescriptions::heap),
-                                                                      cl::init(ConfigStdArgValues::heap),
-                                                                      cl::cat(typeart_category));
+                                                                       cl::desc(ConfigStdArgDescriptions::heap),
+                                                                       cl::init(ConfigStdArgValues::heap),
+                                                                       cl::cat(typeart_category));
+
+static cl::opt<ConfigStdArgTypes::gpu_ty> cl_typeart_instrument_gpu(CommandlineStdArgs::gpu,
+                                                                     cl::desc(ConfigStdArgDescriptions::gpu),
+                                                                     cl::init(ConfigStdArgValues::gpu),
+                                                                     cl::cat(typeart_category));
 
 static cl::opt<ConfigStdArgTypes::global_ty> cl_typeart_instrument_global(CommandlineStdArgs::global,
                                                                           cl::desc(ConfigStdArgDescriptions::global),
@@ -206,6 +211,7 @@ CommandLineOptions::CommandLineOptions() {
       make_entry(ConfigStdArgs::types, cl_typeart_type_file),
       make_entry(ConfigStdArgs::stats, cl_typeart_stats),
       make_entry(ConfigStdArgs::heap, cl_typeart_instrument_heap),
+      make_entry(ConfigStdArgs::gpu, cl_typeart_instrument_gpu),
       make_entry(ConfigStdArgs::global, cl_typeart_instrument_global),
       make_entry(ConfigStdArgs::stack, cl_typeart_instrument_stack),
       make_entry(ConfigStdArgs::type_serialization, cl_typeart_type_serialization),
@@ -226,6 +232,7 @@ CommandLineOptions::CommandLineOptions() {
       make_occurr_entry(ConfigStdArgs::types, cl_typeart_type_file),
       make_occurr_entry(ConfigStdArgs::stats, cl_typeart_stats),
       make_occurr_entry(ConfigStdArgs::heap, cl_typeart_instrument_heap),
+      make_occurr_entry(ConfigStdArgs::gpu, cl_typeart_instrument_gpu),
       make_occurr_entry(ConfigStdArgs::global, cl_typeart_instrument_global),
       make_occurr_entry(ConfigStdArgs::stack, cl_typeart_instrument_stack),
       make_occurr_entry(ConfigStdArgs::type_serialization, cl_typeart_type_serialization),
diff --git a/lib/passes/analysis/MemOpVisitor.cpp b/lib/passes/analysis/MemOpVisitor.cpp
index c42a92ea..e9c98811 100644
--- a/lib/passes/analysis/MemOpVisitor.cpp
+++ b/lib/passes/analysis/MemOpVisitor.cpp
@@ -55,9 +55,14 @@ MemOpVisitor::MemOpVisitor() : MemOpVisitor(true, true) {
 }
 
 MemOpVisitor::MemOpVisitor(const config::Configuration& config)
-    : MemOpVisitor(config[config::ConfigStdArgs::stack], config[config::ConfigStdArgs::heap]) {
+    : MemOpVisitor(config[config::ConfigStdArgs::stack], config[config::ConfigStdArgs::heap],
+                   config[config::ConfigStdArgs::gpu]) {
 }
-MemOpVisitor::MemOpVisitor(bool stack, bool heap) : collect_allocas(stack), collect_heap(heap) {
+MemOpVisitor::MemOpVisitor(bool stack, bool heap) : MemOpVisitor(stack, heap, true) {
+}
+
+MemOpVisitor::MemOpVisitor(bool stack, bool heap, bool gpu)
+    : collect_allocas(stack), collect_heap(heap), collect_gpu(gpu) {
 }
 
 void MemOpVisitor::collect(llvm::Function& function) {
@@ -92,14 +97,17 @@ void MemOpVisitor::visitCallBase(llvm::CallBase& cb) {
   if (!collect_heap) {
     return;
   }
+  const auto* called_function = cb.getCalledFunction();
+  if (!collect_gpu && called_function != nullptr && cuda::is_cuda_function(*called_function)) {
+    return;
+  }
   const auto isInSet = [&](const auto& fMap) -> std::optional<MemOpKind> {
-    const auto* f = cb.getCalledFunction();
-    if (!f) {
+    if (called_function == nullptr) {
       // TODO handle calls through, e.g., function pointers? - seems infeasible
       // LOG_INFO("Encountered indirect call, skipping.");
       return {};
     }
-    const auto name = f->getName().str();
+    const auto name = called_function->getName().str();
 
     const auto res = fMap.find(name);
     if (res != fMap.end()) {
diff --git a/lib/passes/analysis/MemOpVisitor.h b/lib/passes/analysis/MemOpVisitor.h
index fc982549..bc626854 100644
--- a/lib/passes/analysis/MemOpVisitor.h
+++ b/lib/passes/analysis/MemOpVisitor.h
@@ -39,11 +39,13 @@ struct MemOpVisitor : public llvm::InstVisitor<MemOpVisitor> {
   MemOps mem_operations{};
   bool collect_allocas;
   bool collect_heap;
+  bool collect_gpu;
 
  public:
   MemOpVisitor();
   explicit MemOpVisitor(const config::Configuration& config);
   MemOpVisitor(bool stack, bool heap);
+  MemOpVisitor(bool stack, bool heap, bool gpu);
   void collect(llvm::Function& function);
   void collectGlobals(llvm::Module& module);
   void clear();
diff --git a/lib/passes/configuration/EnvironmentConfiguration.cpp b/lib/passes/configuration/EnvironmentConfiguration.cpp
index 76182011..6eec7254 100644
--- a/lib/passes/configuration/EnvironmentConfiguration.cpp
+++ b/lib/passes/configuration/EnvironmentConfiguration.cpp
@@ -110,6 +110,8 @@ EnvironmentFlagsOptions::EnvironmentFlagsOptions() {
                                               EnvironmentStdArgsValues::stats),
       make_entry<ConfigStdArgTypes::heap_ty>(ConfigStdArgs::heap, EnvironmentStdArgs::heap,
                                              EnvironmentStdArgsValues::heap),
+      make_entry<ConfigStdArgTypes::gpu_ty>(ConfigStdArgs::gpu, EnvironmentStdArgs::gpu,
+                                            EnvironmentStdArgsValues::gpu),
       make_entry<ConfigStdArgTypes::global_ty>(ConfigStdArgs::global, EnvironmentStdArgs::global,
                                                EnvironmentStdArgsValues::global),
       make_entry<ConfigStdArgTypes::stack_ty>(ConfigStdArgs::stack, EnvironmentStdArgs::stack,
@@ -150,6 +152,7 @@ EnvironmentFlagsOptions::EnvironmentFlagsOptions() {
       make_occurr_entry(ConfigStdArgs::types, config::EnvironmentStdArgs::types),
       make_occurr_entry(ConfigStdArgs::stats, EnvironmentStdArgs::stats),
       make_occurr_entry(ConfigStdArgs::heap, EnvironmentStdArgs::heap),
+      make_occurr_entry(ConfigStdArgs::gpu, EnvironmentStdArgs::gpu),
       make_occurr_entry(ConfigStdArgs::global, EnvironmentStdArgs::global),
       make_occurr_entry(ConfigStdArgs::stack, EnvironmentStdArgs::stack),
       make_occurr_entry(ConfigStdArgs::type_serialization, EnvironmentStdArgs::type_serialization),
diff --git a/lib/passes/configuration/PassConfiguration.cpp b/lib/passes/configuration/PassConfiguration.cpp
index 5c067571..ad911f74 100644
--- a/lib/passes/configuration/PassConfiguration.cpp
+++ b/lib/passes/configuration/PassConfiguration.cpp
@@ -59,6 +59,12 @@ PassConfig parse_typeart_config_with_occurrence(llvm::StringRef parameters) {
       continue;
     }
 
+    if (parameter_name == ConfigStdArgs::gpu) {
+      result.gpu                         = enable;
+      occurrence_map[ConfigStdArgs::gpu] = true;
+      continue;
+    }
+
     if (parameter_name == ConfigStdArgs::stack) {
       result.stack                         = enable;
       occurrence_map[ConfigStdArgs::stack] = true;
@@ -162,4 +168,4 @@ PassConfig parse_typeart_config_with_occurrence(llvm::StringRef parameters) {
   return {result, occurrence_map};
 }
 
-}  // namespace typeart::config::pass
\ No newline at end of file
+}  // namespace typeart::config::pass
diff --git a/lib/passes/configuration/TypeARTOptions.cpp b/lib/passes/configuration/TypeARTOptions.cpp
index 1c82427f..30b0d0fd 100644
--- a/lib/passes/configuration/TypeARTOptions.cpp
+++ b/lib/passes/configuration/TypeARTOptions.cpp
@@ -90,6 +90,7 @@ struct llvm::yaml::MappingTraits<typeart::config::TypeARTConfigOptions> {
     using typeart::config::ConfigStdArgs;
     yml_io.mapRequired(ConfigStdArgs::types, info.types);
     yml_io.mapRequired(ConfigStdArgs::heap, info.heap);
+    yml_io.mapOptional(ConfigStdArgs::gpu, info.gpu);
     yml_io.mapRequired(ConfigStdArgs::stack, info.stack);
     yml_io.mapOptional(ConfigStdArgs::global, info.global);
     yml_io.mapOptional(ConfigStdArgs::stats, info.statistics);
@@ -138,6 +139,7 @@ TypeARTConfigOptions construct_with(Constructor&& make_entry) {
   make_entry(ConfigStdArgs::types, config.types);
   make_entry(ConfigStdArgs::stats, config.statistics);
   make_entry(ConfigStdArgs::heap, config.heap);
+  make_entry(ConfigStdArgs::gpu, config.gpu);
   make_entry(ConfigStdArgs::global, config.global);
   make_entry(ConfigStdArgs::stack, config.stack);
   make_entry(ConfigStdArgs::stack_lifetime, config.stack_lifetime);
@@ -186,6 +188,7 @@ OptionsMap options_to_map(const TypeARTConfigOptions& config) {
       make_entry(ConfigStdArgs::types, config.types),
       make_entry(ConfigStdArgs::stats, config.statistics),
       make_entry(ConfigStdArgs::heap, config.heap),
+      make_entry(ConfigStdArgs::gpu, config.gpu),
       make_entry(ConfigStdArgs::global, config.global),
       make_entry(ConfigStdArgs::stack, config.stack),
       make_entry(ConfigStdArgs::stack_lifetime, config.stack_lifetime),
diff --git a/lib/passes/configuration/TypeARTOptions.h b/lib/passes/configuration/TypeARTOptions.h
index 03e7884d..56b99424 100644
--- a/lib/passes/configuration/TypeARTOptions.h
+++ b/lib/passes/configuration/TypeARTOptions.h
@@ -46,6 +46,7 @@ struct TypeARTAnalysisOptions {
 struct TypeARTConfigOptions {
   std::string types{ConfigStdArgValues::types};
   bool heap{ConfigStdArgValues::heap};
+  bool gpu{ConfigStdArgValues::gpu};
   bool stack{ConfigStdArgValues::stack};
   bool global{ConfigStdArgValues::global};
   bool statistics{ConfigStdArgValues::stats};
diff --git a/lib/support/ConfigurationBaseOptions.h b/lib/support/ConfigurationBaseOptions.h
index f7386c8e..e8bc1fc3 100644
--- a/lib/support/ConfigurationBaseOptions.h
+++ b/lib/support/ConfigurationBaseOptions.h
@@ -18,6 +18,7 @@ TYPEART_CONFIG_OPTION(types, "types", std::string, "typeart-types.yaml", "Locati
                       "TYPES")
 TYPEART_CONFIG_OPTION(stats, "stats", bool, false, "Show statistics for TypeArt type pass.", "STATS")
 TYPEART_CONFIG_OPTION(heap, "heap", bool, true, "Instrument heap allocation/free instructions.", "HEAP")
+TYPEART_CONFIG_OPTION(gpu, "gpu", bool, false, "Instrument GPU allocation/free instructions.", "GPU")
 TYPEART_CONFIG_OPTION(stack, "stack", bool, false, "Instrument stack allocations.", "STACK")
 TYPEART_CONFIG_OPTION(global, "global", bool, false, "Instrument global allocations.", "GLOBAL")
 TYPEART_CONFIG_OPTION(stack_lifetime, "stack-lifetime", bool, true,
diff --git a/test/cuda/pass/01_cudamalloc.c b/test/cuda/pass/01_cudamalloc.c
index 9aa7cde3..0e180be3 100644
--- a/test/cuda/pass/01_cudamalloc.c
+++ b/test/cuda/pass/01_cudamalloc.c
@@ -1,4 +1,4 @@
-// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
+// RUN: %cuda-c-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
 
 // REQUIRES: cuda_static
 
diff --git a/test/cuda/pass/02_cudafree.c b/test/cuda/pass/02_cudafree.c
index 12e5cbf6..273b5674 100644
--- a/test/cuda/pass/02_cudafree.c
+++ b/test/cuda/pass/02_cudafree.c
@@ -1,4 +1,4 @@
-// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s
+// RUN: %cuda-c-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s
 
 // REQUIRES: cuda_static
 
diff --git a/test/cuda/pass/03_cudahostalloc.c b/test/cuda/pass/03_cudahostalloc.c
index 9ea9477c..745b76ca 100644
--- a/test/cuda/pass/03_cudahostalloc.c
+++ b/test/cuda/pass/03_cudahostalloc.c
@@ -1,4 +1,4 @@
-// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
+// RUN: %cuda-c-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
 
 // REQUIRES: cuda_static
 
diff --git a/test/cuda/pass/04_cudamalloc_host.c b/test/cuda/pass/04_cudamalloc_host.c
index fc0923ea..2930268f 100644
--- a/test/cuda/pass/04_cudamalloc_host.c
+++ b/test/cuda/pass/04_cudamalloc_host.c
@@ -1,4 +1,4 @@
-// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
+// RUN: %cuda-c-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
 
 // REQUIRES: cuda_static
 
diff --git a/test/cuda/pass/05_cudamalloc_managed.c b/test/cuda/pass/05_cudamalloc_managed.c
index 47e21d32..74b69aa6 100644
--- a/test/cuda/pass/05_cudamalloc_managed.c
+++ b/test/cuda/pass/05_cudamalloc_managed.c
@@ -1,4 +1,4 @@
-// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
+// RUN: %cuda-c-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
 
 // REQUIRES: cuda_static
 
diff --git a/test/cuda/pass/06_cudamalloc_nonvoid.cpp b/test/cuda/pass/06_cudamalloc_nonvoid.cpp
index 1c58cf8d..8f310fce 100644
--- a/test/cuda/pass/06_cudamalloc_nonvoid.cpp
+++ b/test/cuda/pass/06_cudamalloc_nonvoid.cpp
@@ -1,4 +1,4 @@
-// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
+// RUN: %cuda-c-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
 
 // REQUIRES: cuda_static
 
diff --git a/test/cuda/pass/07_axpy.c b/test/cuda/pass/07_axpy.c
index c661593f..f60ae8cc 100644
--- a/test/cuda/pass/07_axpy.c
+++ b/test/cuda/pass/07_axpy.c
@@ -1,4 +1,4 @@
-// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s
+// RUN: %cuda-c-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s
 
 // REQUIRES: cuda_static
 
diff --git a/test/cuda/pass/08_cudamalloc_async.c b/test/cuda/pass/08_cudamalloc_async.c
index 655450e0..25c1a060 100644
--- a/test/cuda/pass/08_cudamalloc_async.c
+++ b/test/cuda/pass/08_cudamalloc_async.c
@@ -1,4 +1,4 @@
-// RUN: %cuda-c-to-llvm %s | %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
+// RUN: %cuda-c-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
 
 // REQUIRES: cuda_static
 
diff --git a/test/cuda/runtime/07_axpy.c b/test/cuda/runtime/07_axpy.c
index fb89b8f7..440d87c4 100644
--- a/test/cuda/runtime/07_axpy.c
+++ b/test/cuda/runtime/07_axpy.c
@@ -1,4 +1,4 @@
-// RUN: %wrapper-cc -x cuda --cuda-gpu-arch=sm_50 %cuda_link %s -o %s.exe
+// RUN: TYPEART_GPU=true %wrapper-cc -x cuda --cuda-gpu-arch=sm_50 %cuda_link %s -o %s.exe
 // RUN: %s.exe 2>&1 | %filecheck %s
 
 // REQUIRES: cuda_runtime
diff --git a/test/pass/misc/05_make_all_callbacks.c b/test/pass/misc/05_make_all_callbacks.c
index f54e3258..b287257d 100644
--- a/test/pass/misc/05_make_all_callbacks.c
+++ b/test/pass/misc/05_make_all_callbacks.c
@@ -1,5 +1,5 @@
-// RUN: %c-to-llvm -Wimplicit-function-declaration %s -I%runtime_path -I%base_path/lib/runtime | %apply-typeart
-// --typeart-stack=true -S 2>&1 | \ RUN: %filecheck %s
+// RUN: %c-to-llvm -Wimplicit-function-declaration %s -I%runtime_path -I%base_path/lib/runtime | %apply-typeart \
+// RUN: --typeart-stack=true -S 2>&1 | %filecheck %s
 
 #include "CallbackInterface.h"
 
diff --git a/test/pass/misc/07_config_file.c b/test/pass/misc/07_config_file.c
index b2386ebf..de6f0bc5 100644
--- a/test/pass/misc/07_config_file.c
+++ b/test/pass/misc/07_config_file.c
@@ -8,6 +8,7 @@ void test() {
 
 // CHECK:  types:           07_config_file.c.yaml
 // CHECK-NEXT:  heap:            false
+// CHECK-NEXT:  gpu:             false
 // CHECK-NEXT:  stack:           true
 // CHECK-NEXT:  global:          false
 // CHECK-NEXT:  stats:           {{.*}}
diff --git a/test/pass/misc/08_config_file_default.c b/test/pass/misc/08_config_file_default.c
index 6e908b29..cf426311 100644
--- a/test/pass/misc/08_config_file_default.c
+++ b/test/pass/misc/08_config_file_default.c
@@ -6,6 +6,7 @@
 
 // CHECK: types:           {{.*}}.yaml
 // CHECK-NEXT: heap:            true
+// CHECK-NEXT: gpu:             false
 // CHECK-NEXT: stack:           false
 // CHECK-NEXT: global:          false
 // CHECK-NEXT: stats:           true
diff --git a/test/pass/misc/11_env_flags.c b/test/pass/misc/11_env_flags.c
index 5aeceb80..a0b32f19 100644
--- a/test/pass/misc/11_env_flags.c
+++ b/test/pass/misc/11_env_flags.c
@@ -5,6 +5,7 @@
 // CHECK-NEXT: ---
 // CHECK-NEXT: types:           {{.*}}
 // CHECK-NEXT: heap:            false
+// CHECK-NEXT: gpu:             false
 // CHECK-NEXT: stack:           false
 // CHECK-NEXT: global:          false
 // CHECK-NEXT: stats:          false
diff --git a/test/pass/misc/12_env_flags_single_override.c b/test/pass/misc/12_env_flags_single_override.c
index 15a1f8b1..b2187556 100644
--- a/test/pass/misc/12_env_flags_single_override.c
+++ b/test/pass/misc/12_env_flags_single_override.c
@@ -5,6 +5,7 @@
 // CHECK-NEXT: ---
 // CHECK-NEXT: types:           {{.*}}
 // CHECK-NEXT: heap:            true
+// CHECK-NEXT: gpu:             false
 // CHECK-NEXT: stack:           false
 // CHECK-NEXT: global:          false
 // CHECK-NEXT: stats:          false
diff --git a/test/pass/misc/13_env_flags_phase_heap.c b/test/pass/misc/13_env_flags_phase_heap.c
index 5eb78c18..e2da7517 100644
--- a/test/pass/misc/13_env_flags_phase_heap.c
+++ b/test/pass/misc/13_env_flags_phase_heap.c
@@ -5,6 +5,7 @@
 // CHECK-NEXT: ---
 // CHECK-NEXT: types:           {{.*}}
 // CHECK-NEXT: heap:            true
+// CHECK-NEXT: gpu:             false
 // CHECK-NEXT: stack:           true
 // CHECK-NEXT: global:          true
 // CHECK-NEXT: stats:          false
diff --git a/test/pass/misc/14_env_flags_phase_ignored.c b/test/pass/misc/14_env_flags_phase_ignored.c
index 280887bb..358525f2 100644
--- a/test/pass/misc/14_env_flags_phase_ignored.c
+++ b/test/pass/misc/14_env_flags_phase_ignored.c
@@ -3,6 +3,7 @@
 
 // CHECK: Emitting TypeART configuration content
 // CHECK: heap:            true
+// CHECK: gpu:             false
 // CHECK-NOT: stack:           true
 // CHECK-NOT: {{^}}global:          true
 // CHECK-NOT: stats:          false

From 1ec4793cac11f91cdcd64b4842871cb7ef5ef383 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Tue, 28 Apr 2026 16:41:09 +0200
Subject: [PATCH 11/24] Initial HIP support

---
 cmake/typeartToolchainOptions.cmake           |  7 ++
 externals/dimeta/CMakeLists.txt               |  2 +-
 lib/passes/analysis/MemOpData.h               | 19 ++++-
 lib/passes/analysis/MemOpVisitor.cpp          |  8 +-
 .../instrumentation/MemOpArgCollector.cpp     |  2 +
 .../instrumentation/MemOpInstrumentation.cpp  |  6 +-
 .../instrumentation/TypeARTFunctions.cpp      | 32 ++++++--
 lib/passes/instrumentation/TypeARTFunctions.h |  2 +
 lib/passes/support/GpuUtil.h                  | 63 ++++++++++++++++
 lib/passes/support/HipUtil.h                  | 74 +++++++++++++++++++
 lib/passes/typegen/dimeta/DimetaTypeGen.cpp   | 15 ++--
 lib/passes/typegen/ir/TypeManager.cpp         | 14 ++--
 test/CMakeLists.txt                           | 43 ++++++++++-
 test/hip/pass/01_hipmalloc.c                  | 17 +++++
 test/hip/pass/02_hipfree.c                    | 15 ++++
 test/lit.cfg                                  |  7 ++
 test/lit.site.cfg.in                          |  2 +
 17 files changed, 297 insertions(+), 31 deletions(-)
 create mode 100644 lib/passes/support/GpuUtil.h
 create mode 100644 lib/passes/support/HipUtil.h
 create mode 100644 test/hip/pass/01_hipmalloc.c
 create mode 100644 test/hip/pass/02_hipfree.c

diff --git a/cmake/typeartToolchainOptions.cmake b/cmake/typeartToolchainOptions.cmake
index e0f3f8ed..a1444b4a 100644
--- a/cmake/typeartToolchainOptions.cmake
+++ b/cmake/typeartToolchainOptions.cmake
@@ -170,6 +170,13 @@ set_package_properties(CUDAToolkit PROPERTIES
   "CUDA toolkit enables host-side CUDA instrumentation and runtime helpers."
 )
 
+find_package(hip QUIET)
+set_package_properties(hip PROPERTIES
+  TYPE OPTIONAL
+  PURPOSE
+  "HIP enables host-side HIP instrumentation and runtime helpers."
+)
+
 typeart_find_llvm_progs(TYPEART_CLANG_EXEC "clang-${LLVM_VERSION_MAJOR};clang" DEFAULT_EXE "clang")
 typeart_find_llvm_progs(TYPEART_CLANGCXX_EXEC "clang++-${LLVM_VERSION_MAJOR};clang++" DEFAULT_EXE "clang++")
 typeart_find_llvm_progs(TYPEART_LLC_EXEC "llc-${LLVM_VERSION_MAJOR};llc" DEFAULT_EXE "llc")
diff --git a/externals/dimeta/CMakeLists.txt b/externals/dimeta/CMakeLists.txt
index 85d23e22..d0a19ddc 100644
--- a/externals/dimeta/CMakeLists.txt
+++ b/externals/dimeta/CMakeLists.txt
@@ -1,7 +1,7 @@
 FetchContent_Declare(
   llvm-dimeta
   GIT_REPOSITORY https://github.com/ahueck/llvm-dimeta
-  GIT_TAG v0.5.1
+  GIT_TAG feat/hip
   GIT_SHALLOW 1
 )
 
diff --git a/lib/passes/analysis/MemOpData.h b/lib/passes/analysis/MemOpData.h
index 4fcadeea..8255eb01 100644
--- a/lib/passes/analysis/MemOpData.h
+++ b/lib/passes/analysis/MemOpData.h
@@ -31,7 +31,7 @@ class IntrinsicInst;
 }  // namespace llvm
 
 namespace typeart {
-enum class MemOpKind : uint8_t {
+enum class MemOpKind : uint16_t {
   NewLike            = 1 << 0,            // allocates, never null
   MallocLike         = 1 << 1 | NewLike,  // allocates, maybe null
   AlignedAllocLike   = 1 << 2,            // allocates aligned, maybe null
@@ -40,12 +40,19 @@ enum class MemOpKind : uint8_t {
   FreeLike           = 1 << 5,            // free memory
   DeleteLike         = 1 << 6,            // delete (cpp) memory
   CudaMallocLike     = 1 << 7,            // cuda out-parameter allocation
+  HipMallocLike      = 1 << 8,            // hip out-parameter allocation
   MallocOrCallocLike = MallocLike | CallocLike | AlignedAllocLike,
   AllocLike          = MallocOrCallocLike,
   AnyAlloc           = AllocLike | ReallocLike,
-  AnyFree            = FreeLike | DeleteLike
+  AnyFree            = FreeLike | DeleteLike,
+  GpuMallocLike      = CudaMallocLike | HipMallocLike
 };
 
+inline bool is_kind(MemOpKind kind, MemOpKind mask) {
+  return (static_cast<std::underlying_type_t<MemOpKind>>(kind) &
+          static_cast<std::underlying_type_t<MemOpKind>>(mask)) != 0;
+}
+
 struct MemOps {
   inline std::optional<MemOpKind> kind(llvm::StringRef function) const {
     if (auto alloc = allocKind(function)) {
@@ -108,6 +115,11 @@ struct MemOps {
       {"cudaMallocManaged", MemOpKind::CudaMallocLike},
       {"cudaMallocAsync", MemOpKind::CudaMallocLike},
       {"cudaMallocFromPoolAsync", MemOpKind::CudaMallocLike},
+      {"hipMalloc", MemOpKind::HipMallocLike},
+      {"hipHostMalloc", MemOpKind::HipMallocLike},
+      {"hipMallocManaged", MemOpKind::HipMallocLike},
+      {"hipMallocAsync", MemOpKind::HipMallocLike},
+      {"hipMallocFromPoolAsync", MemOpKind::HipMallocLike},
   };
 
   const llvm::StringMap<MemOpKind> dealloc_map{
@@ -129,6 +141,9 @@ struct MemOps {
       {"cudaFree", MemOpKind::FreeLike},
       {"cudaFreeHost", MemOpKind::FreeLike},
       {"cudaFreeAsync", MemOpKind::FreeLike},
+      {"hipFree", MemOpKind::FreeLike},
+      {"hipFreeHost", MemOpKind::FreeLike},
+      {"hipFreeAsync", MemOpKind::FreeLike},
   };
   //clang-format off
 };
diff --git a/lib/passes/analysis/MemOpVisitor.cpp b/lib/passes/analysis/MemOpVisitor.cpp
index e9c98811..e77762cd 100644
--- a/lib/passes/analysis/MemOpVisitor.cpp
+++ b/lib/passes/analysis/MemOpVisitor.cpp
@@ -16,7 +16,7 @@
 #include "compat/CallSite.h"
 #include "configuration/Configuration.h"
 #include "support/ConfigurationBase.h"
-#include "support/CudaUtil.h"
+#include "support/GpuUtil.h"
 #include "support/Error.h"
 #include "support/Logger.h"
 #include "support/TypeUtil.h"
@@ -98,7 +98,7 @@ void MemOpVisitor::visitCallBase(llvm::CallBase& cb) {
     return;
   }
   const auto* called_function = cb.getCalledFunction();
-  if (!collect_gpu && called_function != nullptr && cuda::is_cuda_function(*called_function)) {
+  if (!collect_gpu && called_function != nullptr && gpu::is_gpu_function(*called_function)) {
     return;
   }
   const auto isInSet = [&](const auto& fMap) -> std::optional<MemOpKind> {
@@ -235,8 +235,8 @@ std::pair<MallocGeps, MallocBcasts> collectRelevantMallocUsers(llvm::CallBase& c
   auto geps   = MallocGeps{};
   auto bcasts = MallocBcasts{};
 
-  if (kind == MemOpKind::CudaMallocLike) {
-    if (auto bitcast = cuda::bitcast_for(call_inst); bitcast.has_value()) {
+  if (is_kind(kind, MemOpKind::GpuMallocLike)) {
+    if (auto bitcast = gpu::bitcast_for(call_inst, kind); bitcast.has_value()) {
       bcasts.insert(*bitcast);
     }
     return {geps, bcasts};
diff --git a/lib/passes/instrumentation/MemOpArgCollector.cpp b/lib/passes/instrumentation/MemOpArgCollector.cpp
index b70ba713..f1531c1e 100644
--- a/lib/passes/instrumentation/MemOpArgCollector.cpp
+++ b/lib/passes/instrumentation/MemOpArgCollector.cpp
@@ -105,6 +105,8 @@ HeapArgList MemOpArgCollector::collectHeap(const MallocDataList& mallocs) {
         byte_count = malloc_call->getArgOperand(1);
         break;
       case MemOpKind::CudaMallocLike:
+        [[fallthrough]];
+      case MemOpKind::HipMallocLike:
         byte_count = malloc_call->getArgOperand(1);
         if (mdata.primary != nullptr) {
           pointer = mdata.primary->getOperand(0);
diff --git a/lib/passes/instrumentation/MemOpInstrumentation.cpp b/lib/passes/instrumentation/MemOpInstrumentation.cpp
index d16a99f5..2d742c80 100644
--- a/lib/passes/instrumentation/MemOpInstrumentation.cpp
+++ b/lib/passes/instrumentation/MemOpInstrumentation.cpp
@@ -74,7 +74,7 @@ InstrCount MemOpInstrumentation::instrumentHeap(const HeapArgList& heap) {
   for (const auto& [malloc, args] : heap) {
     auto kind = malloc.kind;
     Instruction* malloc_call{nullptr};
-    if (malloc.kind == MemOpKind::CudaMallocLike) {
+    if (is_kind(malloc.kind, MemOpKind::GpuMallocLike)) {
       malloc_call = llvm::cast<Instruction>(malloc.call);
     } else {
       malloc_call = args.get_as<llvm::Instruction>(ArgMap::ID::pointer);
@@ -149,7 +149,9 @@ InstrCount MemOpInstrumentation::instrumentHeap(const HeapArgList& heap) {
                                                             target_memory_address);
         break;
       }
-      case MemOpKind::CudaMallocLike: {
+      case MemOpKind::CudaMallocLike:
+        [[fallthrough]];
+      case MemOpKind::HipMallocLike: {
         auto* runtime_ptr_type = instrumentation_helper->getTypeFor(IType::ptr);
 #if LLVM_VERSION_MAJOR >= 15
         auto* loaded_ptr = IRB.CreateLoad(runtime_ptr_type, pointer_value);
diff --git a/lib/passes/instrumentation/TypeARTFunctions.cpp b/lib/passes/instrumentation/TypeARTFunctions.cpp
index 17c295fa..b2bc1cea 100644
--- a/lib/passes/instrumentation/TypeARTFunctions.cpp
+++ b/lib/passes/instrumentation/TypeARTFunctions.cpp
@@ -17,6 +17,7 @@
 #include "instrumentation/TypeIDProvider.h"
 #include "support/ConfigurationBase.h"
 #include "support/CudaUtil.h"
+#include "support/HipUtil.h"
 #include "support/Logger.h"
 #include "support/OmpUtil.h"
 
@@ -52,6 +53,8 @@ std::string get_func_suffix(IFunc id) {
   switch (id) {
     case IFunc::free_cuda:
     case IFunc::heap_cuda:
+    case IFunc::free_hip:
+    case IFunc::heap_hip:
       return "_gpu";
     case IFunc::free_omp:
     case IFunc::heap_omp:
@@ -63,7 +66,7 @@ std::string get_func_suffix(IFunc id) {
   }
 }
 
-enum class IFuncType : unsigned { standard, omp, cuda };
+enum class IFuncType : unsigned { standard, omp, cuda, hip };
 
 IFuncType ifunc_type_for(llvm::Function* f) {
   if (f == nullptr) {
@@ -74,6 +77,10 @@ IFuncType ifunc_type_for(llvm::Function* f) {
     return IFuncType::cuda;
   }
 
+  if (hip::is_hip_function(*f)) {
+    return IFuncType::hip;
+  }
+
   if (util::omp::isOmpContext(f)) {
     return IFuncType::omp;
   }
@@ -93,10 +100,10 @@ IFunc ifunc_for_function(IFunc general_type, llvm::Value* value) {
   } else if (llvm::isa<GlobalVariable>(value)) {
     type = detail::ifunc_type_for(nullptr);
   } else if (auto callbase = llvm::dyn_cast<CallBase>(value)) {
-    type            = detail::ifunc_type_for(callbase->getFunction());
-    auto maybe_cuda = detail::ifunc_type_for(callbase->getCalledFunction());
-    if (maybe_cuda == detail::IFuncType::cuda) {
-      type = detail::IFuncType::cuda;
+    type               = detail::ifunc_type_for(callbase->getFunction());
+    auto called_context = detail::ifunc_type_for(callbase->getCalledFunction());
+    if (called_context == detail::IFuncType::cuda || called_context == detail::IFuncType::hip) {
+      type = called_context;
     }
   }
 
@@ -115,6 +122,17 @@ IFunc ifunc_for_function(IFunc general_type, llvm::Value* value) {
     }
   }
 
+  if (detail::IFuncType::hip == type) {
+    switch (general_type) {
+      case IFunc::heap:
+        return IFunc::heap_hip;
+      case IFunc::free:
+        return IFunc::free_hip;
+      default:
+        return general_type;
+    }
+  }
+
   switch (general_type) {
     case IFunc::stack:
       return IFunc::stack_omp;
@@ -180,6 +198,7 @@ llvm::Function* TAFunctionDeclarator::make_function(IFunc func_id, llvm::StringR
   const auto name = make_fname(basename, args);
 
   if (auto it = function_map.find(name); it != function_map.end()) {
+    typeart_functions.putFunctionFor(func_id, it->second);
     return it->second;
   }
 
@@ -343,6 +362,8 @@ std::unique_ptr<TAFunctionQuery> declare_instrumentation_functions(llvm::Module&
 
   typeart_alloc_cuda.f = decl.make_function(IFunc::heap_cuda, typeart_alloc_cuda.name, alloc_arg_types);
   typeart_free_cuda.f  = decl.make_function(IFunc::free_cuda, typeart_free_cuda.name, free_arg_types);
+  decl.make_function(IFunc::heap_hip, typeart_alloc_cuda.name, alloc_arg_types);
+  decl.make_function(IFunc::free_hip, typeart_free_cuda.name, free_arg_types);
 
   typeart_alloc_omp_mty.f =
       decl_alternatives.make_function(IFunc::heap_omp, typeart_alloc_omp_mty.name, alloc_arg_types_mty);
@@ -351,6 +372,7 @@ std::unique_ptr<TAFunctionQuery> declare_instrumentation_functions(llvm::Module&
 
   typeart_alloc_cuda_mty.f =
       decl_alternatives.make_function(IFunc::heap_cuda, typeart_alloc_cuda_mty.name, alloc_arg_types_mty);
+  decl_alternatives.make_function(IFunc::heap_hip, typeart_alloc_cuda_mty.name, alloc_arg_types_mty);
 
   return std::make_unique<TAFunctionAlternatives>(functions, functions_alternative);
 }
diff --git a/lib/passes/instrumentation/TypeARTFunctions.h b/lib/passes/instrumentation/TypeARTFunctions.h
index 3128d92e..e43a103c 100644
--- a/lib/passes/instrumentation/TypeARTFunctions.h
+++ b/lib/passes/instrumentation/TypeARTFunctions.h
@@ -44,6 +44,8 @@ enum class IFunc : unsigned {
   scope_omp,
   heap_cuda,
   free_cuda,
+  heap_hip,
+  free_hip,
   type
 };
 
diff --git a/lib/passes/support/GpuUtil.h b/lib/passes/support/GpuUtil.h
new file mode 100644
index 00000000..5ebcf653
--- /dev/null
+++ b/lib/passes/support/GpuUtil.h
@@ -0,0 +1,63 @@
+// TypeART library
+//
+// Copyright (c) 2017-2026 TypeART Authors
+// Distributed under the BSD 3-Clause license.
+// (See accompanying file LICENSE.txt or copy at
+// https://opensource.org/licenses/BSD-3-Clause)
+//
+// Project home: https://github.com/tudasc/TypeART
+//
+// SPDX-License-Identifier: BSD-3-Clause
+//
+
+#ifndef TYPEART_GPUUTIL_H
+#define TYPEART_GPUUTIL_H
+
+#include "analysis/MemOpData.h"
+#include "support/CudaUtil.h"
+#include "support/HipUtil.h"
+
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+
+#include <optional>
+
+namespace typeart::gpu {
+
+inline std::optional<llvm::BitCastInst*> bitcast_for(const llvm::CallBase& cb, MemOpKind kind) {
+  if (kind == MemOpKind::CudaMallocLike) {
+    return cuda::bitcast_for(cb);
+  }
+  if (kind == MemOpKind::HipMallocLike) {
+    return hip::bitcast_for(cb);
+  }
+  return std::nullopt;
+}
+
+inline bool is_templated_malloc_like(llvm::StringRef name, MemOpKind kind) {
+  if (kind == MemOpKind::CudaMallocLike) {
+    return cuda::is_templated_malloc_like(name);
+  }
+  if (kind == MemOpKind::HipMallocLike) {
+    return hip::is_templated_malloc_like(name);
+  }
+  return false;
+}
+
+inline bool is_templated_malloc_like(const llvm::Function& function, MemOpKind kind) {
+  if (kind == MemOpKind::CudaMallocLike) {
+    return cuda::is_templated_malloc_like(function);
+  }
+  if (kind == MemOpKind::HipMallocLike) {
+    return hip::is_templated_malloc_like(function);
+  }
+  return false;
+}
+
+inline bool is_gpu_function(const llvm::Function& function) {
+  return cuda::is_cuda_function(function) || hip::is_hip_function(function);
+}
+
+}  // namespace typeart::gpu
+
+#endif  // TYPEART_GPUUTIL_H
diff --git a/lib/passes/support/HipUtil.h b/lib/passes/support/HipUtil.h
new file mode 100644
index 00000000..1628db97
--- /dev/null
+++ b/lib/passes/support/HipUtil.h
@@ -0,0 +1,74 @@
+// TypeART library
+//
+// Copyright (c) 2017-2026 TypeART Authors
+// Distributed under the BSD 3-Clause license.
+// (See accompanying file LICENSE.txt or copy at
+// https://opensource.org/licenses/BSD-3-Clause)
+//
+// Project home: https://github.com/tudasc/TypeART
+//
+// SPDX-License-Identifier: BSD-3-Clause
+//
+
+#ifndef TYPEART_HIPUTIL_H
+#define TYPEART_HIPUTIL_H
+
+#include "analysis/MemOpData.h"
+#include "support/Util.h"
+
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+
+#include <algorithm>
+#include <optional>
+#include <string>
+
+namespace typeart::hip {
+
+inline std::optional<llvm::BitCastInst*> bitcast_for(llvm::Value* hip_ptr) {
+  std::optional<llvm::BitCastInst*> fallback;
+  for (auto& use : hip_ptr->uses()) {
+    auto* use_value = use.get();
+    auto* bitcast   = llvm::dyn_cast<llvm::BitCastInst>(use_value);
+    if (bitcast == nullptr) {
+      continue;
+    }
+
+    if (auto* primary_bitcast = llvm::dyn_cast<llvm::BitCastInst>(bitcast->getOperand(0))) {
+      return primary_bitcast;
+    }
+
+    fallback = bitcast;
+    return fallback;
+  }
+  return fallback;
+}
+
+inline std::optional<llvm::BitCastInst*> bitcast_for(const llvm::CallBase& hip_call) {
+  return bitcast_for(hip_call.getArgOperand(0));
+}
+
+inline bool is_hip_function(const llvm::Function& function) {
+  const auto function_name = util::try_demangle(function);
+  return util::starts_with_any_of(function_name, "hip");
+}
+
+inline bool is_templated_malloc_like(llvm::StringRef name) {
+  const auto templ_start_pos = name.find_first_of('<');
+  if (templ_start_pos == llvm::StringRef::npos) {
+    return false;
+  }
+  auto extracted_fn = name.substr(0, templ_start_pos);
+  MemOps ops;
+  return ops.allocKind(extracted_fn) == MemOpKind::HipMallocLike;
+}
+
+inline bool is_templated_malloc_like(const llvm::Function& function) {
+  const std::string name = util::try_demangle(function);
+  return is_templated_malloc_like(name);
+}
+
+}  // namespace typeart::hip
+
+#endif  // TYPEART_HIPUTIL_H
diff --git a/lib/passes/typegen/dimeta/DimetaTypeGen.cpp b/lib/passes/typegen/dimeta/DimetaTypeGen.cpp
index 528bce9d..87b4002a 100644
--- a/lib/passes/typegen/dimeta/DimetaTypeGen.cpp
+++ b/lib/passes/typegen/dimeta/DimetaTypeGen.cpp
@@ -14,7 +14,7 @@
 #include "Dimeta.h"
 #include "DimetaData.h"
 #include "analysis/MemOpData.h"
-#include "support/CudaUtil.h"
+#include "support/GpuUtil.h"
 #include "support/Logger.h"
 #include "typegen/TypeGenerator.h"
 #include "typelib/TypeDatabase.h"
@@ -495,13 +495,12 @@ class DimetaTypeManager final : public TypeIDGenerator {
         LOG_DEBUG("Registering malloc-like")
 
         const auto function_name = val->location.function;
-        if (call->getCalledFunction() != nullptr && cuda::is_templated_malloc_like(function_name)) {
-          MemOps mem_operations;
-          auto kind = mem_operations.kind(call->getCalledFunction()->getName());
-          if (kind == MemOpKind::CudaMallocLike) {
-            LOG_DEBUG("Workaround for pointer level of call base " << function_name)
-            workaround::remove_pointer_level(call, val.value());
-          }
+        MemOps mem_operations;
+        auto kind = call->getCalledFunction() != nullptr ? mem_operations.kind(call->getCalledFunction()->getName()) : std::nullopt;
+
+        if (kind && is_kind(kind.value(), MemOpKind::GpuMallocLike) && gpu::is_templated_malloc_like(function_name, kind.value())) {
+          LOG_DEBUG("Workaround for pointer level of call base " << function_name)
+          workaround::remove_pointer_level(call, val.value());
         }
 
         return {getOrRegister(val->type, true), array_size(val->type)};
diff --git a/lib/passes/typegen/ir/TypeManager.cpp b/lib/passes/typegen/ir/TypeManager.cpp
index 1927f920..2c346700 100644
--- a/lib/passes/typegen/ir/TypeManager.cpp
+++ b/lib/passes/typegen/ir/TypeManager.cpp
@@ -15,7 +15,7 @@
 #include "IRTypeGen.h"
 #include "StructTypeHandler.h"
 #include "VectorTypeHandler.h"
-#include "support/CudaUtil.h"
+#include "support/GpuUtil.h"
 #include "support/Logger.h"
 #include "support/TypeUtil.h"
 #include "support/Util.h"
@@ -280,13 +280,13 @@ TypeIdentifier TypeManager::getOrRegisterType(const MallocData& mdata) {
   BitCastInst* primaryBitcast = mdata.primary;
   llvm::Type* allocation_type = nullptr;
 
-  if (mdata.kind == MemOpKind::CudaMallocLike && primaryBitcast == nullptr) {
-    if (auto bitcast = cuda::bitcast_for(*malloc_call); bitcast.has_value()) {
+  if (is_kind(mdata.kind, MemOpKind::GpuMallocLike) && primaryBitcast == nullptr) {
+    if (auto bitcast = gpu::bitcast_for(*malloc_call, mdata.kind); bitcast.has_value()) {
       primaryBitcast = *bitcast;
     }
   }
 
-  if (mdata.kind == MemOpKind::CudaMallocLike) {
+  if (is_kind(mdata.kind, MemOpKind::GpuMallocLike)) {
     allocation_type = llvm::Type::getInt8Ty(malloc_call->getContext());
   } else {
     auto pointee_type = tu::getPointerElementType(malloc_call->getType());
@@ -295,7 +295,7 @@ TypeIdentifier TypeManager::getOrRegisterType(const MallocData& mdata) {
 
   int typeId = getOrRegisterType(allocation_type, dl);  // retrieveTypeID(tu::getVoidType(c));
 
-  if (mdata.kind == MemOpKind::CudaMallocLike) {
+  if (is_kind(mdata.kind, MemOpKind::GpuMallocLike)) {
     typeId = TYPEART_POINTER;
   }
 
@@ -308,7 +308,7 @@ TypeIdentifier TypeManager::getOrRegisterType(const MallocData& mdata) {
   // Number of bytes per element, 1 for void*
   unsigned typeSize = tu::getTypeSizeInBytes(allocation_type, dl);
 
-  if (mdata.kind == MemOpKind::CudaMallocLike) {
+  if (is_kind(mdata.kind, MemOpKind::GpuMallocLike)) {
     typeSize = 1;
   }
 
@@ -319,7 +319,7 @@ TypeIdentifier TypeManager::getOrRegisterType(const MallocData& mdata) {
       dstPtrType = *pointee_type;
     }
     // Basically: getSrcTy()->getPointerElementType()->getPointerElementType():
-    if (mdata.kind == MemOpKind::CudaMallocLike && dstPtrType == nullptr) {
+    if (is_kind(mdata.kind, MemOpKind::GpuMallocLike) && dstPtrType == nullptr) {
       if (auto pointee_type = tu::getPointerElementType(primaryBitcast->getSrcTy()); pointee_type.has_value()) {
         dstPtrType = *pointee_type;
       }
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 361044d0..6c3f6c53 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -45,6 +45,37 @@ endfunction()
 
 cuda_runnable_detect(TYPEART_CUDA_RUNNABLE)
 
+function(hip_runnable_detect hip_is_runnable)
+  set(${hip_is_runnable} 0 PARENT_SCOPE)
+  if(NOT hip_FOUND)
+    return()
+  endif()
+
+  find_program(TYPEART_ROCM_SMI_EXEC rocm-smi)
+  mark_as_advanced(TYPEART_ROCM_SMI_EXEC)
+  if(NOT TYPEART_ROCM_SMI_EXEC)
+    return()
+  endif()
+
+  execute_process(
+    COMMAND ${TYPEART_ROCM_SMI_EXEC} -L
+    RESULT_VARIABLE smi_result
+    OUTPUT_VARIABLE smi_out
+    ERROR_QUIET
+  )
+  if(NOT smi_result EQUAL 0)
+    return()
+  endif()
+
+  string(REGEX MATCHALL "GPU [0-9]+:" smi_gpu_list ${smi_out})
+  list(LENGTH smi_gpu_list TYPEART_HIP_GPU_COUNT)
+  if(TYPEART_HIP_GPU_COUNT GREATER 0)
+    set(${hip_is_runnable} 1 PARENT_SCOPE)
+  endif()
+endfunction()
+
+hip_runnable_detect(TYPEART_HIP_RUNNABLE)
+
 macro(pythonize_bool truth_var var)
   if(${truth_var})
     set(${var} True)
@@ -76,6 +107,7 @@ function(typeart_configure_lit_site input output)
 
   set(TYPEARTPASS_CUDA_FLAGS "")
   set(TYPEARTPASS_CUDA_PATH "")
+  
   if(CUDAToolkit_FOUND)
     if(DEFINED CUDAToolkit_TARGET_DIR AND NOT "${CUDAToolkit_TARGET_DIR}" STREQUAL "")
       set(TYPEARTPASS_CUDA_PATH "${CUDAToolkit_TARGET_DIR}")
@@ -97,9 +129,14 @@ function(typeart_configure_lit_site input output)
     endif()
   endif()
 
-  set(TYPEART_CUDA_STATIC_AVAILABLE FALSE)
+  set(TYPEART_CUDA_STATIC_AVAILABLE False)
   if(CUDAToolkit_FOUND OR NOT "${TYPEARTPASS_CUDA_PATH}" STREQUAL "")
-    set(TYPEART_CUDA_STATIC_AVAILABLE TRUE)
+    set(TYPEART_CUDA_STATIC_AVAILABLE True)
+  endif()
+
+  set(TYPEART_HIP_STATIC_AVAILABLE False)
+  if(hip_FOUND)
+    set(TYPEART_HIP_STATIC_AVAILABLE True)
   endif()
 
   pythonize_bool(TYPEART_CUDA_STATIC_AVAILABLE TYPEARTPASS_CUDA_STATIC)
@@ -206,6 +243,7 @@ set(TYPEART_SUITES
   all
   pass
   cuda
+  hip
   runtime
   script
   typemapping
@@ -219,6 +257,7 @@ set(TYPEART_SUITES_WORKERS
   1
   1
   1
+  1
   ${NUM_CPU}
   1
   1
diff --git a/test/hip/pass/01_hipmalloc.c b/test/hip/pass/01_hipmalloc.c
new file mode 100644
index 00000000..01833e53
--- /dev/null
+++ b/test/hip/pass/01_hipmalloc.c
@@ -0,0 +1,17 @@
+// RUN: %hip-cpp-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
+
+// REQUIRES: hip_static && !llvm-14
+
+// LLVM: call i32 @hipMalloc(ptr {{.*}}[[HIP_POINTER:%[_0-9a-z]+]],
+// LLVM: [[HIP_LOAD:%[0-9a-z_]+]] = load ptr, ptr [[HIP_POINTER]]
+// LLVM: call void @__typeart_alloc_gpu(ptr [[HIP_LOAD]], i32 23
+
+#include <hip/hip_runtime.h>
+int main() {
+  const int N = 20;
+  float* d_x;
+
+  hipMalloc(&d_x, N * sizeof(float));
+
+  return 0;
+}
diff --git a/test/hip/pass/02_hipfree.c b/test/hip/pass/02_hipfree.c
new file mode 100644
index 00000000..b7efaa99
--- /dev/null
+++ b/test/hip/pass/02_hipfree.c
@@ -0,0 +1,15 @@
+// RUN: %hip-cpp-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s
+
+// REQUIRES: hip_static && !llvm-14
+
+// CHECK: call i32 @hipFree(ptr {{.*}}[[HIP_POINTER:%[0-9a-z]+]])
+// CHECK-NEXT: __typeart_free_gpu(ptr {{.*}}[[HIP_POINTER]])
+
+#include <hip/hip_runtime.h>
+int main() {
+  float* d_x;
+
+  hipFree(d_x);
+
+  return 0;
+}
diff --git a/test/lit.cfg b/test/lit.cfg
index d565c392..9ee0e6c2 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -63,6 +63,9 @@ if config.cuda_static:
 if config.cuda_runtime:
   config.available_features.add('cuda_runtime')
 
+if config.has_hip:
+  config.available_features.add('hip_static')
+
 profile_files       = getattr(config, 'profile_file', None)
 typeart_base_lib_dir= getattr(config, 'typeart_base_lib_dir', None)
 typeart_lib_root    = getattr(config, 'typeart_lib_dir', None)
@@ -80,6 +83,8 @@ if cuda_path:
   cuda_host_args += ' --cuda-path={}'.format(cuda_path)
 cuda_flags          = getattr(config, 'cuda_flags', '')
 cuda_link_flags     = f"-L{config.cuda_lib} -Wl,-rpath,{config.cuda_lib} -lcudart"
+hip_host_args       = '-x hip --offload-host-only -nogpulib'
+hip_flags           = getattr(config, 'hip_flags', '')
 type_file           = 'typeart-types.yaml'
 openmp_c_flags      = getattr(config, 'openmp_c_flags', None)
 openmp_cxx_flags    = getattr(config, 'openmp_cxx_flags', None)
@@ -157,6 +162,8 @@ config.substitutions.append(('%c-to-llvm', '{} {}'.format(clang_cc, to_llvm_args
 config.substitutions.append(('%cpp-to-llvm', '{} {}'.format(clang_cpp, to_llvm_args)))
 config.substitutions.append(('%cuda-c-to-llvm', '{} {} {} {}'.format(clang_cc, cuda_host_args, cuda_flags, to_llvm_args)))
 config.substitutions.append(('%cuda-cpp-to-llvm', '{} {} {} {}'.format(clang_cpp, cuda_host_args, cuda_flags, to_llvm_args)))
+config.substitutions.append(('%hip-c-to-llvm', '{} {} {} {}'.format(clang_cc, hip_host_args, hip_flags, to_llvm_args)))
+config.substitutions.append(('%hip-cpp-to-llvm', '{} {} {} {}'.format(clang_cpp, hip_host_args, hip_flags, to_llvm_args)))
 config.substitutions.append(('%run', '{}/run.sh'.format(typeart_script_dir)))
 config.substitutions.append(('%apply', '{}/apply.sh'.format(typeart_script_dir)))
 
diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in
index b399460c..8837a79c 100644
--- a/test/lit.site.cfg.in
+++ b/test/lit.site.cfg.in
@@ -45,6 +45,8 @@ config.cuda_flags="@TYPEARTPASS_CUDA_FLAGS@"
 config.cuda_path="@TYPEARTPASS_CUDA_PATH@"
 config.cuda_lib="@CUDAToolkit_LIBRARY_DIR@"
 
+config.has_hip = "@TYPEART_HIP_STATIC_AVAILABLE@"
+
 config.python_interp = "@Python3_EXECUTABLE@"
 
 config.llvm_version = @LLVM_VERSION_MAJOR@

From 780d892b01115a2dc88f3eb3cef77c848e87bfb1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Wed, 29 Apr 2026 16:33:45 +0200
Subject: [PATCH 12/24] Simplify CMake GPU test setup

---
 lib/passes/TypeARTPass.cpp               |  5 +-
 lib/passes/support/GpuUtil.h             |  5 ++
 lib/passes/support/HipUtil.h             |  9 +++
 test/CMakeLists.txt                      | 74 +++++++++++++++++-------
 test/cuda/pass/01_cudamalloc.c           |  2 +-
 test/cuda/pass/02_cudafree.c             |  2 +-
 test/cuda/pass/03_cudahostalloc.c        |  2 +-
 test/cuda/pass/04_cudamalloc_host.c      |  2 +-
 test/cuda/pass/05_cudamalloc_managed.c   |  2 +-
 test/cuda/pass/06_cudamalloc_nonvoid.cpp |  2 +-
 test/cuda/pass/07_axpy.c                 |  2 +-
 test/cuda/pass/08_cudamalloc_async.c     |  4 +-
 test/hip/pass/01_hipmalloc.c             |  2 +-
 test/hip/pass/02_hipfree.c               |  2 +-
 test/lit.cfg                             | 16 +++--
 test/lit.site.cfg.in                     | 11 ++--
 16 files changed, 100 insertions(+), 42 deletions(-)

diff --git a/lib/passes/TypeARTPass.cpp b/lib/passes/TypeARTPass.cpp
index ba3d4b2f..19851132 100644
--- a/lib/passes/TypeARTPass.cpp
+++ b/lib/passes/TypeARTPass.cpp
@@ -26,6 +26,7 @@
 #include "instrumentation/TypeIDProvider.h"
 #include "support/ConfigurationBase.h"
 #include "support/CudaUtil.h"
+#include "support/GpuUtil.h"
 #include "support/Logger.h"
 #include "support/ModuleDumper.h"
 #include "support/Table.h"
@@ -270,8 +271,8 @@ class TypeArtPass : public llvm::PassInfoMixin<TypeArtPass> {
 
 llvm::PreservedAnalyses
 run(llvm::Module& m, llvm::ModuleAnalysisManager&) {
-  if (cuda::is_device_module(m)) {
-    LOG_DEBUG("Skipping CUDA device module: " << m.getName());
+  if (gpu::is_device_module(m)) {
+    LOG_DEBUG("Skipping GPU device module: " << m.getName());
     return llvm::PreservedAnalyses::all();
   }
   bool changed{false};
diff --git a/lib/passes/support/GpuUtil.h b/lib/passes/support/GpuUtil.h
index 5ebcf653..f695a1a4 100644
--- a/lib/passes/support/GpuUtil.h
+++ b/lib/passes/support/GpuUtil.h
@@ -19,11 +19,16 @@
 
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
 
 #include <optional>
 
 namespace typeart::gpu {
 
+inline bool is_device_module(const llvm::Module& module) {
+  return cuda::is_device_module(module) || hip::is_device_module(module);
+}
+
 inline std::optional<llvm::BitCastInst*> bitcast_for(const llvm::CallBase& cb, MemOpKind kind) {
   if (kind == MemOpKind::CudaMallocLike) {
     return cuda::bitcast_for(cb);
diff --git a/lib/passes/support/HipUtil.h b/lib/passes/support/HipUtil.h
index 1628db97..360ad074 100644
--- a/lib/passes/support/HipUtil.h
+++ b/lib/passes/support/HipUtil.h
@@ -49,6 +49,15 @@ inline std::optional<llvm::BitCastInst*> bitcast_for(const llvm::CallBase& hip_c
   return bitcast_for(hip_call.getArgOperand(0));
 }
 
+inline bool is_device_module(const llvm::Module& module) {
+#if LLVM_VERSION_MAJOR >= 20
+  const auto triple = module.getTargetTriple().str();
+#else
+  const auto triple = module.getTargetTriple();
+#endif
+  return llvm::StringRef{triple}.find("amdgcn") != llvm::StringRef::npos;
+}
+
 inline bool is_hip_function(const llvm::Function& function) {
   const auto function_name = util::try_demangle(function);
   return util::starts_with_any_of(function_name, "hip");
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 6c3f6c53..28451681 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -2,6 +2,10 @@ typeart_find_llvm_progs(TYPEART_FILECHECK_EXEC "FileCheck-${LLVM_VERSION_MAJOR};
 
 find_package(Python3 COMPONENTS Interpreter)
 
+set(TYPEART_HIP_ARCH_OVERRIDE "" CACHE STRING "Override HIP offload architecture for lit tests (e.g., gfx1031)")
+
+option(TYPEART_DEBUG_HIP_TARGETS "Print HIP target properties during configure" OFF)
+
 if(LLVM_EXTERNAL_LIT)
   cmake_path(GET LLVM_EXTERNAL_LIT PARENT_PATH LLVM_EXTERNAL_LIT_DIR)
 endif()
@@ -45,8 +49,9 @@ endfunction()
 
 cuda_runnable_detect(TYPEART_CUDA_RUNNABLE)
 
-function(hip_runnable_detect hip_is_runnable)
+function(hip_runnable_detect hip_is_runnable hip_arch)
   set(${hip_is_runnable} 0 PARENT_SCOPE)
+  set(${hip_arch} "" PARENT_SCOPE)
   if(NOT hip_FOUND)
     return()
   endif()
@@ -58,7 +63,7 @@ function(hip_runnable_detect hip_is_runnable)
   endif()
 
   execute_process(
-    COMMAND ${TYPEART_ROCM_SMI_EXEC} -L
+    COMMAND ${TYPEART_ROCM_SMI_EXEC} --showproductname --showuniqueid --json
     RESULT_VARIABLE smi_result
     OUTPUT_VARIABLE smi_out
     ERROR_QUIET
@@ -67,14 +72,25 @@ function(hip_runnable_detect hip_is_runnable)
     return()
   endif()
 
-  string(REGEX MATCHALL "GPU [0-9]+:" smi_gpu_list ${smi_out})
-  list(LENGTH smi_gpu_list TYPEART_HIP_GPU_COUNT)
+  string(FIND "${smi_out}" "{" json_start)
+  if(json_start EQUAL -1)
+    return()
+  endif()
+
+  string(SUBSTRING "${smi_out}" ${json_start} -1 clean_json)
+  string(JSON TYPEART_HIP_GPU_COUNT LENGTH "${clean_json}")
+
   if(TYPEART_HIP_GPU_COUNT GREATER 0)
     set(${hip_is_runnable} 1 PARENT_SCOPE)
+    string(JSON TYPEART_HIP_ARCH GET "${clean_json}" "card0" "GFX Version")
+    set(${hip_arch} ${TYPEART_HIP_ARCH} PARENT_SCOPE)
   endif()
 endfunction()
 
-hip_runnable_detect(TYPEART_HIP_RUNNABLE)
+hip_runnable_detect(TYPEART_HIP_RUNNABLE TYPEART_HIP_ARCH)
+if(TYPEART_HIP_ARCH_OVERRIDE)
+  set(TYPEART_HIP_ARCH ${TYPEART_HIP_ARCH_OVERRIDE})
+endif()
 
 macro(pythonize_bool truth_var var)
   if(${truth_var})
@@ -107,7 +123,9 @@ function(typeart_configure_lit_site input output)
 
   set(TYPEARTPASS_CUDA_FLAGS "")
   set(TYPEARTPASS_CUDA_PATH "")
-  
+  set(TYPEARTPASS_HIP_FLAGS "")
+  set(TYPEARTPASS_HIP_LINK_FLAGS "")
+
   if(CUDAToolkit_FOUND)
     if(DEFINED CUDAToolkit_TARGET_DIR AND NOT "${CUDAToolkit_TARGET_DIR}" STREQUAL "")
       set(TYPEARTPASS_CUDA_PATH "${CUDAToolkit_TARGET_DIR}")
@@ -117,30 +135,44 @@ function(typeart_configure_lit_site input output)
       cmake_path(GET CUDAToolkit_BIN_DIR PARENT_PATH TYPEARTPASS_CUDA_PATH)
     endif()
 
-    foreach(cuda_inc ${CUDAToolkit_INCLUDE_DIRS})
-      string(APPEND TYPEARTPASS_CUDA_FLAGS " -isystem ${cuda_inc}")
-    endforeach()
-  else()
-    find_path(TYPEART_CUDA_RUNTIME_HEADER_DIR cuda_runtime_api.h
-      HINTS /usr/local/cuda/include /usr/include /opt/cuda/include
-    )
-    if(TYPEART_CUDA_RUNTIME_HEADER_DIR)
-      cmake_path(GET TYPEART_CUDA_RUNTIME_HEADER_DIR PARENT_PATH TYPEARTPASS_CUDA_PATH)
+    if(CUDAToolkit_INCLUDE_DIRS)
+      list(TRANSFORM CUDAToolkit_INCLUDE_DIRS PREPEND "-isystem ")
+      list(JOIN CUDAToolkit_INCLUDE_DIRS " " TYPEARTPASS_CUDA_FLAGS)
+      string(PREPEND TYPEARTPASS_CUDA_FLAGS " ")
     endif()
   endif()
 
-  set(TYPEART_CUDA_STATIC_AVAILABLE False)
-  if(CUDAToolkit_FOUND OR NOT "${TYPEARTPASS_CUDA_PATH}" STREQUAL "")
-    set(TYPEART_CUDA_STATIC_AVAILABLE True)
-  endif()
+  set(TYPEART_CUDA_STATIC_AVAILABLE ${CUDAToolkit_FOUND})
 
   set(TYPEART_HIP_STATIC_AVAILABLE False)
   if(hip_FOUND)
     set(TYPEART_HIP_STATIC_AVAILABLE True)
+
+    set(TYPEART_HIP_INCLUDE_DIR_INPUT "")
+    if(hip_INCLUDE_DIRS)
+      set(TYPEART_HIP_INCLUDE_DIR_INPUT ${hip_INCLUDE_DIRS})
+    elseif(hip_INCLUDE_DIR)
+      set(TYPEART_HIP_INCLUDE_DIR_INPUT ${hip_INCLUDE_DIR})
+    endif()
+
+    if(TYPEART_HIP_INCLUDE_DIR_INPUT)
+      list(TRANSFORM TYPEART_HIP_INCLUDE_DIR_INPUT PREPEND "-isystem ")
+      list(JOIN TYPEART_HIP_INCLUDE_DIR_INPUT " " TYPEARTPASS_HIP_FLAGS)
+      string(PREPEND TYPEARTPASS_HIP_FLAGS " ")
+    endif()
+
+    set(TYPEART_HIP_LINK_FLAGS_MISC "--hip-link")
+    if(hip_LIB_INSTALL_DIR)
+      list(APPEND TYPEART_HIP_LINK_FLAGS_MISC "-L${hip_LIB_INSTALL_DIR}")
+      list(APPEND TYPEART_HIP_LINK_FLAGS_MISC "-Wl,-rpath,${hip_LIB_INSTALL_DIR}")
+    endif()
+    list(JOIN TYPEART_HIP_LINK_FLAGS_MISC " " TYPEARTPASS_HIP_LINK_FLAGS)
   endif()
 
-  pythonize_bool(TYPEART_CUDA_STATIC_AVAILABLE TYPEARTPASS_CUDA_STATIC)
-  pythonize_bool(TYPEART_CUDA_RUNNABLE TYPEARTPASS_CUDA_RUNTIME)
+  pythonize_bool(TYPEART_CUDA_STATIC_AVAILABLE TYPEARTPASS_HAS_CUDA)
+  pythonize_bool(TYPEART_CUDA_RUNNABLE TYPEARTPASS_CUDA_RUNNABLE)
+  pythonize_bool(TYPEART_HIP_STATIC_AVAILABLE TYPEARTPASS_HAS_HIP)
+  pythonize_bool(TYPEART_HIP_RUNNABLE TYPEARTPASS_HIP_RUNNABLE)
 
   pythonize_bool(MPI_C_FOUND TYPEARTPASS_MPI_C)
   pythonize_bool(MPI_CXX_FOUND TYPEARTPASS_MPI_CXX)
diff --git a/test/cuda/pass/01_cudamalloc.c b/test/cuda/pass/01_cudamalloc.c
index 0e180be3..4a63c9f0 100644
--- a/test/cuda/pass/01_cudamalloc.c
+++ b/test/cuda/pass/01_cudamalloc.c
@@ -1,6 +1,6 @@
 // RUN: %cuda-c-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
 
-// REQUIRES: cuda_static
+// REQUIRES: cuda
 
 // LLVM: call i32 @cudaMalloc(ptr {{.*}}[[CU_POINTER:%[_0-9a-z]+]],
 // LLVM-NEXT: [[CUDA_PTR:%[0-9a-z_]+]] = load {{.*}}, {{.*}}[[CU_POINTER]]
diff --git a/test/cuda/pass/02_cudafree.c b/test/cuda/pass/02_cudafree.c
index 273b5674..a14e7198 100644
--- a/test/cuda/pass/02_cudafree.c
+++ b/test/cuda/pass/02_cudafree.c
@@ -1,6 +1,6 @@
 // RUN: %cuda-c-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s
 
-// REQUIRES: cuda_static
+// REQUIRES: cuda
 
 // CHECK: call i32 @cudaFree({{(ptr|i8\*)}} {{.*}}[[CU_POINTER:%[0-9a-z]+]])
 // CHECK-NEXT: __typeart_free_gpu({{(ptr|i8\*)}} {{.*}}[[CU_POINTER]])
diff --git a/test/cuda/pass/03_cudahostalloc.c b/test/cuda/pass/03_cudahostalloc.c
index 745b76ca..34ef38e2 100644
--- a/test/cuda/pass/03_cudahostalloc.c
+++ b/test/cuda/pass/03_cudahostalloc.c
@@ -1,6 +1,6 @@
 // RUN: %cuda-c-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
 
-// REQUIRES: cuda_static
+// REQUIRES: cuda
 
 // LLVM: call i32 @cudaHostAlloc(ptr {{.*}}[[CU_POINTER:%[_0-9a-z]+]],
 // LLVM-NEXT: [[CUDA_PTR:%[0-9a-z_]+]] = load ptr, ptr [[CU_POINTER]]
diff --git a/test/cuda/pass/04_cudamalloc_host.c b/test/cuda/pass/04_cudamalloc_host.c
index 2930268f..11b1e998 100644
--- a/test/cuda/pass/04_cudamalloc_host.c
+++ b/test/cuda/pass/04_cudamalloc_host.c
@@ -1,6 +1,6 @@
 // RUN: %cuda-c-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
 
-// REQUIRES: cuda_static
+// REQUIRES: cuda
 
 // LLVM: call i32 @cudaMallocHost(ptr {{.*}}[[CU_POINTER:%[_0-9a-z]+]],
 // LLVM-NEXT: [[CUDA_PTR:%[0-9a-z_]+]] = load {{.*}}, {{.*}}[[CU_POINTER]]
diff --git a/test/cuda/pass/05_cudamalloc_managed.c b/test/cuda/pass/05_cudamalloc_managed.c
index 74b69aa6..925a733e 100644
--- a/test/cuda/pass/05_cudamalloc_managed.c
+++ b/test/cuda/pass/05_cudamalloc_managed.c
@@ -1,6 +1,6 @@
 // RUN: %cuda-c-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
 
-// REQUIRES: cuda_static
+// REQUIRES: cuda
 
 // LLVM: call i32 @cudaMallocManaged(ptr {{.*}}[[CU_POINTER:%[_0-9a-z]+]],
 // LLVM-NEXT: [[CUDA_PTR:%[0-9a-z_]+]] = load {{.*}}, {{.*}}[[CU_POINTER]]
diff --git a/test/cuda/pass/06_cudamalloc_nonvoid.cpp b/test/cuda/pass/06_cudamalloc_nonvoid.cpp
index 8f310fce..57296074 100644
--- a/test/cuda/pass/06_cudamalloc_nonvoid.cpp
+++ b/test/cuda/pass/06_cudamalloc_nonvoid.cpp
@@ -1,6 +1,6 @@
 // RUN: %cuda-c-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
 
-// REQUIRES: cuda_static
+// REQUIRES: cuda
 
 // LLVM: __typeart_alloc_gpu(ptr %{{[0-9a-z_]+}}, i32 23, i64 {{.*}})
 // LLVM: __typeart_alloc_gpu(ptr %{{[0-9a-z_]+}}, i32 24, i64 {{.*}})
diff --git a/test/cuda/pass/07_axpy.c b/test/cuda/pass/07_axpy.c
index f60ae8cc..37be150d 100644
--- a/test/cuda/pass/07_axpy.c
+++ b/test/cuda/pass/07_axpy.c
@@ -1,6 +1,6 @@
 // RUN: %cuda-c-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s
 
-// REQUIRES: cuda_static
+// REQUIRES: cuda
 
 // CHECK: Malloc :   2
 
diff --git a/test/cuda/pass/08_cudamalloc_async.c b/test/cuda/pass/08_cudamalloc_async.c
index 25c1a060..667b6840 100644
--- a/test/cuda/pass/08_cudamalloc_async.c
+++ b/test/cuda/pass/08_cudamalloc_async.c
@@ -1,6 +1,6 @@
 // RUN: %cuda-c-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
 
-// REQUIRES: cuda_static
+// REQUIRES: cuda
 
 // clang-format off
 // LLVM: call i32 @cudaMallocAsync(ptr {{.*}}[[CU_POINTER_X:%[_0-9a-z]+]], i64{{.*}} 80, ptr {{.*}})
@@ -38,4 +38,4 @@ int main() {
   cudaMallocFromPoolAsync((void**)&d_y, N * sizeof(float), pool, stream);
 
   return 0;
-}
\ No newline at end of file
+}
diff --git a/test/hip/pass/01_hipmalloc.c b/test/hip/pass/01_hipmalloc.c
index 01833e53..ca7dd7c2 100644
--- a/test/hip/pass/01_hipmalloc.c
+++ b/test/hip/pass/01_hipmalloc.c
@@ -1,6 +1,6 @@
 // RUN: %hip-cpp-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
 
-// REQUIRES: hip_static && !llvm-14
+// REQUIRES: hip && !llvm-14
 
 // LLVM: call i32 @hipMalloc(ptr {{.*}}[[HIP_POINTER:%[_0-9a-z]+]],
 // LLVM: [[HIP_LOAD:%[0-9a-z_]+]] = load ptr, ptr [[HIP_POINTER]]
diff --git a/test/hip/pass/02_hipfree.c b/test/hip/pass/02_hipfree.c
index b7efaa99..24046408 100644
--- a/test/hip/pass/02_hipfree.c
+++ b/test/hip/pass/02_hipfree.c
@@ -1,6 +1,6 @@
 // RUN: %hip-cpp-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s
 
-// REQUIRES: hip_static && !llvm-14
+// REQUIRES: hip && !llvm-14
 
 // CHECK: call i32 @hipFree(ptr {{.*}}[[HIP_POINTER:%[0-9a-z]+]])
 // CHECK-NEXT: __typeart_free_gpu(ptr {{.*}}[[HIP_POINTER]])
diff --git a/test/lit.cfg b/test/lit.cfg
index 9ee0e6c2..46e787c7 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -58,13 +58,15 @@ if config.has_legacy_wrapper:
 if config.is_ci:
   config.available_features.add('ci')
 
-if config.cuda_static:
-  config.available_features.add('cuda_static')
-if config.cuda_runtime:
+if config.has_cuda:
+  config.available_features.add('cuda')
+if config.cuda_runnable:
   config.available_features.add('cuda_runtime')
 
 if config.has_hip:
-  config.available_features.add('hip_static')
+  config.available_features.add('hip')
+if config.hip_runnable:
+  config.available_features.add('hip_runtime')
 
 profile_files       = getattr(config, 'profile_file', None)
 typeart_base_lib_dir= getattr(config, 'typeart_base_lib_dir', None)
@@ -85,6 +87,10 @@ cuda_flags          = getattr(config, 'cuda_flags', '')
 cuda_link_flags     = f"-L{config.cuda_lib} -Wl,-rpath,{config.cuda_lib} -lcudart"
 hip_host_args       = '-x hip --offload-host-only -nogpulib'
 hip_flags           = getattr(config, 'hip_flags', '')
+hip_link_flags      = getattr(config, 'hip_link', '')
+hip_arch            = getattr(config, 'hip_arch', '')
+if not hip_arch:
+  hip_arch = 'native'
 type_file           = 'typeart-types.yaml'
 openmp_c_flags      = getattr(config, 'openmp_c_flags', None)
 openmp_cxx_flags    = getattr(config, 'openmp_cxx_flags', None)
@@ -151,6 +157,8 @@ config.substitutions.append(('%omp_c_flags', openmp_c_flags))
 config.substitutions.append(('%omp_cpp_flags', openmp_cxx_flags))
 
 config.substitutions.append(('%cuda_link', cuda_link_flags))
+config.substitutions.append(('%hip_link', hip_link_flags))
+config.substitutions.append(('%hip_arch', hip_arch))
 
 # TODO refactor typeart arguments (and add args for enabling heap/stack/globals etc.)
 # config.substitutions.append(('%arg_stack', '-typeart-stack'))
diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in
index 8837a79c..e1f852aa 100644
--- a/test/lit.site.cfg.in
+++ b/test/lit.site.cfg.in
@@ -39,13 +39,16 @@ config.tsan=@TYPEARTPASS_TSAN@
 config.asan=@TYPEARTPASS_ASAN@
 config.ubsan=@TYPEARTPASS_UBSAN@
 config.coverage=@TYPEART_COVERAGE@
-config.cuda_static=@TYPEARTPASS_CUDA_STATIC@
-config.cuda_runtime=@TYPEARTPASS_CUDA_RUNTIME@
+config.has_cuda=@TYPEARTPASS_HAS_CUDA@
+config.cuda_runnable=@TYPEARTPASS_CUDA_RUNNABLE@
 config.cuda_flags="@TYPEARTPASS_CUDA_FLAGS@"
 config.cuda_path="@TYPEARTPASS_CUDA_PATH@"
 config.cuda_lib="@CUDAToolkit_LIBRARY_DIR@"
-
-config.has_hip = "@TYPEART_HIP_STATIC_AVAILABLE@"
+config.has_hip=@TYPEARTPASS_HAS_HIP@
+config.hip_runnable=@TYPEARTPASS_HIP_RUNNABLE@
+config.hip_flags="@TYPEARTPASS_HIP_FLAGS@"
+config.hip_link="@TYPEARTPASS_HIP_LINK_FLAGS@"
+config.hip_arch="@TYPEART_HIP_ARCH@"
 
 config.python_interp = "@Python3_EXECUTABLE@"
 

From 5a9b5f95023d42541ce65ef2ef5b9836c615929d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Thu, 30 Apr 2026 15:56:40 +0200
Subject: [PATCH 13/24] More comprehensive testing

---
 externals/dimeta/CMakeLists.txt               |  2 +-
 lib/passes/analysis/MemOpData.h               |  1 +
 test/cuda/pass/07_axpy.c                      | 32 -----------------
 ...damalloc_async.c => 07_cudamalloc_async.c} |  0
 test/cuda/pass/08_cudamalloc_inline_types.c   | 34 +++++++++++++++++++
 test/cuda/pass/09_cudamalloc_async.c          | 34 +++++++++++++++++++
 test/cuda/pass/10_cudamalloc_gpu_disabled.c   | 19 +++++++++++
 test/cuda/runtime/07_axpy.c                   |  2 +-
 test/hip/pass/01_hipmalloc.c                  | 10 +++---
 test/hip/pass/02_hipfree.c                    | 11 ++++++
 test/hip/pass/03_hiphostalloc.c               | 17 ++++++++++
 test/hip/pass/04_hipmalloc_host.c             | 17 ++++++++++
 test/hip/pass/05_hipmalloc_managed.c          | 17 ++++++++++
 test/hip/pass/07_hipmalloc_async.c            | 30 ++++++++++++++++
 test/hip/pass/08_hipmalloc_inline_types.c     | 34 +++++++++++++++++++
 test/hip/pass/09_hipmalloc_async.c            | 26 ++++++++++++++
 test/hip/pass/10_hipmalloc_gpu_disabled.c     | 21 ++++++++++++
 17 files changed, 268 insertions(+), 39 deletions(-)
 delete mode 100644 test/cuda/pass/07_axpy.c
 rename test/cuda/pass/{08_cudamalloc_async.c => 07_cudamalloc_async.c} (100%)
 create mode 100644 test/cuda/pass/08_cudamalloc_inline_types.c
 create mode 100644 test/cuda/pass/09_cudamalloc_async.c
 create mode 100644 test/cuda/pass/10_cudamalloc_gpu_disabled.c
 create mode 100644 test/hip/pass/03_hiphostalloc.c
 create mode 100644 test/hip/pass/04_hipmalloc_host.c
 create mode 100644 test/hip/pass/05_hipmalloc_managed.c
 create mode 100644 test/hip/pass/07_hipmalloc_async.c
 create mode 100644 test/hip/pass/08_hipmalloc_inline_types.c
 create mode 100644 test/hip/pass/09_hipmalloc_async.c
 create mode 100644 test/hip/pass/10_hipmalloc_gpu_disabled.c

diff --git a/externals/dimeta/CMakeLists.txt b/externals/dimeta/CMakeLists.txt
index d0a19ddc..40fd1525 100644
--- a/externals/dimeta/CMakeLists.txt
+++ b/externals/dimeta/CMakeLists.txt
@@ -1,7 +1,7 @@
 FetchContent_Declare(
   llvm-dimeta
   GIT_REPOSITORY https://github.com/ahueck/llvm-dimeta
-  GIT_TAG feat/hip
+  GIT_TAG devel
   GIT_SHALLOW 1
 )
 
diff --git a/lib/passes/analysis/MemOpData.h b/lib/passes/analysis/MemOpData.h
index 8255eb01..2af24f4e 100644
--- a/lib/passes/analysis/MemOpData.h
+++ b/lib/passes/analysis/MemOpData.h
@@ -116,6 +116,7 @@ struct MemOps {
       {"cudaMallocAsync", MemOpKind::CudaMallocLike},
       {"cudaMallocFromPoolAsync", MemOpKind::CudaMallocLike},
       {"hipMalloc", MemOpKind::HipMallocLike},
+      {"hipMallocHost", MemOpKind::HipMallocLike},
       {"hipHostMalloc", MemOpKind::HipMallocLike},
       {"hipMallocManaged", MemOpKind::HipMallocLike},
       {"hipMallocAsync", MemOpKind::HipMallocLike},
diff --git a/test/cuda/pass/07_axpy.c b/test/cuda/pass/07_axpy.c
deleted file mode 100644
index 37be150d..00000000
--- a/test/cuda/pass/07_axpy.c
+++ /dev/null
@@ -1,32 +0,0 @@
-// RUN: %cuda-c-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s
-
-// REQUIRES: cuda
-
-// CHECK: Malloc :   2
-
-__global__ void axpy(float a, float* x, float* y) {
-  y[threadIdx.x] = a * x[threadIdx.x];
-}
-
-int main(int argc, char* argv[]) {
-  const int kDataLen = 4;
-
-  float a                = 2.0f;
-  float host_x[kDataLen] = {1.0f, 2.0f, 3.0f, 4.0f};
-  float host_y[kDataLen];
-
-  float* device_x;
-  float* device_y;
-  cudaMalloc((void**)&device_x, kDataLen * sizeof(float));
-  cudaMalloc((void**)&device_y, kDataLen * sizeof(float));
-
-  cudaMemcpy(device_x, host_x, kDataLen * sizeof(float), cudaMemcpyHostToDevice);
-
-  axpy<<<1, kDataLen>>>(a, device_x, device_y);
-
-  cudaDeviceSynchronize();
-  cudaMemcpy(host_y, device_y, kDataLen * sizeof(float), cudaMemcpyDeviceToHost);
-
-  cudaDeviceReset();
-  return 0;
-}
diff --git a/test/cuda/pass/08_cudamalloc_async.c b/test/cuda/pass/07_cudamalloc_async.c
similarity index 100%
rename from test/cuda/pass/08_cudamalloc_async.c
rename to test/cuda/pass/07_cudamalloc_async.c
diff --git a/test/cuda/pass/08_cudamalloc_inline_types.c b/test/cuda/pass/08_cudamalloc_inline_types.c
new file mode 100644
index 00000000..f006a72f
--- /dev/null
+++ b/test/cuda/pass/08_cudamalloc_inline_types.c
@@ -0,0 +1,34 @@
+// clang-format off
+// RUN: %cuda-c-to-llvm %s | TYPEART_GPU=true %apply-typeart -typeart-type-serialization=inline -S 2>&1 | %filecheck %s --check-prefix INLINE
+// RUN: %cuda-c-to-llvm %s | TYPEART_GPU=true %apply-typeart -typeart-type-serialization=hybrid -S 2>&1 | %filecheck %s --check-prefix HYBRID
+// RUN: %cuda-c-to-llvm %s | TYPEART_GPU=true %apply-typeart -typeart-type-serialization=file -S 2>&1 | %filecheck %s --check-prefix FILE
+// clang-format on
+
+// REQUIRES: cuda
+// REQUIRES: !llvm-14
+
+// INLINE: %struct._typeart_struct_layout_t = type { i32, i32, ptr }
+// INLINE: call void @__typeart_alloc_mty(ptr %{{[0-9a-z]+}}, ptr @_typeart_{{.*}}, i64 {{.*}})
+// INLINE: call void @__typeart_register_type(ptr @_typeart_{{.*}})
+
+// HYBRID: call void @__typeart_alloc_cuda(ptr %{{[0-9a-z]+}}, i32 {{[0-9]+}}, i64 {{.*}})
+// HYBRID: call void @__typeart_alloc_mty(ptr %{{[0-9a-z]+}}, ptr @_typeart_{{.*}}, i64 {{.*}})
+// HYBRID: call void @__typeart_register_type(ptr @_typeart_{{.*}})
+
+// FILE: call void @__typeart_alloc_cuda(ptr %{{[0-9a-z]+}}, i32 {{[0-9]+}}, i64 {{.*}})
+
+typedef struct MyData {
+  int a;
+  double b;
+} MyData;
+
+int main() {
+  const int N = 20;
+  float* d_x;
+  MyData* d_s;
+
+  cudaMalloc((void**)&d_x, N * sizeof(float));
+  cudaMalloc((void**)&d_s, N * sizeof(MyData));
+
+  return 0;
+}
diff --git a/test/cuda/pass/09_cudamalloc_async.c b/test/cuda/pass/09_cudamalloc_async.c
new file mode 100644
index 00000000..f46d1996
--- /dev/null
+++ b/test/cuda/pass/09_cudamalloc_async.c
@@ -0,0 +1,34 @@
+// RUN: %cuda-c-to-llvm %s | TYPEART_GPU=true %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
+
+// REQUIRES: cuda
+
+// clang-format off
+// LLVM: call i32 @{{.*}}(ptr {{.*}}[[CU_POINTER_X:%[_0-9a-z]+]], i64 80, ptr {{.*}})
+// LLVM-NEXT: [[CUDA_PTR_X:%[0-9a-z_]+]] = load ptr, ptr [[CU_POINTER_X]]
+// LLVM-NEXT: call void @__typeart_alloc_gpu(ptr [[CUDA_PTR_X]], i32 23, i64 20)
+
+// LLVM: call i32 @{{.*}}(ptr {{.*}}[[CU_POINTER_Y:%[_0-9a-z]+]], i64 80, ptr {{.*}}, ptr {{.*}})
+// LLVM-NEXT: [[CUDA_PTR_Y:%[0-9a-z_]+]] = load ptr, ptr [[CU_POINTER_Y]]
+// LLVM-NEXT: call void @__typeart_alloc_gpu(ptr [[CUDA_PTR_Y]], i32 23, i64 20)
+
+// LLVM_LEGACY: call i32 @{{.*}}({{(ptr|i8\*\*)}} {{.*}}[[CU_POINTER_X:%[_0-9a-z]+]], i64 80, {{(ptr|i8\*)}} {{.*}})
+// LLVM_LEGACY: [[CUDA_PTR_X:%[0-9a-z_]+]] = load i8*, i8** [[CU_POINTER_X]]
+// LLVM_LEGACY: call void @__typeart_alloc_gpu(i8* [[CUDA_PTR_X]], i32 23, i64 20)
+
+// LLVM_LEGACY: call i32 @{{.*}}({{(ptr|i8\*\*)}} {{.*}}[[CU_POINTER_Y:%[_0-9a-z]+]], i64 80, {{(ptr|i8\*)}} {{.*}}, {{(ptr|i8\*)}} {{.*}})
+// LLVM_LEGACY: [[CUDA_PTR_Y:%[0-9a-z_]+]] = load i8*, i8** [[CU_POINTER_Y]]
+// LLVM_LEGACY: call void @__typeart_alloc_gpu(i8* [[CUDA_PTR_Y]], i32 23, i64 20)
+// clang-format on
+
+int main() {
+  const int N = 20;
+  float* d_x;
+  float* d_y;
+  cudaStream_t stream = 0;
+  cudaMemPool_t pool  = 0;
+
+  cudaMallocAsync((void**)&d_x, N * sizeof(float), stream);
+  cudaMallocFromPoolAsync((void**)&d_y, N * sizeof(float), pool, stream);
+
+  return 0;
+}
diff --git a/test/cuda/pass/10_cudamalloc_gpu_disabled.c b/test/cuda/pass/10_cudamalloc_gpu_disabled.c
new file mode 100644
index 00000000..2c001879
--- /dev/null
+++ b/test/cuda/pass/10_cudamalloc_gpu_disabled.c
@@ -0,0 +1,19 @@
+// RUN: %cuda-c-to-llvm %s | %apply-typeart --typeart-gpu=false -S 2>&1 | %filecheck %s
+// RUN: %cuda-c-to-llvm %s | TYPEART_GPU=false %apply-typeart -S 2>&1 | %filecheck %s
+
+// REQUIRES: cuda
+
+// CHECK: call i32 @{{.*}}({{(ptr|i8\*)}} {{.*}}[[CU_POINTER:%[_0-9a-z]+]],
+// CHECK: call i32 @cudaFree({{(ptr|i8\*)}} {{.*}}[[CU_POINTER]])
+// CHECK-NOT: call void @__typeart_alloc_gpu(
+// CHECK-NOT: call void @__typeart_free_gpu(
+
+int main() {
+  const int N = 20;
+  float* d_x;
+
+  cudaMalloc((void**)&d_x, N * sizeof(float));
+  cudaFree(d_x);
+
+  return 0;
+}
diff --git a/test/cuda/runtime/07_axpy.c b/test/cuda/runtime/07_axpy.c
index 440d87c4..4cc61e02 100644
--- a/test/cuda/runtime/07_axpy.c
+++ b/test/cuda/runtime/07_axpy.c
@@ -1,7 +1,7 @@
 // RUN: TYPEART_GPU=true %wrapper-cc -x cuda --cuda-gpu-arch=sm_50 %cuda_link %s -o %s.exe
 // RUN: %s.exe 2>&1 | %filecheck %s
 
-// REQUIRES: cuda_runtime
+// REQUIRES: cuda_runtime && softcounter
 // UNSUPPORTED: sanitizer
 
 // CHECK: [0]=2 [1]=4 [2]=6 [3]=8
diff --git a/test/hip/pass/01_hipmalloc.c b/test/hip/pass/01_hipmalloc.c
index ca7dd7c2..ba4aeba3 100644
--- a/test/hip/pass/01_hipmalloc.c
+++ b/test/hip/pass/01_hipmalloc.c
@@ -1,17 +1,17 @@
-// RUN: %hip-cpp-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
+// RUN: %hip-cpp-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s
 
 // REQUIRES: hip && !llvm-14
 
-// LLVM: call i32 @hipMalloc(ptr {{.*}}[[HIP_POINTER:%[_0-9a-z]+]],
-// LLVM: [[HIP_LOAD:%[0-9a-z_]+]] = load ptr, ptr [[HIP_POINTER]]
-// LLVM: call void @__typeart_alloc_gpu(ptr [[HIP_LOAD]], i32 23
+// CHECK: call i32 @hipMalloc(ptr {{.*}}[[HIP_POINTER:%[_0-9a-z]+]],
+// CHECK-NEXT: [[HIP_PTR:%[0-9a-z_]+]] = load ptr, ptr [[HIP_POINTER]]
+// CHECK-NEXT: call void @__typeart_alloc_gpu(ptr [[HIP_PTR]], i32 23, i64 20)
 
 #include <hip/hip_runtime.h>
 int main() {
   const int N = 20;
   float* d_x;
 
-  hipMalloc(&d_x, N * sizeof(float));
+  hipMalloc((void**)&d_x, N * sizeof(float));
 
   return 0;
 }
diff --git a/test/hip/pass/02_hipfree.c b/test/hip/pass/02_hipfree.c
index 24046408..5ff21f9f 100644
--- a/test/hip/pass/02_hipfree.c
+++ b/test/hip/pass/02_hipfree.c
@@ -5,11 +5,22 @@
 // CHECK: call i32 @hipFree(ptr {{.*}}[[HIP_POINTER:%[0-9a-z]+]])
 // CHECK-NEXT: __typeart_free_gpu(ptr {{.*}}[[HIP_POINTER]])
 
+// CHECK: call i32 @hipFreeHost(ptr {{.*}}[[HIP_POINTER:%[0-9a-z]+]])
+// CHECK-NEXT: __typeart_free_gpu(ptr {{.*}}[[HIP_POINTER]])
+
+// CHECK: call i32 @hipFreeAsync(ptr {{.*}}[[HIP_POINTER:%[0-9a-z]+]],
+// CHECK-NEXT: __typeart_free_gpu(ptr {{.*}}[[HIP_POINTER]])
+
 #include <hip/hip_runtime.h>
 int main() {
   float* d_x;
 
   hipFree(d_x);
 
+  hipFreeHost(d_x);
+
+  hipStream_t stream;
+  hipFreeAsync(d_x, stream);
+
   return 0;
 }
diff --git a/test/hip/pass/03_hiphostalloc.c b/test/hip/pass/03_hiphostalloc.c
new file mode 100644
index 00000000..80a0079a
--- /dev/null
+++ b/test/hip/pass/03_hiphostalloc.c
@@ -0,0 +1,17 @@
+// RUN: %hip-cpp-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s
+
+// REQUIRES: hip && !llvm-14
+
+// CHECK: call i32 @hipHostMalloc(ptr {{.*}}[[HIP_POINTER:%[_0-9a-z]+]],
+// CHECK-NEXT: [[HIP_PTR:%[0-9a-z_]+]] = load ptr, ptr [[HIP_POINTER]]
+// CHECK-NEXT: call void @__typeart_alloc_gpu(ptr {{.*}}[[HIP_PTR]],
+
+#include <hip/hip_runtime.h>
+int main() {
+  const int N = 20;
+  float* d_x;
+
+  hipHostMalloc((void**)&d_x, N * sizeof(float), hipHostMallocDefault);
+
+  return 0;
+}
diff --git a/test/hip/pass/04_hipmalloc_host.c b/test/hip/pass/04_hipmalloc_host.c
new file mode 100644
index 00000000..cc5e4468
--- /dev/null
+++ b/test/hip/pass/04_hipmalloc_host.c
@@ -0,0 +1,17 @@
+// RUN: %hip-cpp-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s
+
+// REQUIRES: hip && !llvm-14
+
+// CHECK: call i32 @hipMallocHost(ptr {{.*}}[[HIP_POINTER:%[_0-9a-z]+]],
+// CHECK-NEXT: [[HIP_PTR:%[0-9a-z_]+]] = load ptr, ptr [[HIP_POINTER]]
+// CHECK-NEXT: call void @__typeart_alloc_gpu(ptr [[HIP_PTR]], i32 23, i64 20)
+
+#include <hip/hip_runtime.h>
+int main() {
+  const int N = 20;
+  float* d_x;
+
+  hipMallocHost((void**)&d_x, N * sizeof(float));
+
+  return 0;
+}
diff --git a/test/hip/pass/05_hipmalloc_managed.c b/test/hip/pass/05_hipmalloc_managed.c
new file mode 100644
index 00000000..861367fc
--- /dev/null
+++ b/test/hip/pass/05_hipmalloc_managed.c
@@ -0,0 +1,17 @@
+// RUN: %hip-cpp-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s
+
+// REQUIRES: hip && !llvm-14
+
+// CHECK: call i32 @hipMallocManaged(ptr {{.*}}[[HIP_POINTER:%[_0-9a-z]+]],
+// CHECK-NEXT: [[HIP_PTR:%[0-9a-z_]+]] = load ptr, ptr [[HIP_POINTER]]
+// CHECK-NEXT: call void @__typeart_alloc_gpu(ptr [[HIP_PTR]], i32 23, i64 20)
+
+#include <hip/hip_runtime.h>
+int main() {
+  const int N = 20;
+  float* d_x;
+
+  hipMallocManaged((void**)&d_x, N * sizeof(float));
+
+  return 0;
+}
diff --git a/test/hip/pass/07_hipmalloc_async.c b/test/hip/pass/07_hipmalloc_async.c
new file mode 100644
index 00000000..43547b5f
--- /dev/null
+++ b/test/hip/pass/07_hipmalloc_async.c
@@ -0,0 +1,30 @@
+// RUN: %hip-cpp-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s
+
+// REQUIRES: hip && !llvm-14
+
+// clang-format off
+// CHECK: call i32 @hipMallocAsync(ptr {{.*}}[[HIP_POINTER_X:%[_0-9a-z]+]], i64{{.*}} 80, ptr {{.*}})
+// CHECK-NEXT: [[HIP_PTR_X:%[0-9a-z_]+]] = load ptr, ptr [[HIP_POINTER_X]]
+// CHECK-NEXT: call void @__typeart_alloc_gpu(ptr [[HIP_PTR_X]], i32 23, i64 20)
+
+// CHECK: call i32 @hipMallocFromPoolAsync(ptr {{.*}}[[HIP_POINTER_Y:%[_0-9a-z]+]], i64{{.*}} 80, ptr {{.*}}, ptr {{.*}})
+// CHECK-NEXT: [[HIP_PTR_Y:%[0-9a-z_]+]] = load ptr, ptr [[HIP_POINTER_Y]]
+// CHECK-NEXT: call void @__typeart_alloc_gpu(ptr [[HIP_PTR_Y]], i32 23, i64 20)
+// clang-format on
+
+#include <hip/hip_runtime.h>
+int main() {
+  const int N = 20;
+  float* d_x;
+  float* d_y;
+
+  hipStream_t stream;
+  hipStreamCreateWithFlags(&stream, hipStreamNonBlocking);
+
+  hipMallocAsync((void**)&d_x, N * sizeof(float), stream);
+
+  hipMemPool_t pool;
+  hipMallocFromPoolAsync((void**)&d_y, N * sizeof(float), pool, stream);
+
+  return 0;
+}
diff --git a/test/hip/pass/08_hipmalloc_inline_types.c b/test/hip/pass/08_hipmalloc_inline_types.c
new file mode 100644
index 00000000..417acc5a
--- /dev/null
+++ b/test/hip/pass/08_hipmalloc_inline_types.c
@@ -0,0 +1,34 @@
+// clang-format off
+// RUN: %hip-cpp-to-llvm %s | TYPEART_GPU=true %apply-typeart -typeart-type-serialization=inline -S 2>&1 | %filecheck %s --check-prefix INLINE
+// RUN: %hip-cpp-to-llvm %s | TYPEART_GPU=true %apply-typeart -typeart-type-serialization=hybrid -S 2>&1 | %filecheck %s --check-prefix HYBRID
+// RUN: %hip-cpp-to-llvm %s | TYPEART_GPU=true %apply-typeart -typeart-type-serialization=file -S 2>&1 | %filecheck %s --check-prefix FILE
+// clang-format on
+
+// REQUIRES: hip && !llvm-14
+
+// INLINE: %struct._typeart_struct_layout_t = type { i32, i32, ptr }
+// INLINE: call void @__typeart_alloc_mty_gpu(ptr %{{[0-9a-z]+}}, ptr @_typeart_{{.*}}, i64 {{.*}})
+// INLINE: call void @__typeart_register_type(ptr @_typeart_{{.*}})
+
+// HYBRID: call void @__typeart_alloc_gpu(ptr %{{[0-9a-z]+}}, i32 {{[0-9]+}}, i64 {{.*}})
+// HYBRID: call void @__typeart_alloc_mty_gpu(ptr %{{[0-9a-z]+}}, ptr @_typeart_{{.*}}, i64 {{.*}})
+// HYBRID: call void @__typeart_register_type(ptr @_typeart_{{.*}})
+
+// FILE: call void @__typeart_alloc_gpu(ptr %{{[0-9a-z]+}}, i32 {{[0-9]+}}, i64 {{.*}})
+
+#include <hip/hip_runtime.h>
+typedef struct MyData {
+  int a;
+  double b;
+} MyData;
+
+int main() {
+  const int N = 20;
+  float* d_x;
+  MyData* d_s;
+
+  hipMalloc((void**)&d_x, N * sizeof(float));
+  hipMalloc((void**)&d_s, N * sizeof(MyData));
+
+  return 0;
+}
diff --git a/test/hip/pass/09_hipmalloc_async.c b/test/hip/pass/09_hipmalloc_async.c
new file mode 100644
index 00000000..7217efb6
--- /dev/null
+++ b/test/hip/pass/09_hipmalloc_async.c
@@ -0,0 +1,26 @@
+// RUN: %hip-cpp-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s
+
+// REQUIRES: hip && !llvm-14
+
+// CHECK: call i32 @{{.*}}(ptr {{.*}}[[HIP_POINTER_X:%[_0-9a-z]+]], i64 {{.*}}80, ptr {{.*}})
+// CHECK-NEXT: [[HIP_PTR_X:%[0-9a-z_]+]] = load ptr, ptr [[HIP_POINTER_X]]
+// CHECK-NEXT: call void @__typeart_alloc_gpu(ptr [[HIP_PTR_X]], i32 23, i64 20)
+
+// CHECK: call i32 @{{.*}}(ptr {{.*}}[[HIP_POINTER_Y:%[_0-9a-z]+]], i64 {{.*}}80, ptr {{.*}}, ptr {{.*}})
+// CHECK-NEXT: [[HIP_PTR_Y:%[0-9a-z_]+]] = load ptr, ptr [[HIP_POINTER_Y]]
+// CHECK-NEXT: call void @__typeart_alloc_gpu(ptr [[HIP_PTR_Y]], i32 23, i64 20)
+
+#include <hip/hip_runtime.h>
+
+int main() {
+  const int N = 20;
+  float* d_x;
+  float* d_y;
+  hipStream_t stream = 0;
+  hipMemPool_t pool  = 0;
+
+  hipMallocAsync((void**)&d_x, N * sizeof(float), stream);
+  hipMallocFromPoolAsync((void**)&d_y, N * sizeof(float), pool, stream);
+
+  return 0;
+}
diff --git a/test/hip/pass/10_hipmalloc_gpu_disabled.c b/test/hip/pass/10_hipmalloc_gpu_disabled.c
new file mode 100644
index 00000000..14e97ea9
--- /dev/null
+++ b/test/hip/pass/10_hipmalloc_gpu_disabled.c
@@ -0,0 +1,21 @@
+// RUN: %hip-cpp-to-llvm %s | %apply-typeart --typeart-gpu=false -S 2>&1 | %filecheck %s
+// RUN: %hip-cpp-to-llvm %s | TYPEART_GPU=0 %apply-typeart -S 2>&1 | %filecheck %s
+
+// REQUIRES: hip && !llvm-14
+
+// CHECK: call i32 @{{.*}}(ptr {{.*}}[[HIP_POINTER:%[_0-9a-z]+]],
+// CHECK: [[HIP_PTR:%[_0-9a-z]+]] = load ptr, ptr [[HIP_POINTER]]
+// CHECK: call i32 @hipFree(ptr {{.*}}[[HIP_PTR]])
+// CHECK-NOT: call void @__typeart_alloc_gpu(
+// CHECK-NOT: call void @__typeart_free_gpu(
+
+#include <hip/hip_runtime.h>
+int main() {
+  const int N = 20;
+  float* d_x;
+
+  hipMalloc((void**)&d_x, N * sizeof(float));
+  hipFree(d_x);
+
+  return 0;
+}

From c76761920e88c78c60e129126354271129878758 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Thu, 30 Apr 2026 16:49:04 +0200
Subject: [PATCH 14/24] CUDA test fixes

---
 test/cuda/pass/08_cudamalloc_inline_types.c |  8 +++----
 test/cuda/pass/09_cudamalloc_async.c        | 24 ++++++++++++---------
 test/cuda/pass/10_cudamalloc_gpu_disabled.c |  6 +++---
 3 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/test/cuda/pass/08_cudamalloc_inline_types.c b/test/cuda/pass/08_cudamalloc_inline_types.c
index f006a72f..29953c15 100644
--- a/test/cuda/pass/08_cudamalloc_inline_types.c
+++ b/test/cuda/pass/08_cudamalloc_inline_types.c
@@ -8,14 +8,14 @@
 // REQUIRES: !llvm-14
 
 // INLINE: %struct._typeart_struct_layout_t = type { i32, i32, ptr }
-// INLINE: call void @__typeart_alloc_mty(ptr %{{[0-9a-z]+}}, ptr @_typeart_{{.*}}, i64 {{.*}})
+// INLINE: call void @__typeart_alloc_mty_gpu(ptr %{{[0-9a-z]+}}, ptr @_typeart_{{.*}}, i64 {{.*}})
 // INLINE: call void @__typeart_register_type(ptr @_typeart_{{.*}})
 
-// HYBRID: call void @__typeart_alloc_cuda(ptr %{{[0-9a-z]+}}, i32 {{[0-9]+}}, i64 {{.*}})
-// HYBRID: call void @__typeart_alloc_mty(ptr %{{[0-9a-z]+}}, ptr @_typeart_{{.*}}, i64 {{.*}})
+// HYBRID: call void @__typeart_alloc_gpu(ptr %{{[0-9a-z]+}}, i32 {{[0-9]+}}, i64 {{.*}})
+// HYBRID: call void @__typeart_alloc_mty_gpu(ptr %{{[0-9a-z]+}}, ptr @_typeart_{{.*}}, i64 {{.*}})
 // HYBRID: call void @__typeart_register_type(ptr @_typeart_{{.*}})
 
-// FILE: call void @__typeart_alloc_cuda(ptr %{{[0-9a-z]+}}, i32 {{[0-9]+}}, i64 {{.*}})
+// FILE: call void @__typeart_alloc_gpu(ptr %{{[0-9a-z]+}}, i32 {{[0-9]+}}, i64 {{.*}})
 
 typedef struct MyData {
   int a;
diff --git a/test/cuda/pass/09_cudamalloc_async.c b/test/cuda/pass/09_cudamalloc_async.c
index f46d1996..b5c4967d 100644
--- a/test/cuda/pass/09_cudamalloc_async.c
+++ b/test/cuda/pass/09_cudamalloc_async.c
@@ -1,23 +1,27 @@
-// RUN: %cuda-c-to-llvm %s | TYPEART_GPU=true %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
+// RUN: %cuda-c-to-llvm %s | TYPEART_GPU=1 %apply-typeart -S 2>&1 | %filecheck %s --check-prefix=%llvm-version-check
 
 // REQUIRES: cuda
 
 // clang-format off
-// LLVM: call i32 @{{.*}}(ptr {{.*}}[[CU_POINTER_X:%[_0-9a-z]+]], i64 80, ptr {{.*}})
+// LLVM: call i32 @cudaMallocAsync(ptr {{.*}}[[CU_POINTER_X:%[_0-9a-z]+]], i64 {{.*}}80, ptr {{.*}})
 // LLVM-NEXT: [[CUDA_PTR_X:%[0-9a-z_]+]] = load ptr, ptr [[CU_POINTER_X]]
 // LLVM-NEXT: call void @__typeart_alloc_gpu(ptr [[CUDA_PTR_X]], i32 23, i64 20)
 
-// LLVM: call i32 @{{.*}}(ptr {{.*}}[[CU_POINTER_Y:%[_0-9a-z]+]], i64 80, ptr {{.*}}, ptr {{.*}})
+// LLVM: call i32 @cudaMallocFromPoolAsync(ptr {{.*}}[[CU_POINTER_Y:%[_0-9a-z]+]], i64 {{.*}}80, ptr {{.*}}, ptr {{.*}})
 // LLVM-NEXT: [[CUDA_PTR_Y:%[0-9a-z_]+]] = load ptr, ptr [[CU_POINTER_Y]]
 // LLVM-NEXT: call void @__typeart_alloc_gpu(ptr [[CUDA_PTR_Y]], i32 23, i64 20)
 
-// LLVM_LEGACY: call i32 @{{.*}}({{(ptr|i8\*\*)}} {{.*}}[[CU_POINTER_X:%[_0-9a-z]+]], i64 80, {{(ptr|i8\*)}} {{.*}})
-// LLVM_LEGACY: [[CUDA_PTR_X:%[0-9a-z_]+]] = load i8*, i8** [[CU_POINTER_X]]
-// LLVM_LEGACY: call void @__typeart_alloc_gpu(i8* [[CUDA_PTR_X]], i32 23, i64 20)
-
-// LLVM_LEGACY: call i32 @{{.*}}({{(ptr|i8\*\*)}} {{.*}}[[CU_POINTER_Y:%[_0-9a-z]+]], i64 80, {{(ptr|i8\*)}} {{.*}}, {{(ptr|i8\*)}} {{.*}})
-// LLVM_LEGACY: [[CUDA_PTR_Y:%[0-9a-z_]+]] = load i8*, i8** [[CU_POINTER_Y]]
-// LLVM_LEGACY: call void @__typeart_alloc_gpu(i8* [[CUDA_PTR_Y]], i32 23, i64 20)
+// LLVM_LEGACY: [[CAST1:%[0-9a-z_]+]] = bitcast float** [[SRC_VAR:%[0-9a-zA-Z_]+]] to i8**
+// LLVM_LEGACY: call i32 @cudaMallocAsync(i8** {{.*}}[[CU_POINTER_X:%[_0-9a-z]+]], i64 {{.*}}80, 
+// LLVM_LEGACY: [[CAST2:%[0-9a-z_]+]] = bitcast float** [[SRC_VAR]] to i8**
+// LLVM_LEGACY: [[LOADED_PTR:%[0-9a-z_]+]] = load i8*, i8** [[CAST2]]
+// LLVM_LEGACY: call void @__typeart_alloc_gpu(i8* [[LOADED_PTR]], i32 23, i64 20)
+
+// LLVM_LEGACY: [[CAST1:%[0-9a-z_]+]] = bitcast float** [[SRC_VAR:%[0-9a-zA-Z_]+]] to i8**
+// LLVM_LEGACY: call i32 @cudaMallocFromPoolAsync(i8** {{.*}}[[CU_POINTER_Y:%[_0-9a-z]+]], i64 {{.*}}80,
+// LLVM_LEGACY: [[CAST2:%[0-9a-z_]+]] = bitcast float** [[SRC_VAR]] to i8**
+// LLVM_LEGACY: [[LOADED_PTR:%[0-9a-z_]+]] = load i8*, i8** [[CAST2]]
+// LLVM_LEGACY: call void @__typeart_alloc_gpu(i8* [[LOADED_PTR]], i32 23, i64 20)
 // clang-format on
 
 int main() {
diff --git a/test/cuda/pass/10_cudamalloc_gpu_disabled.c b/test/cuda/pass/10_cudamalloc_gpu_disabled.c
index 2c001879..47f0a662 100644
--- a/test/cuda/pass/10_cudamalloc_gpu_disabled.c
+++ b/test/cuda/pass/10_cudamalloc_gpu_disabled.c
@@ -1,10 +1,10 @@
 // RUN: %cuda-c-to-llvm %s | %apply-typeart --typeart-gpu=false -S 2>&1 | %filecheck %s
 // RUN: %cuda-c-to-llvm %s | TYPEART_GPU=false %apply-typeart -S 2>&1 | %filecheck %s
 
-// REQUIRES: cuda
+// REQUIRES: cuda && !llvm-14
 
-// CHECK: call i32 @{{.*}}({{(ptr|i8\*)}} {{.*}}[[CU_POINTER:%[_0-9a-z]+]],
-// CHECK: call i32 @cudaFree({{(ptr|i8\*)}} {{.*}}[[CU_POINTER]])
+// CHECK: call i32 @cudaMalloc({{(ptr|i8\*\*)}} {{.*}}[[CU_POINTER:%[_0-9a-z]+]],
+// CHECK: call i32 @cudaFree({{(ptr|i8\*)}}
 // CHECK-NOT: call void @__typeart_alloc_gpu(
 // CHECK-NOT: call void @__typeart_free_gpu(
 

From 3a66c4782d73c8613552522f10c163d2f2f5c31e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Thu, 30 Apr 2026 16:56:39 +0200
Subject: [PATCH 15/24] HIP runtime test

---
 test/hip/runtime/01_axpy.c | 43 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 test/hip/runtime/01_axpy.c

diff --git a/test/hip/runtime/01_axpy.c b/test/hip/runtime/01_axpy.c
new file mode 100644
index 00000000..89f61eb8
--- /dev/null
+++ b/test/hip/runtime/01_axpy.c
@@ -0,0 +1,43 @@
+// RUN: TYPEART_GPU=true %wrapper-cc -x hip --offload-arch=%hip_arch %hip_link %s -o %s.exe
+// RUN: %s.exe 2>&1 | %filecheck %s
+
+// REQUIRES: hip_runtime && !llvm-14 && softcounter
+// UNSUPPORTED: sanitizer
+
+// CHECK: [0]=2 [1]=4 [2]=6 [3]=8
+// CHECK: Total heap{{[ ]*}}:   2 ,    2 ,    -
+
+#include <hip/hip_runtime.h>
+#include <stdio.h>
+__global__ void axpy(float a, float* x, float* y) {
+  y[threadIdx.x] = a * x[threadIdx.x];
+}
+
+int main(int argc, char* argv[]) {
+  const int kDataLen = 4;
+
+  float a                = 2.0f;
+  float host_x[kDataLen] = {1.0f, 2.0f, 3.0f, 4.0f};
+  float host_y[kDataLen];
+
+  float* device_x;
+  float* device_y;
+  hipMalloc((void**)&device_x, kDataLen * sizeof(float));
+  hipMalloc((void**)&device_y, kDataLen * sizeof(float));
+
+  hipMemcpy(device_x, host_x, kDataLen * sizeof(float), hipMemcpyHostToDevice);
+
+  axpy<<<1, kDataLen>>>(a, device_x, device_y);
+
+  hipDeviceSynchronize();
+  hipMemcpy(host_y, device_y, kDataLen * sizeof(float), hipMemcpyDeviceToHost);
+
+  hipDeviceReset();
+
+  for (int i = 0; i < kDataLen; ++i) {
+    printf("[%i]=%.0f ", i, host_y[i]);
+  }
+  printf("\n");
+
+  return 0;
+}

From 61645e210dbde5328c08f2f0601055b8addbc8cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Thu, 30 Apr 2026 18:02:27 +0200
Subject: [PATCH 16/24] Enable CUDA support in CI

---
 .github/actions/setup-typeart/action.yml |  82 +++++++++++++++++
 .github/workflows/basic-ci.yml           | 109 ++++++++++++++---------
 2 files changed, 150 insertions(+), 41 deletions(-)
 create mode 100644 .github/actions/setup-typeart/action.yml

diff --git a/.github/actions/setup-typeart/action.yml b/.github/actions/setup-typeart/action.yml
new file mode 100644
index 00000000..cbe854a1
--- /dev/null
+++ b/.github/actions/setup-typeart/action.yml
@@ -0,0 +1,82 @@
+name: 'Setup TypeART Environment'
+description: 'Sets up LLVM, Clang, OpenMPI, and other dependencies for TypeART CI'
+
+inputs:
+  llvm-version:
+    description: 'LLVM version to install'
+    required: true
+  typeart-typegen-legacy:
+    description: 'Set TYPEART_TYPEGEN_IR env variable'
+    required: false
+  install-libcxx:
+    description: 'Install libc++'
+    required: false
+    default: 'false'
+  install-lcov:
+    description: 'Install lcov'
+    required: false
+    default: 'false'
+  install-omp:
+    description: 'Install LLVM OpenMP runtime'
+    required: false
+    default: 'true'
+  setup-mold:
+    description: 'Setup Mold Linker'
+    required: false
+    default: 'false'
+
+runs:
+  using: "composite"
+  steps:
+    - name: LLVM apt
+      if: ${{ inputs.llvm-version >= 19 }}
+      shell: bash
+      run: |
+        wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
+        echo "deb http://apt.llvm.org/noble/ llvm-toolchain-noble-${{ inputs.llvm-version }} main" | sudo tee /etc/apt/sources.list.d/llvm-${{ inputs.llvm-version }}.list
+
+    - name: Update apt
+      shell: bash
+      run: sudo apt-get update
+
+    - name: Install LLVM
+      shell: bash
+      run: sudo apt-get install -y libllvm${{ inputs.llvm-version }} llvm-${{ inputs.llvm-version }} llvm-${{ inputs.llvm-version }}-dev
+
+    - name: Install LLVM OpenMP runtime
+      if: ${{ inputs.install-omp == 'true' }}
+      shell: bash
+      run: sudo apt-get install -y libomp-${{ inputs.llvm-version }}-dev
+
+    - name: Install Clang
+      shell: bash
+      run: sudo apt-get install -y clang-${{ inputs.llvm-version }} clang-tidy-${{ inputs.llvm-version }}
+
+    - name: Install libc++
+      if: ${{ inputs.install-libcxx == 'true' }}
+      shell: bash
+      run: sudo apt-get install -y --no-install-recommends libc++-${{ inputs.llvm-version }}-dev libc++abi-${{ inputs.llvm-version }}-dev
+
+    - name: Install OpenMPI
+      shell: bash
+      run: sudo apt-get install -y libopenmpi-dev openmpi-bin
+
+    - name: Install lcov
+      if: ${{ inputs.install-lcov == 'true' }}
+      shell: bash
+      run: sudo apt-get install -y lcov
+
+    - name: Setup Mold Linker
+      if: ${{ inputs.setup-mold == 'true' && inputs.llvm-version > 14 }}
+      uses: rui314/setup-mold@v1
+
+    - name: Setup env
+      shell: bash
+      run: |
+        sudo ln -f -s /usr/bin/clang-${{ inputs.llvm-version }} /usr/bin/clang
+        sudo ln -f -s /usr/bin/clang++-${{ inputs.llvm-version }} /usr/bin/clang++
+        echo "LLVM_CMAKE_DIR=/usr/lib/llvm-${{ inputs.llvm-version }}/cmake" >> $GITHUB_ENV
+        echo "EXTERNAL_LIT=/usr/lib/llvm-${{ inputs.llvm-version >= 20 && 18 || inputs.llvm-version }}/build/utils/lit/lit.py" >> $GITHUB_ENV
+        if [ -n "${{ inputs.typeart-typegen-legacy }}" ]; then
+          echo "TYPEART_TYPEGEN_IR=${{ inputs.typeart-typegen-legacy }}" >> $GITHUB_ENV
+        fi
diff --git a/.github/workflows/basic-ci.yml b/.github/workflows/basic-ci.yml
index 8c420922..d7936d6a 100644
--- a/.github/workflows/basic-ci.yml
+++ b/.github/workflows/basic-ci.yml
@@ -81,80 +81,107 @@ jobs:
     steps:
       - uses: actions/checkout@v6
 
-      - name: LLVM apt
-        if: ${{ matrix.platform.llvm-version >= 19 }}
-        run: |
-          wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
-          echo "deb http://apt.llvm.org/noble/ llvm-toolchain-noble-${{ matrix.platform.llvm-version }} main" | sudo tee /etc/apt/sources.list.d/llvm-${{ matrix.platform.llvm-version }}.list
-
-      - name: Update apt
-        run: sudo apt-get update
+      - name: Setup TypeART Environment
+        uses: ./.github/actions/setup-typeart
+        with:
+          llvm-version: ${{ matrix.platform.llvm-version }}
+          typeart-typegen-legacy: ${{ matrix.platform.typeart-typegen-legacy }}
+          install-libcxx: ${{ matrix.preset.libcxx || 'false' }}
+          install-lcov: ${{ matrix.preset.coverage || 'false' }}
+          install-omp: 'true'
+          setup-mold: 'true'
 
-      - name: Install LLVM
-        run: sudo apt-get install libllvm${{ matrix.platform.llvm-version }} llvm-${{ matrix.platform.llvm-version }} llvm-${{ matrix.platform.llvm-version }}-dev
+      - name: Configure TypeART
+        run: cmake -B build --preset ${{ matrix.preset.name }} -DLLVM_DIR=${LLVM_CMAKE_DIR} -DLLVM_EXTERNAL_LIT=${EXTERNAL_LIT}
 
-      - name: Install LLVM OpenMP runtime
-        run: sudo apt-get install libomp-${{ matrix.platform.llvm-version }}-dev
+      - name: Build TypeART
+        run: cmake --build build --parallel 2
 
-      - name: Install Clang
-        run: sudo apt-get install clang-${{ matrix.platform.llvm-version }} clang-tidy-${{ matrix.platform.llvm-version }}
+      - name: Prepare TypeART coverage
+        if: matrix.preset.coverage
+        run: cmake --build build --target typeart-lcov-clean
 
-      - name: Install libc++
-        if: matrix.preset.libcxx
-        run: sudo apt-get install --no-install-recommends libc++-${{ matrix.platform.llvm-version }}-dev libc++abi-${{ matrix.platform.llvm-version }}-dev
+      - name: Test TypeART lit-suite
+        if: matrix.preset.skip_test == false
+        run: cmake --build build --target check-typeart
 
-      - name: Install OpenMPI
-        run: sudo apt-get install libopenmpi-dev openmpi-bin
+      - name: Build coverage report
+        if: matrix.preset.coverage
+        run: cmake --build build --target typeart-lcov-html
 
-      - name: Install lcov
+      - name: Coveralls (parallel)
         if: matrix.preset.coverage
-        run: sudo apt-get install lcov
+        uses: coverallsapp/github-action@v2.3.6
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          path-to-lcov: build/typeart.coverage
+          flag-name: ${{ matrix.preset.name }}-${{ matrix.platform.llvm-version }}-${{ matrix.platform.typeart-typegen-legacy }}
+          parallel: true
 
-      - name: Setup Mold Linker
-        if: ${{ matrix.platform.llvm-version > 14 }}
-        uses: rui314/setup-mold@v1
+  cuda-suite:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - llvm-version: 14
+            os: ubuntu-22.04
+            cuda: 11.8.0
+          - llvm-version: 22
+            os: ubuntu-24.04
+            cuda: 12.6.0
 
-      - name: Setup env
-        run: |
-          sudo ln -f -s /usr/bin/clang-${{ matrix.platform.llvm-version }} /usr/bin/clang
-          sudo ln -f -s /usr/bin/clang++-${{ matrix.platform.llvm-version }} /usr/bin/clang++
-          echo "LLVM_CMAKE_DIR=/usr/lib/llvm-${{ matrix.platform.llvm-version }}/cmake" >> $GITHUB_ENV
-          echo "EXTERNAL_LIT=/usr/lib/llvm-${{ matrix.platform.llvm-version >= 20 && 18 || matrix.platform.llvm-version }}/build/utils/lit/lit.py" >> $GITHUB_ENV
-          echo "TYPEART_TYPEGEN_IR=${{ matrix.platform.typeart-typegen-legacy }}" >> $GITHUB_ENV
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: Jimver/cuda-toolkit@v0.2.35
+        id: cuda-toolkit
+        with:
+          cuda: "${{ matrix.cuda }}"
+          method: network
+          sub-packages: '["nvcc", "cudart", "cudart-dev"]'
+          non-cuda-sub-packages: '["libcurand", "libcurand-dev"]'
+
+      - name: Setup TypeART Environment
+        uses: ./.github/actions/setup-typeart
+        with:
+          llvm-version: ${{ matrix.llvm-version }}
+          typeart-typegen-legacy: 0
+          install-lcov: 'true'
+          install-omp: 'true'
+          setup-mold: 'false'
 
       - name: Configure TypeART
-        run: cmake -B build --preset ${{ matrix.preset.name }} -DLLVM_DIR=${LLVM_CMAKE_DIR} -DLLVM_EXTERNAL_LIT=${EXTERNAL_LIT}
+        run: cmake -B build --preset ci-cov-thread-safe -DLLVM_DIR=${LLVM_CMAKE_DIR} -DLLVM_EXTERNAL_LIT=${EXTERNAL_LIT}
 
       - name: Build TypeART
         run: cmake --build build --parallel 2
 
       - name: Prepare TypeART coverage
-        if: matrix.preset.coverage
         run: cmake --build build --target typeart-lcov-clean
 
-      - name: Test TypeART lit-suite
-        if: matrix.preset.skip_test == false
-        run: cmake --build build --target check-typeart
+      - name: Test TypeART cuda-suite
+        run: cmake --build build --target check-typeart-cuda
 
       - name: Build coverage report
-        if: matrix.preset.coverage
         run: cmake --build build --target typeart-lcov-html
 
       - name: Coveralls (parallel)
-        if: matrix.preset.coverage
         uses: coverallsapp/github-action@v2.3.6
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           path-to-lcov: build/typeart.coverage
-          flag-name: ${{ matrix.preset.name }}-${{ matrix.platform.llvm-version }}-${{ matrix.platform.typeart-typegen-legacy }}
+          flag-name: cuda-suite-${{ matrix.llvm-version }}-${{ matrix.cuda }}
           parallel: true
 
   finish-coverage:
-    needs: lit-suite
+    if: ${{ always() }}
+    needs: [lit-suite, cuda-suite]
     runs-on: ubuntu-24.04
     steps:
       - name: Coveralls Finished
         uses: coverallsapp/github-action@v2.3.6
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
-          parallel-finished: true
\ No newline at end of file
+          parallel-finished: true

From 88918912c3284267a5f037f33cb3e5d8332658e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Thu, 30 Apr 2026 18:23:32 +0200
Subject: [PATCH 17/24] CI error fixes

---
 .github/workflows/basic-ci.yml                 |  3 ++-
 cmake/modules/coverage-lcov.cmake              | 18 ++++++++++++++++--
 lib/passes/Commandline.cpp                     | 12 ++++++------
 lib/passes/analysis/MemOpVisitor.cpp           |  2 +-
 .../configuration/EnvironmentConfiguration.cpp |  3 +--
 .../instrumentation/TypeARTFunctions.cpp       |  2 +-
 lib/passes/support/CudaUtil.h                  |  2 +-
 lib/passes/support/HipUtil.h                   |  2 +-
 lib/passes/typegen/dimeta/DimetaTypeGen.cpp    |  6 ++++--
 9 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/basic-ci.yml b/.github/workflows/basic-ci.yml
index d7936d6a..77dd7684 100644
--- a/.github/workflows/basic-ci.yml
+++ b/.github/workflows/basic-ci.yml
@@ -162,7 +162,7 @@ jobs:
         run: cmake --build build --target typeart-lcov-clean
 
       - name: Test TypeART cuda-suite
-        run: cmake --build build --target check-typeart-cuda
+        run: cmake --build build --target check-typeart
 
       - name: Build coverage report
         run: cmake --build build --target typeart-lcov-html
@@ -181,6 +181,7 @@ jobs:
     runs-on: ubuntu-24.04
     steps:
       - name: Coveralls Finished
+
         uses: coverallsapp/github-action@v2.3.6
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/cmake/modules/coverage-lcov.cmake b/cmake/modules/coverage-lcov.cmake
index c97a985b..0b836576 100644
--- a/cmake/modules/coverage-lcov.cmake
+++ b/cmake/modules/coverage-lcov.cmake
@@ -5,6 +5,20 @@ if(TYPEART_LCOV_EXEC-NOTFOUND OR TYPEART_GENHTML_EXEC-NOTFOUND)
   message(WARNING "lcov and genhtml command needed for coverage.")
 endif()
 
+# Detect whether lcov supports: --ignore-errors unused
+set(TYPEART_LCOV_IGNORE_UNUSED)
+if(TYPEART_LCOV_EXEC)
+  execute_process(
+    COMMAND ${TYPEART_LCOV_EXEC} --ignore-errors unused --version
+    RESULT_VARIABLE TYPEART_LCOV_IGNORE_UNUSED_RESULT
+    OUTPUT_QUIET
+    ERROR_QUIET
+  )
+  if(TYPEART_LCOV_IGNORE_UNUSED_RESULT EQUAL 0)
+    set(TYPEART_LCOV_IGNORE_UNUSED --ignore-errors unused)
+  endif()
+endif()
+
 add_custom_target(
   typeart-lcov-clean
   COMMAND ${TYPEART_LCOV_EXEC} -d ${CMAKE_BINARY_DIR} -z
@@ -27,7 +41,7 @@ endif()
 
 add_custom_target(
   typeart-lcov-make
-  COMMAND ${TYPEART_LCOV_EXEC} ${GCOV_TOOL} ${GCOV_WORKAROUND}
+  COMMAND ${TYPEART_LCOV_EXEC} ${GCOV_TOOL} ${GCOV_WORKAROUND} ${TYPEART_LCOV_IGNORE_UNUSED}
           --no-external -c -d ${CMAKE_BINARY_DIR} -b ${CMAKE_SOURCE_DIR} -o typeart.coverage
   COMMAND ${TYPEART_LCOV_EXEC} --rc derive_function_end_line=0 --remove typeart.coverage '${CMAKE_BINARY_DIR}/*' -o typeart.coverage
 )
@@ -50,7 +64,7 @@ function(typeart_target_lcov target)
 
   add_custom_target(
     typeart-lcov-make-${target}
-    COMMAND ${TYPEART_LCOV_EXEC} ${GCOV_TOOL} ${GCOV_WORKAROUND}
+    COMMAND ${TYPEART_LCOV_EXEC} ${GCOV_TOOL} ${GCOV_WORKAROUND} ${TYPEART_LCOV_IGNORE_UNUSED}
             --no-external -c -d ${CMAKE_BINARY_DIR}
             -b ${LCOV_TARGET_SOURCE_DIR} -o counter-${target}.pro
     COMMAND ${TYPEART_LCOV_EXEC} --remove counter-${target}.pro '${CMAKE_BINARY_DIR}/*'
diff --git a/lib/passes/Commandline.cpp b/lib/passes/Commandline.cpp
index 8e4100ee..b4171625 100644
--- a/lib/passes/Commandline.cpp
+++ b/lib/passes/Commandline.cpp
@@ -70,14 +70,14 @@ static cl::opt<ConfigStdArgTypes::stats_ty> cl_typeart_stats(CommandlineStdArgs:
                                                              cl::cat(typeart_category));
 
 static cl::opt<ConfigStdArgTypes::heap_ty> cl_typeart_instrument_heap(CommandlineStdArgs::heap,
-                                                                       cl::desc(ConfigStdArgDescriptions::heap),
-                                                                       cl::init(ConfigStdArgValues::heap),
-                                                                       cl::cat(typeart_category));
+                                                                      cl::desc(ConfigStdArgDescriptions::heap),
+                                                                      cl::init(ConfigStdArgValues::heap),
+                                                                      cl::cat(typeart_category));
 
 static cl::opt<ConfigStdArgTypes::gpu_ty> cl_typeart_instrument_gpu(CommandlineStdArgs::gpu,
-                                                                     cl::desc(ConfigStdArgDescriptions::gpu),
-                                                                     cl::init(ConfigStdArgValues::gpu),
-                                                                     cl::cat(typeart_category));
+                                                                    cl::desc(ConfigStdArgDescriptions::gpu),
+                                                                    cl::init(ConfigStdArgValues::gpu),
+                                                                    cl::cat(typeart_category));
 
 static cl::opt<ConfigStdArgTypes::global_ty> cl_typeart_instrument_global(CommandlineStdArgs::global,
                                                                           cl::desc(ConfigStdArgDescriptions::global),
diff --git a/lib/passes/analysis/MemOpVisitor.cpp b/lib/passes/analysis/MemOpVisitor.cpp
index e77762cd..57a81d61 100644
--- a/lib/passes/analysis/MemOpVisitor.cpp
+++ b/lib/passes/analysis/MemOpVisitor.cpp
@@ -16,8 +16,8 @@
 #include "compat/CallSite.h"
 #include "configuration/Configuration.h"
 #include "support/ConfigurationBase.h"
-#include "support/GpuUtil.h"
 #include "support/Error.h"
+#include "support/GpuUtil.h"
 #include "support/Logger.h"
 #include "support/TypeUtil.h"
 #include "support/Util.h"
diff --git a/lib/passes/configuration/EnvironmentConfiguration.cpp b/lib/passes/configuration/EnvironmentConfiguration.cpp
index 6eec7254..c36eb9c4 100644
--- a/lib/passes/configuration/EnvironmentConfiguration.cpp
+++ b/lib/passes/configuration/EnvironmentConfiguration.cpp
@@ -110,8 +110,7 @@ EnvironmentFlagsOptions::EnvironmentFlagsOptions() {
                                               EnvironmentStdArgsValues::stats),
       make_entry<ConfigStdArgTypes::heap_ty>(ConfigStdArgs::heap, EnvironmentStdArgs::heap,
                                              EnvironmentStdArgsValues::heap),
-      make_entry<ConfigStdArgTypes::gpu_ty>(ConfigStdArgs::gpu, EnvironmentStdArgs::gpu,
-                                            EnvironmentStdArgsValues::gpu),
+      make_entry<ConfigStdArgTypes::gpu_ty>(ConfigStdArgs::gpu, EnvironmentStdArgs::gpu, EnvironmentStdArgsValues::gpu),
       make_entry<ConfigStdArgTypes::global_ty>(ConfigStdArgs::global, EnvironmentStdArgs::global,
                                                EnvironmentStdArgsValues::global),
       make_entry<ConfigStdArgTypes::stack_ty>(ConfigStdArgs::stack, EnvironmentStdArgs::stack,
diff --git a/lib/passes/instrumentation/TypeARTFunctions.cpp b/lib/passes/instrumentation/TypeARTFunctions.cpp
index b2bc1cea..0828ca00 100644
--- a/lib/passes/instrumentation/TypeARTFunctions.cpp
+++ b/lib/passes/instrumentation/TypeARTFunctions.cpp
@@ -100,7 +100,7 @@ IFunc ifunc_for_function(IFunc general_type, llvm::Value* value) {
   } else if (llvm::isa<GlobalVariable>(value)) {
     type = detail::ifunc_type_for(nullptr);
   } else if (auto callbase = llvm::dyn_cast<CallBase>(value)) {
-    type               = detail::ifunc_type_for(callbase->getFunction());
+    type                = detail::ifunc_type_for(callbase->getFunction());
     auto called_context = detail::ifunc_type_for(callbase->getCalledFunction());
     if (called_context == detail::IFuncType::cuda || called_context == detail::IFuncType::hip) {
       type = called_context;
diff --git a/lib/passes/support/CudaUtil.h b/lib/passes/support/CudaUtil.h
index af4add08..bc1ae58b 100644
--- a/lib/passes/support/CudaUtil.h
+++ b/lib/passes/support/CudaUtil.h
@@ -50,7 +50,7 @@ inline std::optional<llvm::BitCastInst*> bitcast_for(const llvm::CallBase& cuda_
 }
 
 inline bool is_device_module(const llvm::Module& module) {
-#if LLVM_VERSION_MAJOR >= 20
+#if LLVM_VERSION_MAJOR >= 21
   const auto triple = module.getTargetTriple().str();
 #else
   const auto triple = module.getTargetTriple();
diff --git a/lib/passes/support/HipUtil.h b/lib/passes/support/HipUtil.h
index 360ad074..e22d967d 100644
--- a/lib/passes/support/HipUtil.h
+++ b/lib/passes/support/HipUtil.h
@@ -50,7 +50,7 @@ inline std::optional<llvm::BitCastInst*> bitcast_for(const llvm::CallBase& hip_c
 }
 
 inline bool is_device_module(const llvm::Module& module) {
-#if LLVM_VERSION_MAJOR >= 20
+#if LLVM_VERSION_MAJOR >= 21
   const auto triple = module.getTargetTriple().str();
 #else
   const auto triple = module.getTargetTriple();
diff --git a/lib/passes/typegen/dimeta/DimetaTypeGen.cpp b/lib/passes/typegen/dimeta/DimetaTypeGen.cpp
index 87b4002a..5c2feb32 100644
--- a/lib/passes/typegen/dimeta/DimetaTypeGen.cpp
+++ b/lib/passes/typegen/dimeta/DimetaTypeGen.cpp
@@ -496,9 +496,11 @@ class DimetaTypeManager final : public TypeIDGenerator {
 
         const auto function_name = val->location.function;
         MemOps mem_operations;
-        auto kind = call->getCalledFunction() != nullptr ? mem_operations.kind(call->getCalledFunction()->getName()) : std::nullopt;
+        auto kind = call->getCalledFunction() != nullptr ? mem_operations.kind(call->getCalledFunction()->getName())
+                                                         : std::nullopt;
 
-        if (kind && is_kind(kind.value(), MemOpKind::GpuMallocLike) && gpu::is_templated_malloc_like(function_name, kind.value())) {
+        if (kind && is_kind(kind.value(), MemOpKind::GpuMallocLike) &&
+            gpu::is_templated_malloc_like(function_name, kind.value())) {
           LOG_DEBUG("Workaround for pointer level of call base " << function_name)
           workaround::remove_pointer_level(call, val.value());
         }

From f285f0e8f79c5b3cbc630b0ec42ad171b740304c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Fri, 1 May 2026 13:37:10 +0200
Subject: [PATCH 18/24] Fix lcov issue and remove CUDA linking

---
 cmake/modules/coverage-lcov.cmake  | 17 +++++++-----
 lib/runtime/CMakeLists.txt         | 21 ---------------
 lib/runtime/CudaRuntimeInterface.h | 36 -------------------------
 lib/runtime/CudaSupport.cpp        | 43 ------------------------------
 4 files changed, 10 insertions(+), 107 deletions(-)
 delete mode 100644 lib/runtime/CudaRuntimeInterface.h
 delete mode 100644 lib/runtime/CudaSupport.cpp

diff --git a/cmake/modules/coverage-lcov.cmake b/cmake/modules/coverage-lcov.cmake
index 0b836576..61893485 100644
--- a/cmake/modules/coverage-lcov.cmake
+++ b/cmake/modules/coverage-lcov.cmake
@@ -5,17 +5,20 @@ if(TYPEART_LCOV_EXEC-NOTFOUND OR TYPEART_GENHTML_EXEC-NOTFOUND)
   message(WARNING "lcov and genhtml command needed for coverage.")
 endif()
 
-# Detect whether lcov supports: --ignore-errors unused
+# Detect whether lcov supports: --ignore-errors unused:
+# - avoids CUDA error "geninfo: ERROR: 'exclude' pattern '*/Version.cpp' is unused"
 set(TYPEART_LCOV_IGNORE_UNUSED)
 if(TYPEART_LCOV_EXEC)
   execute_process(
-    COMMAND ${TYPEART_LCOV_EXEC} --ignore-errors unused --version
-    RESULT_VARIABLE TYPEART_LCOV_IGNORE_UNUSED_RESULT
-    OUTPUT_QUIET
-    ERROR_QUIET
+    COMMAND ${TYPEART_LCOV_EXEC} --version
+    OUTPUT_VARIABLE TYPEART_LCOV_VERSION_STRING
+    OUTPUT_STRIP_TRAILING_WHITESPACE
   )
-  if(TYPEART_LCOV_IGNORE_UNUSED_RESULT EQUAL 0)
-    set(TYPEART_LCOV_IGNORE_UNUSED --ignore-errors unused)
+  if(TYPEART_LCOV_VERSION_STRING MATCHES "LCOV version ([0-9]+)\\.([0-9]+)")
+    set(TYPEART_LCOV_VERSION_MAJOR ${CMAKE_MATCH_1})
+    if(TYPEART_LCOV_VERSION_MAJOR GREATER_EQUAL 2)
+      set(TYPEART_LCOV_IGNORE_UNUSED --ignore-errors unused)
+    endif()
   endif()
 endif()
 
diff --git a/lib/runtime/CMakeLists.txt b/lib/runtime/CMakeLists.txt
index 9e2c0e46..c889e7d7 100644
--- a/lib/runtime/CMakeLists.txt
+++ b/lib/runtime/CMakeLists.txt
@@ -18,7 +18,6 @@ add_custom_command(
 set(RUNTIME_LIB_SOURCES
     AccessCounter.h
     CallbackInterface.h
-    CudaRuntimeInterface.h
     RuntimeData.h
     RuntimeInterface.h
     TypeResolution.cpp
@@ -100,10 +99,6 @@ target_include_directories(
 
 target_include_directories(${TYPEART_PREFIX}_Runtime SYSTEM PRIVATE ${LLVM_INCLUDE_DIRS})
 
-if(CUDAToolkit_FOUND)
-  target_include_directories(${TYPEART_PREFIX}_Runtime SYSTEM PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
-endif()
-
 target_compile_definitions(
   ${TYPEART_PREFIX}_Runtime
   PRIVATE TYPEART_LOG_LEVEL=${TYPEART_LOG_LEVEL_RT}
@@ -113,18 +108,8 @@ target_compile_definitions(
           $<$<BOOL:${TYPEART_ABSEIL}>:TYPEART_ABSEIL>
           $<$<BOOL:${TYPEART_SAFEPTR}>:USE_SAFEPTR>
           $<$<BOOL:${TYPEART_DISABLE_THREAD_SAFETY}>:TYPEART_DISABLE_THREAD_SAFETY>
-          $<$<BOOL:${CUDAToolkit_FOUND}>:TYPEART_HAS_CUDA=1>
 )
 
-if(CUDAToolkit_FOUND)
-  if(TARGET CUDA::cudart)
-    target_link_libraries(${TYPEART_PREFIX}_Runtime PRIVATE CUDA::cudart)
-  endif()
-  if(TARGET CUDA::cuda_driver)
-    target_link_libraries(${TYPEART_PREFIX}_Runtime PRIVATE CUDA::cuda_driver)
-  endif()
-endif()
-
 typeart_target_compile_options(${TYPEART_PREFIX}_Runtime)
 typeart_target_define_file_basename(${TYPEART_PREFIX}_Runtime)
 typeart_target_coverage_options(${TYPEART_PREFIX}_Runtime)
@@ -152,12 +137,6 @@ install(FILES
   DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}
 )
 
-if(CUDAToolkit_FOUND)
-  install(FILES CudaRuntimeInterface.h
-    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}
-  )
-endif()
-
 install(
   TARGETS ${TYPEART_PREFIX}_Runtime
   EXPORT ${TARGETS_EXPORT_NAME}
diff --git a/lib/runtime/CudaRuntimeInterface.h b/lib/runtime/CudaRuntimeInterface.h
deleted file mode 100644
index 2b42ee96..00000000
--- a/lib/runtime/CudaRuntimeInterface.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// TypeART library
-//
-// Copyright (c) 2017-2026 TypeART Authors
-// Distributed under the BSD 3-Clause license.
-// (See accompanying file LICENSE.txt or copy at
-// https://opensource.org/licenses/BSD-3-Clause)
-//
-// Project home: https://github.com/tudasc/TypeART
-//
-// SPDX-License-Identifier: BSD-3-Clause
-//
-
-#ifndef TYPEART_CUDARUNTIMEINTERFACE_H
-#define TYPEART_CUDARUNTIMEINTERFACE_H
-
-#include "RuntimeExport.h"
-#include "RuntimeInterface.h"
-
-#ifdef __cplusplus
-#include <cstddef>
-#else
-#include <stdbool.h>
-#include <stddef.h>
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-TYPEART_EXPORT typeart_status typeart_cuda_is_device_ptr(const void* addr, bool* is_device_ptr_flag);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // TYPEART_CUDARUNTIMEINTERFACE_H
diff --git a/lib/runtime/CudaSupport.cpp b/lib/runtime/CudaSupport.cpp
deleted file mode 100644
index 3f358bc5..00000000
--- a/lib/runtime/CudaSupport.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// TypeART library
-//
-// Copyright (c) 2017-2026 TypeART Authors
-// Distributed under the BSD 3-Clause license.
-// (See accompanying file LICENSE.txt or copy at
-// https://opensource.org/licenses/BSD-3-Clause)
-//
-// Project home: https://github.com/tudasc/TypeART
-//
-// SPDX-License-Identifier: BSD-3-Clause
-//
-
-#include "CudaRuntimeInterface.h"
-
-#ifdef TYPEART_HAS_CUDA
-#include <cuda.h>
-#endif
-
-typeart_status typeart_cuda_is_device_ptr(const void* addr, bool* is_device_ptr_flag) {
-  if (is_device_ptr_flag == nullptr) {
-    return TYPEART_ERROR;
-  }
-
-#ifdef TYPEART_HAS_CUDA
-  CUmemorytype mem_type;
-  CUresult status =
-      cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, reinterpret_cast<CUdeviceptr>(addr));
-  if (status != CUDA_SUCCESS) {
-    *is_device_ptr_flag = false;
-    if (status == CUDA_ERROR_INVALID_VALUE) {
-      return TYPEART_OK;
-    }
-    return TYPEART_ERROR;
-  }
-
-  *is_device_ptr_flag = (mem_type == CU_MEMORYTYPE_DEVICE);
-  return TYPEART_OK;
-#else
-  (void)addr;
-  *is_device_ptr_flag = false;
-  return TYPEART_OK;
-#endif
-}

From f3181ef51396506688dd25ccd8240785ea079584 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Fri, 1 May 2026 14:52:40 +0200
Subject: [PATCH 19/24] HIP testing

---
 .github/workflows/basic-ci.yml | 70 +++++++++++++++++++++++++++++++++-
 1 file changed, 68 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/basic-ci.yml b/.github/workflows/basic-ci.yml
index 77dd7684..2cff501c 100644
--- a/.github/workflows/basic-ci.yml
+++ b/.github/workflows/basic-ci.yml
@@ -150,7 +150,7 @@ jobs:
           typeart-typegen-legacy: 0
           install-lcov: 'true'
           install-omp: 'true'
-          setup-mold: 'false'
+          setup-mold: 'true'
 
       - name: Configure TypeART
         run: cmake -B build --preset ci-cov-thread-safe -DLLVM_DIR=${LLVM_CMAKE_DIR} -DLLVM_EXTERNAL_LIT=${EXTERNAL_LIT}
@@ -175,9 +175,75 @@ jobs:
           flag-name: cuda-suite-${{ matrix.llvm-version }}-${{ matrix.cuda }}
           parallel: true
 
+  hip-suite:
+    runs-on: ubuntu-24.04
+    
+    env:
+      ROCM_PATH: /opt/rocm
+      ROCM_VERSION: 6.4.4
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Setup ROCm Toolchain
+        run: |
+          sudo mkdir --parents --mode=0755 /etc/apt/keyrings
+          wget -qO - https://repo.radeon.com/rocm/rocm.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/rocm.gpg
+          echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${ROCM_VERSION} noble main" | sudo tee /etc/apt/sources.list.d/rocm.list
+          echo -e "Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600" | sudo tee /etc/apt/preferences.d/rocm-pin-600
+          
+      - name: Install ROCm ${{ env.ROCM_VERSION }}
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends \
+            rocm-llvm \
+            rocm-llvm-dev \
+            rocm-dev
+          
+      - name: Setup ROCm Environment
+        run: |    
+          sudo ln -sfn /opt/rocm-${ROCM_VERSION} /opt/rocm
+          echo "/opt/rocm/llvm/bin" >> $GITHUB_PATH
+          echo "/opt/rocm/bin" >> $GITHUB_PATH
+
+      - name: Setup TypeART Environment
+        uses: ./.github/actions/setup-typeart
+        with:
+          llvm-version: 18
+          typeart-typegen-legacy: 0
+          install-lcov: 'true'
+          install-omp: 'true'
+          setup-mold: 'true'
+
+      - name: Configure TypeART
+        run: |
+         cmake -B build --preset ci-cov-thread-safe \
+           -DCMAKE_C_COMPILER=amdclang -DCMAKE_CXX_COMPILER=amdclang++ \
+           -DLLVM_DIR=${ROCM_PATH}/llvm/lib/cmake/llvm -DLLVM_EXTERNAL_LIT=${EXTERNAL_LIT}
+
+      - name: Build TypeART
+        run: cmake --build build --parallel 2
+
+      - name: Prepare TypeART coverage
+        run: cmake --build build --target typeart-lcov-clean
+
+      - name: Test TypeART hip-suite
+        run: cmake --build build --target check-typeart
+
+      - name: Build coverage report
+        run: cmake --build build --target typeart-lcov-html
+
+      - name: Coveralls (parallel)
+        uses: coverallsapp/github-action@v2.3.6
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          path-to-lcov: build/typeart.coverage
+          flag-name: hip-suite-rocm-${{ env.ROCM_VERSION }}
+          parallel: true
+
   finish-coverage:
     if: ${{ always() }}
-    needs: [lit-suite, cuda-suite]
+    needs: [lit-suite, cuda-suite, hip-suite]
     runs-on: ubuntu-24.04
     steps:
       - name: Coveralls Finished

From 673e5f9e554efe346bf1c89a73070a4510ba51f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Fri, 1 May 2026 15:16:44 +0200
Subject: [PATCH 20/24] Try fix HIP omp issue

---
 .github/workflows/basic-ci.yml     | 1 +
 test/runtime/21_runtime_manual.cpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/basic-ci.yml b/.github/workflows/basic-ci.yml
index 2cff501c..b2e96cc0 100644
--- a/.github/workflows/basic-ci.yml
+++ b/.github/workflows/basic-ci.yml
@@ -205,6 +205,7 @@ jobs:
           sudo ln -sfn /opt/rocm-${ROCM_VERSION} /opt/rocm
           echo "/opt/rocm/llvm/bin" >> $GITHUB_PATH
           echo "/opt/rocm/bin" >> $GITHUB_PATH
+          echo "LD_LIBRARY_PATH=/opt/rocm/llvm/lib:/opt/rocm/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
 
       - name: Setup TypeART Environment
         uses: ./.github/actions/setup-typeart
diff --git a/test/runtime/21_runtime_manual.cpp b/test/runtime/21_runtime_manual.cpp
index 5b81bb9a..784f1e55 100644
--- a/test/runtime/21_runtime_manual.cpp
+++ b/test/runtime/21_runtime_manual.cpp
@@ -1,5 +1,5 @@
 // clang-format off
-// RUN: clang++ -std=c++17 -I%S/../../ -I%S/../../lib/typelib -I%S/../../lib  %s -o %s.exe
+// RUN: %clang-cpp -std=c++17 -I%S/../../ -I%S/../../lib/typelib -I%S/../../lib  %s -o %s.exe
 // RUN: %s.exe 2>&1 | %filecheck %s
 // clang-format on
 // FIXME this test doesn't add to the coverage data.

From c465c8fb5947aead5b441a515df0a4f8c40f86b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Fri, 1 May 2026 16:00:30 +0200
Subject: [PATCH 21/24] Set vars explicitly

---
 .github/workflows/basic-ci.yml | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/basic-ci.yml b/.github/workflows/basic-ci.yml
index b2e96cc0..822146f1 100644
--- a/.github/workflows/basic-ci.yml
+++ b/.github/workflows/basic-ci.yml
@@ -213,14 +213,20 @@ jobs:
           llvm-version: 18
           typeart-typegen-legacy: 0
           install-lcov: 'true'
-          install-omp: 'true'
+          install-omp: 'false'
           setup-mold: 'true'
 
       - name: Configure TypeART
         run: |
          cmake -B build --preset ci-cov-thread-safe \
-           -DCMAKE_C_COMPILER=amdclang -DCMAKE_CXX_COMPILER=amdclang++ \
-           -DLLVM_DIR=${ROCM_PATH}/llvm/lib/cmake/llvm -DLLVM_EXTERNAL_LIT=${EXTERNAL_LIT}
+            -DCMAKE_C_COMPILER=/opt/rocm/bin/amdclang \
+            -DCMAKE_CXX_COMPILER=/opt/rocm/bin/amdclang++ \
+            -DLLVM_DIR=${ROCM_PATH}/llvm/lib/cmake/llvm -DLLVM_EXTERNAL_LIT=${EXTERNAL_LIT} \
+            -DTYPEART_CLANG_EXEC=/opt/rocm/bin/amdclang \
+            -DTYPEART_CLANGCXX_EXEC=/opt/rocm/bin/amdclang++ \
+            -DTYPEART_OPT_EXEC=/opt/rocm/llvm/bin/opt \
+            -DTYPEART_LLC_EXEC=/opt/rocm/llvm/bin/llc \
+            -DTYPEART_LLVMCONFIG_COMMAND=/opt/rocm/llvm/bin/llvm-config
 
       - name: Build TypeART
         run: cmake --build build --parallel 2

From 54cee42456e417584969764479de55973c1b12d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Fri, 1 May 2026 16:13:36 +0200
Subject: [PATCH 22/24] Disable OpenMP in HIP tests

---
 .github/workflows/basic-ci.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/basic-ci.yml b/.github/workflows/basic-ci.yml
index 822146f1..5f511500 100644
--- a/.github/workflows/basic-ci.yml
+++ b/.github/workflows/basic-ci.yml
@@ -226,7 +226,8 @@ jobs:
             -DTYPEART_CLANGCXX_EXEC=/opt/rocm/bin/amdclang++ \
             -DTYPEART_OPT_EXEC=/opt/rocm/llvm/bin/opt \
             -DTYPEART_LLC_EXEC=/opt/rocm/llvm/bin/llc \
-            -DTYPEART_LLVMCONFIG_COMMAND=/opt/rocm/llvm/bin/llvm-config
+            -DTYPEART_LLVMCONFIG_COMMAND=/opt/rocm/llvm/bin/llvm-config \
+            -DCMAKE_DISABLE_FIND_PACKAGE_OpenMP=ON
 
       - name: Build TypeART
         run: cmake --build build --parallel 2

From 9ca0e8ae2f6808af8db98a7968aec6739694f8a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Tue, 5 May 2026 15:17:12 +0200
Subject: [PATCH 23/24] Wrapper parses typeart-related flags

---
 scripts/typeart-wrapperv2.in                  | 74 ++++++++++++++++---
 .../cuda/pass/11_wrapper_gpu_wrapper_option.c | 32 ++++++++
 test/hip/pass/11_wrapper_gpu_wrapper_option.c | 34 +++++++++
 test/script/16_wrapper_parse_options.c        | 21 ++++++
 4 files changed, 149 insertions(+), 12 deletions(-)
 create mode 100644 test/cuda/pass/11_wrapper_gpu_wrapper_option.c
 create mode 100644 test/hip/pass/11_wrapper_gpu_wrapper_option.c
 create mode 100644 test/script/16_wrapper_parse_options.c

diff --git a/scripts/typeart-wrapperv2.in b/scripts/typeart-wrapperv2.in
index 494cb4b9..dfea2317 100644
--- a/scripts/typeart-wrapperv2.in
+++ b/scripts/typeart-wrapperv2.in
@@ -69,15 +69,64 @@ function typeart_is_linking_fn() {
   return 1
 }
 
+function typeart_parse_typeart_option_fn() {
+  local typeart_option="$1"
+  local option_payload="${typeart_option#--typeart-}"
+  local option_name="${option_payload%%=*}"
+  local option_value="${option_payload#*=}"
+
+  if [ -z "$option_name" ] || [ "$option_payload" = "$option_value" ]; then
+    echo "TypeART wrapper option requires --typeart-<option>=<value>: ${typeart_option}" >&2
+    return 1
+  fi
+
+  case "$option_name" in
+  config)
+    export TYPEART_CONFIG_FILE="$option_value"
+    ;;
+  *)
+    local typeart_env_name="${option_name//-/_}"
+    typeart_env_name="${typeart_env_name^^}"
+    export "TYPEART_${typeart_env_name}=${option_value}"
+    ;;
+  esac
+  return 0
+}
+
 function typeart_parse_cmd_line_fn() {
+  local typeart_parse_mode="$1"
+  shift
   local typeart_has_debug=0
-  for arg in "$@"; do
-    if [ "$arg" = "-g" ]; then
+  typeart_forwarded_args=()
+
+  while (($#)); do
+    arg="$1"
+
+    case "$arg" in
+    -g)
       typeart_has_debug=1
-      break
-    fi
+      typeart_forwarded_args+=("$arg")
+      shift
+      ;;
+    --typeart-*=*)
+      case "$typeart_parse_mode" in
+      ON)
+        typeart_parse_typeart_option_fn "$arg" || return 1
+        ;;
+      esac
+      shift
+      ;;
+    --typeart-*)
+      shift
+      ;;
+    *)
+      typeart_forwarded_args+=("$arg")
+      shift
+      ;;
+    esac
   done
-  if [ $typeart_has_debug == 0 ]; then
+
+  if [ "$typeart_parse_mode" = "ON" ] && [ "$typeart_has_debug" == 0 ]; then
     typeart_more_flags+=" -g"
   fi
 }
@@ -87,21 +136,22 @@ function typeart_main_driver_fn() {
 
   typeart_is_wrapper_disabled_fn
   if [ "$?" == 1 ]; then
-    # shellcheck disable=SC2068
-    $typeart_compiler $@
+    typeart_parse_cmd_line_fn OFF "$@"
+    $typeart_compiler "${typeart_forwarded_args[@]}"
     return 0
   fi
   
-  typeart_parse_cmd_line_fn "$@"
+  typeart_parse_cmd_line_fn ON "$@"
+  if [ "$?" != 0 ]; then
+    return 1
+  fi
 
-  typeart_is_linking_fn "$@"
+  typeart_is_linking_fn "${typeart_forwarded_args[@]}"
   if [ "$?" == 1 ]; then
     typeart_more_flags+=" ${typeart_ldflags}"
   fi
   
-  # export TYPEART_TYPE_SERIALIZATION="${TYPEART_TYPE_SERIALIZATION:-hybrid}"
-  
-  $typeart_compiler ${typeart_plugin} ${typeart_includes} ${typeart_more_flags} ${typeart_san_flags} $@
+  $typeart_compiler ${typeart_plugin} ${typeart_includes} ${typeart_more_flags} ${typeart_san_flags} "${typeart_forwarded_args[@]}"
 }
 
 typeart_main_driver_fn "$@"
diff --git a/test/cuda/pass/11_wrapper_gpu_wrapper_option.c b/test/cuda/pass/11_wrapper_gpu_wrapper_option.c
new file mode 100644
index 00000000..48ab89dd
--- /dev/null
+++ b/test/cuda/pass/11_wrapper_gpu_wrapper_option.c
@@ -0,0 +1,32 @@
+// RUN: %wrapper-cc -x cuda --cuda-host-only -nocudalib -S -emit-llvm -O1 --typeart-gpu=true %s -o - 2>&1 | %filecheck %s --check-prefix=GPU-ON
+// RUN: %wrapper-cc -x cuda --cuda-host-only -nocudalib -S -emit-llvm -O1 --typeart-gpu=false %s -o - 2>&1 | %filecheck %s --check-prefix=GPU-OFF
+// RUN: %wrapper-cc -x cuda --cuda-host-only -nocudalib -S -emit-llvm -O1 --typeart-gpu --typeart-gpu=true %s -o - 2>&1 | %filecheck %s --check-prefix=GPU-ON
+// RUN: %wrapper-cc -x cuda --cuda-host-only -nocudalib -S -emit-llvm -O1 --typeart-gpu %s -o - 2>&1 | %filecheck %s --check-prefix=GPU-OFF
+// RUN: TYPEART_WRAPPER=OFF %wrapper-cc -x cuda --cuda-host-only -nocudalib -S -emit-llvm -O1 --typeart-gpu=true --typeart-gpu %s -o - 2>&1 | %filecheck %s --check-prefix=WRAPPER-OFF
+
+// REQUIRES: cuda && !llvm-14
+
+// GPU-ON: call i32 @cudaMalloc(ptr {{.*}}[[CU_POINTER:%[_0-9a-z]+]],
+// GPU-ON: call void @__typeart_alloc_gpu(
+// GPU-ON: call i32 @cudaFree(ptr {{.*}}[[CU_PTR:%[_0-9a-z]+]])
+// GPU-ON: call void @__typeart_free_gpu(
+
+// GPU-OFF: call i32 @cudaMalloc(ptr {{.*}}[[CU_POINTER:%[_0-9a-z]+]],
+// GPU-OFF: call i32 @cudaFree(ptr {{.*}}[[CU_PTR:%[_0-9a-z]+]])
+// GPU-OFF-NOT: __typeart_alloc_gpu(
+// GPU-OFF-NOT: __typeart_free_gpu(
+
+// WRAPPER-OFF: call i32 @cudaMalloc(ptr {{.*}}[[CU_POINTER:%[_0-9a-z]+]],
+// WRAPPER-OFF: call i32 @cudaFree(ptr {{.*}}[[CU_PTR:%[_0-9a-z]+]])
+// WRAPPER-OFF-NOT: __typeart_alloc_gpu(
+// WRAPPER-OFF-NOT: __typeart_free_gpu(
+
+int main() {
+  const int N = 20;
+  float* d_x;
+
+  cudaMalloc((void**)&d_x, N * sizeof(float));
+  cudaFree(d_x);
+
+  return 0;
+}
diff --git a/test/hip/pass/11_wrapper_gpu_wrapper_option.c b/test/hip/pass/11_wrapper_gpu_wrapper_option.c
new file mode 100644
index 00000000..c22c7f16
--- /dev/null
+++ b/test/hip/pass/11_wrapper_gpu_wrapper_option.c
@@ -0,0 +1,34 @@
+// RUN: %wrapper-cc -x hip --offload-host-only -S -emit-llvm -O1 --typeart-gpu=true %s -o - 2>&1 | %filecheck %s --check-prefix=GPU-ON
+// RUN: %wrapper-cc -x hip --offload-host-only -S -emit-llvm -O1 --typeart-gpu=false %s -o - 2>&1 | %filecheck %s --check-prefix=GPU-OFF
+// RUN: %wrapper-cc -x hip --offload-host-only -S -emit-llvm -O1 --typeart-gpu --typeart-gpu=true %s -o - 2>&1 | %filecheck %s --check-prefix=GPU-ON
+// RUN: %wrapper-cc -x hip --offload-host-only -S -emit-llvm -O1 --typeart-gpu %s -o - 2>&1 | %filecheck %s --check-prefix=GPU-OFF
+// RUN: TYPEART_WRAPPER=OFF %wrapper-cc -x hip --offload-host-only -S -emit-llvm -O1 --typeart-gpu=true --typeart-gpu %s -o - 2>&1 | %filecheck %s --check-prefix=WRAPPER-OFF
+
+// REQUIRES: hip && !llvm-14
+
+// GPU-ON: call i32 @hipMalloc(ptr {{.*}}[[HIP_POINTER:%[_0-9a-z]+]],
+// GPU-ON: call void @__typeart_alloc_gpu(
+// GPU-ON: call i32 @hipFree(ptr {{.*}}[[HIP_PTR:%[_0-9a-z]+]])
+// GPU-ON: call void @__typeart_free_gpu(
+
+// GPU-OFF: call i32 @hipMalloc(ptr {{.*}}[[HIP_POINTER:%[_0-9a-z]+]],
+// GPU-OFF: call i32 @hipFree(ptr {{.*}}[[HIP_PTR:%[_0-9a-z]+]])
+// GPU-OFF-NOT: __typeart_alloc_gpu(
+// GPU-OFF-NOT: __typeart_free_gpu(
+
+// WRAPPER-OFF: call i32 @hipMalloc(ptr {{.*}}[[HIP_POINTER:%[_0-9a-z]+]],
+// WRAPPER-OFF: call i32 @hipFree(ptr {{.*}}[[HIP_PTR:%[_0-9a-z]+]])
+// WRAPPER-OFF-NOT: __typeart_alloc_gpu(
+// WRAPPER-OFF-NOT: __typeart_free_gpu(
+
+#include <hip/hip_runtime.h>
+
+int main() {
+  const int N = 20;
+  float* d_x;
+
+  hipMalloc((void**)&d_x, N * sizeof(float));
+  hipFree(d_x);
+
+  return 0;
+}
diff --git a/test/script/16_wrapper_parse_options.c b/test/script/16_wrapper_parse_options.c
new file mode 100644
index 00000000..dfad9886
--- /dev/null
+++ b/test/script/16_wrapper_parse_options.c
@@ -0,0 +1,21 @@
+// clang-format off
+// RUN: %wrapper-cc -S -emit-llvm -O1 --typeart-stack=false --typeart-global=true --typeart-stack-lifetime %s -o - 2>&1 | %filecheck %s --check-prefix=ON
+// RUN: TYPEART_WRAPPER=OFF %wrapper-cc -S -emit-llvm -O1 --typeart-stack=false --typeart-global=true --typeart-stack-lifetime %s -o - 2>&1 | %filecheck %s --check-prefix=OFF
+// clang-format on
+
+// REQUIRES: !legacywrapper
+
+#include <stddef.h>
+#include <stdlib.h>
+
+int main(int argc, char** argv) {
+  volatile int local[4] = {0, 0, 0, 0};
+  int* p                = (int*)malloc((size_t)argc * sizeof(int));
+  local[0]              = argc;
+  free(p);
+  return 0;
+}
+
+// ON: __typeart_alloc
+// ON-NOT: __typeart_alloc_stack
+// OFF-NOT: __typeart_alloc

From 0ea8e839e756f4b15e77e81c7b82f071b8ff5f80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexander=20H=C3=BCck?= <hueck.alexander@gmail.com>
Date: Tue, 5 May 2026 16:34:27 +0200
Subject: [PATCH 24/24] Format

---
 test/cuda/pass/11_wrapper_gpu_wrapper_option.c | 2 ++
 test/hip/pass/11_wrapper_gpu_wrapper_option.c  | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/test/cuda/pass/11_wrapper_gpu_wrapper_option.c b/test/cuda/pass/11_wrapper_gpu_wrapper_option.c
index 48ab89dd..8f970aa2 100644
--- a/test/cuda/pass/11_wrapper_gpu_wrapper_option.c
+++ b/test/cuda/pass/11_wrapper_gpu_wrapper_option.c
@@ -1,8 +1,10 @@
+// clang-format off
 // RUN: %wrapper-cc -x cuda --cuda-host-only -nocudalib -S -emit-llvm -O1 --typeart-gpu=true %s -o - 2>&1 | %filecheck %s --check-prefix=GPU-ON
 // RUN: %wrapper-cc -x cuda --cuda-host-only -nocudalib -S -emit-llvm -O1 --typeart-gpu=false %s -o - 2>&1 | %filecheck %s --check-prefix=GPU-OFF
 // RUN: %wrapper-cc -x cuda --cuda-host-only -nocudalib -S -emit-llvm -O1 --typeart-gpu --typeart-gpu=true %s -o - 2>&1 | %filecheck %s --check-prefix=GPU-ON
 // RUN: %wrapper-cc -x cuda --cuda-host-only -nocudalib -S -emit-llvm -O1 --typeart-gpu %s -o - 2>&1 | %filecheck %s --check-prefix=GPU-OFF
 // RUN: TYPEART_WRAPPER=OFF %wrapper-cc -x cuda --cuda-host-only -nocudalib -S -emit-llvm -O1 --typeart-gpu=true --typeart-gpu %s -o - 2>&1 | %filecheck %s --check-prefix=WRAPPER-OFF
+// clang-format on
 
 // REQUIRES: cuda && !llvm-14
 
diff --git a/test/hip/pass/11_wrapper_gpu_wrapper_option.c b/test/hip/pass/11_wrapper_gpu_wrapper_option.c
index c22c7f16..57b8cdbf 100644
--- a/test/hip/pass/11_wrapper_gpu_wrapper_option.c
+++ b/test/hip/pass/11_wrapper_gpu_wrapper_option.c
@@ -1,8 +1,10 @@
+// clang-format off
 // RUN: %wrapper-cc -x hip --offload-host-only -S -emit-llvm -O1 --typeart-gpu=true %s -o - 2>&1 | %filecheck %s --check-prefix=GPU-ON
 // RUN: %wrapper-cc -x hip --offload-host-only -S -emit-llvm -O1 --typeart-gpu=false %s -o - 2>&1 | %filecheck %s --check-prefix=GPU-OFF
 // RUN: %wrapper-cc -x hip --offload-host-only -S -emit-llvm -O1 --typeart-gpu --typeart-gpu=true %s -o - 2>&1 | %filecheck %s --check-prefix=GPU-ON
 // RUN: %wrapper-cc -x hip --offload-host-only -S -emit-llvm -O1 --typeart-gpu %s -o - 2>&1 | %filecheck %s --check-prefix=GPU-OFF
 // RUN: TYPEART_WRAPPER=OFF %wrapper-cc -x hip --offload-host-only -S -emit-llvm -O1 --typeart-gpu=true --typeart-gpu %s -o - 2>&1 | %filecheck %s --check-prefix=WRAPPER-OFF
+// clang-format on
 
 // REQUIRES: hip && !llvm-14