RIKEN-RCCS · freifrauvonbleifrei · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026
diff --git a/.github/workflows/ctest.yml b/.github/workflows/ctest.yml
@@ -83,4 +83,4 @@ jobs:
       uses: threeal/ctest-action@v1.1.0
       with:
         test-dir: build-cuda
-        args: --verbose --exclude-regex cufile-pointer-detection
+        args: --verbose --exclude-regex "cufile-pointer-detection|cuda-memory-resource"
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_library(dalotia_cpp dalotia.cpp) # Daniel Pfeifer says: no variables
 target_sources(dalotia_cpp PRIVATE dalotia_assignment.cpp dalotia_formats.cpp dalotia_datasource.cpp dalotia_tensor_file.cpp)
 set_target_properties(dalotia_cpp PROPERTIES PUBLIC_HEADER
-	"dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp;dalotia_cuda.hpp")
+	"dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp;dalotia_cuda.hpp;dalotia_cuda_memory_resource.hpp")
 # have one dalotia library target that can be used in C++ and Fortran
 add_library(dalotia INTERFACE)
 add_library(dalotia::dalotia_cpp ALIAS dalotia_cpp)
@@ -41,7 +41,19 @@ if (DALOTIA_WITH_CUFILE)
     find_library(CUFILE_LIBRARY cufile
         PATHS ${CUDAToolkit_LIBRARY_DIR}
         REQUIRED)
+    get_filename_component(_CUFILE_LIB_DIR "${CUFILE_LIBRARY}" DIRECTORY)
+    find_path(CUFILE_INCLUDE_DIR cufile.h
+        HINTS ${_CUFILE_LIB_DIR}/../include
+              ${_CUFILE_LIB_DIR}/../../include
+              ${CUDAToolkit_INCLUDE_DIRS}
+        PATHS /usr/local/cuda/include
+              /usr/include)
     target_link_libraries(dalotia_cpp PUBLIC ${CUFILE_LIBRARY})
+    if (CUFILE_INCLUDE_DIR)
+        target_include_directories(dalotia_cpp PUBLIC ${CUFILE_INCLUDE_DIR})
+    else()
+        message(WARNING "cufile.h not found — set CUFILE_INCLUDE_DIR manually")
+    endif()
     target_compile_definitions(dalotia_cpp PUBLIC "DALOTIA_WITH_CUFILE")
     target_sources(dalotia_cpp PRIVATE dalotia_cufile.cpp)
 endif (DALOTIA_WITH_CUFILE)

diff --git a/src/dalotia.hpp b/src/dalotia.hpp
@@ -6,6 +6,9 @@
 #include <memory>
 #ifdef DALOTIA_WITH_CPP_PMR
 #include <memory_resource>
+#ifdef DALOTIA_WITH_CUDA
+#include "dalotia_cuda_memory_resource.hpp"
+#endif  // DALOTIA_WITH_CUDA
 #endif  // DALOTIA_WITH_CPP_PMR
 #include <numeric>
 #include <string>

diff --git a/src/dalotia_cuda.hpp b/src/dalotia_cuda.hpp
@@ -4,76 +4,46 @@
 
 #include <cstddef>
 #include <cuda_runtime.h>
-#include <stdexcept>
-#include <string>
+#include <memory_resource>
+
+#include "dalotia_cuda_memory_resource.hpp"
 
 namespace dalotia {
 
 // Returns true if `ptr` is a CUDA device pointer (cudaMalloc'd or managed).
 // Returns false for host pointers (including cudaMallocHost pinned memory).
 bool is_device_pointer(const void* ptr) noexcept;
 
-// RAII wrapper for a cudaMalloc'd device buffer. Move-only.
-// Type-erased (stores void*); use as<T>() for typed access.
+// Move-only owning byte buffer backed by a std::pmr::memory_resource.
+//
+// The default resource is `cuda_device_resource()` (sync cudaMalloc/cudaFree).
 class CudaBuffer {
    public:
-    CudaBuffer() = default;
-
-    explicit CudaBuffer(size_t nbytes) : size_(nbytes) {
-        if (nbytes > 0) {
-            cudaError_t err = cudaMalloc(&ptr_, nbytes);
-            if (err != cudaSuccess) {
-                throw std::runtime_error(
-                    std::string("CudaBuffer: cudaMalloc failed: ") +
-                    cudaGetErrorString(err));
-            }
-        }
-    }
+    CudaBuffer() noexcept = default;
 
-    CudaBuffer(size_t nbytes, cudaStream_t stream) : size_(nbytes) {
+    explicit CudaBuffer(size_t nbytes,
+                        std::pmr::memory_resource* mr = cuda_device_resource())
+        : mr_(mr), size_(nbytes) {
         if (nbytes > 0) {
-            cudaError_t err = cudaMallocAsync(&ptr_, nbytes, stream);
-            if (err != cudaSuccess) {
-                throw std::runtime_error(
-                    std::string("CudaBuffer: cudaMallocAsync failed: ") +
-                    cudaGetErrorString(err));
-            }
-            stream_ = stream;
-            async_ = true;
+            ptr_ = std::pmr::polymorphic_allocator<std::byte>(mr_).allocate(
+                nbytes);
         }
     }
 
-    ~CudaBuffer() {
-        if (ptr_) {
-            if (async_) {
-                cudaFreeAsync(ptr_, stream_);
-            } else {
-                cudaFree(ptr_);
-            }
-        }
-    }
+    ~CudaBuffer() { reset(); }
 
     CudaBuffer(CudaBuffer&& other) noexcept
-        : ptr_(other.ptr_)
-        , size_(other.size_)
-        , stream_(other.stream_)
-        , async_(other.async_) {
+        : mr_(other.mr_), ptr_(other.ptr_), size_(other.size_) {
         other.ptr_ = nullptr;
         other.size_ = 0;
     }
 
     CudaBuffer& operator=(CudaBuffer&& other) noexcept {
         if (this != &other) {
-            if (ptr_) {
-                if (async_)
-                    cudaFreeAsync(ptr_, stream_);
-                else
-                    cudaFree(ptr_);
-            }
+            reset();
+            mr_ = other.mr_;
             ptr_ = other.ptr_;
             size_ = other.size_;
-            stream_ = other.stream_;
-            async_ = other.async_;
             other.ptr_ = nullptr;
             other.size_ = 0;
         }
@@ -90,19 +60,27 @@ class CudaBuffer {
 
     template <typename T>
     T* as() noexcept {
-        return static_cast<T*>(ptr_);
+        return static_cast<T*>(static_cast<void*>(ptr_));
     }
 
     template <typename T>
     const T* as() const noexcept {
-        return static_cast<const T*>(ptr_);
+        return static_cast<const T*>(static_cast<const void*>(ptr_));
     }
 
    private:
-    void* ptr_ = nullptr;
+    void reset() noexcept {
+        if (ptr_) {
+            std::pmr::polymorphic_allocator<std::byte>(mr_).deallocate(ptr_,
+                                                                       size_);
+            ptr_ = nullptr;
+            size_ = 0;
+        }
+    }
+
+    std::pmr::memory_resource* mr_ = cuda_device_resource();
+    std::byte* ptr_ = nullptr;
     size_t size_ = 0;
-    cudaStream_t stream_ = 0;
-    bool async_ = false;
 };
 
 }  // namespace dalotia

diff --git a/src/dalotia_cuda_memory_resource.hpp b/src/dalotia_cuda_memory_resource.hpp
@@ -0,0 +1,155 @@
+#pragma once
+
+#ifdef DALOTIA_WITH_CUDA
+#ifdef DALOTIA_WITH_CPP_PMR
+
+#include <cuda_runtime.h>
+#include <memory_resource>
+#include <stdexcept>
+#include <string>
+
+namespace dalotia {
+
+namespace detail {
+
+[[noreturn]] inline void throw_cuda(const char* context, cudaError_t err) {
+    throw std::runtime_error(std::string(context) + ": " +
+                             cudaGetErrorString(err));
+}
+
+}  // namespace detail
+
+// TODO if there is a good library to take these from, we should consider it
+
+class cuda_device_memory_resource : public std::pmr::memory_resource {
+   protected:
+    void* do_allocate(size_t bytes, size_t /*alignment*/) override {
+        void* p = nullptr;
+        if (bytes > 0) {
+            cudaError_t err = cudaMalloc(&p, bytes);
+            if (err != cudaSuccess)
+                detail::throw_cuda("cuda_device_memory_resource::allocate",
+                                   err);
+        }
+        return p;
+    }
+
+    void do_deallocate(void* p, size_t /*bytes*/,
+                       size_t /*alignment*/) override {
+        if (p)
+            cudaFree(p);
+    }
+
+    bool do_is_equal(const memory_resource& other) const noexcept override {
+        return dynamic_cast<const cuda_device_memory_resource*>(&other) !=
+               nullptr;
+    }
+};
+
+class cuda_pinned_memory_resource : public std::pmr::memory_resource {
+   protected:
+    void* do_allocate(size_t bytes, size_t /*alignment*/) override {
+        void* p = nullptr;
+        if (bytes > 0) {
+            cudaError_t err = cudaMallocHost(&p, bytes);
+            if (err != cudaSuccess)
+                detail::throw_cuda("cuda_pinned_memory_resource::allocate",
+                                   err);
+        }
+        return p;
+    }
+
+    void do_deallocate(void* p, size_t /*bytes*/,
+                       size_t /*alignment*/) override {
+        if (p)
+            cudaFreeHost(p);
+    }
+
+    bool do_is_equal(const memory_resource& other) const noexcept override {
+        return dynamic_cast<const cuda_pinned_memory_resource*>(&other) !=
+               nullptr;
+    }
+};
+
+class cuda_managed_memory_resource : public std::pmr::memory_resource {
+   protected:
+    void* do_allocate(size_t bytes, size_t /*alignment*/) override {
+        void* p = nullptr;
+        if (bytes > 0) {
+            cudaError_t err = cudaMallocManaged(&p, bytes);
+            if (err != cudaSuccess)
+                detail::throw_cuda("cuda_managed_memory_resource::allocate",
+                                   err);
+        }
+        return p;
+    }
+
+    void do_deallocate(void* p, size_t /*bytes*/,
+                       size_t /*alignment*/) override {
+        if (p)
+            cudaFree(p);
+    }
+
+    bool do_is_equal(const memory_resource& other) const noexcept override {
+        return dynamic_cast<const cuda_managed_memory_resource*>(&other) !=
+               nullptr;
+    }
+};
+
+class cuda_async_memory_resource : public std::pmr::memory_resource {
+    // The stream is fixed at construction; all allocations and deallocations
+    // are ordered on that stream.  The resource must outlive any container
+    // using it, and the stream must remain valid for that lifetime.
+   public:
+    explicit cuda_async_memory_resource(cudaStream_t stream)
+        : stream_(stream) {}
+
+    cudaStream_t stream() const noexcept { return stream_; }
+
+   protected:
+    void* do_allocate(size_t bytes, size_t /*alignment*/) override {
+        void* p = nullptr;
+        if (bytes > 0) {
+            cudaError_t err = cudaMallocAsync(&p, bytes, stream_);
+            if (err != cudaSuccess)
+                detail::throw_cuda("cuda_async_memory_resource::allocate", err);
+        }
+        return p;
+    }
+
+    void do_deallocate(void* p, size_t /*bytes*/,
+                       size_t /*alignment*/) override {
+        if (p)
+            cudaFreeAsync(p, stream_);
+    }
+
+    bool do_is_equal(const memory_resource& other) const noexcept override {
+        auto* o = dynamic_cast<const cuda_async_memory_resource*>(&other);
+        return o != nullptr && o->stream_ == stream_;
+    }
+
+   private:
+    cudaStream_t stream_;
+};
+
+// Thread-safe (C++11 static-local guarantee).  These resources are never
+// destroyed, matching the contract of std::pmr::new_delete_resource().
+inline cuda_device_memory_resource* cuda_device_resource() noexcept {
+    static cuda_device_memory_resource r;
+    return &r;
+}
+
+inline cuda_pinned_memory_resource* cuda_pinned_resource() noexcept {
+    static cuda_pinned_memory_resource r;
+    return &r;
+}
+
+inline cuda_managed_memory_resource* cuda_managed_resource() noexcept {
+    static cuda_managed_memory_resource r;
+    return &r;
+}
+
+}  // namespace dalotia
+
+#endif  // DALOTIA_WITH_CPP_PMR
+#endif  // DALOTIA_WITH_CUDA
diff --git a/src/dalotia_tensor_file.cpp b/src/dalotia_tensor_file.cpp
@@ -44,10 +44,11 @@ void TensorFile::load_tensor_dense(const std::string& tensor_name,
         size_t nbytes = total_elements * element_bytes;
 
         // If permutation needed, load into a temp buffer then permute.
+        cuda_async_memory_resource async_mr(stream);
         CudaBuffer d_tmp;
         dalotia_byte* d_raw = tensor;
         if (needs_permute) {
-            d_tmp = CudaBuffer(nbytes, stream);
+            d_tmp = CudaBuffer(nbytes, &async_mr);
             d_raw = d_tmp.as<dalotia_byte>();
         }
 

diff --git a/src/dalotia_tensor_file.hpp b/src/dalotia_tensor_file.hpp
@@ -149,7 +149,7 @@ class TensorFile {
         this->load_tensor_dense(tensor_name, weight_format, ordering,
                                 reinterpret_cast<dalotia_byte*>(tensor.data()),
                                 permutation);
-        return std::make_pair(extents, tensor);
+        return {std::move(extents), std::move(tensor)};
     }
 
     template <typename value_type>

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -42,6 +42,19 @@ if (DALOTIA_WITH_CUDA)
         target_link_libraries( test_cufile CUDA::cuda_driver )
     endif()
     add_test( cufile-pointer-detection test_cufile )
+
+    if (DALOTIA_WITH_CPP_PMR)
+        add_executable( test_cuda_memory_resource test_cuda_memory_resource.cpp )
+        target_link_libraries( test_cuda_memory_resource dalotia_cpp CUDA::cudart )
+        add_test( cuda-memory-resource test_cuda_memory_resource )
+
+        if (DALOTIA_WITH_SAFETENSORS_CPP)
+            add_executable( test_cuda_pmr_load test_cuda_pmr_load.cpp )
+            target_link_libraries( test_cuda_pmr_load dalotia_cpp CUDA::cudart )
+            target_include_directories( test_cuda_pmr_load PUBLIC ${SAFETENSORS_CPP_INCLUDE_DIR} ${safetensors-cpp_DIR})
+            add_test( cuda-pmr-load test_cuda_pmr_load )
+        endif (DALOTIA_WITH_SAFETENSORS_CPP)
+    endif (DALOTIA_WITH_CPP_PMR)
 endif (DALOTIA_WITH_CUDA)
 
 if (DALOTIA_WITH_TENSORFLOW)