From e589fdde5d7d9d208f9812692bc5eb2ac5318ee1 Mon Sep 17 00:00:00 2001
From: Theresa <theresa.pollinger@riken.jp>
Date: Tue, 24 Mar 2026 05:43:24 +0000
Subject: [PATCH 1/6] cuda: add different pmr allocators

---
 src/CMakeLists.txt                   |   2 +-
 src/dalotia.hpp                      |   3 +
 src/dalotia_cuda_memory_resource.hpp | 155 ++++++++++++++
 src/dalotia_tensor_file.hpp          |   2 +-
 test/CMakeLists.txt                  |   6 +
 test/test_cuda_memory_resource.cpp   | 305 +++++++++++++++++++++++++++
 6 files changed, 471 insertions(+), 2 deletions(-)
 create mode 100644 src/dalotia_cuda_memory_resource.hpp
 create mode 100644 test/test_cuda_memory_resource.cpp
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d804732..3db7bf0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_library(dalotia_cpp dalotia.cpp) # Daniel Pfeifer says: no variables
 target_sources(dalotia_cpp PRIVATE dalotia_assignment.cpp dalotia_formats.cpp dalotia_datasource.cpp dalotia_tensor_file.cpp)
 set_target_properties(dalotia_cpp PROPERTIES PUBLIC_HEADER
-	"dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp;dalotia_cuda.hpp")
+	"dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp;dalotia_cuda.hpp;dalotia_cuda_memory_resource.hpp")
 # have one dalotia library target that can be used in C++ and Fortran
 add_library(dalotia INTERFACE)
 add_library(dalotia::dalotia_cpp ALIAS dalotia_cpp)
diff --git a/src/dalotia.hpp b/src/dalotia.hpp
index 389124e..6dbaf1c 100644
--- a/src/dalotia.hpp
+++ b/src/dalotia.hpp
@@ -6,6 +6,9 @@
 #include <memory>
 #ifdef DALOTIA_WITH_CPP_PMR
 #include <memory_resource>
+#ifdef DALOTIA_WITH_CUDA
+#include "dalotia_cuda_memory_resource.hpp"
+#endif  // DALOTIA_WITH_CUDA
 #endif  // DALOTIA_WITH_CPP_PMR
 #include <numeric>
 #include <string>
diff --git a/src/dalotia_cuda_memory_resource.hpp b/src/dalotia_cuda_memory_resource.hpp
new file mode 100644
index 0000000..3565b18
--- /dev/null
+++ b/src/dalotia_cuda_memory_resource.hpp
@@ -0,0 +1,155 @@
+#pragma once
+
+#ifdef DALOTIA_WITH_CUDA
+#ifdef DALOTIA_WITH_CPP_PMR
+
+#include <cuda_runtime.h>
+#include <memory_resource>
+#include <stdexcept>
+#include <string>
+
+namespace dalotia {
+
+namespace detail {
+
+[[noreturn]] inline void throw_cuda(const char* context, cudaError_t err) {
+    throw std::runtime_error(std::string(context) + ": " +
+                             cudaGetErrorString(err));
+}
+
+}  // namespace detail
+
+//TODO if there is a good library to take these from, we should consider it
+
+class cuda_device_memory_resource : public std::pmr::memory_resource {
+   protected:
+    void* do_allocate(size_t bytes, size_t /*alignment*/) override {
+        void* p = nullptr;
+        if (bytes > 0) {
+            cudaError_t err = cudaMalloc(&p, bytes);
+            if (err != cudaSuccess)
+                detail::throw_cuda("cuda_device_memory_resource::allocate",
+                                   err);
+        }
+        return p;
+    }
+
+    void do_deallocate(void* p, size_t /*bytes*/,
+                       size_t /*alignment*/) override {
+        if (p)
+            cudaFree(p);
+    }
+
+    bool do_is_equal(const memory_resource& other) const noexcept override {
+        return dynamic_cast<const cuda_device_memory_resource*>(&other) !=
+               nullptr;
+    }
+};
+
+class cuda_pinned_memory_resource : public std::pmr::memory_resource {
+   protected:
+    void* do_allocate(size_t bytes, size_t /*alignment*/) override {
+        void* p = nullptr;
+        if (bytes > 0) {
+            cudaError_t err = cudaMallocHost(&p, bytes);
+            if (err != cudaSuccess)
+                detail::throw_cuda("cuda_pinned_memory_resource::allocate",
+                                   err);
+        }
+        return p;
+    }
+
+    void do_deallocate(void* p, size_t /*bytes*/,
+                       size_t /*alignment*/) override {
+        if (p)
+            cudaFreeHost(p);
+    }
+
+    bool do_is_equal(const memory_resource& other) const noexcept override {
+        return dynamic_cast<const cuda_pinned_memory_resource*>(&other) !=
+               nullptr;
+    }
+};
+
+class cuda_managed_memory_resource : public std::pmr::memory_resource {
+   protected:
+    void* do_allocate(size_t bytes, size_t /*alignment*/) override {
+        void* p = nullptr;
+        if (bytes > 0) {
+            cudaError_t err = cudaMallocManaged(&p, bytes);
+            if (err != cudaSuccess)
+                detail::throw_cuda("cuda_managed_memory_resource::allocate",
+                                   err);
+        }
+        return p;
+    }
+
+    void do_deallocate(void* p, size_t /*bytes*/,
+                       size_t /*alignment*/) override {
+        if (p)
+            cudaFree(p);
+    }
+
+    bool do_is_equal(const memory_resource& other) const noexcept override {
+        return dynamic_cast<const cuda_managed_memory_resource*>(&other) !=
+               nullptr;
+    }
+};
+
+class cuda_async_memory_resource : public std::pmr::memory_resource {
+// The stream is fixed at construction; all allocations and deallocations are
+// ordered on that stream.  The resource must outlive any container using it,
+// and the stream must remain valid for that lifetime.
+   public:
+    explicit cuda_async_memory_resource(cudaStream_t stream)
+        : stream_(stream) {}
+
+    cudaStream_t stream() const noexcept { return stream_; }
+
+   protected:
+    void* do_allocate(size_t bytes, size_t /*alignment*/) override {
+        void* p = nullptr;
+        if (bytes > 0) {
+            cudaError_t err = cudaMallocAsync(&p, bytes, stream_);
+            if (err != cudaSuccess)
+                detail::throw_cuda("cuda_async_memory_resource::allocate", err);
+        }
+        return p;
+    }
+
+    void do_deallocate(void* p, size_t /*bytes*/,
+                       size_t /*alignment*/) override {
+        if (p)
+            cudaFreeAsync(p, stream_);
+    }
+
+    bool do_is_equal(const memory_resource& other) const noexcept override {
+        auto* o = dynamic_cast<const cuda_async_memory_resource*>(&other);
+        return o != nullptr && o->stream_ == stream_;
+    }
+
+   private:
+    cudaStream_t stream_;
+};
+
+// Thread-safe (C++11 static-local guarantee).  These resources are never
+// destroyed, matching the contract of std::pmr::new_delete_resource().
+inline cuda_device_memory_resource* cuda_device_resource() noexcept {
+    static cuda_device_memory_resource r;
+    return &r;
+}
+
+inline cuda_pinned_memory_resource* cuda_pinned_resource() noexcept {
+    static cuda_pinned_memory_resource r;
+    return &r;
+}
+
+inline cuda_managed_memory_resource* cuda_managed_resource() noexcept {
+    static cuda_managed_memory_resource r;
+    return &r;
+}
+
+}  // namespace dalotia
+
+#endif  // DALOTIA_WITH_CPP_PMR
+#endif  // DALOTIA_WITH_CUDA
diff --git a/src/dalotia_tensor_file.hpp b/src/dalotia_tensor_file.hpp
index 34c70ba..75f7198 100644
--- a/src/dalotia_tensor_file.hpp
+++ b/src/dalotia_tensor_file.hpp
@@ -149,7 +149,7 @@ class TensorFile {
         this->load_tensor_dense(tensor_name, weight_format, ordering,
                                 reinterpret_cast<dalotia_byte*>(tensor.data()),
                                 permutation);
-        return std::make_pair(extents, tensor);
+        return {std::move(extents), std::move(tensor)};
     }
 
     template <typename value_type>
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 9187fca..9a4fc3c 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -42,6 +42,12 @@ if (DALOTIA_WITH_CUDA)
         target_link_libraries( test_cufile CUDA::cuda_driver )
     endif()
     add_test( cufile-pointer-detection test_cufile )
+
+    if (DALOTIA_WITH_CPP_PMR)
+        add_executable( test_cuda_memory_resource test_cuda_memory_resource.cpp )
+        target_link_libraries( test_cuda_memory_resource dalotia_cpp CUDA::cudart )
+        add_test( cuda-memory-resource test_cuda_memory_resource )
+    endif (DALOTIA_WITH_CPP_PMR)
 endif (DALOTIA_WITH_CUDA)
 
 if (DALOTIA_WITH_TENSORFLOW)
diff --git a/test/test_cuda_memory_resource.cpp b/test/test_cuda_memory_resource.cpp
new file mode 100644
index 0000000..e356b95
--- /dev/null
+++ b/test/test_cuda_memory_resource.cpp
@@ -0,0 +1,305 @@
+// Tests for dalotia::cuda_{device,pinned,managed,async}_memory_resource.
+
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <memory_resource>
+#include <vector>
+
+#include <cuda_runtime.h>
+
+#include "dalotia.hpp"
+#include "dalotia_cuda_memory_resource.hpp"
+
+#define CHECK_CUDA(call)                                                 \
+    do {                                                                 \
+        cudaError_t err = (call);                                        \
+        if (err != cudaSuccess) {                                        \
+            std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ \
+                      << " : " << cudaGetErrorString(err) << std::endl;  \
+            std::exit(EXIT_FAILURE);                                     \
+        }                                                                \
+    } while (0)
+
+// Query the exact CUDA memory type for a pointer.
+static cudaMemoryType get_memory_type(const void* ptr) {
+    cudaPointerAttributes attrs{};
+    CHECK_CUDA(cudaPointerGetAttributes(&attrs, ptr));
+    return attrs.type;
+}
+
+static const char* TEST_FILE = "../data/model.safetensors";
+static const char* TENSOR_NAME = "embedding";
+static constexpr int NUM_ELEMENTS = 3 * 4 * 5;  // 60
+static constexpr dalotia_WeightFormat FORMAT = dalotia_float_64;
+
+void test_device_resource_basic() {
+    std::cout << "test_device_resource_basic... " << std::flush;
+
+    auto* mr = dalotia::cuda_device_resource();
+    void* p = mr->allocate(1024);
+    assert(p != nullptr);
+    assert(dalotia::is_device_pointer(p));
+    mr->deallocate(p, 1024);
+
+    // zero-byte allocation returns nullptr and doesn't crash on dealloc
+    void* z = mr->allocate(0);
+    assert(z == nullptr);
+    mr->deallocate(z, 0);
+
+    std::cout << "OK" << std::endl;
+}
+
+void test_pinned_resource_basic() {
+    std::cout << "test_pinned_resource_basic... " << std::flush;
+
+    auto* mr = dalotia::cuda_pinned_resource();
+    void* p = mr->allocate(1024);
+    assert(p != nullptr);
+    // pinned memory is only host-accessible — should not be detected as device
+    assert(!dalotia::is_device_pointer(p));
+
+    // verify it's actually writable from the host
+    std::memset(p, 0xAB, 1024);
+
+    mr->deallocate(p, 1024);
+
+    void* z = mr->allocate(0);
+    assert(z == nullptr);
+    mr->deallocate(z, 0);
+
+    std::cout << "OK" << std::endl;
+}
+
+void test_managed_resource_basic() {
+    std::cout << "test_managed_resource_basic... " << std::flush;
+
+    auto* mr = dalotia::cuda_managed_resource();
+    void* p = mr->allocate(1024);
+    assert(p != nullptr);
+    // managed memory is detected as device pointer
+    assert(dalotia::is_device_pointer(p));
+
+    // managed memory is also host-accessible
+    std::memset(p, 0xCD, 1024);
+    CHECK_CUDA(cudaDeviceSynchronize());
+
+    mr->deallocate(p, 1024);
+
+    std::cout << "OK" << std::endl;
+}
+
+void test_async_resource_basic() {
+    std::cout << "test_async_resource_basic... " << std::flush;
+
+    cudaStream_t stream;
+    CHECK_CUDA(cudaStreamCreate(&stream));
+
+    dalotia::cuda_async_memory_resource mr(stream);
+    assert(mr.stream() == stream);
+
+    void* p = mr.allocate(1024);
+    assert(p != nullptr);
+    assert(dalotia::is_device_pointer(p));
+    mr.deallocate(p, 1024);
+
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+    CHECK_CUDA(cudaStreamDestroy(stream));
+
+    std::cout << "OK" << std::endl;
+}
+
+void test_is_equal() {
+    std::cout << "test_is_equal... " << std::flush;
+
+    auto* dev = dalotia::cuda_device_resource();
+    auto* pin = dalotia::cuda_pinned_resource();
+    auto* mgd = dalotia::cuda_managed_resource();
+
+    // same-type singletons are equal
+    dalotia::cuda_device_memory_resource dev2;
+    assert(dev->is_equal(dev2));
+    assert(dev2.is_equal(*dev));
+
+    // different types are not equal
+    assert(!dev->is_equal(*pin));
+    assert(!dev->is_equal(*mgd));
+    assert(!pin->is_equal(*mgd));
+
+    // not equal to the default resource
+    assert(!dev->is_equal(*std::pmr::get_default_resource()));
+
+    // async resources: equal iff same stream
+    cudaStream_t s1, s2;
+    CHECK_CUDA(cudaStreamCreate(&s1));
+    CHECK_CUDA(cudaStreamCreate(&s2));
+
+    dalotia::cuda_async_memory_resource a1(s1);
+    dalotia::cuda_async_memory_resource a1_copy(s1);
+    dalotia::cuda_async_memory_resource a2(s2);
+
+    assert(a1.is_equal(a1_copy));
+    assert(!a1.is_equal(a2));
+    assert(!a1.is_equal(*dev));
+
+    CHECK_CUDA(cudaStreamDestroy(s1));
+    CHECK_CUDA(cudaStreamDestroy(s2));
+
+    // calling the accessor twice returns the same pointer
+    assert(dalotia::cuda_device_resource() == dalotia::cuda_device_resource());
+    assert(dalotia::cuda_pinned_resource() == dalotia::cuda_pinned_resource());
+    assert(dalotia::cuda_managed_resource() ==
+           dalotia::cuda_managed_resource());
+
+    std::cout << "OK" << std::endl;
+}
+
+void test_pmr_vector_pinned() {
+    std::cout << "test_pmr_vector_pinned... " << std::flush;
+
+    auto* mr = dalotia::cuda_pinned_resource();
+    std::pmr::polymorphic_allocator<double> alloc(mr);
+    std::pmr::vector<double> v(alloc);
+
+    v.resize(100);
+    for (int i = 0; i < 100; i++)
+        v[i] = static_cast<double>(i);
+
+    // verify contents (pinned memory is host-accessible)
+    for (int i = 0; i < 100; i++)
+        assert(v[i] == static_cast<double>(i));
+
+    // the underlying pointer should be pinned (DMA-capable), not device
+    assert(!dalotia::is_device_pointer(v.data()));
+
+    std::cout << "OK" << std::endl;
+}
+
+void test_pmr_vector_managed() {
+    std::cout << "test_pmr_vector_managed... " << std::flush;
+
+    auto* mr = dalotia::cuda_managed_resource();
+    std::pmr::polymorphic_allocator<double> alloc(mr);
+    std::pmr::vector<double> v(alloc);
+
+    v.resize(100);
+    for (int i = 0; i < 100; i++)
+        v[i] = static_cast<double>(i);
+
+    CHECK_CUDA(cudaDeviceSynchronize());
+
+    for (int i = 0; i < 100; i++)
+        assert(v[i] == static_cast<double>(i));
+
+    assert(dalotia::is_device_pointer(v.data()));
+
+    std::cout << "OK" << std::endl;
+}
+
+void test_load_tensor_with_pinned_resource() {
+    std::cout << "test_load_tensor_with_pinned_resource... " << std::flush;
+
+    auto* mr = dalotia::cuda_pinned_resource();
+    std::pmr::polymorphic_allocator<dalotia_byte> alloc(mr);
+
+    auto file = std::unique_ptr<dalotia::TensorFile>(
+        dalotia::make_tensor_file(TEST_FILE));
+
+    auto [extents, tensor] = file->load_tensor_dense<double>(
+        TENSOR_NAME, FORMAT, dalotia_C_ordering, {}, alloc);
+
+    assert(extents.size() == 3);
+    assert(extents[0] == 3 && extents[1] == 4 && extents[2] == 5);
+    assert(tensor.size() == NUM_ELEMENTS);
+
+    // pinned memory is host-readable
+    for (int i = 0; i < NUM_ELEMENTS; i++) {
+        assert(tensor[i] == static_cast<double>(i));
+    }
+
+    std::cout << "OK" << std::endl;
+}
+
+void test_load_tensor_with_managed_resource() {
+    std::cout << "test_load_tensor_with_managed_resource... " << std::flush;
+
+    auto* mr = dalotia::cuda_managed_resource();
+    std::pmr::polymorphic_allocator<dalotia_byte> alloc(mr);
+
+    auto file = std::unique_ptr<dalotia::TensorFile>(
+        dalotia::make_tensor_file(TEST_FILE));
+
+    auto [extents, tensor] = file->load_tensor_dense<double>(
+        TENSOR_NAME, FORMAT, dalotia_C_ordering, {}, alloc);
+
+    assert(extents.size() == 3);
+    assert(tensor.size() == NUM_ELEMENTS);
+
+    CHECK_CUDA(cudaDeviceSynchronize());
+
+    // managed memory is host-readable after sync
+    for (int i = 0; i < NUM_ELEMENTS; i++) {
+        assert(tensor[i] == static_cast<double>(i));
+    }
+
+    // and it's device-accessible (allocator preserved through move)
+    assert(dalotia::is_device_pointer(tensor.data()));
+
+    std::cout << "OK" << std::endl;
+}
+
+void test_load_tensor_memory_types() {
+    std::cout << "test_load_tensor_memory_types... " << std::flush;
+
+    auto file = std::unique_ptr<dalotia::TensorFile>(
+        dalotia::make_tensor_file(TEST_FILE));
+
+    // default allocator → cudaMemoryTypeUnregistered (plain heap)
+    {
+        auto [ext, tensor] = file->load_tensor_dense<double>(
+            TENSOR_NAME, FORMAT, dalotia_C_ordering);
+        assert(get_memory_type(tensor.data()) == cudaMemoryTypeUnregistered);
+    }
+
+    // pinned resource → cudaMemoryTypeHost
+    {
+        std::pmr::polymorphic_allocator<dalotia_byte> alloc(
+            dalotia::cuda_pinned_resource());
+        auto [ext, tensor] = file->load_tensor_dense<double>(
+            TENSOR_NAME, FORMAT, dalotia_C_ordering, {}, alloc);
+        assert(get_memory_type(tensor.data()) == cudaMemoryTypeHost);
+        for (int i = 0; i < NUM_ELEMENTS; i++)
+            assert(tensor[i] == static_cast<double>(i));
+    }
+
+    // managed resource → cudaMemoryTypeManaged
+    {
+        std::pmr::polymorphic_allocator<dalotia_byte> alloc(
+            dalotia::cuda_managed_resource());
+        auto [ext, tensor] = file->load_tensor_dense<double>(
+            TENSOR_NAME, FORMAT, dalotia_C_ordering, {}, alloc);
+        assert(get_memory_type(tensor.data()) == cudaMemoryTypeManaged);
+        CHECK_CUDA(cudaDeviceSynchronize());
+        for (int i = 0; i < NUM_ELEMENTS; i++)
+            assert(tensor[i] == static_cast<double>(i));
+    }
+
+    std::cout << "OK" << std::endl;
+}
+
+int main() {
+    test_device_resource_basic();
+    test_pinned_resource_basic();
+    test_managed_resource_basic();
+    test_async_resource_basic();
+    test_is_equal();
+    test_singletons();
+    test_pmr_vector_pinned();
+    test_pmr_vector_managed();
+    test_load_tensor_with_pinned_resource();
+    test_load_tensor_with_managed_resource();
+    test_load_tensor_memory_types();
+
+    std::cout << "test_cuda_memory_resource succeeded" << std::endl;
+    return 0;
+}

From e1a92f47168920c364237034f06ac1305c9d9db0 Mon Sep 17 00:00:00 2001
From: Theresa <theresa.pollinger@riken.jp>
Date: Tue, 24 Mar 2026 05:44:11 +0000
Subject: [PATCH 2/6] test cpmr: delete unused call

---
 test/test_cuda_memory_resource.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_cuda_memory_resource.cpp b/test/test_cuda_memory_resource.cpp
index e356b95..b1a830d 100644
--- a/test/test_cuda_memory_resource.cpp
+++ b/test/test_cuda_memory_resource.cpp
@@ -293,7 +293,6 @@ int main() {
     test_managed_resource_basic();
     test_async_resource_basic();
     test_is_equal();
-    test_singletons();
     test_pmr_vector_pinned();
     test_pmr_vector_managed();
     test_load_tensor_with_pinned_resource();

From 08566cc46bcb35bd3f76f2cea50f1bece8b5b606 Mon Sep 17 00:00:00 2001
From: Theresa <theresa.pollinger@riken.jp>
Date: Tue, 24 Mar 2026 05:54:04 +0000
Subject: [PATCH 3/6] cpmr: clang-format

---
 src/dalotia_cuda_memory_resource.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/dalotia_cuda_memory_resource.hpp b/src/dalotia_cuda_memory_resource.hpp
index 3565b18..ac272e4 100644
--- a/src/dalotia_cuda_memory_resource.hpp
+++ b/src/dalotia_cuda_memory_resource.hpp
@@ -19,7 +19,7 @@ namespace detail {
 
 }  // namespace detail
 
-//TODO if there is a good library to take these from, we should consider it
+// TODO if there is a good library to take these from, we should consider it
 
 class cuda_device_memory_resource : public std::pmr::memory_resource {
    protected:
@@ -97,9 +97,9 @@ class cuda_managed_memory_resource : public std::pmr::memory_resource {
 };
 
 class cuda_async_memory_resource : public std::pmr::memory_resource {
-// The stream is fixed at construction; all allocations and deallocations are
-// ordered on that stream.  The resource must outlive any container using it,
-// and the stream must remain valid for that lifetime.
+    // The stream is fixed at construction; all allocations and deallocations
+    // are ordered on that stream.  The resource must outlive any container
+    // using it, and the stream must remain valid for that lifetime.
    public:
     explicit cuda_async_memory_resource(cudaStream_t stream)
         : stream_(stream) {}

From dafa7c602eb9c37b15bfe560e1e3f1a7760af68c Mon Sep 17 00:00:00 2001
From: Theresa <theresa.pollinger@riken.jp>
Date: Tue, 24 Mar 2026 05:56:52 +0000
Subject: [PATCH 4/6] ci: don't run new cpmr tests

---
 .github/workflows/ctest.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ctest.yml b/.github/workflows/ctest.yml
index 95318a2..3f2584a 100644
--- a/.github/workflows/ctest.yml
+++ b/.github/workflows/ctest.yml
@@ -83,4 +83,4 @@ jobs:
       uses: threeal/ctest-action@v1.1.0
       with:
         test-dir: build-cuda
-        args: --verbose --exclude-regex cufile-pointer-detection
+        args: --verbose --exclude-regex "cufile-pointer-detection|cuda-memory-resource"

From 294c1ff51e619b176a1e58fbd7cf8f95b3b8d887 Mon Sep 17 00:00:00 2001
From: Theresa <theresa.pollinger@riken.jp>
Date: Wed, 25 Mar 2026 01:20:51 +0000
Subject: [PATCH 5/6] cmake: cufile transient dependency when
 DALOTIA_WITH_CUFILE

---
 src/CMakeLists.txt | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3db7bf0..2db6c72 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -41,7 +41,19 @@ if (DALOTIA_WITH_CUFILE)
     find_library(CUFILE_LIBRARY cufile
         PATHS ${CUDAToolkit_LIBRARY_DIR}
         REQUIRED)
+    get_filename_component(_CUFILE_LIB_DIR "${CUFILE_LIBRARY}" DIRECTORY)
+    find_path(CUFILE_INCLUDE_DIR cufile.h
+        HINTS ${_CUFILE_LIB_DIR}/../include
+              ${_CUFILE_LIB_DIR}/../../include
+              ${CUDAToolkit_INCLUDE_DIRS}
+        PATHS /usr/local/cuda/include
+              /usr/include)
     target_link_libraries(dalotia_cpp PUBLIC ${CUFILE_LIBRARY})
+    if (CUFILE_INCLUDE_DIR)
+        target_include_directories(dalotia_cpp PUBLIC ${CUFILE_INCLUDE_DIR})
+    else()
+        message(WARNING "cufile.h not found — set CUFILE_INCLUDE_DIR manually")
+    endif()
     target_compile_definitions(dalotia_cpp PUBLIC "DALOTIA_WITH_CUFILE")
     target_sources(dalotia_cpp PRIVATE dalotia_cufile.cpp)
 endif (DALOTIA_WITH_CUFILE)

From 791ab1afc6d378d1a35a696115b7c7b9571e9315 Mon Sep 17 00:00:00 2001
From: Theresa <theresa.pollinger@riken.jp>
Date: Mon, 13 Apr 2026 08:52:17 +0000
Subject: [PATCH 6/6] cudabuffer: use pmr resources + test

---
 src/dalotia_cuda.hpp        | 80 ++++++++++++---------------------
 src/dalotia_tensor_file.cpp |  3 +-
 test/CMakeLists.txt         |  7 +++
 test/test_cuda_pmr_load.cpp | 90 +++++++++++++++++++++++++++++++++++++
 4 files changed, 128 insertions(+), 52 deletions(-)
 create mode 100644 test/test_cuda_pmr_load.cpp

diff --git a/src/dalotia_cuda.hpp b/src/dalotia_cuda.hpp
index 2fc7fb8..9f4096a 100644
--- a/src/dalotia_cuda.hpp
+++ b/src/dalotia_cuda.hpp
@@ -4,8 +4,9 @@
 
 #include <cstddef>
 #include <cuda_runtime.h>
-#include <stdexcept>
-#include <string>
+#include <memory_resource>
+
+#include "dalotia_cuda_memory_resource.hpp"
 
 namespace dalotia {
 
@@ -13,67 +14,36 @@ namespace dalotia {
 // Returns false for host pointers (including cudaMallocHost pinned memory).
 bool is_device_pointer(const void* ptr) noexcept;
 
-// RAII wrapper for a cudaMalloc'd device buffer. Move-only.
-// Type-erased (stores void*); use as<T>() for typed access.
+// Move-only owning byte buffer backed by a std::pmr::memory_resource.
+//
+// The default resource is `cuda_device_resource()` (sync cudaMalloc/cudaFree).
 class CudaBuffer {
    public:
-    CudaBuffer() = default;
-
-    explicit CudaBuffer(size_t nbytes) : size_(nbytes) {
-        if (nbytes > 0) {
-            cudaError_t err = cudaMalloc(&ptr_, nbytes);
-            if (err != cudaSuccess) {
-                throw std::runtime_error(
-                    std::string("CudaBuffer: cudaMalloc failed: ") +
-                    cudaGetErrorString(err));
-            }
-        }
-    }
+    CudaBuffer() noexcept = default;
 
-    CudaBuffer(size_t nbytes, cudaStream_t stream) : size_(nbytes) {
+    explicit CudaBuffer(size_t nbytes,
+                        std::pmr::memory_resource* mr = cuda_device_resource())
+        : mr_(mr), size_(nbytes) {
         if (nbytes > 0) {
-            cudaError_t err = cudaMallocAsync(&ptr_, nbytes, stream);
-            if (err != cudaSuccess) {
-                throw std::runtime_error(
-                    std::string("CudaBuffer: cudaMallocAsync failed: ") +
-                    cudaGetErrorString(err));
-            }
-            stream_ = stream;
-            async_ = true;
+            ptr_ = std::pmr::polymorphic_allocator<std::byte>(mr_).allocate(
+                nbytes);
         }
     }
 
-    ~CudaBuffer() {
-        if (ptr_) {
-            if (async_) {
-                cudaFreeAsync(ptr_, stream_);
-            } else {
-                cudaFree(ptr_);
-            }
-        }
-    }
+    ~CudaBuffer() { reset(); }
 
     CudaBuffer(CudaBuffer&& other) noexcept
-        : ptr_(other.ptr_)
-        , size_(other.size_)
-        , stream_(other.stream_)
-        , async_(other.async_) {
+        : mr_(other.mr_), ptr_(other.ptr_), size_(other.size_) {
         other.ptr_ = nullptr;
         other.size_ = 0;
     }
 
     CudaBuffer& operator=(CudaBuffer&& other) noexcept {
         if (this != &other) {
-            if (ptr_) {
-                if (async_)
-                    cudaFreeAsync(ptr_, stream_);
-                else
-                    cudaFree(ptr_);
-            }
+            reset();
+            mr_ = other.mr_;
             ptr_ = other.ptr_;
             size_ = other.size_;
-            stream_ = other.stream_;
-            async_ = other.async_;
             other.ptr_ = nullptr;
             other.size_ = 0;
         }
@@ -90,19 +60,27 @@ class CudaBuffer {
 
     template <typename T>
     T* as() noexcept {
-        return static_cast<T*>(ptr_);
+        return static_cast<T*>(static_cast<void*>(ptr_));
     }
 
     template <typename T>
     const T* as() const noexcept {
-        return static_cast<const T*>(ptr_);
+        return static_cast<const T*>(static_cast<const void*>(ptr_));
     }
 
    private:
-    void* ptr_ = nullptr;
+    void reset() noexcept {
+        if (ptr_) {
+            std::pmr::polymorphic_allocator<std::byte>(mr_).deallocate(ptr_,
+                                                                       size_);
+            ptr_ = nullptr;
+            size_ = 0;
+        }
+    }
+
+    std::pmr::memory_resource* mr_ = cuda_device_resource();
+    std::byte* ptr_ = nullptr;
     size_t size_ = 0;
-    cudaStream_t stream_ = 0;
-    bool async_ = false;
 };
 
 }  // namespace dalotia
diff --git a/src/dalotia_tensor_file.cpp b/src/dalotia_tensor_file.cpp
index 654a4b7..391196f 100644
--- a/src/dalotia_tensor_file.cpp
+++ b/src/dalotia_tensor_file.cpp
@@ -44,10 +44,11 @@ void TensorFile::load_tensor_dense(const std::string& tensor_name,
         size_t nbytes = total_elements * element_bytes;
 
         // If permutation needed, load into a temp buffer then permute.
+        cuda_async_memory_resource async_mr(stream);
         CudaBuffer d_tmp;
         dalotia_byte* d_raw = tensor;
         if (needs_permute) {
-            d_tmp = CudaBuffer(nbytes, stream);
+            d_tmp = CudaBuffer(nbytes, &async_mr);
             d_raw = d_tmp.as<dalotia_byte>();
         }
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 9a4fc3c..bc32e9b 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -47,6 +47,13 @@ if (DALOTIA_WITH_CUDA)
         add_executable( test_cuda_memory_resource test_cuda_memory_resource.cpp )
         target_link_libraries( test_cuda_memory_resource dalotia_cpp CUDA::cudart )
         add_test( cuda-memory-resource test_cuda_memory_resource )
+
+        if (DALOTIA_WITH_SAFETENSORS_CPP)
+            add_executable( test_cuda_pmr_load test_cuda_pmr_load.cpp )
+            target_link_libraries( test_cuda_pmr_load dalotia_cpp CUDA::cudart )
+            target_include_directories( test_cuda_pmr_load PUBLIC ${SAFETENSORS_CPP_INCLUDE_DIR} ${safetensors-cpp_DIR})
+            add_test( cuda-pmr-load test_cuda_pmr_load )
+        endif (DALOTIA_WITH_SAFETENSORS_CPP)
     endif (DALOTIA_WITH_CPP_PMR)
 endif (DALOTIA_WITH_CUDA)
 
diff --git a/test/test_cuda_pmr_load.cpp b/test/test_cuda_pmr_load.cpp
new file mode 100644
index 0000000..f9c27f4
--- /dev/null
+++ b/test/test_cuda_pmr_load.cpp
@@ -0,0 +1,90 @@
+// Combined test: load a permuted tensor into buffers backed by different
+// dalotia CUDA memory resources (device + pinned host) and verify that the
+// device-side bytes (after copy-back) match the pinned host-side bytes.
+
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <vector>
+
+#include <cuda_runtime.h>
+
+#include "dalotia.hpp"
+#include "dalotia_cuda.hpp"
+#include "dalotia_cuda_memory_resource.hpp"
+#include "dalotia_safetensors_file.hpp"
+
+#define CHECK_CUDA(call)                                                 \
+    do {                                                                 \
+        cudaError_t err = (call);                                        \
+        if (err != cudaSuccess) {                                        \
+            std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ \
+                      << " : " << cudaGetErrorString(err) << std::endl;  \
+            std::exit(EXIT_FAILURE);                                     \
+        }                                                                \
+    } while (0)
+
+static const char* TEST_FILE = "../data/model.safetensors";
+// Shape [4,3,5]; permutation [1,0,2] yields shape [3,4,5] with values 0..59.
+static const char* PERM_TENSOR = "embedding_firstchanged";
+static constexpr int NUM_ELEMENTS = 3 * 4 * 5;
+static constexpr dalotia_WeightFormat FORMAT = dalotia_float_64;
+
+int main() {
+    std::cout << "test_permuted_load_device_vs_pinned... " << std::flush;
+
+    int device_count = 0;
+    if (cudaGetDeviceCount(&device_count) != cudaSuccess || device_count == 0) {
+        std::cout << "SKIP (no CUDA device)" << std::endl;
+        return 0;
+    }
+
+    const std::vector<int> perm = {1, 0, 2};
+    const size_t nbytes = NUM_ELEMENTS * sizeof(double);
+
+    // Non-default stream: the GPU permute kernel and its temp-buffer
+    // alloc/free should be ordered on this stream, not the default one.
+    cudaStream_t stream = nullptr;
+    CHECK_CUDA(cudaStreamCreate(&stream));
+
+    dalotia::SafetensorsFile file(TEST_FILE);
+
+    // 1) Permuted load into a device-resident buffer (GPU permute kernel path).
+    //    Pass `stream` so load + permute kernel run on it.
+    dalotia::CudaBuffer d_buf(nbytes, dalotia::cuda_device_resource());
+    file.load_tensor_dense(PERM_TENSOR, FORMAT, dalotia_C_ordering,
+                           d_buf.as<dalotia_byte>(), perm, stream);
+
+    // 2) Permuted load into a pinned host buffer (host permute path —
+    //    pinned memory is host-accessible and not detected as a device ptr).
+    //    The stream argument is ignored on the host path.
+    dalotia::CudaBuffer h_pinned(nbytes, dalotia::cuda_pinned_resource());
+    file.load_tensor_dense(PERM_TENSOR, FORMAT, dalotia_C_ordering,
+                           h_pinned.as<dalotia_byte>(), perm, stream);
+
+    // Sanity: pinned pointer must be host-addressable, not a device pointer.
+    assert(!dalotia::is_device_pointer(h_pinned.data()));
+    assert(dalotia::is_device_pointer(d_buf.data()));
+
+    // Wait for all stream work before reading the device buffer back.
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    // Copy device buffer back to host (also on the same stream) and compare.
+    std::vector<dalotia_byte> d_copy(nbytes);
+    CHECK_CUDA(cudaMemcpyAsync(d_copy.data(), d_buf.data(), nbytes,
+                               cudaMemcpyDeviceToHost, stream));
+    CHECK_CUDA(cudaStreamSynchronize(stream));
+
+    assert(std::memcmp(d_copy.data(), h_pinned.data(), nbytes) == 0);
+
+    // Also verify the values are the expected 0..59 sequence.
+    const double* pinned_vals = h_pinned.as<double>();
+    for (int i = 0; i < NUM_ELEMENTS; ++i) {
+        assert(pinned_vals[i] == static_cast<double>(i));
+    }
+
+    CHECK_CUDA(cudaStreamDestroy(stream));
+
+    std::cout << "OK" << std::endl;
+    return 0;
+}