From e589fdde5d7d9d208f9812692bc5eb2ac5318ee1 Mon Sep 17 00:00:00 2001 From: Theresa Date: Tue, 24 Mar 2026 05:43:24 +0000 Subject: [PATCH 1/6] cuda: add different pmr allocators --- src/CMakeLists.txt | 2 +- src/dalotia.hpp | 3 + src/dalotia_cuda_memory_resource.hpp | 155 ++++++++++++++ src/dalotia_tensor_file.hpp | 2 +- test/CMakeLists.txt | 6 + test/test_cuda_memory_resource.cpp | 305 +++++++++++++++++++++++++++ 6 files changed, 471 insertions(+), 2 deletions(-) create mode 100644 src/dalotia_cuda_memory_resource.hpp create mode 100644 test/test_cuda_memory_resource.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d804732..3db7bf0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,7 +1,7 @@ add_library(dalotia_cpp dalotia.cpp) # Daniel Pfeifer says: no variables target_sources(dalotia_cpp PRIVATE dalotia_assignment.cpp dalotia_formats.cpp dalotia_datasource.cpp dalotia_tensor_file.cpp) set_target_properties(dalotia_cpp PROPERTIES PUBLIC_HEADER - "dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp;dalotia_cuda.hpp") + "dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp;dalotia_cuda.hpp;dalotia_cuda_memory_resource.hpp") # have one dalotia library target that can be used in C++ and Fortran add_library(dalotia INTERFACE) add_library(dalotia::dalotia_cpp ALIAS dalotia_cpp) diff --git a/src/dalotia.hpp b/src/dalotia.hpp index 389124e..6dbaf1c 100644 --- a/src/dalotia.hpp +++ b/src/dalotia.hpp @@ -6,6 +6,9 @@ #include #ifdef DALOTIA_WITH_CPP_PMR #include +#ifdef DALOTIA_WITH_CUDA +#include "dalotia_cuda_memory_resource.hpp" +#endif // DALOTIA_WITH_CUDA #endif // DALOTIA_WITH_CPP_PMR #include #include diff --git a/src/dalotia_cuda_memory_resource.hpp b/src/dalotia_cuda_memory_resource.hpp new file mode 100644 index 0000000..3565b18 --- /dev/null +++ b/src/dalotia_cuda_memory_resource.hpp @@ -0,0 +1,155 @@ +#pragma once + +#ifdef DALOTIA_WITH_CUDA +#ifdef DALOTIA_WITH_CPP_PMR + +#include +#include +#include +#include + +namespace dalotia { + +namespace detail { + +[[noreturn]] inline void throw_cuda(const char* context, cudaError_t err) { + throw std::runtime_error(std::string(context) + ": " + + cudaGetErrorString(err)); +} + +} // namespace detail + +//TODO if there is a good library to take these from, we should consider it + +class cuda_device_memory_resource : public std::pmr::memory_resource { + protected: + void* do_allocate(size_t bytes, size_t /*alignment*/) override { + void* p = nullptr; + if (bytes > 0) { + cudaError_t err = cudaMalloc(&p, bytes); + if (err != cudaSuccess) + detail::throw_cuda("cuda_device_memory_resource::allocate", + err); + } + return p; + } + + void do_deallocate(void* p, size_t /*bytes*/, + size_t /*alignment*/) override { + if (p) + cudaFree(p); + } + + bool do_is_equal(const memory_resource& other) const noexcept override { + return dynamic_cast(&other) != + nullptr; + } +}; + +class cuda_pinned_memory_resource : public std::pmr::memory_resource { + protected: + void* do_allocate(size_t bytes, size_t /*alignment*/) override { + void* p = nullptr; + if (bytes > 0) { + cudaError_t err = cudaMallocHost(&p, bytes); + if (err != cudaSuccess) + detail::throw_cuda("cuda_pinned_memory_resource::allocate", + err); + } + return p; + } + + void do_deallocate(void* p, size_t /*bytes*/, + size_t /*alignment*/) override { + if (p) + cudaFreeHost(p); + } + + bool do_is_equal(const memory_resource& other) const noexcept override { + return dynamic_cast(&other) != + nullptr; + } +}; + +class cuda_managed_memory_resource : public std::pmr::memory_resource { + protected: + void* do_allocate(size_t bytes, size_t /*alignment*/) override { + void* p = nullptr; + if (bytes > 0) { + cudaError_t err = cudaMallocManaged(&p, bytes); + if (err != cudaSuccess) + detail::throw_cuda("cuda_managed_memory_resource::allocate", + err); + } + return p; + } + + void do_deallocate(void* p, size_t /*bytes*/, + size_t /*alignment*/) override { + if (p) + cudaFree(p); + } + + bool do_is_equal(const memory_resource& other) const noexcept override { + return dynamic_cast(&other) != + nullptr; + } +}; + +class cuda_async_memory_resource : public std::pmr::memory_resource { +// The stream is fixed at construction; all allocations and deallocations are +// ordered on that stream. The resource must outlive any container using it, +// and the stream must remain valid for that lifetime. + public: + explicit cuda_async_memory_resource(cudaStream_t stream) + : stream_(stream) {} + + cudaStream_t stream() const noexcept { return stream_; } + + protected: + void* do_allocate(size_t bytes, size_t /*alignment*/) override { + void* p = nullptr; + if (bytes > 0) { + cudaError_t err = cudaMallocAsync(&p, bytes, stream_); + if (err != cudaSuccess) + detail::throw_cuda("cuda_async_memory_resource::allocate", err); + } + return p; + } + + void do_deallocate(void* p, size_t /*bytes*/, + size_t /*alignment*/) override { + if (p) + cudaFreeAsync(p, stream_); + } + + bool do_is_equal(const memory_resource& other) const noexcept override { + auto* o = dynamic_cast(&other); + return o != nullptr && o->stream_ == stream_; + } + + private: + cudaStream_t stream_; +}; + +// Thread-safe (C++11 static-local guarantee). These resources are never +// destroyed, matching the contract of std::pmr::new_delete_resource(). +inline cuda_device_memory_resource* cuda_device_resource() noexcept { + static cuda_device_memory_resource r; + return &r; +} + +inline cuda_pinned_memory_resource* cuda_pinned_resource() noexcept { + static cuda_pinned_memory_resource r; + return &r; +} + +inline cuda_managed_memory_resource* cuda_managed_resource() noexcept { + static cuda_managed_memory_resource r; + return &r; +} + +} // namespace dalotia + +#endif // DALOTIA_WITH_CPP_PMR +#endif // DALOTIA_WITH_CUDA diff --git a/src/dalotia_tensor_file.hpp b/src/dalotia_tensor_file.hpp index 34c70ba..75f7198 100644 --- a/src/dalotia_tensor_file.hpp +++ b/src/dalotia_tensor_file.hpp @@ -149,7 +149,7 @@ class TensorFile { this->load_tensor_dense(tensor_name, weight_format, ordering, reinterpret_cast(tensor.data()), permutation); - return std::make_pair(extents, tensor); + return {std::move(extents), std::move(tensor)}; } template diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 9187fca..9a4fc3c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -42,6 +42,12 @@ if (DALOTIA_WITH_CUDA) target_link_libraries( test_cufile CUDA::cuda_driver ) endif() add_test( cufile-pointer-detection test_cufile ) + + if (DALOTIA_WITH_CPP_PMR) + add_executable( test_cuda_memory_resource test_cuda_memory_resource.cpp ) + target_link_libraries( test_cuda_memory_resource dalotia_cpp CUDA::cudart ) + add_test( cuda-memory-resource test_cuda_memory_resource ) + endif (DALOTIA_WITH_CPP_PMR) endif (DALOTIA_WITH_CUDA) if (DALOTIA_WITH_TENSORFLOW) diff --git a/test/test_cuda_memory_resource.cpp b/test/test_cuda_memory_resource.cpp new file mode 100644 index 0000000..e356b95 --- /dev/null +++ b/test/test_cuda_memory_resource.cpp @@ -0,0 +1,305 @@ +// Tests for dalotia::cuda_{device,pinned,managed,async}_memory_resource. + +#include +#include +#include +#include +#include + +#include + +#include "dalotia.hpp" +#include "dalotia_cuda_memory_resource.hpp" + +#define CHECK_CUDA(call) \ + do { \ + cudaError_t err = (call); \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ \ + << " : " << cudaGetErrorString(err) << std::endl; \ + std::exit(EXIT_FAILURE); \ + } \ + } while (0) + +// Query the exact CUDA memory type for a pointer. +static cudaMemoryType get_memory_type(const void* ptr) { + cudaPointerAttributes attrs{}; + CHECK_CUDA(cudaPointerGetAttributes(&attrs, ptr)); + return attrs.type; +} + +static const char* TEST_FILE = "../data/model.safetensors"; +static const char* TENSOR_NAME = "embedding"; +static constexpr int NUM_ELEMENTS = 3 * 4 * 5; // 60 +static constexpr dalotia_WeightFormat FORMAT = dalotia_float_64; + +void test_device_resource_basic() { + std::cout << "test_device_resource_basic... " << std::flush; + + auto* mr = dalotia::cuda_device_resource(); + void* p = mr->allocate(1024); + assert(p != nullptr); + assert(dalotia::is_device_pointer(p)); + mr->deallocate(p, 1024); + + // zero-byte allocation returns nullptr and doesn't crash on dealloc + void* z = mr->allocate(0); + assert(z == nullptr); + mr->deallocate(z, 0); + + std::cout << "OK" << std::endl; +} + +void test_pinned_resource_basic() { + std::cout << "test_pinned_resource_basic... " << std::flush; + + auto* mr = dalotia::cuda_pinned_resource(); + void* p = mr->allocate(1024); + assert(p != nullptr); + // pinned memory is only host-accessible — should not be detected as device + assert(!dalotia::is_device_pointer(p)); + + // verify it's actually writable from the host + std::memset(p, 0xAB, 1024); + + mr->deallocate(p, 1024); + + void* z = mr->allocate(0); + assert(z == nullptr); + mr->deallocate(z, 0); + + std::cout << "OK" << std::endl; +} + +void test_managed_resource_basic() { + std::cout << "test_managed_resource_basic... " << std::flush; + + auto* mr = dalotia::cuda_managed_resource(); + void* p = mr->allocate(1024); + assert(p != nullptr); + // managed memory is detected as device pointer + assert(dalotia::is_device_pointer(p)); + + // managed memory is also host-accessible + std::memset(p, 0xCD, 1024); + CHECK_CUDA(cudaDeviceSynchronize()); + + mr->deallocate(p, 1024); + + std::cout << "OK" << std::endl; +} + +void test_async_resource_basic() { + std::cout << "test_async_resource_basic... " << std::flush; + + cudaStream_t stream; + CHECK_CUDA(cudaStreamCreate(&stream)); + + dalotia::cuda_async_memory_resource mr(stream); + assert(mr.stream() == stream); + + void* p = mr.allocate(1024); + assert(p != nullptr); + assert(dalotia::is_device_pointer(p)); + mr.deallocate(p, 1024); + + CHECK_CUDA(cudaStreamSynchronize(stream)); + CHECK_CUDA(cudaStreamDestroy(stream)); + + std::cout << "OK" << std::endl; +} + +void test_is_equal() { + std::cout << "test_is_equal... " << std::flush; + + auto* dev = dalotia::cuda_device_resource(); + auto* pin = dalotia::cuda_pinned_resource(); + auto* mgd = dalotia::cuda_managed_resource(); + + // same-type singletons are equal + dalotia::cuda_device_memory_resource dev2; + assert(dev->is_equal(dev2)); + assert(dev2.is_equal(*dev)); + + // different types are not equal + assert(!dev->is_equal(*pin)); + assert(!dev->is_equal(*mgd)); + assert(!pin->is_equal(*mgd)); + + // not equal to the default resource + assert(!dev->is_equal(*std::pmr::get_default_resource())); + + // async resources: equal iff same stream + cudaStream_t s1, s2; + CHECK_CUDA(cudaStreamCreate(&s1)); + CHECK_CUDA(cudaStreamCreate(&s2)); + + dalotia::cuda_async_memory_resource a1(s1); + dalotia::cuda_async_memory_resource a1_copy(s1); + dalotia::cuda_async_memory_resource a2(s2); + + assert(a1.is_equal(a1_copy)); + assert(!a1.is_equal(a2)); + assert(!a1.is_equal(*dev)); + + CHECK_CUDA(cudaStreamDestroy(s1)); + CHECK_CUDA(cudaStreamDestroy(s2)); + + // calling the accessor twice returns the same pointer + assert(dalotia::cuda_device_resource() == dalotia::cuda_device_resource()); + assert(dalotia::cuda_pinned_resource() == dalotia::cuda_pinned_resource()); + assert(dalotia::cuda_managed_resource() == + dalotia::cuda_managed_resource()); + + std::cout << "OK" << std::endl; +} + +void test_pmr_vector_pinned() { + std::cout << "test_pmr_vector_pinned... " << std::flush; + + auto* mr = dalotia::cuda_pinned_resource(); + std::pmr::polymorphic_allocator alloc(mr); + std::pmr::vector v(alloc); + + v.resize(100); + for (int i = 0; i < 100; i++) + v[i] = static_cast(i); + + // verify contents (pinned memory is host-accessible) + for (int i = 0; i < 100; i++) + assert(v[i] == static_cast(i)); + + // the underlying pointer should be pinned (DMA-capable), not device + assert(!dalotia::is_device_pointer(v.data())); + + std::cout << "OK" << std::endl; +} + +void test_pmr_vector_managed() { + std::cout << "test_pmr_vector_managed... " << std::flush; + + auto* mr = dalotia::cuda_managed_resource(); + std::pmr::polymorphic_allocator alloc(mr); + std::pmr::vector v(alloc); + + v.resize(100); + for (int i = 0; i < 100; i++) + v[i] = static_cast(i); + + CHECK_CUDA(cudaDeviceSynchronize()); + + for (int i = 0; i < 100; i++) + assert(v[i] == static_cast(i)); + + assert(dalotia::is_device_pointer(v.data())); + + std::cout << "OK" << std::endl; +} + +void test_load_tensor_with_pinned_resource() { + std::cout << "test_load_tensor_with_pinned_resource... " << std::flush; + + auto* mr = dalotia::cuda_pinned_resource(); + std::pmr::polymorphic_allocator alloc(mr); + + auto file = std::unique_ptr( + dalotia::make_tensor_file(TEST_FILE)); + + auto [extents, tensor] = file->load_tensor_dense( + TENSOR_NAME, FORMAT, dalotia_C_ordering, {}, alloc); + + assert(extents.size() == 3); + assert(extents[0] == 3 && extents[1] == 4 && extents[2] == 5); + assert(tensor.size() == NUM_ELEMENTS); + + // pinned memory is host-readable + for (int i = 0; i < NUM_ELEMENTS; i++) { + assert(tensor[i] == static_cast(i)); + } + + std::cout << "OK" << std::endl; +} + +void test_load_tensor_with_managed_resource() { + std::cout << "test_load_tensor_with_managed_resource... " << std::flush; + + auto* mr = dalotia::cuda_managed_resource(); + std::pmr::polymorphic_allocator alloc(mr); + + auto file = std::unique_ptr( + dalotia::make_tensor_file(TEST_FILE)); + + auto [extents, tensor] = file->load_tensor_dense( + TENSOR_NAME, FORMAT, dalotia_C_ordering, {}, alloc); + + assert(extents.size() == 3); + assert(tensor.size() == NUM_ELEMENTS); + + CHECK_CUDA(cudaDeviceSynchronize()); + + // managed memory is host-readable after sync + for (int i = 0; i < NUM_ELEMENTS; i++) { + assert(tensor[i] == static_cast(i)); + } + + // and it's device-accessible (allocator preserved through move) + assert(dalotia::is_device_pointer(tensor.data())); + + std::cout << "OK" << std::endl; +} + +void test_load_tensor_memory_types() { + std::cout << "test_load_tensor_memory_types... " << std::flush; + + auto file = std::unique_ptr( + dalotia::make_tensor_file(TEST_FILE)); + + // default allocator → cudaMemoryTypeUnregistered (plain heap) + { + auto [ext, tensor] = file->load_tensor_dense( + TENSOR_NAME, FORMAT, dalotia_C_ordering); + assert(get_memory_type(tensor.data()) == cudaMemoryTypeUnregistered); + } + + // pinned resource → cudaMemoryTypeHost + { + std::pmr::polymorphic_allocator alloc( + dalotia::cuda_pinned_resource()); + auto [ext, tensor] = file->load_tensor_dense( + TENSOR_NAME, FORMAT, dalotia_C_ordering, {}, alloc); + assert(get_memory_type(tensor.data()) == cudaMemoryTypeHost); + for (int i = 0; i < NUM_ELEMENTS; i++) + assert(tensor[i] == static_cast(i)); + } + + // managed resource → cudaMemoryTypeManaged + { + std::pmr::polymorphic_allocator alloc( + dalotia::cuda_managed_resource()); + auto [ext, tensor] = file->load_tensor_dense( + TENSOR_NAME, FORMAT, dalotia_C_ordering, {}, alloc); + assert(get_memory_type(tensor.data()) == cudaMemoryTypeManaged); + CHECK_CUDA(cudaDeviceSynchronize()); + for (int i = 0; i < NUM_ELEMENTS; i++) + assert(tensor[i] == static_cast(i)); + } + + std::cout << "OK" << std::endl; +} + +int main() { + test_device_resource_basic(); + test_pinned_resource_basic(); + test_managed_resource_basic(); + test_async_resource_basic(); + test_is_equal(); + test_singletons(); + test_pmr_vector_pinned(); + test_pmr_vector_managed(); + test_load_tensor_with_pinned_resource(); + test_load_tensor_with_managed_resource(); + test_load_tensor_memory_types(); + + std::cout << "test_cuda_memory_resource succeeded" << std::endl; + return 0; +} From e1a92f47168920c364237034f06ac1305c9d9db0 Mon Sep 17 00:00:00 2001 From: Theresa Date: Tue, 24 Mar 2026 05:44:11 +0000 Subject: [PATCH 2/6] test cpmr: delete unused call --- test/test_cuda_memory_resource.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_cuda_memory_resource.cpp b/test/test_cuda_memory_resource.cpp index e356b95..b1a830d 100644 --- a/test/test_cuda_memory_resource.cpp +++ b/test/test_cuda_memory_resource.cpp @@ -293,7 +293,6 @@ int main() { test_managed_resource_basic(); test_async_resource_basic(); test_is_equal(); - test_singletons(); test_pmr_vector_pinned(); test_pmr_vector_managed(); test_load_tensor_with_pinned_resource(); From 08566cc46bcb35bd3f76f2cea50f1bece8b5b606 Mon Sep 17 00:00:00 2001 From: Theresa Date: Tue, 24 Mar 2026 05:54:04 +0000 Subject: [PATCH 3/6] cpmr: clang-format --- src/dalotia_cuda_memory_resource.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/dalotia_cuda_memory_resource.hpp b/src/dalotia_cuda_memory_resource.hpp index 3565b18..ac272e4 100644 --- a/src/dalotia_cuda_memory_resource.hpp +++ b/src/dalotia_cuda_memory_resource.hpp @@ -19,7 +19,7 @@ namespace detail { } // namespace detail -//TODO if there is a good library to take these from, we should consider it +// TODO if there is a good library to take these from, we should consider it class cuda_device_memory_resource : public std::pmr::memory_resource { protected: @@ -97,9 +97,9 @@ class cuda_managed_memory_resource : public std::pmr::memory_resource { }; class cuda_async_memory_resource : public std::pmr::memory_resource { -// The stream is fixed at construction; all allocations and deallocations are -// ordered on that stream. The resource must outlive any container using it, -// and the stream must remain valid for that lifetime. + // The stream is fixed at construction; all allocations and deallocations + // are ordered on that stream. The resource must outlive any container + // using it, and the stream must remain valid for that lifetime. public: explicit cuda_async_memory_resource(cudaStream_t stream) : stream_(stream) {} From dafa7c602eb9c37b15bfe560e1e3f1a7760af68c Mon Sep 17 00:00:00 2001 From: Theresa Date: Tue, 24 Mar 2026 05:56:52 +0000 Subject: [PATCH 4/6] ci: don't run new cpmr tests --- .github/workflows/ctest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ctest.yml b/.github/workflows/ctest.yml index 95318a2..3f2584a 100644 --- a/.github/workflows/ctest.yml +++ b/.github/workflows/ctest.yml @@ -83,4 +83,4 @@ jobs: uses: threeal/ctest-action@v1.1.0 with: test-dir: build-cuda - args: --verbose --exclude-regex cufile-pointer-detection + args: --verbose --exclude-regex "cufile-pointer-detection|cuda-memory-resource" From 294c1ff51e619b176a1e58fbd7cf8f95b3b8d887 Mon Sep 17 00:00:00 2001 From: Theresa Date: Wed, 25 Mar 2026 01:20:51 +0000 Subject: [PATCH 5/6] cmake: cufile transient dependency when DALOTIA_WITH_CUFILE --- src/CMakeLists.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3db7bf0..2db6c72 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -41,7 +41,19 @@ if (DALOTIA_WITH_CUFILE) find_library(CUFILE_LIBRARY cufile PATHS ${CUDAToolkit_LIBRARY_DIR} REQUIRED) + get_filename_component(_CUFILE_LIB_DIR "${CUFILE_LIBRARY}" DIRECTORY) + find_path(CUFILE_INCLUDE_DIR cufile.h + HINTS ${_CUFILE_LIB_DIR}/../include + ${_CUFILE_LIB_DIR}/../../include + ${CUDAToolkit_INCLUDE_DIRS} + PATHS /usr/local/cuda/include + /usr/include) target_link_libraries(dalotia_cpp PUBLIC ${CUFILE_LIBRARY}) + if (CUFILE_INCLUDE_DIR) + target_include_directories(dalotia_cpp PUBLIC ${CUFILE_INCLUDE_DIR}) + else() + message(WARNING "cufile.h not found — set CUFILE_INCLUDE_DIR manually") + endif() target_compile_definitions(dalotia_cpp PUBLIC "DALOTIA_WITH_CUFILE") target_sources(dalotia_cpp PRIVATE dalotia_cufile.cpp) endif (DALOTIA_WITH_CUFILE) From 791ab1afc6d378d1a35a696115b7c7b9571e9315 Mon Sep 17 00:00:00 2001 From: Theresa Date: Mon, 13 Apr 2026 08:52:17 +0000 Subject: [PATCH 6/6] cudabuffer: use pmr resources + test --- src/dalotia_cuda.hpp | 80 ++++++++++++--------------------- src/dalotia_tensor_file.cpp | 3 +- test/CMakeLists.txt | 7 +++ test/test_cuda_pmr_load.cpp | 90 +++++++++++++++++++++++++++++++++++++ 4 files changed, 128 insertions(+), 52 deletions(-) create mode 100644 test/test_cuda_pmr_load.cpp diff --git a/src/dalotia_cuda.hpp b/src/dalotia_cuda.hpp index 2fc7fb8..9f4096a 100644 --- a/src/dalotia_cuda.hpp +++ b/src/dalotia_cuda.hpp @@ -4,8 +4,9 @@ #include #include -#include -#include +#include + +#include "dalotia_cuda_memory_resource.hpp" namespace dalotia { @@ -13,67 +14,36 @@ namespace dalotia { // Returns false for host pointers (including cudaMallocHost pinned memory). bool is_device_pointer(const void* ptr) noexcept; -// RAII wrapper for a cudaMalloc'd device buffer. Move-only. -// Type-erased (stores void*); use as() for typed access. +// Move-only owning byte buffer backed by a std::pmr::memory_resource. +// +// The default resource is `cuda_device_resource()` (sync cudaMalloc/cudaFree). class CudaBuffer { public: - CudaBuffer() = default; - - explicit CudaBuffer(size_t nbytes) : size_(nbytes) { - if (nbytes > 0) { - cudaError_t err = cudaMalloc(&ptr_, nbytes); - if (err != cudaSuccess) { - throw std::runtime_error( - std::string("CudaBuffer: cudaMalloc failed: ") + - cudaGetErrorString(err)); - } - } - } + CudaBuffer() noexcept = default; - CudaBuffer(size_t nbytes, cudaStream_t stream) : size_(nbytes) { + explicit CudaBuffer(size_t nbytes, + std::pmr::memory_resource* mr = cuda_device_resource()) + : mr_(mr), size_(nbytes) { if (nbytes > 0) { - cudaError_t err = cudaMallocAsync(&ptr_, nbytes, stream); - if (err != cudaSuccess) { - throw std::runtime_error( - std::string("CudaBuffer: cudaMallocAsync failed: ") + - cudaGetErrorString(err)); - } - stream_ = stream; - async_ = true; + ptr_ = std::pmr::polymorphic_allocator(mr_).allocate( + nbytes); } } - ~CudaBuffer() { - if (ptr_) { - if (async_) { - cudaFreeAsync(ptr_, stream_); - } else { - cudaFree(ptr_); - } - } - } + ~CudaBuffer() { reset(); } CudaBuffer(CudaBuffer&& other) noexcept - : ptr_(other.ptr_) - , size_(other.size_) - , stream_(other.stream_) - , async_(other.async_) { + : mr_(other.mr_), ptr_(other.ptr_), size_(other.size_) { other.ptr_ = nullptr; other.size_ = 0; } CudaBuffer& operator=(CudaBuffer&& other) noexcept { if (this != &other) { - if (ptr_) { - if (async_) - cudaFreeAsync(ptr_, stream_); - else - cudaFree(ptr_); - } + reset(); + mr_ = other.mr_; ptr_ = other.ptr_; size_ = other.size_; - stream_ = other.stream_; - async_ = other.async_; other.ptr_ = nullptr; other.size_ = 0; } @@ -90,19 +60,27 @@ class CudaBuffer { template T* as() noexcept { - return static_cast(ptr_); + return static_cast(static_cast(ptr_)); } template const T* as() const noexcept { - return static_cast(ptr_); + return static_cast(static_cast(ptr_)); } private: - void* ptr_ = nullptr; + void reset() noexcept { + if (ptr_) { + std::pmr::polymorphic_allocator(mr_).deallocate(ptr_, + size_); + ptr_ = nullptr; + size_ = 0; + } + } + + std::pmr::memory_resource* mr_ = cuda_device_resource(); + std::byte* ptr_ = nullptr; size_t size_ = 0; - cudaStream_t stream_ = 0; - bool async_ = false; }; } // namespace dalotia diff --git a/src/dalotia_tensor_file.cpp b/src/dalotia_tensor_file.cpp index 654a4b7..391196f 100644 --- a/src/dalotia_tensor_file.cpp +++ b/src/dalotia_tensor_file.cpp @@ -44,10 +44,11 @@ void TensorFile::load_tensor_dense(const std::string& tensor_name, size_t nbytes = total_elements * element_bytes; // If permutation needed, load into a temp buffer then permute. + cuda_async_memory_resource async_mr(stream); CudaBuffer d_tmp; dalotia_byte* d_raw = tensor; if (needs_permute) { - d_tmp = CudaBuffer(nbytes, stream); + d_tmp = CudaBuffer(nbytes, &async_mr); d_raw = d_tmp.as(); } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 9a4fc3c..bc32e9b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -47,6 +47,13 @@ if (DALOTIA_WITH_CUDA) add_executable( test_cuda_memory_resource test_cuda_memory_resource.cpp ) target_link_libraries( test_cuda_memory_resource dalotia_cpp CUDA::cudart ) add_test( cuda-memory-resource test_cuda_memory_resource ) + + if (DALOTIA_WITH_SAFETENSORS_CPP) + add_executable( test_cuda_pmr_load test_cuda_pmr_load.cpp ) + target_link_libraries( test_cuda_pmr_load dalotia_cpp CUDA::cudart ) + target_include_directories( test_cuda_pmr_load PUBLIC ${SAFETENSORS_CPP_INCLUDE_DIR} ${safetensors-cpp_DIR}) + add_test( cuda-pmr-load test_cuda_pmr_load ) + endif (DALOTIA_WITH_SAFETENSORS_CPP) endif (DALOTIA_WITH_CPP_PMR) endif (DALOTIA_WITH_CUDA) diff --git a/test/test_cuda_pmr_load.cpp b/test/test_cuda_pmr_load.cpp new file mode 100644 index 0000000..f9c27f4 --- /dev/null +++ b/test/test_cuda_pmr_load.cpp @@ -0,0 +1,90 @@ +// Combined test: load a permuted tensor into buffers backed by different +// dalotia CUDA memory resources (device + pinned host) and verify that the +// device-side bytes (after copy-back) match the pinned host-side bytes. + +#include +#include +#include +#include + +#include + +#include "dalotia.hpp" +#include "dalotia_cuda.hpp" +#include "dalotia_cuda_memory_resource.hpp" +#include "dalotia_safetensors_file.hpp" + +#define CHECK_CUDA(call) \ + do { \ + cudaError_t err = (call); \ + if (err != cudaSuccess) { \ + std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ \ + << " : " << cudaGetErrorString(err) << std::endl; \ + std::exit(EXIT_FAILURE); \ + } \ + } while (0) + +static const char* TEST_FILE = "../data/model.safetensors"; +// Shape [4,3,5]; permutation [1,0,2] yields shape [3,4,5] with values 0..59. +static const char* PERM_TENSOR = "embedding_firstchanged"; +static constexpr int NUM_ELEMENTS = 3 * 4 * 5; +static constexpr dalotia_WeightFormat FORMAT = dalotia_float_64; + +int main() { + std::cout << "test_permuted_load_device_vs_pinned... " << std::flush; + + int device_count = 0; + if (cudaGetDeviceCount(&device_count) != cudaSuccess || device_count == 0) { + std::cout << "SKIP (no CUDA device)" << std::endl; + return 0; + } + + const std::vector perm = {1, 0, 2}; + const size_t nbytes = NUM_ELEMENTS * sizeof(double); + + // Non-default stream: the GPU permute kernel and its temp-buffer + // alloc/free should be ordered on this stream, not the default one. + cudaStream_t stream = nullptr; + CHECK_CUDA(cudaStreamCreate(&stream)); + + dalotia::SafetensorsFile file(TEST_FILE); + + // 1) Permuted load into a device-resident buffer (GPU permute kernel path). + // Pass `stream` so load + permute kernel run on it. + dalotia::CudaBuffer d_buf(nbytes, dalotia::cuda_device_resource()); + file.load_tensor_dense(PERM_TENSOR, FORMAT, dalotia_C_ordering, + d_buf.as(), perm, stream); + + // 2) Permuted load into a pinned host buffer (host permute path — + // pinned memory is host-accessible and not detected as a device ptr). + // The stream argument is ignored on the host path. + dalotia::CudaBuffer h_pinned(nbytes, dalotia::cuda_pinned_resource()); + file.load_tensor_dense(PERM_TENSOR, FORMAT, dalotia_C_ordering, + h_pinned.as(), perm, stream); + + // Sanity: pinned pointer must be host-addressable, not a device pointer. + assert(!dalotia::is_device_pointer(h_pinned.data())); + assert(dalotia::is_device_pointer(d_buf.data())); + + // Wait for all stream work before reading the device buffer back. + CHECK_CUDA(cudaStreamSynchronize(stream)); + + // Copy device buffer back to host (also on the same stream) and compare. + std::vector d_copy(nbytes); + CHECK_CUDA(cudaMemcpyAsync(d_copy.data(), d_buf.data(), nbytes, + cudaMemcpyDeviceToHost, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + assert(std::memcmp(d_copy.data(), h_pinned.data(), nbytes) == 0); + + // Also verify the values are the expected 0..59 sequence. + const double* pinned_vals = h_pinned.as(); + for (int i = 0; i < NUM_ELEMENTS; ++i) { + assert(pinned_vals[i] == static_cast(i)); + } + + CHECK_CUDA(cudaStreamDestroy(stream)); + + std::cout << "OK" << std::endl; + return 0; +}