Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ctest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,4 @@ jobs:
uses: threeal/ctest-action@v1.1.0
with:
test-dir: build-cuda
args: --verbose --exclude-regex cufile-pointer-detection
args: --verbose --exclude-regex "cufile-pointer-detection|cuda-memory-resource"
14 changes: 13 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
add_library(dalotia_cpp dalotia.cpp) # Daniel Pfeifer says: no variables
target_sources(dalotia_cpp PRIVATE dalotia_assignment.cpp dalotia_formats.cpp dalotia_datasource.cpp dalotia_tensor_file.cpp)
set_target_properties(dalotia_cpp PROPERTIES PUBLIC_HEADER
"dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp;dalotia_cuda.hpp")
"dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp;dalotia_cuda.hpp;dalotia_cuda_memory_resource.hpp")
# have one dalotia library target that can be used in C++ and Fortran
add_library(dalotia INTERFACE)
add_library(dalotia::dalotia_cpp ALIAS dalotia_cpp)
Expand Down Expand Up @@ -41,7 +41,19 @@ if (DALOTIA_WITH_CUFILE)
find_library(CUFILE_LIBRARY cufile
PATHS ${CUDAToolkit_LIBRARY_DIR}
REQUIRED)
get_filename_component(_CUFILE_LIB_DIR "${CUFILE_LIBRARY}" DIRECTORY)
find_path(CUFILE_INCLUDE_DIR cufile.h
HINTS ${_CUFILE_LIB_DIR}/../include
${_CUFILE_LIB_DIR}/../../include
${CUDAToolkit_INCLUDE_DIRS}
PATHS /usr/local/cuda/include
/usr/include)
target_link_libraries(dalotia_cpp PUBLIC ${CUFILE_LIBRARY})
if (CUFILE_INCLUDE_DIR)
target_include_directories(dalotia_cpp PUBLIC ${CUFILE_INCLUDE_DIR})
else()
message(WARNING "cufile.h not found — set CUFILE_INCLUDE_DIR manually")
endif()
target_compile_definitions(dalotia_cpp PUBLIC "DALOTIA_WITH_CUFILE")
target_sources(dalotia_cpp PRIVATE dalotia_cufile.cpp)
endif (DALOTIA_WITH_CUFILE)
Expand Down
3 changes: 3 additions & 0 deletions src/dalotia.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
#include <memory>
#ifdef DALOTIA_WITH_CPP_PMR
#include <memory_resource>
#ifdef DALOTIA_WITH_CUDA
#include "dalotia_cuda_memory_resource.hpp"
#endif // DALOTIA_WITH_CUDA
#endif // DALOTIA_WITH_CPP_PMR
#include <numeric>
#include <string>
Expand Down
80 changes: 29 additions & 51 deletions src/dalotia_cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,76 +4,46 @@

#include <cstddef>
#include <cuda_runtime.h>
#include <stdexcept>
#include <string>
#include <memory_resource>

#include "dalotia_cuda_memory_resource.hpp"

namespace dalotia {

// Returns true if `ptr` is a CUDA device pointer (cudaMalloc'd or managed).
// Returns false for host pointers (including cudaMallocHost pinned memory).
bool is_device_pointer(const void* ptr) noexcept;

// RAII wrapper for a cudaMalloc'd device buffer. Move-only.
// Type-erased (stores void*); use as<T>() for typed access.
// Move-only owning byte buffer backed by a std::pmr::memory_resource.
//
// The default resource is `cuda_device_resource()` (sync cudaMalloc/cudaFree).
class CudaBuffer {
public:
CudaBuffer() = default;

explicit CudaBuffer(size_t nbytes) : size_(nbytes) {
if (nbytes > 0) {
cudaError_t err = cudaMalloc(&ptr_, nbytes);
if (err != cudaSuccess) {
throw std::runtime_error(
std::string("CudaBuffer: cudaMalloc failed: ") +
cudaGetErrorString(err));
}
}
}
CudaBuffer() noexcept = default;

CudaBuffer(size_t nbytes, cudaStream_t stream) : size_(nbytes) {
explicit CudaBuffer(size_t nbytes,
std::pmr::memory_resource* mr = cuda_device_resource())
: mr_(mr), size_(nbytes) {
if (nbytes > 0) {
cudaError_t err = cudaMallocAsync(&ptr_, nbytes, stream);
if (err != cudaSuccess) {
throw std::runtime_error(
std::string("CudaBuffer: cudaMallocAsync failed: ") +
cudaGetErrorString(err));
}
stream_ = stream;
async_ = true;
ptr_ = std::pmr::polymorphic_allocator<std::byte>(mr_).allocate(
nbytes);
}
}

~CudaBuffer() {
if (ptr_) {
if (async_) {
cudaFreeAsync(ptr_, stream_);
} else {
cudaFree(ptr_);
}
}
}
~CudaBuffer() { reset(); }

CudaBuffer(CudaBuffer&& other) noexcept
: ptr_(other.ptr_)
, size_(other.size_)
, stream_(other.stream_)
, async_(other.async_) {
: mr_(other.mr_), ptr_(other.ptr_), size_(other.size_) {
other.ptr_ = nullptr;
other.size_ = 0;
}

CudaBuffer& operator=(CudaBuffer&& other) noexcept {
if (this != &other) {
if (ptr_) {
if (async_)
cudaFreeAsync(ptr_, stream_);
else
cudaFree(ptr_);
}
reset();
mr_ = other.mr_;
ptr_ = other.ptr_;
size_ = other.size_;
stream_ = other.stream_;
async_ = other.async_;
other.ptr_ = nullptr;
other.size_ = 0;
}
Expand All @@ -90,19 +60,27 @@ class CudaBuffer {

template <typename T>
T* as() noexcept {
return static_cast<T*>(ptr_);
return static_cast<T*>(static_cast<void*>(ptr_));
}

template <typename T>
const T* as() const noexcept {
return static_cast<const T*>(ptr_);
return static_cast<const T*>(static_cast<const void*>(ptr_));
}

private:
void* ptr_ = nullptr;
void reset() noexcept {
if (ptr_) {
std::pmr::polymorphic_allocator<std::byte>(mr_).deallocate(ptr_,
size_);
ptr_ = nullptr;
size_ = 0;
}
}

std::pmr::memory_resource* mr_ = cuda_device_resource();
std::byte* ptr_ = nullptr;
size_t size_ = 0;
cudaStream_t stream_ = 0;
bool async_ = false;
};

} // namespace dalotia
Expand Down
155 changes: 155 additions & 0 deletions src/dalotia_cuda_memory_resource.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#pragma once

#ifdef DALOTIA_WITH_CUDA
#ifdef DALOTIA_WITH_CPP_PMR

#include <cuda_runtime.h>
#include <memory_resource>
#include <stdexcept>
#include <string>

namespace dalotia {

namespace detail {

[[noreturn]] inline void throw_cuda(const char* context, cudaError_t err) {
throw std::runtime_error(std::string(context) + ": " +
cudaGetErrorString(err));
}

} // namespace detail

// TODO if there is a good library to take these from, we should consider it

class cuda_device_memory_resource : public std::pmr::memory_resource {
protected:
void* do_allocate(size_t bytes, size_t /*alignment*/) override {
void* p = nullptr;
if (bytes > 0) {
cudaError_t err = cudaMalloc(&p, bytes);
if (err != cudaSuccess)
detail::throw_cuda("cuda_device_memory_resource::allocate",
err);
}
return p;
}

void do_deallocate(void* p, size_t /*bytes*/,
size_t /*alignment*/) override {
if (p)
cudaFree(p);
}

bool do_is_equal(const memory_resource& other) const noexcept override {
return dynamic_cast<const cuda_device_memory_resource*>(&other) !=
nullptr;
}
};

class cuda_pinned_memory_resource : public std::pmr::memory_resource {
protected:
void* do_allocate(size_t bytes, size_t /*alignment*/) override {
void* p = nullptr;
if (bytes > 0) {
cudaError_t err = cudaMallocHost(&p, bytes);
if (err != cudaSuccess)
detail::throw_cuda("cuda_pinned_memory_resource::allocate",
err);
}
return p;
}

void do_deallocate(void* p, size_t /*bytes*/,
size_t /*alignment*/) override {
if (p)
cudaFreeHost(p);
}

bool do_is_equal(const memory_resource& other) const noexcept override {
return dynamic_cast<const cuda_pinned_memory_resource*>(&other) !=
nullptr;
}
};

class cuda_managed_memory_resource : public std::pmr::memory_resource {
protected:
void* do_allocate(size_t bytes, size_t /*alignment*/) override {
void* p = nullptr;
if (bytes > 0) {
cudaError_t err = cudaMallocManaged(&p, bytes);
if (err != cudaSuccess)
detail::throw_cuda("cuda_managed_memory_resource::allocate",
err);
}
return p;
}

void do_deallocate(void* p, size_t /*bytes*/,
size_t /*alignment*/) override {
if (p)
cudaFree(p);
}

bool do_is_equal(const memory_resource& other) const noexcept override {
return dynamic_cast<const cuda_managed_memory_resource*>(&other) !=
nullptr;
}
};

class cuda_async_memory_resource : public std::pmr::memory_resource {
// The stream is fixed at construction; all allocations and deallocations
// are ordered on that stream. The resource must outlive any container
// using it, and the stream must remain valid for that lifetime.
public:
explicit cuda_async_memory_resource(cudaStream_t stream)
: stream_(stream) {}

cudaStream_t stream() const noexcept { return stream_; }

protected:
void* do_allocate(size_t bytes, size_t /*alignment*/) override {
void* p = nullptr;
if (bytes > 0) {
cudaError_t err = cudaMallocAsync(&p, bytes, stream_);
if (err != cudaSuccess)
detail::throw_cuda("cuda_async_memory_resource::allocate", err);
}
return p;
}

void do_deallocate(void* p, size_t /*bytes*/,
size_t /*alignment*/) override {
if (p)
cudaFreeAsync(p, stream_);
}

bool do_is_equal(const memory_resource& other) const noexcept override {
auto* o = dynamic_cast<const cuda_async_memory_resource*>(&other);
return o != nullptr && o->stream_ == stream_;
}

private:
cudaStream_t stream_;
};

// Thread-safe (C++11 static-local guarantee). These resources are never
// destroyed, matching the contract of std::pmr::new_delete_resource().
inline cuda_device_memory_resource* cuda_device_resource() noexcept {
static cuda_device_memory_resource r;
return &r;
}

inline cuda_pinned_memory_resource* cuda_pinned_resource() noexcept {
static cuda_pinned_memory_resource r;
return &r;
}

inline cuda_managed_memory_resource* cuda_managed_resource() noexcept {
static cuda_managed_memory_resource r;
return &r;
}

} // namespace dalotia

#endif // DALOTIA_WITH_CPP_PMR
#endif // DALOTIA_WITH_CUDA
3 changes: 2 additions & 1 deletion src/dalotia_tensor_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,11 @@ void TensorFile::load_tensor_dense(const std::string& tensor_name,
size_t nbytes = total_elements * element_bytes;

// If permutation needed, load into a temp buffer then permute.
cuda_async_memory_resource async_mr(stream);
CudaBuffer d_tmp;
dalotia_byte* d_raw = tensor;
if (needs_permute) {
d_tmp = CudaBuffer(nbytes, stream);
d_tmp = CudaBuffer(nbytes, &async_mr);
d_raw = d_tmp.as<dalotia_byte>();
}

Expand Down
2 changes: 1 addition & 1 deletion src/dalotia_tensor_file.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ class TensorFile {
this->load_tensor_dense(tensor_name, weight_format, ordering,
reinterpret_cast<dalotia_byte*>(tensor.data()),
permutation);
return std::make_pair(extents, tensor);
return {std::move(extents), std::move(tensor)};
}

template <typename value_type>
Expand Down
13 changes: 13 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,19 @@ if (DALOTIA_WITH_CUDA)
target_link_libraries( test_cufile CUDA::cuda_driver )
endif()
add_test( cufile-pointer-detection test_cufile )

if (DALOTIA_WITH_CPP_PMR)
add_executable( test_cuda_memory_resource test_cuda_memory_resource.cpp )
target_link_libraries( test_cuda_memory_resource dalotia_cpp CUDA::cudart )
add_test( cuda-memory-resource test_cuda_memory_resource )

if (DALOTIA_WITH_SAFETENSORS_CPP)
add_executable( test_cuda_pmr_load test_cuda_pmr_load.cpp )
target_link_libraries( test_cuda_pmr_load dalotia_cpp CUDA::cudart )
target_include_directories( test_cuda_pmr_load PUBLIC ${SAFETENSORS_CPP_INCLUDE_DIR} ${safetensors-cpp_DIR})
add_test( cuda-pmr-load test_cuda_pmr_load )
endif (DALOTIA_WITH_SAFETENSORS_CPP)
endif (DALOTIA_WITH_CPP_PMR)
endif (DALOTIA_WITH_CUDA)

if (DALOTIA_WITH_TENSORFLOW)
Expand Down
Loading
Loading