Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ctest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ jobs:
options: DALOTIA_WITH_CUFILE=ON DALOTIA_BUILD_TESTS=ON DALOTIA_WITH_FORTRAN=OFF
build-dir: build-cuda

- name: Run non-GPU tests
- name: Run tests
uses: threeal/ctest-action@v1.1.0
with:
test-dir: build-cuda
args: --output-on-failure --exclude-regex cufile-pointer-detection
args: --verbose --exclude-regex cufile-pointer-detection
4 changes: 3 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
add_library(dalotia_cpp dalotia.cpp) # Daniel Pfeifer says: no variables
target_sources(dalotia_cpp PRIVATE dalotia_assignment.cpp dalotia_formats.cpp dalotia_datasource.cpp dalotia_tensor_file.cpp)
set_target_properties(dalotia_cpp PROPERTIES PUBLIC_HEADER
"dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp")
"dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp;dalotia_cuda.hpp")
# have one dalotia library target that can be used in C++ and Fortran
add_library(dalotia INTERFACE)
add_library(dalotia::dalotia_cpp ALIAS dalotia_cpp)
Expand Down Expand Up @@ -32,6 +32,8 @@ endif (DALOTIA_WITH_SAFETENSORS_CPP)
if (DALOTIA_WITH_CUDA)
target_link_libraries(dalotia_cpp PUBLIC CUDA::cudart)
target_compile_definitions(dalotia_cpp PUBLIC "DALOTIA_WITH_CUDA")
target_sources(dalotia_cpp PRIVATE dalotia_cuda.cpp dalotia_permute_gpu.cu)
set_source_files_properties(dalotia_permute_gpu.cu PROPERTIES LANGUAGE CUDA)
endif (DALOTIA_WITH_CUDA)

if (DALOTIA_WITH_CUFILE)
Expand Down
22 changes: 22 additions & 0 deletions src/dalotia_cuda.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#ifdef DALOTIA_WITH_CUDA

#include "dalotia_cuda.hpp"

namespace dalotia {

bool is_device_pointer(const void* ptr) noexcept {
if (!ptr)
return false;
cudaPointerAttributes attrs{};
cudaError_t err = cudaPointerGetAttributes(&attrs, ptr);
if (err != cudaSuccess) {
cudaGetLastError();
return false;
}
return attrs.type == cudaMemoryTypeDevice ||
attrs.type == cudaMemoryTypeManaged;
}

} // namespace dalotia

#endif // DALOTIA_WITH_CUDA
110 changes: 110 additions & 0 deletions src/dalotia_cuda.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#pragma once

#ifdef DALOTIA_WITH_CUDA

#include <cstddef>
#include <cuda_runtime.h>
#include <stdexcept>
#include <string>

namespace dalotia {

// Returns true if `ptr` is a CUDA device pointer (cudaMalloc'd or managed).
// Returns false for host pointers (including cudaMallocHost pinned memory).
bool is_device_pointer(const void* ptr) noexcept;

// RAII wrapper for a cudaMalloc'd device buffer. Move-only.
// Type-erased (stores void*); use as<T>() for typed access.
class CudaBuffer {
public:
CudaBuffer() = default;

explicit CudaBuffer(size_t nbytes) : size_(nbytes) {
if (nbytes > 0) {
cudaError_t err = cudaMalloc(&ptr_, nbytes);
if (err != cudaSuccess) {
throw std::runtime_error(
std::string("CudaBuffer: cudaMalloc failed: ") +
cudaGetErrorString(err));
}
}
}

CudaBuffer(size_t nbytes, cudaStream_t stream) : size_(nbytes) {
if (nbytes > 0) {
cudaError_t err = cudaMallocAsync(&ptr_, nbytes, stream);
if (err != cudaSuccess) {
throw std::runtime_error(
std::string("CudaBuffer: cudaMallocAsync failed: ") +
cudaGetErrorString(err));
}
stream_ = stream;
async_ = true;
}
}

~CudaBuffer() {
if (ptr_) {
if (async_) {
cudaFreeAsync(ptr_, stream_);
} else {
cudaFree(ptr_);
}
}
}

CudaBuffer(CudaBuffer&& other) noexcept
: ptr_(other.ptr_)
, size_(other.size_)
, stream_(other.stream_)
, async_(other.async_) {
other.ptr_ = nullptr;
other.size_ = 0;
}

CudaBuffer& operator=(CudaBuffer&& other) noexcept {
if (this != &other) {
if (ptr_) {
if (async_)
cudaFreeAsync(ptr_, stream_);
else
cudaFree(ptr_);
}
ptr_ = other.ptr_;
size_ = other.size_;
stream_ = other.stream_;
async_ = other.async_;
other.ptr_ = nullptr;
other.size_ = 0;
}
return *this;
}

CudaBuffer(const CudaBuffer&) = delete;
CudaBuffer& operator=(const CudaBuffer&) = delete;

void* data() noexcept { return ptr_; }
const void* data() const noexcept { return ptr_; }
size_t size() const noexcept { return size_; }
bool empty() const noexcept { return ptr_ == nullptr; }

template <typename T>
T* as() noexcept {
return static_cast<T*>(ptr_);
}

template <typename T>
const T* as() const noexcept {
return static_cast<const T*>(ptr_);
}

private:
void* ptr_ = nullptr;
size_t size_ = 0;
cudaStream_t stream_ = 0;
bool async_ = false;
};

} // namespace dalotia

#endif // DALOTIA_WITH_CUDA
88 changes: 88 additions & 0 deletions src/dalotia_permute_gpu.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#ifdef DALOTIA_WITH_CUDA

#include "dalotia_assignment.hpp"
#include "dalotia_permute_gpu.cuh"

#include <cuda_runtime.h>
#include <stdexcept>
#include <string>

namespace dalotia {

// Fixed-size stride arrays passed by value as kernel arguments.
// At most 8 dimensions × 8 bytes = 64 bytes per array — well within
// the 4 KB kernel argument limit, and avoids device memory allocation.
struct PermuteStrides {
size_t input[kMaxPermuteDims];
size_t permuted[kMaxPermuteDims];
};

__global__ void permute_kernel(const char* __restrict__ src,
char* __restrict__ dest, size_t total_elements,
size_t element_bytes, int ndims,
PermuteStrides strides) {
size_t idx = blockIdx.x * static_cast<size_t>(blockDim.x) + threadIdx.x;
if (idx >= total_elements)
return;

size_t dest_idx = 0;
size_t remaining = idx;
for (int d = 0; d < ndims; ++d) {
size_t coord = remaining / strides.input[d];
remaining -= coord * strides.input[d];
dest_idx += coord * strides.permuted[d];
}

const char* src_ptr = src + idx * element_bytes;
char* dest_ptr = dest + dest_idx * element_bytes;
for (size_t b = 0; b < element_bytes; ++b) {
dest_ptr[b] = src_ptr[b];
}
}

void permute_on_gpu(const void* d_src, void* d_dest, size_t total_elements,
size_t element_bytes, int ndims,
const std::vector<int>& input_shape,
const std::vector<int>& permutation, cudaStream_t stream) {
if (ndims <= 0 || ndims > kMaxPermuteDims) {
throw std::runtime_error(
"permute_on_gpu: unsupported number of dimensions: " +
std::to_string(ndims));
}
if (static_cast<int>(input_shape.size()) != ndims ||
static_cast<int>(permutation.size()) != ndims) {
throw std::runtime_error(
"permute_on_gpu: input_shape/permutation size mismatch");
}

auto [input_strides_vec, permuted_strides_vec, total_size] =
compute_permute_strides(ndims, input_shape.data(), permutation.data());

// Copy into fixed-size struct for pass-by-value kernel argument.
PermuteStrides strides{};
for (int d = 0; d < ndims; ++d) {
strides.input[d] = input_strides_vec[d];
strides.permuted[d] = permuted_strides_vec[d];
}

// TODO: for the common case of 2D float32 transpose (permutation [1,0]),
// cublasSgeam with CUBLAS_OP_T is significantly faster than this generic
// kernel. Would require adding cuBLAS as a dependency of dalotia_cpp.
constexpr int block_size = 256;
int grid_size =
static_cast<int>((total_elements + block_size - 1) / block_size);
permute_kernel<<<grid_size, block_size, 0, stream>>>(
static_cast<const char*>(d_src), static_cast<char*>(d_dest),
total_elements, element_bytes, ndims, strides);

cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
throw std::runtime_error(
std::string("permute_on_gpu: kernel launch failed: ") +
cudaGetErrorString(err));
}
}

} // namespace dalotia

#endif // DALOTIA_WITH_CUDA
32 changes: 32 additions & 0 deletions src/dalotia_permute_gpu.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#pragma once

#ifdef DALOTIA_WITH_CUDA

#include <cstddef>
#include <cstdint>
#include <vector>

#include <cuda_runtime.h>

namespace dalotia {

static constexpr int kMaxPermuteDims = 8;

// Permute tensor data on device. `d_src` and `d_dest` are device pointers.
// `d_src` contains elements in the original (C-order) layout.
// `d_dest` will contain elements in the permuted layout.
// `input_shape` and `permutation` are host vectors with `ndims` entries.
// `element_bytes` is the size of one element (e.g. 4 for float32).
//
// The kernel is launched on `stream` (default: 0). No synchronization is
// performed — the caller is responsible for synchronizing the stream if
// needed before reading from `d_dest`.
void permute_on_gpu(const void* d_src, void* d_dest, size_t total_elements,
size_t element_bytes, int ndims,
const std::vector<int>& input_shape,
const std::vector<int>& permutation,
cudaStream_t stream = 0);

} // namespace dalotia

#endif // DALOTIA_WITH_CUDA
Loading
Loading