From 51464b689130eeff8b91aeb9fde7f47a6f447de4 Mon Sep 17 00:00:00 2001 From: Theresa Date: Mon, 23 Mar 2026 06:58:44 +0000 Subject: [PATCH 01/11] permute gpu: first version (no type conversion!) --- src/CMakeLists.txt | 2 + src/dalotia_permute_gpu.cu | 88 +++++++++++++++++++++++++++++++++++++ src/dalotia_permute_gpu.cuh | 32 ++++++++++++++ src/dalotia_tensor_file.cpp | 85 +++++++++++++++++++++++++---------- src/dalotia_tensor_file.hpp | 17 +++++-- test/test_cufile.cpp | 40 +++++++++++++++++ 6 files changed, 238 insertions(+), 26 deletions(-) create mode 100644 src/dalotia_permute_gpu.cu create mode 100644 src/dalotia_permute_gpu.cuh diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1d6e1cd..326c3de 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -29,6 +29,8 @@ endif (DALOTIA_WITH_SAFETENSORS_CPP) if (DALOTIA_WITH_CUDA) target_link_libraries(dalotia_cpp PUBLIC CUDA::cudart) target_compile_definitions(dalotia_cpp PUBLIC "DALOTIA_WITH_CUDA") + target_sources(dalotia_cpp PRIVATE dalotia_permute_gpu.cu) + set_source_files_properties(dalotia_permute_gpu.cu PROPERTIES LANGUAGE CUDA) endif (DALOTIA_WITH_CUDA) if (DALOTIA_WITH_CUFILE) diff --git a/src/dalotia_permute_gpu.cu b/src/dalotia_permute_gpu.cu new file mode 100644 index 0000000..3d1d8d2 --- /dev/null +++ b/src/dalotia_permute_gpu.cu @@ -0,0 +1,88 @@ +#ifdef DALOTIA_WITH_CUDA + +#include "dalotia_permute_gpu.cuh" +#include "dalotia_assignment.hpp" + +#include +#include +#include + +namespace dalotia { + +// Fixed-size stride arrays passed by value as kernel arguments. +// At most 8 dimensions × 8 bytes = 64 bytes per array — well within +// the 4 KB kernel argument limit, and avoids device memory allocation. +struct PermuteStrides { + size_t input[kMaxPermuteDims]; + size_t permuted[kMaxPermuteDims]; +}; + +__global__ void permute_kernel(const char *__restrict__ src, + char *__restrict__ dest, size_t total_elements, + size_t element_bytes, int ndims, + PermuteStrides strides) { + size_t idx = blockIdx.x * static_cast(blockDim.x) + threadIdx.x; + if (idx >= total_elements) return; + + size_t dest_idx = 0; + size_t remaining = idx; + for (int d = 0; d < ndims; ++d) { + size_t coord = remaining / strides.input[d]; + remaining -= coord * strides.input[d]; + dest_idx += coord * strides.permuted[d]; + } + + const char *src_ptr = src + idx * element_bytes; + char *dest_ptr = dest + dest_idx * element_bytes; + for (size_t b = 0; b < element_bytes; ++b) { + dest_ptr[b] = src_ptr[b]; + } +} + +void permute_on_gpu(const void *d_src, void *d_dest, size_t total_elements, + size_t element_bytes, int ndims, + const std::vector &input_shape, + const std::vector &permutation, + cudaStream_t stream) { + if (ndims <= 0 || ndims > kMaxPermuteDims) { + throw std::runtime_error( + "permute_on_gpu: unsupported number of dimensions: " + + std::to_string(ndims)); + } + if (static_cast(input_shape.size()) != ndims || + static_cast(permutation.size()) != ndims) { + throw std::runtime_error( + "permute_on_gpu: input_shape/permutation size mismatch"); + } + + auto [input_strides_vec, permuted_strides_vec, total_size] = + compute_permute_strides(ndims, input_shape.data(), permutation.data()); + + // Copy into fixed-size struct for pass-by-value kernel argument. + PermuteStrides strides{}; + for (int d = 0; d < ndims; ++d) { + strides.input[d] = input_strides_vec[d]; + strides.permuted[d] = permuted_strides_vec[d]; + } + + // TODO: for the common case of 2D float32 transpose (permutation [1,0]), + // cublasSgeam with CUBLAS_OP_T is significantly faster than this generic + // kernel. Would require adding cuBLAS as a dependency of dalotia_cpp. + constexpr int block_size = 256; + int grid_size = + static_cast((total_elements + block_size - 1) / block_size); + permute_kernel<<>>( + static_cast(d_src), static_cast(d_dest), + total_elements, element_bytes, ndims, strides); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error( + std::string("permute_on_gpu: kernel launch failed: ") + + cudaGetErrorString(err)); + } +} + +} // namespace dalotia + +#endif // DALOTIA_WITH_CUDA diff --git a/src/dalotia_permute_gpu.cuh b/src/dalotia_permute_gpu.cuh new file mode 100644 index 0000000..5560982 --- /dev/null +++ b/src/dalotia_permute_gpu.cuh @@ -0,0 +1,32 @@ +#pragma once + +#ifdef DALOTIA_WITH_CUDA + +#include +#include +#include + +#include + +namespace dalotia { + +static constexpr int kMaxPermuteDims = 8; + +// Permute tensor data on device. `d_src` and `d_dest` are device pointers. +// `d_src` contains elements in the original (C-order) layout. +// `d_dest` will contain elements in the permuted layout. +// `input_shape` and `permutation` are host vectors with `ndims` entries. +// `element_bytes` is the size of one element (e.g. 4 for float32). +// +// The kernel is launched on `stream` (default: 0). No synchronization is +// performed — the caller is responsible for synchronizing the stream if +// needed before reading from `d_dest`. +void permute_on_gpu(const void *d_src, void *d_dest, size_t total_elements, + size_t element_bytes, int ndims, + const std::vector &input_shape, + const std::vector &permutation, + cudaStream_t stream = 0); + +} // namespace dalotia + +#endif // DALOTIA_WITH_CUDA diff --git a/src/dalotia_tensor_file.cpp b/src/dalotia_tensor_file.cpp index 234deb8..2ea5a2b 100644 --- a/src/dalotia_tensor_file.cpp +++ b/src/dalotia_tensor_file.cpp @@ -2,6 +2,7 @@ #ifdef DALOTIA_WITH_CUDA #include +#include "dalotia_permute_gpu.cuh" #endif #ifdef DALOTIA_WITH_CUFILE #include "dalotia_cufile.hpp" @@ -28,49 +29,87 @@ void TensorFile::load_tensor_dense(const std::string& tensor_name, dalotia_WeightFormat weightFormat, dalotia_Ordering ordering, dalotia_byte* __restrict__ tensor, - const std::vector& permutation) { - // Normalize the permutation once: convert from user-supplied - // (possibly 1-indexed, possibly F-ordering) to 0-indexed C-order. + const std::vector& permutation +#ifdef DALOTIA_WITH_CUDA + , + cudaStream_t stream +#endif +) { auto final_perm = final_c_permutation_from_permutation_and_order( permutation, ordering, this->get_num_dimensions(tensor_name)); #ifdef DALOTIA_WITH_CUDA if (is_device_pointer(tensor)) { - if (!final_perm.empty()) { + auto info = this->get_tensor_info(tensor_name); + if (info.format != weightFormat) { throw std::runtime_error( - "load_tensor_dense: permutation to device memory is not " - "supported; transpose on-device after loading instead."); + "load_tensor_dense: format conversion to device memory is " + "not yet supported (file format " + + std::to_string(info.format) + " != requested " + + std::to_string(weightFormat) + + "); convert on-device after loading instead."); } - auto extents = this->get_tensor_extents(tensor_name); + bool needs_permute = !final_perm.empty(); + + auto input_extents = this->get_tensor_extents(tensor_name); auto total_elements = - std::accumulate(extents.begin(), extents.end(), size_t{1}, - std::multiplies()); - size_t nbytes = total_elements * sizeof_weight_format(weightFormat); + std::accumulate(input_extents.begin(), input_extents.end(), + size_t{1}, std::multiplies()); + size_t element_bytes = sizeof_weight_format(weightFormat); + size_t nbytes = total_elements * element_bytes; + dalotia_byte* d_raw = tensor; + dalotia_byte* d_tmp = nullptr; + if (needs_permute) { + cudaError_t alloc_err = cudaMallocAsync(&d_tmp, nbytes, stream); + if (alloc_err != cudaSuccess) { + throw std::runtime_error( + std::string("load_tensor_dense: cudaMallocAsync for " + "permutation temp buffer failed: ") + + cudaGetErrorString(alloc_err)); + } + d_raw = d_tmp; + } + + bool loaded = false; #ifdef DALOTIA_WITH_CUFILE if (gpu_data_source_ && data_source_ && data_source_->host_data(0)) { - // GDS path: read directly from file to device memory. auto ptrs = this->get_mmap_tensor_pointers(tensor_name); if (!ptrs.empty()) { size_t offset = reinterpret_cast(ptrs[0]) - data_source_->host_data(0); - gpu_data_source_->read_into(offset, nbytes, tensor); - return; + gpu_data_source_->read_into(offset, nbytes, d_raw); + loaded = true; } } #endif // DALOTIA_WITH_CUFILE + if (!loaded) { + // Fallback: load to host buffer, then cudaMemcpyAsync. + std::vector host_buf(nbytes); + load_tensor_dense_impl(tensor_name, weightFormat, + dalotia_C_ordering, host_buf.data(), {}); + cudaError_t err = cudaMemcpyAsync(d_raw, host_buf.data(), nbytes, + cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) { + if (d_tmp) + cudaFreeAsync(d_tmp, stream); + throw std::runtime_error( + std::string("load_tensor_dense: cudaMemcpy failed: ") + + cudaGetErrorString(err)); + } + // Must synchronize before host_buf goes out of scope, + // since cudaMemcpyAsync reads from it asynchronously. + cudaStreamSynchronize(stream); + } - // Fallback: load to a temporary host buffer, then cudaMemcpy. - std::vector host_buf(nbytes); - load_tensor_dense_impl(tensor_name, weightFormat, ordering, - host_buf.data(), {}); - cudaError_t err = - cudaMemcpy(tensor, host_buf.data(), nbytes, cudaMemcpyHostToDevice); - if (err != cudaSuccess) { - throw std::runtime_error( - std::string("load_tensor_dense: cudaMemcpy failed: ") + - cudaGetErrorString(err)); + if (needs_permute) { + permute_on_gpu(d_tmp, tensor, total_elements, element_bytes, + static_cast(input_extents.size()), + input_extents, final_perm, stream); + // Synchronize before freeing the temp buffer. + cudaStreamSynchronize(stream); + cudaFreeAsync(d_tmp, stream); } return; } diff --git a/src/dalotia_tensor_file.hpp b/src/dalotia_tensor_file.hpp index 39dad90..d1ebc3b 100644 --- a/src/dalotia_tensor_file.hpp +++ b/src/dalotia_tensor_file.hpp @@ -16,6 +16,10 @@ #include "dalotia_assignment.hpp" #include "dalotia_datasource.hpp" +#ifdef DALOTIA_WITH_CUDA +#include +#endif + namespace dalotia { #ifdef DALOTIA_WITH_CUDA @@ -84,7 +88,8 @@ class TensorFile { std::multiplies()); } - [[nodiscard]] virtual size_t get_nnz(const std::string &/* tensor_name*/) const { + [[nodiscard]] virtual size_t get_nnz( + const std::string& /* tensor_name*/) const { // This function will read the file and return the number of non-zero // elements ? may take a while for dense tensors, only allow for sparse? throw std::runtime_error( @@ -93,7 +98,8 @@ class TensorFile { } [[nodiscard]] virtual std::vector get_sparse_tensor_extents( - const std::string &/*tensor_name*/, dalotia_SparseFormat /*format*/) const { + const std::string& /*tensor_name*/, + dalotia_SparseFormat /*format*/) const { // This function will (lazily) read the file and return the tensor // extents throw std::runtime_error( @@ -105,7 +111,12 @@ class TensorFile { dalotia_WeightFormat weightFormat, dalotia_Ordering ordering, dalotia_byte* __restrict__ tensor, - const std::vector& permutation = {}); + const std::vector& permutation = {} +#ifdef DALOTIA_WITH_CUDA + , + cudaStream_t stream = 0 +#endif + ); template //? or have no defaults? [[nodiscard]] std::pair, dalotia::vector> diff --git a/test/test_cufile.cpp b/test/test_cufile.cpp index 58a0605..3f108c7 100644 --- a/test/test_cufile.cpp +++ b/test/test_cufile.cpp @@ -211,6 +211,45 @@ void test_same_file_host_and_gpu() { std::cout << "OK" << std::endl; } +void test_permuted_load_to_gpu() { + std::cout << "test_permuted_load_to_gpu... " << std::flush; + + // The test model has "embedding_firstchanged" with shape [4,3,5]. + // Permutation [1,0,2] gives shape [3,4,5] with values 0..59. + const char* perm_tensor = "embedding_firstchanged"; + std::vector perm = {1, 0, 2}; + + // Load with permutation on CPU as reference + auto file = std::unique_ptr( + dalotia::make_tensor_file(TEST_FILE)); + auto [extents_ref, h_ref] = file->load_tensor_dense( + perm_tensor, FORMAT, dalotia_C_ordering, perm); + assert(extents_ref == std::vector({3, 4, 5})); + assert(h_ref.size() == NUM_ELEMENTS); + for (int i = 0; i < NUM_ELEMENTS; i++) { + assert(h_ref[i] == static_cast(i)); + } + + // Now load with permutation directly to GPU + const size_t nbytes = NUM_ELEMENTS * sizeof(double); + double* d_tensor = nullptr; + CHECK_CUDA(cudaMalloc(&d_tensor, nbytes)); + + file->load_tensor_dense(perm_tensor, FORMAT, dalotia_C_ordering, + reinterpret_cast(d_tensor), perm); + + // Copy back and verify + std::vector h_result(NUM_ELEMENTS); + CHECK_CUDA( + cudaMemcpy(h_result.data(), d_tensor, nbytes, cudaMemcpyDeviceToHost)); + for (int i = 0; i < NUM_ELEMENTS; i++) { + assert(h_result[i] == h_ref[i]); + } + + CHECK_CUDA(cudaFree(d_tensor)); + std::cout << "OK" << std::endl; +} + int main() { test_is_device_pointer(); #ifdef DALOTIA_WITH_CUFILE @@ -222,6 +261,7 @@ int main() { test_load_to_gpu_with_driver(); #endif test_same_file_host_and_gpu(); + test_permuted_load_to_gpu(); std::cout << "test_cufile succeeded" << std::endl; return 0; } From 667b7e88c90947d1dbeda8fff215385a935627f4 Mon Sep 17 00:00:00 2001 From: Theresa Date: Mon, 23 Mar 2026 07:51:10 +0000 Subject: [PATCH 02/11] permute gpu: clang-format --- src/dalotia_permute_gpu.cu | 108 ++++++++++++++++++------------------ src/dalotia_permute_gpu.cuh | 4 +- 2 files changed, 56 insertions(+), 56 deletions(-) diff --git a/src/dalotia_permute_gpu.cu b/src/dalotia_permute_gpu.cu index 3d1d8d2..a132a95 100644 --- a/src/dalotia_permute_gpu.cu +++ b/src/dalotia_permute_gpu.cu @@ -1,7 +1,7 @@ #ifdef DALOTIA_WITH_CUDA -#include "dalotia_permute_gpu.cuh" #include "dalotia_assignment.hpp" +#include "dalotia_permute_gpu.cuh" #include #include @@ -13,76 +13,76 @@ namespace dalotia { // At most 8 dimensions × 8 bytes = 64 bytes per array — well within // the 4 KB kernel argument limit, and avoids device memory allocation. struct PermuteStrides { - size_t input[kMaxPermuteDims]; - size_t permuted[kMaxPermuteDims]; + size_t input[kMaxPermuteDims]; + size_t permuted[kMaxPermuteDims]; }; __global__ void permute_kernel(const char *__restrict__ src, char *__restrict__ dest, size_t total_elements, size_t element_bytes, int ndims, PermuteStrides strides) { - size_t idx = blockIdx.x * static_cast(blockDim.x) + threadIdx.x; - if (idx >= total_elements) return; + size_t idx = blockIdx.x * static_cast(blockDim.x) + threadIdx.x; + if (idx >= total_elements) + return; - size_t dest_idx = 0; - size_t remaining = idx; - for (int d = 0; d < ndims; ++d) { - size_t coord = remaining / strides.input[d]; - remaining -= coord * strides.input[d]; - dest_idx += coord * strides.permuted[d]; - } + size_t dest_idx = 0; + size_t remaining = idx; + for (int d = 0; d < ndims; ++d) { + size_t coord = remaining / strides.input[d]; + remaining -= coord * strides.input[d]; + dest_idx += coord * strides.permuted[d]; + } - const char *src_ptr = src + idx * element_bytes; - char *dest_ptr = dest + dest_idx * element_bytes; - for (size_t b = 0; b < element_bytes; ++b) { - dest_ptr[b] = src_ptr[b]; - } + const char *src_ptr = src + idx * element_bytes; + char *dest_ptr = dest + dest_idx * element_bytes; + for (size_t b = 0; b < element_bytes; ++b) { + dest_ptr[b] = src_ptr[b]; + } } void permute_on_gpu(const void *d_src, void *d_dest, size_t total_elements, size_t element_bytes, int ndims, const std::vector &input_shape, - const std::vector &permutation, - cudaStream_t stream) { - if (ndims <= 0 || ndims > kMaxPermuteDims) { - throw std::runtime_error( - "permute_on_gpu: unsupported number of dimensions: " + - std::to_string(ndims)); - } - if (static_cast(input_shape.size()) != ndims || - static_cast(permutation.size()) != ndims) { - throw std::runtime_error( - "permute_on_gpu: input_shape/permutation size mismatch"); - } + const std::vector &permutation, cudaStream_t stream) { + if (ndims <= 0 || ndims > kMaxPermuteDims) { + throw std::runtime_error( + "permute_on_gpu: unsupported number of dimensions: " + + std::to_string(ndims)); + } + if (static_cast(input_shape.size()) != ndims || + static_cast(permutation.size()) != ndims) { + throw std::runtime_error( + "permute_on_gpu: input_shape/permutation size mismatch"); + } - auto [input_strides_vec, permuted_strides_vec, total_size] = - compute_permute_strides(ndims, input_shape.data(), permutation.data()); + auto [input_strides_vec, permuted_strides_vec, total_size] = + compute_permute_strides(ndims, input_shape.data(), permutation.data()); - // Copy into fixed-size struct for pass-by-value kernel argument. - PermuteStrides strides{}; - for (int d = 0; d < ndims; ++d) { - strides.input[d] = input_strides_vec[d]; - strides.permuted[d] = permuted_strides_vec[d]; - } + // Copy into fixed-size struct for pass-by-value kernel argument. + PermuteStrides strides{}; + for (int d = 0; d < ndims; ++d) { + strides.input[d] = input_strides_vec[d]; + strides.permuted[d] = permuted_strides_vec[d]; + } - // TODO: for the common case of 2D float32 transpose (permutation [1,0]), - // cublasSgeam with CUBLAS_OP_T is significantly faster than this generic - // kernel. Would require adding cuBLAS as a dependency of dalotia_cpp. - constexpr int block_size = 256; - int grid_size = - static_cast((total_elements + block_size - 1) / block_size); - permute_kernel<<>>( - static_cast(d_src), static_cast(d_dest), - total_elements, element_bytes, ndims, strides); + // TODO: for the common case of 2D float32 transpose (permutation [1,0]), + // cublasSgeam with CUBLAS_OP_T is significantly faster than this generic + // kernel. Would require adding cuBLAS as a dependency of dalotia_cpp. + constexpr int block_size = 256; + int grid_size = + static_cast((total_elements + block_size - 1) / block_size); + permute_kernel<<>>( + static_cast(d_src), static_cast(d_dest), + total_elements, element_bytes, ndims, strides); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - throw std::runtime_error( - std::string("permute_on_gpu: kernel launch failed: ") + - cudaGetErrorString(err)); - } + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error( + std::string("permute_on_gpu: kernel launch failed: ") + + cudaGetErrorString(err)); + } } -} // namespace dalotia +} // namespace dalotia -#endif // DALOTIA_WITH_CUDA +#endif // DALOTIA_WITH_CUDA diff --git a/src/dalotia_permute_gpu.cuh b/src/dalotia_permute_gpu.cuh index 5560982..a2b336f 100644 --- a/src/dalotia_permute_gpu.cuh +++ b/src/dalotia_permute_gpu.cuh @@ -27,6 +27,6 @@ void permute_on_gpu(const void *d_src, void *d_dest, size_t total_elements, const std::vector &permutation, cudaStream_t stream = 0); -} // namespace dalotia +} // namespace dalotia -#endif // DALOTIA_WITH_CUDA +#endif // DALOTIA_WITH_CUDA From 1759cc77c50a828089a17f090ccf95562c7ac2e8 Mon Sep 17 00:00:00 2001 From: Theresa Date: Mon, 23 Mar 2026 07:57:13 +0000 Subject: [PATCH 03/11] permute gpu: clang-format --- src/dalotia_permute_gpu.cu | 114 ++++++++++++++++++------------------ src/dalotia_permute_gpu.cuh | 10 ++-- 2 files changed, 62 insertions(+), 62 deletions(-) diff --git a/src/dalotia_permute_gpu.cu b/src/dalotia_permute_gpu.cu index a132a95..886a489 100644 --- a/src/dalotia_permute_gpu.cu +++ b/src/dalotia_permute_gpu.cu @@ -13,76 +13,76 @@ namespace dalotia { // At most 8 dimensions × 8 bytes = 64 bytes per array — well within // the 4 KB kernel argument limit, and avoids device memory allocation. struct PermuteStrides { - size_t input[kMaxPermuteDims]; - size_t permuted[kMaxPermuteDims]; + size_t input[kMaxPermuteDims]; + size_t permuted[kMaxPermuteDims]; }; -__global__ void permute_kernel(const char *__restrict__ src, - char *__restrict__ dest, size_t total_elements, +__global__ void permute_kernel(const char* __restrict__ src, + char* __restrict__ dest, size_t total_elements, size_t element_bytes, int ndims, PermuteStrides strides) { - size_t idx = blockIdx.x * static_cast(blockDim.x) + threadIdx.x; - if (idx >= total_elements) - return; + size_t idx = blockIdx.x * static_cast(blockDim.x) + threadIdx.x; + if (idx >= total_elements) + return; - size_t dest_idx = 0; - size_t remaining = idx; - for (int d = 0; d < ndims; ++d) { - size_t coord = remaining / strides.input[d]; - remaining -= coord * strides.input[d]; - dest_idx += coord * strides.permuted[d]; - } + size_t dest_idx = 0; + size_t remaining = idx; + for (int d = 0; d < ndims; ++d) { + size_t coord = remaining / strides.input[d]; + remaining -= coord * strides.input[d]; + dest_idx += coord * strides.permuted[d]; + } - const char *src_ptr = src + idx * element_bytes; - char *dest_ptr = dest + dest_idx * element_bytes; - for (size_t b = 0; b < element_bytes; ++b) { - dest_ptr[b] = src_ptr[b]; - } + const char* src_ptr = src + idx * element_bytes; + char* dest_ptr = dest + dest_idx * element_bytes; + for (size_t b = 0; b < element_bytes; ++b) { + dest_ptr[b] = src_ptr[b]; + } } -void permute_on_gpu(const void *d_src, void *d_dest, size_t total_elements, +void permute_on_gpu(const void* d_src, void* d_dest, size_t total_elements, size_t element_bytes, int ndims, - const std::vector &input_shape, - const std::vector &permutation, cudaStream_t stream) { - if (ndims <= 0 || ndims > kMaxPermuteDims) { - throw std::runtime_error( - "permute_on_gpu: unsupported number of dimensions: " + - std::to_string(ndims)); - } - if (static_cast(input_shape.size()) != ndims || - static_cast(permutation.size()) != ndims) { - throw std::runtime_error( - "permute_on_gpu: input_shape/permutation size mismatch"); - } + const std::vector& input_shape, + const std::vector& permutation, cudaStream_t stream) { + if (ndims <= 0 || ndims > kMaxPermuteDims) { + throw std::runtime_error( + "permute_on_gpu: unsupported number of dimensions: " + + std::to_string(ndims)); + } + if (static_cast(input_shape.size()) != ndims || + static_cast(permutation.size()) != ndims) { + throw std::runtime_error( + "permute_on_gpu: input_shape/permutation size mismatch"); + } - auto [input_strides_vec, permuted_strides_vec, total_size] = - compute_permute_strides(ndims, input_shape.data(), permutation.data()); + auto [input_strides_vec, permuted_strides_vec, total_size] = + compute_permute_strides(ndims, input_shape.data(), permutation.data()); - // Copy into fixed-size struct for pass-by-value kernel argument. - PermuteStrides strides{}; - for (int d = 0; d < ndims; ++d) { - strides.input[d] = input_strides_vec[d]; - strides.permuted[d] = permuted_strides_vec[d]; - } + // Copy into fixed-size struct for pass-by-value kernel argument. + PermuteStrides strides{}; + for (int d = 0; d < ndims; ++d) { + strides.input[d] = input_strides_vec[d]; + strides.permuted[d] = permuted_strides_vec[d]; + } - // TODO: for the common case of 2D float32 transpose (permutation [1,0]), - // cublasSgeam with CUBLAS_OP_T is significantly faster than this generic - // kernel. Would require adding cuBLAS as a dependency of dalotia_cpp. - constexpr int block_size = 256; - int grid_size = - static_cast((total_elements + block_size - 1) / block_size); - permute_kernel<<>>( - static_cast(d_src), static_cast(d_dest), - total_elements, element_bytes, ndims, strides); + // TODO: for the common case of 2D float32 transpose (permutation [1,0]), + // cublasSgeam with CUBLAS_OP_T is significantly faster than this generic + // kernel. Would require adding cuBLAS as a dependency of dalotia_cpp. + constexpr int block_size = 256; + int grid_size = + static_cast((total_elements + block_size - 1) / block_size); + permute_kernel<<>>( + static_cast(d_src), static_cast(d_dest), + total_elements, element_bytes, ndims, strides); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - throw std::runtime_error( - std::string("permute_on_gpu: kernel launch failed: ") + - cudaGetErrorString(err)); - } + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error( + std::string("permute_on_gpu: kernel launch failed: ") + + cudaGetErrorString(err)); + } } -} // namespace dalotia +} // namespace dalotia -#endif // DALOTIA_WITH_CUDA +#endif // DALOTIA_WITH_CUDA diff --git a/src/dalotia_permute_gpu.cuh b/src/dalotia_permute_gpu.cuh index a2b336f..fdaa4f4 100644 --- a/src/dalotia_permute_gpu.cuh +++ b/src/dalotia_permute_gpu.cuh @@ -21,12 +21,12 @@ static constexpr int kMaxPermuteDims = 8; // The kernel is launched on `stream` (default: 0). No synchronization is // performed — the caller is responsible for synchronizing the stream if // needed before reading from `d_dest`. -void permute_on_gpu(const void *d_src, void *d_dest, size_t total_elements, +void permute_on_gpu(const void* d_src, void* d_dest, size_t total_elements, size_t element_bytes, int ndims, - const std::vector &input_shape, - const std::vector &permutation, + const std::vector& input_shape, + const std::vector& permutation, cudaStream_t stream = 0); -} // namespace dalotia +} // namespace dalotia -#endif // DALOTIA_WITH_CUDA +#endif // DALOTIA_WITH_CUDA From 240068c192b9889b8af638571294a16b27bf6697 Mon Sep 17 00:00:00 2001 From: Theresa Date: Mon, 23 Mar 2026 08:01:43 +0000 Subject: [PATCH 04/11] gds: RAII CudaBuffer --- src/CMakeLists.txt | 2 +- src/dalotia_cuda_buffer.hpp | 98 +++++++++++++++++++++++++++++++++++++ src/dalotia_tensor_file.cpp | 24 +++------ 3 files changed, 107 insertions(+), 17 deletions(-) create mode 100644 src/dalotia_cuda_buffer.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1dce087..c57dfc9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,7 +1,7 @@ add_library(dalotia_cpp dalotia.cpp) # Daniel Pfeifer says: no variables target_sources(dalotia_cpp PRIVATE dalotia_assignment.cpp dalotia_formats.cpp dalotia_datasource.cpp dalotia_tensor_file.cpp) set_target_properties(dalotia_cpp PROPERTIES PUBLIC_HEADER - "dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp") + "dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp;dalotia_cuda_buffer.hpp") # have one dalotia library target that can be used in C++ and Fortran add_library(dalotia INTERFACE) add_library(dalotia::dalotia_cpp ALIAS dalotia_cpp) diff --git a/src/dalotia_cuda_buffer.hpp b/src/dalotia_cuda_buffer.hpp new file mode 100644 index 0000000..21ea324 --- /dev/null +++ b/src/dalotia_cuda_buffer.hpp @@ -0,0 +1,98 @@ +#pragma once + +#ifdef DALOTIA_WITH_CUDA + +#include +#include +#include +#include + +namespace dalotia { + +// RAII wrapper for a cudaMalloc'd device buffer. Move-only. +// Type-erased (stores void*); use as() for typed access. +class CudaBuffer { + public: + CudaBuffer() = default; + + explicit CudaBuffer(size_t nbytes) : size_(nbytes) { + if (nbytes > 0) { + cudaError_t err = cudaMalloc(&ptr_, nbytes); + if (err != cudaSuccess) { + throw std::runtime_error( + std::string("CudaBuffer: cudaMalloc failed: ") + + cudaGetErrorString(err)); + } + } + } + + CudaBuffer(size_t nbytes, cudaStream_t stream) : size_(nbytes) { + if (nbytes > 0) { + cudaError_t err = cudaMallocAsync(&ptr_, nbytes, stream); + if (err != cudaSuccess) { + throw std::runtime_error( + std::string("CudaBuffer: cudaMallocAsync failed: ") + + cudaGetErrorString(err)); + } + stream_ = stream; + async_ = true; + } + } + + ~CudaBuffer() { + if (ptr_) { + if (async_) { + cudaFreeAsync(ptr_, stream_); + } else { + cudaFree(ptr_); + } + } + } + + // Move + CudaBuffer(CudaBuffer&& other) noexcept + : ptr_(other.ptr_) + , size_(other.size_) + , stream_(other.stream_) + , async_(other.async_) { + other.ptr_ = nullptr; + other.size_ = 0; + } + + CudaBuffer& operator=(CudaBuffer&& other) noexcept { + if (this != &other) { + if (ptr_) { + if (async_) + cudaFreeAsync(ptr_, stream_); + else + cudaFree(ptr_); + } + ptr_ = other.ptr_; + size_ = other.size_; + stream_ = other.stream_; + async_ = other.async_; + other.ptr_ = nullptr; + other.size_ = 0; + } + return *this; + } + + // No copy + CudaBuffer(const CudaBuffer&) = delete; + CudaBuffer& operator=(const CudaBuffer&) = delete; + + void* data() noexcept { return ptr_; } + const void* data() const noexcept { return ptr_; } + size_t size() const noexcept { return size_; } + bool empty() const noexcept { return ptr_ == nullptr; } + + private: + void* ptr_ = nullptr; + size_t size_ = 0; + cudaStream_t stream_ = 0; + bool async_ = false; +}; + +} // namespace dalotia + +#endif // DALOTIA_WITH_CUDA diff --git a/src/dalotia_tensor_file.cpp b/src/dalotia_tensor_file.cpp index 2ea5a2b..f5afffa 100644 --- a/src/dalotia_tensor_file.cpp +++ b/src/dalotia_tensor_file.cpp @@ -2,6 +2,7 @@ #ifdef DALOTIA_WITH_CUDA #include +#include "dalotia_cuda_buffer.hpp" #include "dalotia_permute_gpu.cuh" #endif #ifdef DALOTIA_WITH_CUFILE @@ -59,17 +60,12 @@ void TensorFile::load_tensor_dense(const std::string& tensor_name, size_t element_bytes = sizeof_weight_format(weightFormat); size_t nbytes = total_elements * element_bytes; + // If permutation needed, load into a temp buffer then permute. + CudaBuffer d_tmp; dalotia_byte* d_raw = tensor; - dalotia_byte* d_tmp = nullptr; if (needs_permute) { - cudaError_t alloc_err = cudaMallocAsync(&d_tmp, nbytes, stream); - if (alloc_err != cudaSuccess) { - throw std::runtime_error( - std::string("load_tensor_dense: cudaMallocAsync for " - "permutation temp buffer failed: ") + - cudaGetErrorString(alloc_err)); - } - d_raw = d_tmp; + d_tmp = CudaBuffer(nbytes, stream); + d_raw = d_tmp.as(); } bool loaded = false; @@ -92,24 +88,20 @@ void TensorFile::load_tensor_dense(const std::string& tensor_name, cudaError_t err = cudaMemcpyAsync(d_raw, host_buf.data(), nbytes, cudaMemcpyHostToDevice, stream); if (err != cudaSuccess) { - if (d_tmp) - cudaFreeAsync(d_tmp, stream); throw std::runtime_error( std::string("load_tensor_dense: cudaMemcpy failed: ") + cudaGetErrorString(err)); } - // Must synchronize before host_buf goes out of scope, - // since cudaMemcpyAsync reads from it asynchronously. + // Must synchronize before host_buf goes out of scope. cudaStreamSynchronize(stream); } if (needs_permute) { - permute_on_gpu(d_tmp, tensor, total_elements, element_bytes, + permute_on_gpu(d_raw, tensor, total_elements, element_bytes, static_cast(input_extents.size()), input_extents, final_perm, stream); - // Synchronize before freeing the temp buffer. + // Synchronize before d_tmp is freed (RAII destructor). cudaStreamSynchronize(stream); - cudaFreeAsync(d_tmp, stream); } return; } From 06623860f86231a55d332f2bff525b355b97fb99 Mon Sep 17 00:00:00 2001 From: Theresa Date: Mon, 23 Mar 2026 08:10:03 +0000 Subject: [PATCH 05/11] cuda: collect cuda things (buffer, is_device_ptr) --- src/CMakeLists.txt | 4 ++-- src/dalotia_cuda.cpp | 22 +++++++++++++++++++ ...lotia_cuda_buffer.hpp => dalotia_cuda.hpp} | 16 ++++++++++++-- src/dalotia_tensor_file.cpp | 17 -------------- src/dalotia_tensor_file.hpp | 8 +------ 5 files changed, 39 insertions(+), 28 deletions(-) create mode 100644 src/dalotia_cuda.cpp rename src/{dalotia_cuda_buffer.hpp => dalotia_cuda.hpp} (86%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c57dfc9..d804732 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,7 +1,7 @@ add_library(dalotia_cpp dalotia.cpp) # Daniel Pfeifer says: no variables target_sources(dalotia_cpp PRIVATE dalotia_assignment.cpp dalotia_formats.cpp dalotia_datasource.cpp dalotia_tensor_file.cpp) set_target_properties(dalotia_cpp PROPERTIES PUBLIC_HEADER - "dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp;dalotia_cuda_buffer.hpp") + "dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp;dalotia_cuda.hpp") # have one dalotia library target that can be used in C++ and Fortran add_library(dalotia INTERFACE) add_library(dalotia::dalotia_cpp ALIAS dalotia_cpp) @@ -32,7 +32,7 @@ endif (DALOTIA_WITH_SAFETENSORS_CPP) if (DALOTIA_WITH_CUDA) target_link_libraries(dalotia_cpp PUBLIC CUDA::cudart) target_compile_definitions(dalotia_cpp PUBLIC "DALOTIA_WITH_CUDA") - target_sources(dalotia_cpp PRIVATE dalotia_permute_gpu.cu) + target_sources(dalotia_cpp PRIVATE dalotia_cuda.cpp dalotia_permute_gpu.cu) set_source_files_properties(dalotia_permute_gpu.cu PROPERTIES LANGUAGE CUDA) endif (DALOTIA_WITH_CUDA) diff --git a/src/dalotia_cuda.cpp b/src/dalotia_cuda.cpp new file mode 100644 index 0000000..bc99f8c --- /dev/null +++ b/src/dalotia_cuda.cpp @@ -0,0 +1,22 @@ +#ifdef DALOTIA_WITH_CUDA + +#include "dalotia_cuda.hpp" + +namespace dalotia { + +bool is_device_pointer(const void* ptr) noexcept { + if (!ptr) + return false; + cudaPointerAttributes attrs{}; + cudaError_t err = cudaPointerGetAttributes(&attrs, ptr); + if (err != cudaSuccess) { + cudaGetLastError(); + return false; + } + return attrs.type == cudaMemoryTypeDevice || + attrs.type == cudaMemoryTypeManaged; +} + +} // namespace dalotia + +#endif // DALOTIA_WITH_CUDA diff --git a/src/dalotia_cuda_buffer.hpp b/src/dalotia_cuda.hpp similarity index 86% rename from src/dalotia_cuda_buffer.hpp rename to src/dalotia_cuda.hpp index 21ea324..2fc7fb8 100644 --- a/src/dalotia_cuda_buffer.hpp +++ b/src/dalotia_cuda.hpp @@ -9,6 +9,10 @@ namespace dalotia { +// Returns true if `ptr` is a CUDA device pointer (cudaMalloc'd or managed). +// Returns false for host pointers (including cudaMallocHost pinned memory). +bool is_device_pointer(const void* ptr) noexcept; + // RAII wrapper for a cudaMalloc'd device buffer. Move-only. // Type-erased (stores void*); use as() for typed access. class CudaBuffer { @@ -49,7 +53,6 @@ class CudaBuffer { } } - // Move CudaBuffer(CudaBuffer&& other) noexcept : ptr_(other.ptr_) , size_(other.size_) @@ -77,7 +80,6 @@ class CudaBuffer { return *this; } - // No copy CudaBuffer(const CudaBuffer&) = delete; CudaBuffer& operator=(const CudaBuffer&) = delete; @@ -86,6 +88,16 @@ class CudaBuffer { size_t size() const noexcept { return size_; } bool empty() const noexcept { return ptr_ == nullptr; } + template + T* as() noexcept { + return static_cast(ptr_); + } + + template + const T* as() const noexcept { + return static_cast(ptr_); + } + private: void* ptr_ = nullptr; size_t size_ = 0; diff --git a/src/dalotia_tensor_file.cpp b/src/dalotia_tensor_file.cpp index f5afffa..654a4b7 100644 --- a/src/dalotia_tensor_file.cpp +++ b/src/dalotia_tensor_file.cpp @@ -1,8 +1,6 @@ #include "dalotia_tensor_file.hpp" #ifdef DALOTIA_WITH_CUDA -#include -#include "dalotia_cuda_buffer.hpp" #include "dalotia_permute_gpu.cuh" #endif #ifdef DALOTIA_WITH_CUFILE @@ -11,21 +9,6 @@ namespace dalotia { -#ifdef DALOTIA_WITH_CUDA -bool is_device_pointer(const void* ptr) noexcept { - if (!ptr) - return false; - cudaPointerAttributes attrs{}; - cudaError_t err = cudaPointerGetAttributes(&attrs, ptr); - if (err != cudaSuccess) { - cudaGetLastError(); - return false; - } - return attrs.type == cudaMemoryTypeDevice || - attrs.type == cudaMemoryTypeManaged; -} -#endif // DALOTIA_WITH_CUDA - void TensorFile::load_tensor_dense(const std::string& tensor_name, dalotia_WeightFormat weightFormat, dalotia_Ordering ordering, diff --git a/src/dalotia_tensor_file.hpp b/src/dalotia_tensor_file.hpp index 0e5fc7b..34c70ba 100644 --- a/src/dalotia_tensor_file.hpp +++ b/src/dalotia_tensor_file.hpp @@ -17,17 +17,11 @@ #include "dalotia_datasource.hpp" #ifdef DALOTIA_WITH_CUDA -#include +#include "dalotia_cuda.hpp" #endif namespace dalotia { -#ifdef DALOTIA_WITH_CUDA -// Returns true if `ptr` is a CUDA device pointer (cudaMalloc'd or managed). -// Returns false for host pointers (including cudaMallocHost pinned memory). -bool is_device_pointer(const void* ptr) noexcept; -#endif // DALOTIA_WITH_CUDA - class TensorFile { public: explicit TensorFile(const std::string& /* filename */) { From b11bce45039c357ff15669972eb8ab97ab9f1c5e Mon Sep 17 00:00:00 2001 From: Theresa Date: Mon, 23 Mar 2026 23:49:22 +0000 Subject: [PATCH 06/11] test cufile: more with open driver --- test/test_cufile.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_cufile.cpp b/test/test_cufile.cpp index 3f108c7..2f8163a 100644 --- a/test/test_cufile.cpp +++ b/test/test_cufile.cpp @@ -144,7 +144,6 @@ void test_load_to_gpu() { CHECK_CUDA(cudaFree(d_tensor)); std::cout << "OK (via fallback)" << std::endl; } - #ifdef DALOTIA_WITH_CUFILE void test_load_to_gpu_with_driver() { std::cout << "test_load_to_gpu_with_driver... " << std::flush; @@ -181,6 +180,7 @@ void test_same_file_host_and_gpu() { // a host pointer and a device pointer, and verify both match. std::cout << "test_same_file_host_and_gpu... " << std::flush; + auto driver = try_open_driver(); dalotia::SafetensorsFile file(TEST_FILE); // Load to host @@ -214,6 +214,7 @@ void test_same_file_host_and_gpu() { void test_permuted_load_to_gpu() { std::cout << "test_permuted_load_to_gpu... " << std::flush; + auto driver = try_open_driver(); // The test model has "embedding_firstchanged" with shape [4,3,5]. // Permutation [1,0,2] gives shape [3,4,5] with values 0..59. const char* perm_tensor = "embedding_firstchanged"; From aca23fa1d4044ff716535e55ba811bd4afb4ee6d Mon Sep 17 00:00:00 2001 From: Theresa Date: Tue, 24 Mar 2026 00:12:50 +0000 Subject: [PATCH 07/11] ci: verbose output on cufile tests --- .github/workflows/ctest.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ctest.yml b/.github/workflows/ctest.yml index ef6d8c3..95318a2 100644 --- a/.github/workflows/ctest.yml +++ b/.github/workflows/ctest.yml @@ -79,8 +79,8 @@ jobs: options: DALOTIA_WITH_CUFILE=ON DALOTIA_BUILD_TESTS=ON DALOTIA_WITH_FORTRAN=OFF build-dir: build-cuda - - name: Run non-GPU tests + - name: Run tests uses: threeal/ctest-action@v1.1.0 with: test-dir: build-cuda - args: --output-on-failure --exclude-regex cufile-pointer-detection + args: --verbose --exclude-regex cufile-pointer-detection From e4e0d485bbb80b9eb4476be02c2030e08fe759d7 Mon Sep 17 00:00:00 2001 From: Theresa Date: Tue, 24 Mar 2026 00:16:28 +0000 Subject: [PATCH 08/11] cmake: output on tests --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c4fbd1b..039c38d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,4 +1,4 @@ -list(APPEND CMAKE_CTEST_ARGUMENTS "--output-on-failure") +list(APPEND CMAKE_CTEST_ARGUMENTS "--verbose") # cf. https://stackoverflow.com/questions/52730994/how-to-pass-arguments-to-memcheck-with-ctest set(MEMORYCHECK_COMMAND_OPTIONS From a39f53f10cd9f84b616d7d912a1fb66a69af2a74 Mon Sep 17 00:00:00 2001 From: Theresa Date: Tue, 24 Mar 2026 00:16:59 +0000 Subject: [PATCH 09/11] cmake: output on tests (amends last commit) --- test/CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 039c38d..9187fca 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,5 +1,3 @@ -list(APPEND CMAKE_CTEST_ARGUMENTS "--verbose") - # cf. https://stackoverflow.com/questions/52730994/how-to-pass-arguments-to-memcheck-with-ctest set(MEMORYCHECK_COMMAND_OPTIONS "--gen-suppressions=all --suppressions=${CMAKE_CURRENT_SOURCE_DIR}/tensorflow.supp --leak-check=full" From 5dee1306a6a5b81a523cf82eecd7d7714abfc2e0 Mon Sep 17 00:00:00 2001 From: Theresa Date: Tue, 24 Mar 2026 00:20:31 +0000 Subject: [PATCH 10/11] ci: verbose output on cufile tests --- .github/workflows/ctest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ctest.yml b/.github/workflows/ctest.yml index 95318a2..78f3b7a 100644 --- a/.github/workflows/ctest.yml +++ b/.github/workflows/ctest.yml @@ -83,4 +83,4 @@ jobs: uses: threeal/ctest-action@v1.1.0 with: test-dir: build-cuda - args: --verbose --exclude-regex cufile-pointer-detection + args: --verbose From af13240e5254a7315d5d6261da18d49c351fb30b Mon Sep 17 00:00:00 2001 From: Theresa Date: Tue, 24 Mar 2026 00:23:13 +0000 Subject: [PATCH 11/11] Revert "ci: verbose output on cufile tests" This reverts commit 5dee1306a6a5b81a523cf82eecd7d7714abfc2e0. --- .github/workflows/ctest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ctest.yml b/.github/workflows/ctest.yml index 78f3b7a..95318a2 100644 --- a/.github/workflows/ctest.yml +++ b/.github/workflows/ctest.yml @@ -83,4 +83,4 @@ jobs: uses: threeal/ctest-action@v1.1.0 with: test-dir: build-cuda - args: --verbose + args: --verbose --exclude-regex cufile-pointer-detection