diff --git a/.github/workflows/ctest.yml b/.github/workflows/ctest.yml index ef6d8c3..95318a2 100644 --- a/.github/workflows/ctest.yml +++ b/.github/workflows/ctest.yml @@ -79,8 +79,8 @@ jobs: options: DALOTIA_WITH_CUFILE=ON DALOTIA_BUILD_TESTS=ON DALOTIA_WITH_FORTRAN=OFF build-dir: build-cuda - - name: Run non-GPU tests + - name: Run tests uses: threeal/ctest-action@v1.1.0 with: test-dir: build-cuda - args: --output-on-failure --exclude-regex cufile-pointer-detection + args: --verbose --exclude-regex cufile-pointer-detection diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4db3b2a..d804732 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,7 +1,7 @@ add_library(dalotia_cpp dalotia.cpp) # Daniel Pfeifer says: no variables target_sources(dalotia_cpp PRIVATE dalotia_assignment.cpp dalotia_formats.cpp dalotia_datasource.cpp dalotia_tensor_file.cpp) set_target_properties(dalotia_cpp PROPERTIES PUBLIC_HEADER - "dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp") + "dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp;dalotia_cuda.hpp") # have one dalotia library target that can be used in C++ and Fortran add_library(dalotia INTERFACE) add_library(dalotia::dalotia_cpp ALIAS dalotia_cpp) @@ -32,6 +32,8 @@ endif (DALOTIA_WITH_SAFETENSORS_CPP) if (DALOTIA_WITH_CUDA) target_link_libraries(dalotia_cpp PUBLIC CUDA::cudart) target_compile_definitions(dalotia_cpp PUBLIC "DALOTIA_WITH_CUDA") + target_sources(dalotia_cpp PRIVATE dalotia_cuda.cpp dalotia_permute_gpu.cu) + set_source_files_properties(dalotia_permute_gpu.cu PROPERTIES LANGUAGE CUDA) endif (DALOTIA_WITH_CUDA) if (DALOTIA_WITH_CUFILE) diff --git a/src/dalotia_cuda.cpp b/src/dalotia_cuda.cpp new file mode 100644 index 0000000..bc99f8c --- /dev/null +++ b/src/dalotia_cuda.cpp @@ -0,0 +1,22 @@ +#ifdef DALOTIA_WITH_CUDA + +#include "dalotia_cuda.hpp" + +namespace dalotia { + +bool is_device_pointer(const void* ptr) noexcept { + if (!ptr) + return false; + cudaPointerAttributes attrs{}; + cudaError_t err = cudaPointerGetAttributes(&attrs, ptr); + if (err != cudaSuccess) { + cudaGetLastError(); + return false; + } + return attrs.type == cudaMemoryTypeDevice || + attrs.type == cudaMemoryTypeManaged; +} + +} // namespace dalotia + +#endif // DALOTIA_WITH_CUDA diff --git a/src/dalotia_cuda.hpp b/src/dalotia_cuda.hpp new file mode 100644 index 0000000..2fc7fb8 --- /dev/null +++ b/src/dalotia_cuda.hpp @@ -0,0 +1,110 @@ +#pragma once + +#ifdef DALOTIA_WITH_CUDA + +#include +#include +#include +#include + +namespace dalotia { + +// Returns true if `ptr` is a CUDA device pointer (cudaMalloc'd or managed). +// Returns false for host pointers (including cudaMallocHost pinned memory). +bool is_device_pointer(const void* ptr) noexcept; + +// RAII wrapper for a cudaMalloc'd device buffer. Move-only. +// Type-erased (stores void*); use as() for typed access. +class CudaBuffer { + public: + CudaBuffer() = default; + + explicit CudaBuffer(size_t nbytes) : size_(nbytes) { + if (nbytes > 0) { + cudaError_t err = cudaMalloc(&ptr_, nbytes); + if (err != cudaSuccess) { + throw std::runtime_error( + std::string("CudaBuffer: cudaMalloc failed: ") + + cudaGetErrorString(err)); + } + } + } + + CudaBuffer(size_t nbytes, cudaStream_t stream) : size_(nbytes) { + if (nbytes > 0) { + cudaError_t err = cudaMallocAsync(&ptr_, nbytes, stream); + if (err != cudaSuccess) { + throw std::runtime_error( + std::string("CudaBuffer: cudaMallocAsync failed: ") + + cudaGetErrorString(err)); + } + stream_ = stream; + async_ = true; + } + } + + ~CudaBuffer() { + if (ptr_) { + if (async_) { + cudaFreeAsync(ptr_, stream_); + } else { + cudaFree(ptr_); + } + } + } + + CudaBuffer(CudaBuffer&& other) noexcept + : ptr_(other.ptr_) + , size_(other.size_) + , stream_(other.stream_) + , async_(other.async_) { + other.ptr_ = nullptr; + other.size_ = 0; + } + + CudaBuffer& operator=(CudaBuffer&& other) noexcept { + if (this != &other) { + if (ptr_) { + if (async_) + cudaFreeAsync(ptr_, stream_); + else + cudaFree(ptr_); + } + ptr_ = other.ptr_; + size_ = other.size_; + stream_ = other.stream_; + async_ = other.async_; + other.ptr_ = nullptr; + other.size_ = 0; + } + return *this; + } + + CudaBuffer(const CudaBuffer&) = delete; + CudaBuffer& operator=(const CudaBuffer&) = delete; + + void* data() noexcept { return ptr_; } + const void* data() const noexcept { return ptr_; } + size_t size() const noexcept { return size_; } + bool empty() const noexcept { return ptr_ == nullptr; } + + template + T* as() noexcept { + return static_cast(ptr_); + } + + template + const T* as() const noexcept { + return static_cast(ptr_); + } + + private: + void* ptr_ = nullptr; + size_t size_ = 0; + cudaStream_t stream_ = 0; + bool async_ = false; +}; + +} // namespace dalotia + +#endif // DALOTIA_WITH_CUDA diff --git a/src/dalotia_permute_gpu.cu b/src/dalotia_permute_gpu.cu new file mode 100644 index 0000000..886a489 --- /dev/null +++ b/src/dalotia_permute_gpu.cu @@ -0,0 +1,88 @@ +#ifdef DALOTIA_WITH_CUDA + +#include "dalotia_assignment.hpp" +#include "dalotia_permute_gpu.cuh" + +#include +#include +#include + +namespace dalotia { + +// Fixed-size stride arrays passed by value as kernel arguments. +// At most 8 dimensions × 8 bytes = 64 bytes per array — well within +// the 4 KB kernel argument limit, and avoids device memory allocation. +struct PermuteStrides { + size_t input[kMaxPermuteDims]; + size_t permuted[kMaxPermuteDims]; +}; + +__global__ void permute_kernel(const char* __restrict__ src, + char* __restrict__ dest, size_t total_elements, + size_t element_bytes, int ndims, + PermuteStrides strides) { + size_t idx = blockIdx.x * static_cast(blockDim.x) + threadIdx.x; + if (idx >= total_elements) + return; + + size_t dest_idx = 0; + size_t remaining = idx; + for (int d = 0; d < ndims; ++d) { + size_t coord = remaining / strides.input[d]; + remaining -= coord * strides.input[d]; + dest_idx += coord * strides.permuted[d]; + } + + const char* src_ptr = src + idx * element_bytes; + char* dest_ptr = dest + dest_idx * element_bytes; + for (size_t b = 0; b < element_bytes; ++b) { + dest_ptr[b] = src_ptr[b]; + } +} + +void permute_on_gpu(const void* d_src, void* d_dest, size_t total_elements, + size_t element_bytes, int ndims, + const std::vector& input_shape, + const std::vector& permutation, cudaStream_t stream) { + if (ndims <= 0 || ndims > kMaxPermuteDims) { + throw std::runtime_error( + "permute_on_gpu: unsupported number of dimensions: " + + std::to_string(ndims)); + } + if (static_cast(input_shape.size()) != ndims || + static_cast(permutation.size()) != ndims) { + throw std::runtime_error( + "permute_on_gpu: input_shape/permutation size mismatch"); + } + + auto [input_strides_vec, permuted_strides_vec, total_size] = + compute_permute_strides(ndims, input_shape.data(), permutation.data()); + + // Copy into fixed-size struct for pass-by-value kernel argument. + PermuteStrides strides{}; + for (int d = 0; d < ndims; ++d) { + strides.input[d] = input_strides_vec[d]; + strides.permuted[d] = permuted_strides_vec[d]; + } + + // TODO: for the common case of 2D float32 transpose (permutation [1,0]), + // cublasSgeam with CUBLAS_OP_T is significantly faster than this generic + // kernel. Would require adding cuBLAS as a dependency of dalotia_cpp. + constexpr int block_size = 256; + int grid_size = + static_cast((total_elements + block_size - 1) / block_size); + permute_kernel<<>>( + static_cast(d_src), static_cast(d_dest), + total_elements, element_bytes, ndims, strides); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + throw std::runtime_error( + std::string("permute_on_gpu: kernel launch failed: ") + + cudaGetErrorString(err)); + } +} + +} // namespace dalotia + +#endif // DALOTIA_WITH_CUDA diff --git a/src/dalotia_permute_gpu.cuh b/src/dalotia_permute_gpu.cuh new file mode 100644 index 0000000..fdaa4f4 --- /dev/null +++ b/src/dalotia_permute_gpu.cuh @@ -0,0 +1,32 @@ +#pragma once + +#ifdef DALOTIA_WITH_CUDA + +#include +#include +#include + +#include + +namespace dalotia { + +static constexpr int kMaxPermuteDims = 8; + +// Permute tensor data on device. `d_src` and `d_dest` are device pointers. +// `d_src` contains elements in the original (C-order) layout. +// `d_dest` will contain elements in the permuted layout. +// `input_shape` and `permutation` are host vectors with `ndims` entries. +// `element_bytes` is the size of one element (e.g. 4 for float32). +// +// The kernel is launched on `stream` (default: 0). No synchronization is +// performed — the caller is responsible for synchronizing the stream if +// needed before reading from `d_dest`. +void permute_on_gpu(const void* d_src, void* d_dest, size_t total_elements, + size_t element_bytes, int ndims, + const std::vector& input_shape, + const std::vector& permutation, + cudaStream_t stream = 0); + +} // namespace dalotia + +#endif // DALOTIA_WITH_CUDA diff --git a/src/dalotia_tensor_file.cpp b/src/dalotia_tensor_file.cpp index 234deb8..654a4b7 100644 --- a/src/dalotia_tensor_file.cpp +++ b/src/dalotia_tensor_file.cpp @@ -1,7 +1,7 @@ #include "dalotia_tensor_file.hpp" #ifdef DALOTIA_WITH_CUDA -#include +#include "dalotia_permute_gpu.cuh" #endif #ifdef DALOTIA_WITH_CUFILE #include "dalotia_cufile.hpp" @@ -9,68 +9,82 @@ namespace dalotia { -#ifdef DALOTIA_WITH_CUDA -bool is_device_pointer(const void* ptr) noexcept { - if (!ptr) - return false; - cudaPointerAttributes attrs{}; - cudaError_t err = cudaPointerGetAttributes(&attrs, ptr); - if (err != cudaSuccess) { - cudaGetLastError(); - return false; - } - return attrs.type == cudaMemoryTypeDevice || - attrs.type == cudaMemoryTypeManaged; -} -#endif // DALOTIA_WITH_CUDA - void TensorFile::load_tensor_dense(const std::string& tensor_name, dalotia_WeightFormat weightFormat, dalotia_Ordering ordering, dalotia_byte* __restrict__ tensor, - const std::vector& permutation) { - // Normalize the permutation once: convert from user-supplied - // (possibly 1-indexed, possibly F-ordering) to 0-indexed C-order. + const std::vector& permutation +#ifdef DALOTIA_WITH_CUDA + , + cudaStream_t stream +#endif +) { auto final_perm = final_c_permutation_from_permutation_and_order( permutation, ordering, this->get_num_dimensions(tensor_name)); #ifdef DALOTIA_WITH_CUDA if (is_device_pointer(tensor)) { - if (!final_perm.empty()) { + auto info = this->get_tensor_info(tensor_name); + if (info.format != weightFormat) { throw std::runtime_error( - "load_tensor_dense: permutation to device memory is not " - "supported; transpose on-device after loading instead."); + "load_tensor_dense: format conversion to device memory is " + "not yet supported (file format " + + std::to_string(info.format) + " != requested " + + std::to_string(weightFormat) + + "); convert on-device after loading instead."); } - auto extents = this->get_tensor_extents(tensor_name); + bool needs_permute = !final_perm.empty(); + + auto input_extents = this->get_tensor_extents(tensor_name); auto total_elements = - std::accumulate(extents.begin(), extents.end(), size_t{1}, - std::multiplies()); - size_t nbytes = total_elements * sizeof_weight_format(weightFormat); + std::accumulate(input_extents.begin(), input_extents.end(), + size_t{1}, std::multiplies()); + size_t element_bytes = sizeof_weight_format(weightFormat); + size_t nbytes = total_elements * element_bytes; + + // If permutation needed, load into a temp buffer then permute. + CudaBuffer d_tmp; + dalotia_byte* d_raw = tensor; + if (needs_permute) { + d_tmp = CudaBuffer(nbytes, stream); + d_raw = d_tmp.as(); + } + bool loaded = false; #ifdef DALOTIA_WITH_CUFILE if (gpu_data_source_ && data_source_ && data_source_->host_data(0)) { - // GDS path: read directly from file to device memory. auto ptrs = this->get_mmap_tensor_pointers(tensor_name); if (!ptrs.empty()) { size_t offset = reinterpret_cast(ptrs[0]) - data_source_->host_data(0); - gpu_data_source_->read_into(offset, nbytes, tensor); - return; + gpu_data_source_->read_into(offset, nbytes, d_raw); + loaded = true; } } #endif // DALOTIA_WITH_CUFILE + if (!loaded) { + // Fallback: load to host buffer, then cudaMemcpyAsync. + std::vector host_buf(nbytes); + load_tensor_dense_impl(tensor_name, weightFormat, + dalotia_C_ordering, host_buf.data(), {}); + cudaError_t err = cudaMemcpyAsync(d_raw, host_buf.data(), nbytes, + cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) { + throw std::runtime_error( + std::string("load_tensor_dense: cudaMemcpy failed: ") + + cudaGetErrorString(err)); + } + // Must synchronize before host_buf goes out of scope. + cudaStreamSynchronize(stream); + } - // Fallback: load to a temporary host buffer, then cudaMemcpy. - std::vector host_buf(nbytes); - load_tensor_dense_impl(tensor_name, weightFormat, ordering, - host_buf.data(), {}); - cudaError_t err = - cudaMemcpy(tensor, host_buf.data(), nbytes, cudaMemcpyHostToDevice); - if (err != cudaSuccess) { - throw std::runtime_error( - std::string("load_tensor_dense: cudaMemcpy failed: ") + - cudaGetErrorString(err)); + if (needs_permute) { + permute_on_gpu(d_raw, tensor, total_elements, element_bytes, + static_cast(input_extents.size()), + input_extents, final_perm, stream); + // Synchronize before d_tmp is freed (RAII destructor). + cudaStreamSynchronize(stream); } return; } diff --git a/src/dalotia_tensor_file.hpp b/src/dalotia_tensor_file.hpp index ccb7583..34c70ba 100644 --- a/src/dalotia_tensor_file.hpp +++ b/src/dalotia_tensor_file.hpp @@ -16,13 +16,11 @@ #include "dalotia_assignment.hpp" #include "dalotia_datasource.hpp" -namespace dalotia { - #ifdef DALOTIA_WITH_CUDA -// Returns true if `ptr` is a CUDA device pointer (cudaMalloc'd or managed). -// Returns false for host pointers (including cudaMallocHost pinned memory). -bool is_device_pointer(const void* ptr) noexcept; -#endif // DALOTIA_WITH_CUDA +#include "dalotia_cuda.hpp" +#endif + +namespace dalotia { class TensorFile { public: @@ -109,7 +107,12 @@ class TensorFile { dalotia_WeightFormat weightFormat, dalotia_Ordering ordering, dalotia_byte* __restrict__ tensor, - const std::vector& permutation = {}); + const std::vector& permutation = {} +#ifdef DALOTIA_WITH_CUDA + , + cudaStream_t stream = 0 +#endif + ); template //? or have no defaults? [[nodiscard]] std::pair, dalotia::vector> diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c4fbd1b..9187fca 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,5 +1,3 @@ -list(APPEND CMAKE_CTEST_ARGUMENTS "--output-on-failure") - # cf. https://stackoverflow.com/questions/52730994/how-to-pass-arguments-to-memcheck-with-ctest set(MEMORYCHECK_COMMAND_OPTIONS "--gen-suppressions=all --suppressions=${CMAKE_CURRENT_SOURCE_DIR}/tensorflow.supp --leak-check=full" diff --git a/test/test_cufile.cpp b/test/test_cufile.cpp index 58a0605..2f8163a 100644 --- a/test/test_cufile.cpp +++ b/test/test_cufile.cpp @@ -144,7 +144,6 @@ void test_load_to_gpu() { CHECK_CUDA(cudaFree(d_tensor)); std::cout << "OK (via fallback)" << std::endl; } - #ifdef DALOTIA_WITH_CUFILE void test_load_to_gpu_with_driver() { std::cout << "test_load_to_gpu_with_driver... " << std::flush; @@ -181,6 +180,7 @@ void test_same_file_host_and_gpu() { // a host pointer and a device pointer, and verify both match. std::cout << "test_same_file_host_and_gpu... " << std::flush; + auto driver = try_open_driver(); dalotia::SafetensorsFile file(TEST_FILE); // Load to host @@ -211,6 +211,46 @@ void test_same_file_host_and_gpu() { std::cout << "OK" << std::endl; } +void test_permuted_load_to_gpu() { + std::cout << "test_permuted_load_to_gpu... " << std::flush; + + auto driver = try_open_driver(); + // The test model has "embedding_firstchanged" with shape [4,3,5]. + // Permutation [1,0,2] gives shape [3,4,5] with values 0..59. + const char* perm_tensor = "embedding_firstchanged"; + std::vector perm = {1, 0, 2}; + + // Load with permutation on CPU as reference + auto file = std::unique_ptr( + dalotia::make_tensor_file(TEST_FILE)); + auto [extents_ref, h_ref] = file->load_tensor_dense( + perm_tensor, FORMAT, dalotia_C_ordering, perm); + assert(extents_ref == std::vector({3, 4, 5})); + assert(h_ref.size() == NUM_ELEMENTS); + for (int i = 0; i < NUM_ELEMENTS; i++) { + assert(h_ref[i] == static_cast(i)); + } + + // Now load with permutation directly to GPU + const size_t nbytes = NUM_ELEMENTS * sizeof(double); + double* d_tensor = nullptr; + CHECK_CUDA(cudaMalloc(&d_tensor, nbytes)); + + file->load_tensor_dense(perm_tensor, FORMAT, dalotia_C_ordering, + reinterpret_cast(d_tensor), perm); + + // Copy back and verify + std::vector h_result(NUM_ELEMENTS); + CHECK_CUDA( + cudaMemcpy(h_result.data(), d_tensor, nbytes, cudaMemcpyDeviceToHost)); + for (int i = 0; i < NUM_ELEMENTS; i++) { + assert(h_result[i] == h_ref[i]); + } + + CHECK_CUDA(cudaFree(d_tensor)); + std::cout << "OK" << std::endl; +} + int main() { test_is_device_pointer(); #ifdef DALOTIA_WITH_CUFILE @@ -222,6 +262,7 @@ int main() { test_load_to_gpu_with_driver(); #endif test_same_file_host_and_gpu(); + test_permuted_load_to_gpu(); std::cout << "test_cufile succeeded" << std::endl; return 0; }