RIKEN-RCCS · freifrauvonbleifrei · Mar 24, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/.github/workflows/ctest.yml b/.github/workflows/ctest.yml
@@ -79,8 +79,8 @@ jobs:
         options: DALOTIA_WITH_CUFILE=ON DALOTIA_BUILD_TESTS=ON DALOTIA_WITH_FORTRAN=OFF
         build-dir: build-cuda
 
-    - name: Run non-GPU tests
+    - name: Run tests
       uses: threeal/ctest-action@v1.1.0
       with:
         test-dir: build-cuda
-        args: --output-on-failure --exclude-regex cufile-pointer-detection
+        args: --verbose --exclude-regex cufile-pointer-detection
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_library(dalotia_cpp dalotia.cpp) # Daniel Pfeifer says: no variables
 target_sources(dalotia_cpp PRIVATE dalotia_assignment.cpp dalotia_formats.cpp dalotia_datasource.cpp dalotia_tensor_file.cpp)
 set_target_properties(dalotia_cpp PROPERTIES PUBLIC_HEADER
-	"dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp")
+	"dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp;dalotia_cuda.hpp")
 # have one dalotia library target that can be used in C++ and Fortran
 add_library(dalotia INTERFACE)
 add_library(dalotia::dalotia_cpp ALIAS dalotia_cpp)
@@ -32,6 +32,8 @@ endif (DALOTIA_WITH_SAFETENSORS_CPP)
 if (DALOTIA_WITH_CUDA)
     target_link_libraries(dalotia_cpp PUBLIC CUDA::cudart)
     target_compile_definitions(dalotia_cpp PUBLIC "DALOTIA_WITH_CUDA")
+    target_sources(dalotia_cpp PRIVATE dalotia_cuda.cpp dalotia_permute_gpu.cu)
+    set_source_files_properties(dalotia_permute_gpu.cu PROPERTIES LANGUAGE CUDA)
 endif (DALOTIA_WITH_CUDA)
 
 if (DALOTIA_WITH_CUFILE)

diff --git a/src/dalotia_cuda.cpp b/src/dalotia_cuda.cpp
@@ -0,0 +1,22 @@
+#ifdef DALOTIA_WITH_CUDA
+
+#include "dalotia_cuda.hpp"
+
+namespace dalotia {
+
+bool is_device_pointer(const void* ptr) noexcept {
+    if (!ptr)
+        return false;
+    cudaPointerAttributes attrs{};
+    cudaError_t err = cudaPointerGetAttributes(&attrs, ptr);
+    if (err != cudaSuccess) {
+        cudaGetLastError();
+        return false;
+    }
+    return attrs.type == cudaMemoryTypeDevice ||
+           attrs.type == cudaMemoryTypeManaged;
+}
+
+}  // namespace dalotia
+
+#endif  // DALOTIA_WITH_CUDA
diff --git a/src/dalotia_cuda.hpp b/src/dalotia_cuda.hpp
@@ -0,0 +1,110 @@
+#pragma once
+
+#ifdef DALOTIA_WITH_CUDA
+
+#include <cstddef>
+#include <cuda_runtime.h>
+#include <stdexcept>
+#include <string>
+
+namespace dalotia {
+
+// Returns true if `ptr` is a CUDA device pointer (cudaMalloc'd or managed).
+// Returns false for host pointers (including cudaMallocHost pinned memory).
+bool is_device_pointer(const void* ptr) noexcept;
+
+// RAII wrapper for a cudaMalloc'd device buffer. Move-only.
+// Type-erased (stores void*); use as<T>() for typed access.
+class CudaBuffer {
+   public:
+    CudaBuffer() = default;
+
+    explicit CudaBuffer(size_t nbytes) : size_(nbytes) {
+        if (nbytes > 0) {
+            cudaError_t err = cudaMalloc(&ptr_, nbytes);
+            if (err != cudaSuccess) {
+                throw std::runtime_error(
+                    std::string("CudaBuffer: cudaMalloc failed: ") +
+                    cudaGetErrorString(err));
+            }
+        }
+    }
+
+    CudaBuffer(size_t nbytes, cudaStream_t stream) : size_(nbytes) {
+        if (nbytes > 0) {
+            cudaError_t err = cudaMallocAsync(&ptr_, nbytes, stream);
+            if (err != cudaSuccess) {
+                throw std::runtime_error(
+                    std::string("CudaBuffer: cudaMallocAsync failed: ") +
+                    cudaGetErrorString(err));
+            }
+            stream_ = stream;
+            async_ = true;
+        }
+    }
+
+    ~CudaBuffer() {
+        if (ptr_) {
+            if (async_) {
+                cudaFreeAsync(ptr_, stream_);
+            } else {
+                cudaFree(ptr_);
+            }
+        }
+    }
+
+    CudaBuffer(CudaBuffer&& other) noexcept
+        : ptr_(other.ptr_)
+        , size_(other.size_)
+        , stream_(other.stream_)
+        , async_(other.async_) {
+        other.ptr_ = nullptr;
+        other.size_ = 0;
+    }
+
+    CudaBuffer& operator=(CudaBuffer&& other) noexcept {
+        if (this != &other) {
+            if (ptr_) {
+                if (async_)
+                    cudaFreeAsync(ptr_, stream_);
+                else
+                    cudaFree(ptr_);
+            }
+            ptr_ = other.ptr_;
+            size_ = other.size_;
+            stream_ = other.stream_;
+            async_ = other.async_;
+            other.ptr_ = nullptr;
+            other.size_ = 0;
+        }
+        return *this;
+    }
+
+    CudaBuffer(const CudaBuffer&) = delete;
+    CudaBuffer& operator=(const CudaBuffer&) = delete;
+
+    void* data() noexcept { return ptr_; }
+    const void* data() const noexcept { return ptr_; }
+    size_t size() const noexcept { return size_; }
+    bool empty() const noexcept { return ptr_ == nullptr; }
+
+    template <typename T>
+    T* as() noexcept {
+        return static_cast<T*>(ptr_);
+    }
+
+    template <typename T>
+    const T* as() const noexcept {
+        return static_cast<const T*>(ptr_);
+    }
+
+   private:
+    void* ptr_ = nullptr;
+    size_t size_ = 0;
+    cudaStream_t stream_ = 0;
+    bool async_ = false;
+};
+
+}  // namespace dalotia
+
+#endif  // DALOTIA_WITH_CUDA
diff --git a/src/dalotia_permute_gpu.cu b/src/dalotia_permute_gpu.cu
@@ -0,0 +1,88 @@
+#ifdef DALOTIA_WITH_CUDA
+
+#include "dalotia_assignment.hpp"
+#include "dalotia_permute_gpu.cuh"
+
+#include <cuda_runtime.h>
+#include <stdexcept>
+#include <string>
+
+namespace dalotia {
+
+// Fixed-size stride arrays passed by value as kernel arguments.
+// At most 8 dimensions × 8 bytes = 64 bytes per array — well within
+// the 4 KB kernel argument limit, and avoids device memory allocation.
+struct PermuteStrides {
+    size_t input[kMaxPermuteDims];
+    size_t permuted[kMaxPermuteDims];
+};
+
+__global__ void permute_kernel(const char* __restrict__ src,
+                               char* __restrict__ dest, size_t total_elements,
+                               size_t element_bytes, int ndims,
+                               PermuteStrides strides) {
+    size_t idx = blockIdx.x * static_cast<size_t>(blockDim.x) + threadIdx.x;
+    if (idx >= total_elements)
+        return;
+
+    size_t dest_idx = 0;
+    size_t remaining = idx;
+    for (int d = 0; d < ndims; ++d) {
+        size_t coord = remaining / strides.input[d];
+        remaining -= coord * strides.input[d];
+        dest_idx += coord * strides.permuted[d];
+    }
+
+    const char* src_ptr = src + idx * element_bytes;
+    char* dest_ptr = dest + dest_idx * element_bytes;
+    for (size_t b = 0; b < element_bytes; ++b) {
+        dest_ptr[b] = src_ptr[b];
+    }
+}
+
+void permute_on_gpu(const void* d_src, void* d_dest, size_t total_elements,
+                    size_t element_bytes, int ndims,
+                    const std::vector<int>& input_shape,
+                    const std::vector<int>& permutation, cudaStream_t stream) {
+    if (ndims <= 0 || ndims > kMaxPermuteDims) {
+        throw std::runtime_error(
+            "permute_on_gpu: unsupported number of dimensions: " +
+            std::to_string(ndims));
+    }
+    if (static_cast<int>(input_shape.size()) != ndims ||
+        static_cast<int>(permutation.size()) != ndims) {
+        throw std::runtime_error(
+            "permute_on_gpu: input_shape/permutation size mismatch");
+    }
+
+    auto [input_strides_vec, permuted_strides_vec, total_size] =
+        compute_permute_strides(ndims, input_shape.data(), permutation.data());
+
+    // Copy into fixed-size struct for pass-by-value kernel argument.
+    PermuteStrides strides{};
+    for (int d = 0; d < ndims; ++d) {
+        strides.input[d] = input_strides_vec[d];
+        strides.permuted[d] = permuted_strides_vec[d];
+    }
+
+    // TODO: for the common case of 2D float32 transpose (permutation [1,0]),
+    // cublasSgeam with CUBLAS_OP_T is significantly faster than this generic
+    // kernel. Would require adding cuBLAS as a dependency of dalotia_cpp.
+    constexpr int block_size = 256;
+    int grid_size =
+        static_cast<int>((total_elements + block_size - 1) / block_size);
+    permute_kernel<<<grid_size, block_size, 0, stream>>>(
+        static_cast<const char*>(d_src), static_cast<char*>(d_dest),
+        total_elements, element_bytes, ndims, strides);
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        throw std::runtime_error(
+            std::string("permute_on_gpu: kernel launch failed: ") +
+            cudaGetErrorString(err));
+    }
+}
+
+}  // namespace dalotia
+
+#endif  // DALOTIA_WITH_CUDA
diff --git a/src/dalotia_permute_gpu.cuh b/src/dalotia_permute_gpu.cuh
@@ -0,0 +1,32 @@
+#pragma once
+
+#ifdef DALOTIA_WITH_CUDA
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include <cuda_runtime.h>
+
+namespace dalotia {
+
+static constexpr int kMaxPermuteDims = 8;
+
+// Permute tensor data on device. `d_src` and `d_dest` are device pointers.
+// `d_src` contains elements in the original (C-order) layout.
+// `d_dest` will contain elements in the permuted layout.
+// `input_shape` and `permutation` are host vectors with `ndims` entries.
+// `element_bytes` is the size of one element (e.g. 4 for float32).
+//
+// The kernel is launched on `stream` (default: 0). No synchronization is
+// performed — the caller is responsible for synchronizing the stream if
+// needed before reading from `d_dest`.
+void permute_on_gpu(const void* d_src, void* d_dest, size_t total_elements,
+                    size_t element_bytes, int ndims,
+                    const std::vector<int>& input_shape,
+                    const std::vector<int>& permutation,
+                    cudaStream_t stream = 0);
+
+}  // namespace dalotia
+
+#endif  // DALOTIA_WITH_CUDA