From 51464b689130eeff8b91aeb9fde7f47a6f447de4 Mon Sep 17 00:00:00 2001
From: Theresa <theresa.pollinger@riken.jp>
Date: Mon, 23 Mar 2026 06:58:44 +0000
Subject: [PATCH 01/11] permute gpu: first version (no type conversion!)

---
 src/CMakeLists.txt          |  2 +
 src/dalotia_permute_gpu.cu  | 88 +++++++++++++++++++++++++++++++++++++
 src/dalotia_permute_gpu.cuh | 32 ++++++++++++++
 src/dalotia_tensor_file.cpp | 85 +++++++++++++++++++++++++----------
 src/dalotia_tensor_file.hpp | 17 +++++--
 test/test_cufile.cpp        | 40 +++++++++++++++++
 6 files changed, 238 insertions(+), 26 deletions(-)
 create mode 100644 src/dalotia_permute_gpu.cu
 create mode 100644 src/dalotia_permute_gpu.cuh
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1d6e1cd..326c3de 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -29,6 +29,8 @@ endif (DALOTIA_WITH_SAFETENSORS_CPP)
 if (DALOTIA_WITH_CUDA)
     target_link_libraries(dalotia_cpp PUBLIC CUDA::cudart)
     target_compile_definitions(dalotia_cpp PUBLIC "DALOTIA_WITH_CUDA")
+    target_sources(dalotia_cpp PRIVATE dalotia_permute_gpu.cu)
+    set_source_files_properties(dalotia_permute_gpu.cu PROPERTIES LANGUAGE CUDA)
 endif (DALOTIA_WITH_CUDA)
 
 if (DALOTIA_WITH_CUFILE)
diff --git a/src/dalotia_permute_gpu.cu b/src/dalotia_permute_gpu.cu
new file mode 100644
index 0000000..3d1d8d2
--- /dev/null
+++ b/src/dalotia_permute_gpu.cu
@@ -0,0 +1,88 @@
+#ifdef DALOTIA_WITH_CUDA
+
+#include "dalotia_permute_gpu.cuh"
+#include "dalotia_assignment.hpp"
+
+#include <cuda_runtime.h>
+#include <stdexcept>
+#include <string>
+
+namespace dalotia {
+
+// Fixed-size stride arrays passed by value as kernel arguments.
+// At most 8 dimensions × 8 bytes = 64 bytes per array — well within
+// the 4 KB kernel argument limit, and avoids device memory allocation.
+struct PermuteStrides {
+    size_t input[kMaxPermuteDims];
+    size_t permuted[kMaxPermuteDims];
+};
+
+__global__ void permute_kernel(const char *__restrict__ src,
+                               char *__restrict__ dest, size_t total_elements,
+                               size_t element_bytes, int ndims,
+                               PermuteStrides strides) {
+    size_t idx = blockIdx.x * static_cast<size_t>(blockDim.x) + threadIdx.x;
+    if (idx >= total_elements) return;
+
+    size_t dest_idx = 0;
+    size_t remaining = idx;
+    for (int d = 0; d < ndims; ++d) {
+        size_t coord = remaining / strides.input[d];
+        remaining -= coord * strides.input[d];
+        dest_idx += coord * strides.permuted[d];
+    }
+
+    const char *src_ptr = src + idx * element_bytes;
+    char *dest_ptr = dest + dest_idx * element_bytes;
+    for (size_t b = 0; b < element_bytes; ++b) {
+        dest_ptr[b] = src_ptr[b];
+    }
+}
+
+void permute_on_gpu(const void *d_src, void *d_dest, size_t total_elements,
+                    size_t element_bytes, int ndims,
+                    const std::vector<int> &input_shape,
+                    const std::vector<int> &permutation,
+                    cudaStream_t stream) {
+    if (ndims <= 0 || ndims > kMaxPermuteDims) {
+        throw std::runtime_error(
+            "permute_on_gpu: unsupported number of dimensions: " +
+            std::to_string(ndims));
+    }
+    if (static_cast<int>(input_shape.size()) != ndims ||
+        static_cast<int>(permutation.size()) != ndims) {
+        throw std::runtime_error(
+            "permute_on_gpu: input_shape/permutation size mismatch");
+    }
+
+    auto [input_strides_vec, permuted_strides_vec, total_size] =
+        compute_permute_strides(ndims, input_shape.data(), permutation.data());
+
+    // Copy into fixed-size struct for pass-by-value kernel argument.
+    PermuteStrides strides{};
+    for (int d = 0; d < ndims; ++d) {
+        strides.input[d] = input_strides_vec[d];
+        strides.permuted[d] = permuted_strides_vec[d];
+    }
+
+    // TODO: for the common case of 2D float32 transpose (permutation [1,0]),
+    // cublasSgeam with CUBLAS_OP_T is significantly faster than this generic
+    // kernel. Would require adding cuBLAS as a dependency of dalotia_cpp.
+    constexpr int block_size = 256;
+    int grid_size =
+        static_cast<int>((total_elements + block_size - 1) / block_size);
+    permute_kernel<<<grid_size, block_size, 0, stream>>>(
+        static_cast<const char *>(d_src), static_cast<char *>(d_dest),
+        total_elements, element_bytes, ndims, strides);
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        throw std::runtime_error(
+            std::string("permute_on_gpu: kernel launch failed: ") +
+            cudaGetErrorString(err));
+    }
+}
+
+}  // namespace dalotia
+
+#endif  // DALOTIA_WITH_CUDA
diff --git a/src/dalotia_permute_gpu.cuh b/src/dalotia_permute_gpu.cuh
new file mode 100644
index 0000000..5560982
--- /dev/null
+++ b/src/dalotia_permute_gpu.cuh
@@ -0,0 +1,32 @@
+#pragma once
+
+#ifdef DALOTIA_WITH_CUDA
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include <cuda_runtime.h>
+
+namespace dalotia {
+
+static constexpr int kMaxPermuteDims = 8;
+
+// Permute tensor data on device. `d_src` and `d_dest` are device pointers.
+// `d_src` contains elements in the original (C-order) layout.
+// `d_dest` will contain elements in the permuted layout.
+// `input_shape` and `permutation` are host vectors with `ndims` entries.
+// `element_bytes` is the size of one element (e.g. 4 for float32).
+//
+// The kernel is launched on `stream` (default: 0). No synchronization is
+// performed — the caller is responsible for synchronizing the stream if
+// needed before reading from `d_dest`.
+void permute_on_gpu(const void *d_src, void *d_dest, size_t total_elements,
+                    size_t element_bytes, int ndims,
+                    const std::vector<int> &input_shape,
+                    const std::vector<int> &permutation,
+                    cudaStream_t stream = 0);
+
+}  // namespace dalotia
+
+#endif  // DALOTIA_WITH_CUDA
diff --git a/src/dalotia_tensor_file.cpp b/src/dalotia_tensor_file.cpp
index 234deb8..2ea5a2b 100644
--- a/src/dalotia_tensor_file.cpp
+++ b/src/dalotia_tensor_file.cpp
@@ -2,6 +2,7 @@
 
 #ifdef DALOTIA_WITH_CUDA
 #include <cuda_runtime.h>
+#include "dalotia_permute_gpu.cuh"
 #endif
 #ifdef DALOTIA_WITH_CUFILE
 #include "dalotia_cufile.hpp"
@@ -28,49 +29,87 @@ void TensorFile::load_tensor_dense(const std::string& tensor_name,
                                    dalotia_WeightFormat weightFormat,
                                    dalotia_Ordering ordering,
                                    dalotia_byte* __restrict__ tensor,
-                                   const std::vector<int>& permutation) {
-    // Normalize the permutation once: convert from user-supplied
-    // (possibly 1-indexed, possibly F-ordering) to 0-indexed C-order.
+                                   const std::vector<int>& permutation
+#ifdef DALOTIA_WITH_CUDA
+                                   ,
+                                   cudaStream_t stream
+#endif
+) {
     auto final_perm = final_c_permutation_from_permutation_and_order(
         permutation, ordering, this->get_num_dimensions(tensor_name));
 
 #ifdef DALOTIA_WITH_CUDA
     if (is_device_pointer(tensor)) {
-        if (!final_perm.empty()) {
+        auto info = this->get_tensor_info(tensor_name);
+        if (info.format != weightFormat) {
             throw std::runtime_error(
-                "load_tensor_dense: permutation to device memory is not "
-                "supported; transpose on-device after loading instead.");
+                "load_tensor_dense: format conversion to device memory is "
+                "not yet supported (file format " +
+                std::to_string(info.format) + " != requested " +
+                std::to_string(weightFormat) +
+                "); convert on-device after loading instead.");
         }
 
-        auto extents = this->get_tensor_extents(tensor_name);
+        bool needs_permute = !final_perm.empty();
+
+        auto input_extents = this->get_tensor_extents(tensor_name);
         auto total_elements =
-            std::accumulate(extents.begin(), extents.end(), size_t{1},
-                            std::multiplies<size_t>());
-        size_t nbytes = total_elements * sizeof_weight_format(weightFormat);
+            std::accumulate(input_extents.begin(), input_extents.end(),
+                            size_t{1}, std::multiplies<size_t>());
+        size_t element_bytes = sizeof_weight_format(weightFormat);
+        size_t nbytes = total_elements * element_bytes;
 
+        dalotia_byte* d_raw = tensor;
+        dalotia_byte* d_tmp = nullptr;
+        if (needs_permute) {
+            cudaError_t alloc_err = cudaMallocAsync(&d_tmp, nbytes, stream);
+            if (alloc_err != cudaSuccess) {
+                throw std::runtime_error(
+                    std::string("load_tensor_dense: cudaMallocAsync for "
+                                "permutation temp buffer failed: ") +
+                    cudaGetErrorString(alloc_err));
+            }
+            d_raw = d_tmp;
+        }
+
+        bool loaded = false;
 #ifdef DALOTIA_WITH_CUFILE
         if (gpu_data_source_ && data_source_ && data_source_->host_data(0)) {
-            // GDS path: read directly from file to device memory.
             auto ptrs = this->get_mmap_tensor_pointers(tensor_name);
             if (!ptrs.empty()) {
                 size_t offset = reinterpret_cast<const uint8_t*>(ptrs[0]) -
                                 data_source_->host_data(0);
-                gpu_data_source_->read_into(offset, nbytes, tensor);
-                return;
+                gpu_data_source_->read_into(offset, nbytes, d_raw);
+                loaded = true;
             }
         }
 #endif  // DALOTIA_WITH_CUFILE
+        if (!loaded) {
+            // Fallback: load to host buffer, then cudaMemcpyAsync.
+            std::vector<dalotia_byte> host_buf(nbytes);
+            load_tensor_dense_impl(tensor_name, weightFormat,
+                                   dalotia_C_ordering, host_buf.data(), {});
+            cudaError_t err = cudaMemcpyAsync(d_raw, host_buf.data(), nbytes,
+                                              cudaMemcpyHostToDevice, stream);
+            if (err != cudaSuccess) {
+                if (d_tmp)
+                    cudaFreeAsync(d_tmp, stream);
+                throw std::runtime_error(
+                    std::string("load_tensor_dense: cudaMemcpy failed: ") +
+                    cudaGetErrorString(err));
+            }
+            // Must synchronize before host_buf goes out of scope,
+            // since cudaMemcpyAsync reads from it asynchronously.
+            cudaStreamSynchronize(stream);
+        }
 
-        // Fallback: load to a temporary host buffer, then cudaMemcpy.
-        std::vector<dalotia_byte> host_buf(nbytes);
-        load_tensor_dense_impl(tensor_name, weightFormat, ordering,
-                               host_buf.data(), {});
-        cudaError_t err =
-            cudaMemcpy(tensor, host_buf.data(), nbytes, cudaMemcpyHostToDevice);
-        if (err != cudaSuccess) {
-            throw std::runtime_error(
-                std::string("load_tensor_dense: cudaMemcpy failed: ") +
-                cudaGetErrorString(err));
+        if (needs_permute) {
+            permute_on_gpu(d_tmp, tensor, total_elements, element_bytes,
+                           static_cast<int>(input_extents.size()),
+                           input_extents, final_perm, stream);
+            // Synchronize before freeing the temp buffer.
+            cudaStreamSynchronize(stream);
+            cudaFreeAsync(d_tmp, stream);
         }
         return;
     }
diff --git a/src/dalotia_tensor_file.hpp b/src/dalotia_tensor_file.hpp
index 39dad90..d1ebc3b 100644
--- a/src/dalotia_tensor_file.hpp
+++ b/src/dalotia_tensor_file.hpp
@@ -16,6 +16,10 @@
 #include "dalotia_assignment.hpp"
 #include "dalotia_datasource.hpp"
 
+#ifdef DALOTIA_WITH_CUDA
+#include <cuda_runtime.h>
+#endif
+
 namespace dalotia {
 
 #ifdef DALOTIA_WITH_CUDA
@@ -84,7 +88,8 @@ class TensorFile {
                                std::multiplies<size_t>());
     }
 
-    [[nodiscard]] virtual size_t get_nnz(const std::string &/* tensor_name*/) const {
+    [[nodiscard]] virtual size_t get_nnz(
+        const std::string& /* tensor_name*/) const {
         // This function will read the file and return the number of non-zero
         // elements ? may take a while for dense tensors, only allow for sparse?
         throw std::runtime_error(
@@ -93,7 +98,8 @@ class TensorFile {
     }
 
     [[nodiscard]] virtual std::vector<int> get_sparse_tensor_extents(
-        const std::string &/*tensor_name*/, dalotia_SparseFormat /*format*/) const {
+        const std::string& /*tensor_name*/,
+        dalotia_SparseFormat /*format*/) const {
         // This function will (lazily) read the file and return the tensor
         // extents
         throw std::runtime_error(
@@ -105,7 +111,12 @@ class TensorFile {
                            dalotia_WeightFormat weightFormat,
                            dalotia_Ordering ordering,
                            dalotia_byte* __restrict__ tensor,
-                           const std::vector<int>& permutation = {});
+                           const std::vector<int>& permutation = {}
+#ifdef DALOTIA_WITH_CUDA
+                           ,
+                           cudaStream_t stream = 0
+#endif
+    );
 
     template <typename value_type = dalotia_byte>  //? or have no defaults?
     [[nodiscard]] std::pair<std::vector<int>, dalotia::vector<value_type>>
diff --git a/test/test_cufile.cpp b/test/test_cufile.cpp
index 58a0605..3f108c7 100644
--- a/test/test_cufile.cpp
+++ b/test/test_cufile.cpp
@@ -211,6 +211,45 @@ void test_same_file_host_and_gpu() {
     std::cout << "OK" << std::endl;
 }
 
+void test_permuted_load_to_gpu() {
+    std::cout << "test_permuted_load_to_gpu... " << std::flush;
+
+    // The test model has "embedding_firstchanged" with shape [4,3,5].
+    // Permutation [1,0,2] gives shape [3,4,5] with values 0..59.
+    const char* perm_tensor = "embedding_firstchanged";
+    std::vector<int> perm = {1, 0, 2};
+
+    // Load with permutation on CPU as reference
+    auto file = std::unique_ptr<dalotia::TensorFile>(
+        dalotia::make_tensor_file(TEST_FILE));
+    auto [extents_ref, h_ref] = file->load_tensor_dense<double>(
+        perm_tensor, FORMAT, dalotia_C_ordering, perm);
+    assert(extents_ref == std::vector<int>({3, 4, 5}));
+    assert(h_ref.size() == NUM_ELEMENTS);
+    for (int i = 0; i < NUM_ELEMENTS; i++) {
+        assert(h_ref[i] == static_cast<double>(i));
+    }
+
+    // Now load with permutation directly to GPU
+    const size_t nbytes = NUM_ELEMENTS * sizeof(double);
+    double* d_tensor = nullptr;
+    CHECK_CUDA(cudaMalloc(&d_tensor, nbytes));
+
+    file->load_tensor_dense(perm_tensor, FORMAT, dalotia_C_ordering,
+                            reinterpret_cast<dalotia_byte*>(d_tensor), perm);
+
+    // Copy back and verify
+    std::vector<double> h_result(NUM_ELEMENTS);
+    CHECK_CUDA(
+        cudaMemcpy(h_result.data(), d_tensor, nbytes, cudaMemcpyDeviceToHost));
+    for (int i = 0; i < NUM_ELEMENTS; i++) {
+        assert(h_result[i] == h_ref[i]);
+    }
+
+    CHECK_CUDA(cudaFree(d_tensor));
+    std::cout << "OK" << std::endl;
+}
+
 int main() {
     test_is_device_pointer();
 #ifdef DALOTIA_WITH_CUFILE
@@ -222,6 +261,7 @@ int main() {
     test_load_to_gpu_with_driver();
 #endif
     test_same_file_host_and_gpu();
+    test_permuted_load_to_gpu();
     std::cout << "test_cufile succeeded" << std::endl;
     return 0;
 }

From 667b7e88c90947d1dbeda8fff215385a935627f4 Mon Sep 17 00:00:00 2001
From: Theresa <theresa.pollinger@riken.jp>
Date: Mon, 23 Mar 2026 07:51:10 +0000
Subject: [PATCH 02/11] permute gpu: clang-format

---
 src/dalotia_permute_gpu.cu  | 108 ++++++++++++++++++------------------
 src/dalotia_permute_gpu.cuh |   4 +-
 2 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/src/dalotia_permute_gpu.cu b/src/dalotia_permute_gpu.cu
index 3d1d8d2..a132a95 100644
--- a/src/dalotia_permute_gpu.cu
+++ b/src/dalotia_permute_gpu.cu
@@ -1,7 +1,7 @@
 #ifdef DALOTIA_WITH_CUDA
 
-#include "dalotia_permute_gpu.cuh"
 #include "dalotia_assignment.hpp"
+#include "dalotia_permute_gpu.cuh"
 
 #include <cuda_runtime.h>
 #include <stdexcept>
@@ -13,76 +13,76 @@ namespace dalotia {
 // At most 8 dimensions × 8 bytes = 64 bytes per array — well within
 // the 4 KB kernel argument limit, and avoids device memory allocation.
 struct PermuteStrides {
-    size_t input[kMaxPermuteDims];
-    size_t permuted[kMaxPermuteDims];
+  size_t input[kMaxPermuteDims];
+  size_t permuted[kMaxPermuteDims];
 };
 
 __global__ void permute_kernel(const char *__restrict__ src,
                                char *__restrict__ dest, size_t total_elements,
                                size_t element_bytes, int ndims,
                                PermuteStrides strides) {
-    size_t idx = blockIdx.x * static_cast<size_t>(blockDim.x) + threadIdx.x;
-    if (idx >= total_elements) return;
+  size_t idx = blockIdx.x * static_cast<size_t>(blockDim.x) + threadIdx.x;
+  if (idx >= total_elements)
+    return;
 
-    size_t dest_idx = 0;
-    size_t remaining = idx;
-    for (int d = 0; d < ndims; ++d) {
-        size_t coord = remaining / strides.input[d];
-        remaining -= coord * strides.input[d];
-        dest_idx += coord * strides.permuted[d];
-    }
+  size_t dest_idx = 0;
+  size_t remaining = idx;
+  for (int d = 0; d < ndims; ++d) {
+    size_t coord = remaining / strides.input[d];
+    remaining -= coord * strides.input[d];
+    dest_idx += coord * strides.permuted[d];
+  }
 
-    const char *src_ptr = src + idx * element_bytes;
-    char *dest_ptr = dest + dest_idx * element_bytes;
-    for (size_t b = 0; b < element_bytes; ++b) {
-        dest_ptr[b] = src_ptr[b];
-    }
+  const char *src_ptr = src + idx * element_bytes;
+  char *dest_ptr = dest + dest_idx * element_bytes;
+  for (size_t b = 0; b < element_bytes; ++b) {
+    dest_ptr[b] = src_ptr[b];
+  }
 }
 
 void permute_on_gpu(const void *d_src, void *d_dest, size_t total_elements,
                     size_t element_bytes, int ndims,
                     const std::vector<int> &input_shape,
-                    const std::vector<int> &permutation,
-                    cudaStream_t stream) {
-    if (ndims <= 0 || ndims > kMaxPermuteDims) {
-        throw std::runtime_error(
-            "permute_on_gpu: unsupported number of dimensions: " +
-            std::to_string(ndims));
-    }
-    if (static_cast<int>(input_shape.size()) != ndims ||
-        static_cast<int>(permutation.size()) != ndims) {
-        throw std::runtime_error(
-            "permute_on_gpu: input_shape/permutation size mismatch");
-    }
+                    const std::vector<int> &permutation, cudaStream_t stream) {
+  if (ndims <= 0 || ndims > kMaxPermuteDims) {
+    throw std::runtime_error(
+        "permute_on_gpu: unsupported number of dimensions: " +
+        std::to_string(ndims));
+  }
+  if (static_cast<int>(input_shape.size()) != ndims ||
+      static_cast<int>(permutation.size()) != ndims) {
+    throw std::runtime_error(
+        "permute_on_gpu: input_shape/permutation size mismatch");
+  }
 
-    auto [input_strides_vec, permuted_strides_vec, total_size] =
-        compute_permute_strides(ndims, input_shape.data(), permutation.data());
+  auto [input_strides_vec, permuted_strides_vec, total_size] =
+      compute_permute_strides(ndims, input_shape.data(), permutation.data());
 
-    // Copy into fixed-size struct for pass-by-value kernel argument.
-    PermuteStrides strides{};
-    for (int d = 0; d < ndims; ++d) {
-        strides.input[d] = input_strides_vec[d];
-        strides.permuted[d] = permuted_strides_vec[d];
-    }
+  // Copy into fixed-size struct for pass-by-value kernel argument.
+  PermuteStrides strides{};
+  for (int d = 0; d < ndims; ++d) {
+    strides.input[d] = input_strides_vec[d];
+    strides.permuted[d] = permuted_strides_vec[d];
+  }
 
-    // TODO: for the common case of 2D float32 transpose (permutation [1,0]),
-    // cublasSgeam with CUBLAS_OP_T is significantly faster than this generic
-    // kernel. Would require adding cuBLAS as a dependency of dalotia_cpp.
-    constexpr int block_size = 256;
-    int grid_size =
-        static_cast<int>((total_elements + block_size - 1) / block_size);
-    permute_kernel<<<grid_size, block_size, 0, stream>>>(
-        static_cast<const char *>(d_src), static_cast<char *>(d_dest),
-        total_elements, element_bytes, ndims, strides);
+  // TODO: for the common case of 2D float32 transpose (permutation [1,0]),
+  // cublasSgeam with CUBLAS_OP_T is significantly faster than this generic
+  // kernel. Would require adding cuBLAS as a dependency of dalotia_cpp.
+  constexpr int block_size = 256;
+  int grid_size =
+      static_cast<int>((total_elements + block_size - 1) / block_size);
+  permute_kernel<<<grid_size, block_size, 0, stream>>>(
+      static_cast<const char *>(d_src), static_cast<char *>(d_dest),
+      total_elements, element_bytes, ndims, strides);
 
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess) {
-        throw std::runtime_error(
-            std::string("permute_on_gpu: kernel launch failed: ") +
-            cudaGetErrorString(err));
-    }
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    throw std::runtime_error(
+        std::string("permute_on_gpu: kernel launch failed: ") +
+        cudaGetErrorString(err));
+  }
 }
 
-}  // namespace dalotia
+} // namespace dalotia
 
-#endif  // DALOTIA_WITH_CUDA
+#endif // DALOTIA_WITH_CUDA
diff --git a/src/dalotia_permute_gpu.cuh b/src/dalotia_permute_gpu.cuh
index 5560982..a2b336f 100644
--- a/src/dalotia_permute_gpu.cuh
+++ b/src/dalotia_permute_gpu.cuh
@@ -27,6 +27,6 @@ void permute_on_gpu(const void *d_src, void *d_dest, size_t total_elements,
                     const std::vector<int> &permutation,
                     cudaStream_t stream = 0);
 
-}  // namespace dalotia
+} // namespace dalotia
 
-#endif  // DALOTIA_WITH_CUDA
+#endif // DALOTIA_WITH_CUDA

From 1759cc77c50a828089a17f090ccf95562c7ac2e8 Mon Sep 17 00:00:00 2001
From: Theresa <theresa.pollinger@riken.jp>
Date: Mon, 23 Mar 2026 07:57:13 +0000
Subject: [PATCH 03/11] permute gpu: clang-format

---
 src/dalotia_permute_gpu.cu  | 114 ++++++++++++++++++------------------
 src/dalotia_permute_gpu.cuh |  10 ++--
 2 files changed, 62 insertions(+), 62 deletions(-)

diff --git a/src/dalotia_permute_gpu.cu b/src/dalotia_permute_gpu.cu
index a132a95..886a489 100644
--- a/src/dalotia_permute_gpu.cu
+++ b/src/dalotia_permute_gpu.cu
@@ -13,76 +13,76 @@ namespace dalotia {
 // At most 8 dimensions × 8 bytes = 64 bytes per array — well within
 // the 4 KB kernel argument limit, and avoids device memory allocation.
 struct PermuteStrides {
-  size_t input[kMaxPermuteDims];
-  size_t permuted[kMaxPermuteDims];
+    size_t input[kMaxPermuteDims];
+    size_t permuted[kMaxPermuteDims];
 };
 
-__global__ void permute_kernel(const char *__restrict__ src,
-                               char *__restrict__ dest, size_t total_elements,
+__global__ void permute_kernel(const char* __restrict__ src,
+                               char* __restrict__ dest, size_t total_elements,
                                size_t element_bytes, int ndims,
                                PermuteStrides strides) {
-  size_t idx = blockIdx.x * static_cast<size_t>(blockDim.x) + threadIdx.x;
-  if (idx >= total_elements)
-    return;
+    size_t idx = blockIdx.x * static_cast<size_t>(blockDim.x) + threadIdx.x;
+    if (idx >= total_elements)
+        return;
 
-  size_t dest_idx = 0;
-  size_t remaining = idx;
-  for (int d = 0; d < ndims; ++d) {
-    size_t coord = remaining / strides.input[d];
-    remaining -= coord * strides.input[d];
-    dest_idx += coord * strides.permuted[d];
-  }
+    size_t dest_idx = 0;
+    size_t remaining = idx;
+    for (int d = 0; d < ndims; ++d) {
+        size_t coord = remaining / strides.input[d];
+        remaining -= coord * strides.input[d];
+        dest_idx += coord * strides.permuted[d];
+    }
 
-  const char *src_ptr = src + idx * element_bytes;
-  char *dest_ptr = dest + dest_idx * element_bytes;
-  for (size_t b = 0; b < element_bytes; ++b) {
-    dest_ptr[b] = src_ptr[b];
-  }
+    const char* src_ptr = src + idx * element_bytes;
+    char* dest_ptr = dest + dest_idx * element_bytes;
+    for (size_t b = 0; b < element_bytes; ++b) {
+        dest_ptr[b] = src_ptr[b];
+    }
 }
 
-void permute_on_gpu(const void *d_src, void *d_dest, size_t total_elements,
+void permute_on_gpu(const void* d_src, void* d_dest, size_t total_elements,
                     size_t element_bytes, int ndims,
-                    const std::vector<int> &input_shape,
-                    const std::vector<int> &permutation, cudaStream_t stream) {
-  if (ndims <= 0 || ndims > kMaxPermuteDims) {
-    throw std::runtime_error(
-        "permute_on_gpu: unsupported number of dimensions: " +
-        std::to_string(ndims));
-  }
-  if (static_cast<int>(input_shape.size()) != ndims ||
-      static_cast<int>(permutation.size()) != ndims) {
-    throw std::runtime_error(
-        "permute_on_gpu: input_shape/permutation size mismatch");
-  }
+                    const std::vector<int>& input_shape,
+                    const std::vector<int>& permutation, cudaStream_t stream) {
+    if (ndims <= 0 || ndims > kMaxPermuteDims) {
+        throw std::runtime_error(
+            "permute_on_gpu: unsupported number of dimensions: " +
+            std::to_string(ndims));
+    }
+    if (static_cast<int>(input_shape.size()) != ndims ||
+        static_cast<int>(permutation.size()) != ndims) {
+        throw std::runtime_error(
+            "permute_on_gpu: input_shape/permutation size mismatch");
+    }
 
-  auto [input_strides_vec, permuted_strides_vec, total_size] =
-      compute_permute_strides(ndims, input_shape.data(), permutation.data());
+    auto [input_strides_vec, permuted_strides_vec, total_size] =
+        compute_permute_strides(ndims, input_shape.data(), permutation.data());
 
-  // Copy into fixed-size struct for pass-by-value kernel argument.
-  PermuteStrides strides{};
-  for (int d = 0; d < ndims; ++d) {
-    strides.input[d] = input_strides_vec[d];
-    strides.permuted[d] = permuted_strides_vec[d];
-  }
+    // Copy into fixed-size struct for pass-by-value kernel argument.
+    PermuteStrides strides{};
+    for (int d = 0; d < ndims; ++d) {
+        strides.input[d] = input_strides_vec[d];
+        strides.permuted[d] = permuted_strides_vec[d];
+    }
 
-  // TODO: for the common case of 2D float32 transpose (permutation [1,0]),
-  // cublasSgeam with CUBLAS_OP_T is significantly faster than this generic
-  // kernel. Would require adding cuBLAS as a dependency of dalotia_cpp.
-  constexpr int block_size = 256;
-  int grid_size =
-      static_cast<int>((total_elements + block_size - 1) / block_size);
-  permute_kernel<<<grid_size, block_size, 0, stream>>>(
-      static_cast<const char *>(d_src), static_cast<char *>(d_dest),
-      total_elements, element_bytes, ndims, strides);
+    // TODO: for the common case of 2D float32 transpose (permutation [1,0]),
+    // cublasSgeam with CUBLAS_OP_T is significantly faster than this generic
+    // kernel. Would require adding cuBLAS as a dependency of dalotia_cpp.
+    constexpr int block_size = 256;
+    int grid_size =
+        static_cast<int>((total_elements + block_size - 1) / block_size);
+    permute_kernel<<<grid_size, block_size, 0, stream>>>(
+        static_cast<const char*>(d_src), static_cast<char*>(d_dest),
+        total_elements, element_bytes, ndims, strides);
 
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    throw std::runtime_error(
-        std::string("permute_on_gpu: kernel launch failed: ") +
-        cudaGetErrorString(err));
-  }
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        throw std::runtime_error(
+            std::string("permute_on_gpu: kernel launch failed: ") +
+            cudaGetErrorString(err));
+    }
 }
 
-} // namespace dalotia
+}  // namespace dalotia
 
-#endif // DALOTIA_WITH_CUDA
+#endif  // DALOTIA_WITH_CUDA
diff --git a/src/dalotia_permute_gpu.cuh b/src/dalotia_permute_gpu.cuh
index a2b336f..fdaa4f4 100644
--- a/src/dalotia_permute_gpu.cuh
+++ b/src/dalotia_permute_gpu.cuh
@@ -21,12 +21,12 @@ static constexpr int kMaxPermuteDims = 8;
 // The kernel is launched on `stream` (default: 0). No synchronization is
 // performed — the caller is responsible for synchronizing the stream if
 // needed before reading from `d_dest`.
-void permute_on_gpu(const void *d_src, void *d_dest, size_t total_elements,
+void permute_on_gpu(const void* d_src, void* d_dest, size_t total_elements,
                     size_t element_bytes, int ndims,
-                    const std::vector<int> &input_shape,
-                    const std::vector<int> &permutation,
+                    const std::vector<int>& input_shape,
+                    const std::vector<int>& permutation,
                     cudaStream_t stream = 0);
 
-} // namespace dalotia
+}  // namespace dalotia
 
-#endif // DALOTIA_WITH_CUDA
+#endif  // DALOTIA_WITH_CUDA

From 240068c192b9889b8af638571294a16b27bf6697 Mon Sep 17 00:00:00 2001
From: Theresa <theresa.pollinger@riken.jp>
Date: Mon, 23 Mar 2026 08:01:43 +0000
Subject: [PATCH 04/11] gds: RAII CudaBuffer

---
 src/CMakeLists.txt          |  2 +-
 src/dalotia_cuda_buffer.hpp | 98 +++++++++++++++++++++++++++++++++++++
 src/dalotia_tensor_file.cpp | 24 +++------
 3 files changed, 107 insertions(+), 17 deletions(-)
 create mode 100644 src/dalotia_cuda_buffer.hpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1dce087..c57dfc9 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_library(dalotia_cpp dalotia.cpp) # Daniel Pfeifer says: no variables
 target_sources(dalotia_cpp PRIVATE dalotia_assignment.cpp dalotia_formats.cpp dalotia_datasource.cpp dalotia_tensor_file.cpp)
 set_target_properties(dalotia_cpp PROPERTIES PUBLIC_HEADER
-	"dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp")
+	"dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp;dalotia_cuda_buffer.hpp")
 # have one dalotia library target that can be used in C++ and Fortran
 add_library(dalotia INTERFACE)
 add_library(dalotia::dalotia_cpp ALIAS dalotia_cpp)
diff --git a/src/dalotia_cuda_buffer.hpp b/src/dalotia_cuda_buffer.hpp
new file mode 100644
index 0000000..21ea324
--- /dev/null
+++ b/src/dalotia_cuda_buffer.hpp
@@ -0,0 +1,98 @@
+#pragma once
+
+#ifdef DALOTIA_WITH_CUDA
+
+#include <cstddef>
+#include <cuda_runtime.h>
+#include <stdexcept>
+#include <string>
+
+namespace dalotia {
+
+// RAII wrapper for a cudaMalloc'd device buffer. Move-only.
+// Type-erased (stores void*); use as<T>() for typed access.
+class CudaBuffer {
+   public:
+    CudaBuffer() = default;
+
+    explicit CudaBuffer(size_t nbytes) : size_(nbytes) {
+        if (nbytes > 0) {
+            cudaError_t err = cudaMalloc(&ptr_, nbytes);
+            if (err != cudaSuccess) {
+                throw std::runtime_error(
+                    std::string("CudaBuffer: cudaMalloc failed: ") +
+                    cudaGetErrorString(err));
+            }
+        }
+    }
+
+    CudaBuffer(size_t nbytes, cudaStream_t stream) : size_(nbytes) {
+        if (nbytes > 0) {
+            cudaError_t err = cudaMallocAsync(&ptr_, nbytes, stream);
+            if (err != cudaSuccess) {
+                throw std::runtime_error(
+                    std::string("CudaBuffer: cudaMallocAsync failed: ") +
+                    cudaGetErrorString(err));
+            }
+            stream_ = stream;
+            async_ = true;
+        }
+    }
+
+    ~CudaBuffer() {
+        if (ptr_) {
+            if (async_) {
+                cudaFreeAsync(ptr_, stream_);
+            } else {
+                cudaFree(ptr_);
+            }
+        }
+    }
+
+    // Move
+    CudaBuffer(CudaBuffer&& other) noexcept
+        : ptr_(other.ptr_)
+        , size_(other.size_)
+        , stream_(other.stream_)
+        , async_(other.async_) {
+        other.ptr_ = nullptr;
+        other.size_ = 0;
+    }
+
+    CudaBuffer& operator=(CudaBuffer&& other) noexcept {
+        if (this != &other) {
+            if (ptr_) {
+                if (async_)
+                    cudaFreeAsync(ptr_, stream_);
+                else
+                    cudaFree(ptr_);
+            }
+            ptr_ = other.ptr_;
+            size_ = other.size_;
+            stream_ = other.stream_;
+            async_ = other.async_;
+            other.ptr_ = nullptr;
+            other.size_ = 0;
+        }
+        return *this;
+    }
+
+    // No copy
+    CudaBuffer(const CudaBuffer&) = delete;
+    CudaBuffer& operator=(const CudaBuffer&) = delete;
+
+    void* data() noexcept { return ptr_; }
+    const void* data() const noexcept { return ptr_; }
+    size_t size() const noexcept { return size_; }
+    bool empty() const noexcept { return ptr_ == nullptr; }
+
+   private:
+    void* ptr_ = nullptr;
+    size_t size_ = 0;
+    cudaStream_t stream_ = 0;
+    bool async_ = false;
+};
+
+}  // namespace dalotia
+
+#endif  // DALOTIA_WITH_CUDA
diff --git a/src/dalotia_tensor_file.cpp b/src/dalotia_tensor_file.cpp
index 2ea5a2b..f5afffa 100644
--- a/src/dalotia_tensor_file.cpp
+++ b/src/dalotia_tensor_file.cpp
@@ -2,6 +2,7 @@
 
 #ifdef DALOTIA_WITH_CUDA
 #include <cuda_runtime.h>
+#include "dalotia_cuda_buffer.hpp"
 #include "dalotia_permute_gpu.cuh"
 #endif
 #ifdef DALOTIA_WITH_CUFILE
@@ -59,17 +60,12 @@ void TensorFile::load_tensor_dense(const std::string& tensor_name,
         size_t element_bytes = sizeof_weight_format(weightFormat);
         size_t nbytes = total_elements * element_bytes;
 
+        // If permutation needed, load into a temp buffer then permute.
+        CudaBuffer d_tmp;
         dalotia_byte* d_raw = tensor;
-        dalotia_byte* d_tmp = nullptr;
         if (needs_permute) {
-            cudaError_t alloc_err = cudaMallocAsync(&d_tmp, nbytes, stream);
-            if (alloc_err != cudaSuccess) {
-                throw std::runtime_error(
-                    std::string("load_tensor_dense: cudaMallocAsync for "
-                                "permutation temp buffer failed: ") +
-                    cudaGetErrorString(alloc_err));
-            }
-            d_raw = d_tmp;
+            d_tmp = CudaBuffer(nbytes, stream);
+            d_raw = d_tmp.as<dalotia_byte>();
         }
 
         bool loaded = false;
@@ -92,24 +88,20 @@ void TensorFile::load_tensor_dense(const std::string& tensor_name,
             cudaError_t err = cudaMemcpyAsync(d_raw, host_buf.data(), nbytes,
                                               cudaMemcpyHostToDevice, stream);
             if (err != cudaSuccess) {
-                if (d_tmp)
-                    cudaFreeAsync(d_tmp, stream);
                 throw std::runtime_error(
                     std::string("load_tensor_dense: cudaMemcpy failed: ") +
                     cudaGetErrorString(err));
             }
-            // Must synchronize before host_buf goes out of scope,
-            // since cudaMemcpyAsync reads from it asynchronously.
+            // Must synchronize before host_buf goes out of scope.
             cudaStreamSynchronize(stream);
         }
 
         if (needs_permute) {
-            permute_on_gpu(d_tmp, tensor, total_elements, element_bytes,
+            permute_on_gpu(d_raw, tensor, total_elements, element_bytes,
                            static_cast<int>(input_extents.size()),
                            input_extents, final_perm, stream);
-            // Synchronize before freeing the temp buffer.
+            // Synchronize before d_tmp is freed (RAII destructor).
             cudaStreamSynchronize(stream);
-            cudaFreeAsync(d_tmp, stream);
         }
         return;
     }

From 06623860f86231a55d332f2bff525b355b97fb99 Mon Sep 17 00:00:00 2001
From: Theresa <theresa.pollinger@riken.jp>
Date: Mon, 23 Mar 2026 08:10:03 +0000
Subject: [PATCH 05/11] cuda: collect cuda things (buffer, is_device_ptr)

---
 src/CMakeLists.txt                            |  4 ++--
 src/dalotia_cuda.cpp                          | 22 +++++++++++++++++++
 ...lotia_cuda_buffer.hpp => dalotia_cuda.hpp} | 16 ++++++++++++--
 src/dalotia_tensor_file.cpp                   | 17 --------------
 src/dalotia_tensor_file.hpp                   |  8 +------
 5 files changed, 39 insertions(+), 28 deletions(-)
 create mode 100644 src/dalotia_cuda.cpp
 rename src/{dalotia_cuda_buffer.hpp => dalotia_cuda.hpp} (86%)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c57dfc9..d804732 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_library(dalotia_cpp dalotia.cpp) # Daniel Pfeifer says: no variables
 target_sources(dalotia_cpp PRIVATE dalotia_assignment.cpp dalotia_formats.cpp dalotia_datasource.cpp dalotia_tensor_file.cpp)
 set_target_properties(dalotia_cpp PROPERTIES PUBLIC_HEADER
-	"dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp;dalotia_cuda_buffer.hpp")
+	"dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp;dalotia_cuda.hpp")
 # have one dalotia library target that can be used in C++ and Fortran
 add_library(dalotia INTERFACE)
 add_library(dalotia::dalotia_cpp ALIAS dalotia_cpp)
@@ -32,7 +32,7 @@ endif (DALOTIA_WITH_SAFETENSORS_CPP)
 if (DALOTIA_WITH_CUDA)
     target_link_libraries(dalotia_cpp PUBLIC CUDA::cudart)
     target_compile_definitions(dalotia_cpp PUBLIC "DALOTIA_WITH_CUDA")
-    target_sources(dalotia_cpp PRIVATE dalotia_permute_gpu.cu)
+    target_sources(dalotia_cpp PRIVATE dalotia_cuda.cpp dalotia_permute_gpu.cu)
     set_source_files_properties(dalotia_permute_gpu.cu PROPERTIES LANGUAGE CUDA)
 endif (DALOTIA_WITH_CUDA)
 
diff --git a/src/dalotia_cuda.cpp b/src/dalotia_cuda.cpp
new file mode 100644
index 0000000..bc99f8c
--- /dev/null
+++ b/src/dalotia_cuda.cpp
@@ -0,0 +1,22 @@
+#ifdef DALOTIA_WITH_CUDA
+
+#include "dalotia_cuda.hpp"
+
+namespace dalotia {
+
+bool is_device_pointer(const void* ptr) noexcept {
+    if (!ptr)
+        return false;
+    cudaPointerAttributes attrs{};
+    cudaError_t err = cudaPointerGetAttributes(&attrs, ptr);
+    if (err != cudaSuccess) {
+        cudaGetLastError();
+        return false;
+    }
+    return attrs.type == cudaMemoryTypeDevice ||
+           attrs.type == cudaMemoryTypeManaged;
+}
+
+}  // namespace dalotia
+
+#endif  // DALOTIA_WITH_CUDA
diff --git a/src/dalotia_cuda_buffer.hpp b/src/dalotia_cuda.hpp
similarity index 86%
rename from src/dalotia_cuda_buffer.hpp
rename to src/dalotia_cuda.hpp
index 21ea324..2fc7fb8 100644
--- a/src/dalotia_cuda_buffer.hpp
+++ b/src/dalotia_cuda.hpp
@@ -9,6 +9,10 @@
 
 namespace dalotia {
 
+// Returns true if `ptr` is a CUDA device pointer (cudaMalloc'd or managed).
+// Returns false for host pointers (including cudaMallocHost pinned memory).
+bool is_device_pointer(const void* ptr) noexcept;
+
 // RAII wrapper for a cudaMalloc'd device buffer. Move-only.
 // Type-erased (stores void*); use as<T>() for typed access.
 class CudaBuffer {
@@ -49,7 +53,6 @@ class CudaBuffer {
         }
     }
 
-    // Move
     CudaBuffer(CudaBuffer&& other) noexcept
         : ptr_(other.ptr_)
         , size_(other.size_)
@@ -77,7 +80,6 @@ class CudaBuffer {
         return *this;
     }
 
-    // No copy
     CudaBuffer(const CudaBuffer&) = delete;
     CudaBuffer& operator=(const CudaBuffer&) = delete;
 
@@ -86,6 +88,16 @@ class CudaBuffer {
     size_t size() const noexcept { return size_; }
     bool empty() const noexcept { return ptr_ == nullptr; }
 
+    template <typename T>
+    T* as() noexcept {
+        return static_cast<T*>(ptr_);
+    }
+
+    template <typename T>
+    const T* as() const noexcept {
+        return static_cast<const T*>(ptr_);
+    }
+
    private:
     void* ptr_ = nullptr;
     size_t size_ = 0;
diff --git a/src/dalotia_tensor_file.cpp b/src/dalotia_tensor_file.cpp
index f5afffa..654a4b7 100644
--- a/src/dalotia_tensor_file.cpp
+++ b/src/dalotia_tensor_file.cpp
@@ -1,8 +1,6 @@
 #include "dalotia_tensor_file.hpp"
 
 #ifdef DALOTIA_WITH_CUDA
-#include <cuda_runtime.h>
-#include "dalotia_cuda_buffer.hpp"
 #include "dalotia_permute_gpu.cuh"
 #endif
 #ifdef DALOTIA_WITH_CUFILE
@@ -11,21 +9,6 @@
 
 namespace dalotia {
 
-#ifdef DALOTIA_WITH_CUDA
-bool is_device_pointer(const void* ptr) noexcept {
-    if (!ptr)
-        return false;
-    cudaPointerAttributes attrs{};
-    cudaError_t err = cudaPointerGetAttributes(&attrs, ptr);
-    if (err != cudaSuccess) {
-        cudaGetLastError();
-        return false;
-    }
-    return attrs.type == cudaMemoryTypeDevice ||
-           attrs.type == cudaMemoryTypeManaged;
-}
-#endif  // DALOTIA_WITH_CUDA
-
 void TensorFile::load_tensor_dense(const std::string& tensor_name,
                                    dalotia_WeightFormat weightFormat,
                                    dalotia_Ordering ordering,
diff --git a/src/dalotia_tensor_file.hpp b/src/dalotia_tensor_file.hpp
index 0e5fc7b..34c70ba 100644
--- a/src/dalotia_tensor_file.hpp
+++ b/src/dalotia_tensor_file.hpp
@@ -17,17 +17,11 @@
 #include "dalotia_datasource.hpp"
 
 #ifdef DALOTIA_WITH_CUDA
-#include <cuda_runtime.h>
+#include "dalotia_cuda.hpp"
 #endif
 
 namespace dalotia {
 
-#ifdef DALOTIA_WITH_CUDA
-// Returns true if `ptr` is a CUDA device pointer (cudaMalloc'd or managed).
-// Returns false for host pointers (including cudaMallocHost pinned memory).
-bool is_device_pointer(const void* ptr) noexcept;
-#endif  // DALOTIA_WITH_CUDA
-
 class TensorFile {
    public:
     explicit TensorFile(const std::string& /* filename */) {

From b11bce45039c357ff15669972eb8ab97ab9f1c5e Mon Sep 17 00:00:00 2001
From: Theresa <theresa.pollinger@riken.jp>
Date: Mon, 23 Mar 2026 23:49:22 +0000
Subject: [PATCH 06/11] test cufile: more with open driver

---
 test/test_cufile.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_cufile.cpp b/test/test_cufile.cpp
index 3f108c7..2f8163a 100644
--- a/test/test_cufile.cpp
+++ b/test/test_cufile.cpp
@@ -144,7 +144,6 @@ void test_load_to_gpu() {
     CHECK_CUDA(cudaFree(d_tensor));
     std::cout << "OK (via fallback)" << std::endl;
 }
-
 #ifdef DALOTIA_WITH_CUFILE
 void test_load_to_gpu_with_driver() {
     std::cout << "test_load_to_gpu_with_driver... " << std::flush;
@@ -181,6 +180,7 @@ void test_same_file_host_and_gpu() {
     // a host pointer and a device pointer, and verify both match.
     std::cout << "test_same_file_host_and_gpu... " << std::flush;
 
+    auto driver = try_open_driver();
     dalotia::SafetensorsFile file(TEST_FILE);
 
     // Load to host
@@ -214,6 +214,7 @@ void test_same_file_host_and_gpu() {
 void test_permuted_load_to_gpu() {
     std::cout << "test_permuted_load_to_gpu... " << std::flush;
 
+    auto driver = try_open_driver();
     // The test model has "embedding_firstchanged" with shape [4,3,5].
     // Permutation [1,0,2] gives shape [3,4,5] with values 0..59.
     const char* perm_tensor = "embedding_firstchanged";

From aca23fa1d4044ff716535e55ba811bd4afb4ee6d Mon Sep 17 00:00:00 2001
From: Theresa <theresa.pollinger@riken.jp>
Date: Tue, 24 Mar 2026 00:12:50 +0000
Subject: [PATCH 07/11] ci: verbose output on cufile tests

---
 .github/workflows/ctest.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ctest.yml b/.github/workflows/ctest.yml
index ef6d8c3..95318a2 100644
--- a/.github/workflows/ctest.yml
+++ b/.github/workflows/ctest.yml
@@ -79,8 +79,8 @@ jobs:
         options: DALOTIA_WITH_CUFILE=ON DALOTIA_BUILD_TESTS=ON DALOTIA_WITH_FORTRAN=OFF
         build-dir: build-cuda
 
-    - name: Run non-GPU tests
+    - name: Run tests
       uses: threeal/ctest-action@v1.1.0
       with:
         test-dir: build-cuda
-        args: --output-on-failure --exclude-regex cufile-pointer-detection
+        args: --verbose --exclude-regex cufile-pointer-detection

From e4e0d485bbb80b9eb4476be02c2030e08fe759d7 Mon Sep 17 00:00:00 2001
From: Theresa <theresa.pollinger@riken.jp>
Date: Tue, 24 Mar 2026 00:16:28 +0000
Subject: [PATCH 08/11] cmake: output on tests

---
 test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index c4fbd1b..039c38d 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,4 +1,4 @@
-list(APPEND CMAKE_CTEST_ARGUMENTS "--output-on-failure")
+list(APPEND CMAKE_CTEST_ARGUMENTS "--verbose")
 
 # cf. https://stackoverflow.com/questions/52730994/how-to-pass-arguments-to-memcheck-with-ctest
 set(MEMORYCHECK_COMMAND_OPTIONS

From a39f53f10cd9f84b616d7d912a1fb66a69af2a74 Mon Sep 17 00:00:00 2001
From: Theresa <theresa.pollinger@riken.jp>
Date: Tue, 24 Mar 2026 00:16:59 +0000
Subject: [PATCH 09/11] cmake: output on tests (amends last commit)

---
 test/CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 039c38d..9187fca 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,5 +1,3 @@
-list(APPEND CMAKE_CTEST_ARGUMENTS "--verbose")
-
 # cf. https://stackoverflow.com/questions/52730994/how-to-pass-arguments-to-memcheck-with-ctest
 set(MEMORYCHECK_COMMAND_OPTIONS
     "--gen-suppressions=all --suppressions=${CMAKE_CURRENT_SOURCE_DIR}/tensorflow.supp --leak-check=full"

From 5dee1306a6a5b81a523cf82eecd7d7714abfc2e0 Mon Sep 17 00:00:00 2001
From: Theresa <theresa.pollinger@riken.jp>
Date: Tue, 24 Mar 2026 00:20:31 +0000
Subject: [PATCH 10/11] ci: verbose output on cufile tests

---
 .github/workflows/ctest.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ctest.yml b/.github/workflows/ctest.yml
index 95318a2..78f3b7a 100644
--- a/.github/workflows/ctest.yml
+++ b/.github/workflows/ctest.yml
@@ -83,4 +83,4 @@ jobs:
       uses: threeal/ctest-action@v1.1.0
       with:
         test-dir: build-cuda
-        args: --verbose --exclude-regex cufile-pointer-detection
+        args: --verbose

From af13240e5254a7315d5d6261da18d49c351fb30b Mon Sep 17 00:00:00 2001
From: Theresa <theresa.pollinger@riken.jp>
Date: Tue, 24 Mar 2026 00:23:13 +0000
Subject: [PATCH 11/11] Revert "ci: verbose output on cufile tests"

This reverts commit 5dee1306a6a5b81a523cf82eecd7d7714abfc2e0.
---
 .github/workflows/ctest.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ctest.yml b/.github/workflows/ctest.yml
index 78f3b7a..95318a2 100644
--- a/.github/workflows/ctest.yml
+++ b/.github/workflows/ctest.yml
@@ -83,4 +83,4 @@ jobs:
       uses: threeal/ctest-action@v1.1.0
       with:
         test-dir: build-cuda
-        args: --verbose
+        args: --verbose --exclude-regex cufile-pointer-detection