diff --git a/.github/workflows/ctest.yml b/.github/workflows/ctest.yml
index ef6d8c3..95318a2 100644
--- a/.github/workflows/ctest.yml
+++ b/.github/workflows/ctest.yml
@@ -79,8 +79,8 @@ jobs:
         options: DALOTIA_WITH_CUFILE=ON DALOTIA_BUILD_TESTS=ON DALOTIA_WITH_FORTRAN=OFF
         build-dir: build-cuda
 
-    - name: Run non-GPU tests
+    - name: Run tests
       uses: threeal/ctest-action@v1.1.0
       with:
         test-dir: build-cuda
-        args: --output-on-failure --exclude-regex cufile-pointer-detection
+        args: --verbose --exclude-regex cufile-pointer-detection
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4db3b2a..d804732 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,7 +1,7 @@
 add_library(dalotia_cpp dalotia.cpp) # Daniel Pfeifer says: no variables
 target_sources(dalotia_cpp PRIVATE dalotia_assignment.cpp dalotia_formats.cpp dalotia_datasource.cpp dalotia_tensor_file.cpp)
 set_target_properties(dalotia_cpp PROPERTIES PUBLIC_HEADER
-	"dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp")
+	"dalotia.h;dalotia_formats.h;dalotia.hpp;dalotia_formats.hpp;dalotia_assignment.hpp;dalotia_datasource.hpp;dalotia_tensor_file.hpp;dalotia_safetensors_file.hpp;dalotia_tensorflow_file.hpp;dalotia_cufile.hpp;dalotia_cuda.hpp")
 # have one dalotia library target that can be used in C++ and Fortran
 add_library(dalotia INTERFACE)
 add_library(dalotia::dalotia_cpp ALIAS dalotia_cpp)
@@ -32,6 +32,8 @@ endif (DALOTIA_WITH_SAFETENSORS_CPP)
 if (DALOTIA_WITH_CUDA)
     target_link_libraries(dalotia_cpp PUBLIC CUDA::cudart)
     target_compile_definitions(dalotia_cpp PUBLIC "DALOTIA_WITH_CUDA")
+    target_sources(dalotia_cpp PRIVATE dalotia_cuda.cpp dalotia_permute_gpu.cu)
+    set_source_files_properties(dalotia_permute_gpu.cu PROPERTIES LANGUAGE CUDA)
 endif (DALOTIA_WITH_CUDA)
 
 if (DALOTIA_WITH_CUFILE)
diff --git a/src/dalotia_cuda.cpp b/src/dalotia_cuda.cpp
new file mode 100644
index 0000000..bc99f8c
--- /dev/null
+++ b/src/dalotia_cuda.cpp
@@ -0,0 +1,22 @@
+#ifdef DALOTIA_WITH_CUDA
+
+#include "dalotia_cuda.hpp"
+
+namespace dalotia {
+
+bool is_device_pointer(const void* ptr) noexcept {
+    if (!ptr)
+        return false;
+    cudaPointerAttributes attrs{};
+    cudaError_t err = cudaPointerGetAttributes(&attrs, ptr);
+    if (err != cudaSuccess) {
+        cudaGetLastError();
+        return false;
+    }
+    return attrs.type == cudaMemoryTypeDevice ||
+           attrs.type == cudaMemoryTypeManaged;
+}
+
+}  // namespace dalotia
+
+#endif  // DALOTIA_WITH_CUDA
diff --git a/src/dalotia_cuda.hpp b/src/dalotia_cuda.hpp
new file mode 100644
index 0000000..2fc7fb8
--- /dev/null
+++ b/src/dalotia_cuda.hpp
@@ -0,0 +1,110 @@
+#pragma once
+
+#ifdef DALOTIA_WITH_CUDA
+
+#include <cstddef>
+#include <cuda_runtime.h>
+#include <stdexcept>
+#include <string>
+
+namespace dalotia {
+
+// Returns true if `ptr` is a CUDA device pointer (cudaMalloc'd or managed).
+// Returns false for host pointers (including cudaMallocHost pinned memory).
+bool is_device_pointer(const void* ptr) noexcept;
+
+// RAII wrapper for a cudaMalloc'd device buffer. Move-only.
+// Type-erased (stores void*); use as<T>() for typed access.
+class CudaBuffer {
+   public:
+    CudaBuffer() = default;
+
+    explicit CudaBuffer(size_t nbytes) : size_(nbytes) {
+        if (nbytes > 0) {
+            cudaError_t err = cudaMalloc(&ptr_, nbytes);
+            if (err != cudaSuccess) {
+                throw std::runtime_error(
+                    std::string("CudaBuffer: cudaMalloc failed: ") +
+                    cudaGetErrorString(err));
+            }
+        }
+    }
+
+    CudaBuffer(size_t nbytes, cudaStream_t stream) : size_(nbytes) {
+        if (nbytes > 0) {
+            cudaError_t err = cudaMallocAsync(&ptr_, nbytes, stream);
+            if (err != cudaSuccess) {
+                throw std::runtime_error(
+                    std::string("CudaBuffer: cudaMallocAsync failed: ") +
+                    cudaGetErrorString(err));
+            }
+            stream_ = stream;
+            async_ = true;
+        }
+    }
+
+    ~CudaBuffer() {
+        if (ptr_) {
+            if (async_) {
+                cudaFreeAsync(ptr_, stream_);
+            } else {
+                cudaFree(ptr_);
+            }
+        }
+    }
+
+    CudaBuffer(CudaBuffer&& other) noexcept
+        : ptr_(other.ptr_)
+        , size_(other.size_)
+        , stream_(other.stream_)
+        , async_(other.async_) {
+        other.ptr_ = nullptr;
+        other.size_ = 0;
+    }
+
+    CudaBuffer& operator=(CudaBuffer&& other) noexcept {
+        if (this != &other) {
+            if (ptr_) {
+                if (async_)
+                    cudaFreeAsync(ptr_, stream_);
+                else
+                    cudaFree(ptr_);
+            }
+            ptr_ = other.ptr_;
+            size_ = other.size_;
+            stream_ = other.stream_;
+            async_ = other.async_;
+            other.ptr_ = nullptr;
+            other.size_ = 0;
+        }
+        return *this;
+    }
+
+    CudaBuffer(const CudaBuffer&) = delete;
+    CudaBuffer& operator=(const CudaBuffer&) = delete;
+
+    void* data() noexcept { return ptr_; }
+    const void* data() const noexcept { return ptr_; }
+    size_t size() const noexcept { return size_; }
+    bool empty() const noexcept { return ptr_ == nullptr; }
+
+    template <typename T>
+    T* as() noexcept {
+        return static_cast<T*>(ptr_);
+    }
+
+    template <typename T>
+    const T* as() const noexcept {
+        return static_cast<const T*>(ptr_);
+    }
+
+   private:
+    void* ptr_ = nullptr;
+    size_t size_ = 0;
+    cudaStream_t stream_ = 0;
+    bool async_ = false;
+};
+
+}  // namespace dalotia
+
+#endif  // DALOTIA_WITH_CUDA
diff --git a/src/dalotia_permute_gpu.cu b/src/dalotia_permute_gpu.cu
new file mode 100644
index 0000000..886a489
--- /dev/null
+++ b/src/dalotia_permute_gpu.cu
@@ -0,0 +1,88 @@
+#ifdef DALOTIA_WITH_CUDA
+
+#include "dalotia_assignment.hpp"
+#include "dalotia_permute_gpu.cuh"
+
+#include <cuda_runtime.h>
+#include <stdexcept>
+#include <string>
+
+namespace dalotia {
+
+// Fixed-size stride arrays passed by value as kernel arguments.
+// At most 8 dimensions × 8 bytes = 64 bytes per array — well within
+// the 4 KB kernel argument limit, and avoids device memory allocation.
+struct PermuteStrides {
+    size_t input[kMaxPermuteDims];
+    size_t permuted[kMaxPermuteDims];
+};
+
+__global__ void permute_kernel(const char* __restrict__ src,
+                               char* __restrict__ dest, size_t total_elements,
+                               size_t element_bytes, int ndims,
+                               PermuteStrides strides) {
+    size_t idx = blockIdx.x * static_cast<size_t>(blockDim.x) + threadIdx.x;
+    if (idx >= total_elements)
+        return;
+
+    size_t dest_idx = 0;
+    size_t remaining = idx;
+    for (int d = 0; d < ndims; ++d) {
+        size_t coord = remaining / strides.input[d];
+        remaining -= coord * strides.input[d];
+        dest_idx += coord * strides.permuted[d];
+    }
+
+    const char* src_ptr = src + idx * element_bytes;
+    char* dest_ptr = dest + dest_idx * element_bytes;
+    for (size_t b = 0; b < element_bytes; ++b) {
+        dest_ptr[b] = src_ptr[b];
+    }
+}
+
+void permute_on_gpu(const void* d_src, void* d_dest, size_t total_elements,
+                    size_t element_bytes, int ndims,
+                    const std::vector<int>& input_shape,
+                    const std::vector<int>& permutation, cudaStream_t stream) {
+    if (ndims <= 0 || ndims > kMaxPermuteDims) {
+        throw std::runtime_error(
+            "permute_on_gpu: unsupported number of dimensions: " +
+            std::to_string(ndims));
+    }
+    if (static_cast<int>(input_shape.size()) != ndims ||
+        static_cast<int>(permutation.size()) != ndims) {
+        throw std::runtime_error(
+            "permute_on_gpu: input_shape/permutation size mismatch");
+    }
+
+    auto [input_strides_vec, permuted_strides_vec, total_size] =
+        compute_permute_strides(ndims, input_shape.data(), permutation.data());
+
+    // Copy into fixed-size struct for pass-by-value kernel argument.
+    PermuteStrides strides{};
+    for (int d = 0; d < ndims; ++d) {
+        strides.input[d] = input_strides_vec[d];
+        strides.permuted[d] = permuted_strides_vec[d];
+    }
+
+    // TODO: for the common case of 2D float32 transpose (permutation [1,0]),
+    // cublasSgeam with CUBLAS_OP_T is significantly faster than this generic
+    // kernel. Would require adding cuBLAS as a dependency of dalotia_cpp.
+    constexpr int block_size = 256;
+    int grid_size =
+        static_cast<int>((total_elements + block_size - 1) / block_size);
+    permute_kernel<<<grid_size, block_size, 0, stream>>>(
+        static_cast<const char*>(d_src), static_cast<char*>(d_dest),
+        total_elements, element_bytes, ndims, strides);
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        throw std::runtime_error(
+            std::string("permute_on_gpu: kernel launch failed: ") +
+            cudaGetErrorString(err));
+    }
+}
+
+}  // namespace dalotia
+
+#endif  // DALOTIA_WITH_CUDA
diff --git a/src/dalotia_permute_gpu.cuh b/src/dalotia_permute_gpu.cuh
new file mode 100644
index 0000000..fdaa4f4
--- /dev/null
+++ b/src/dalotia_permute_gpu.cuh
@@ -0,0 +1,32 @@
+#pragma once
+
+#ifdef DALOTIA_WITH_CUDA
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include <cuda_runtime.h>
+
+namespace dalotia {
+
+static constexpr int kMaxPermuteDims = 8;
+
+// Permute tensor data on device. `d_src` and `d_dest` are device pointers.
+// `d_src` contains elements in the original (C-order) layout.
+// `d_dest` will contain elements in the permuted layout.
+// `input_shape` and `permutation` are host vectors with `ndims` entries.
+// `element_bytes` is the size of one element (e.g. 4 for float32).
+//
+// The kernel is launched on `stream` (default: 0). No synchronization is
+// performed — the caller is responsible for synchronizing the stream if
+// needed before reading from `d_dest`.
+void permute_on_gpu(const void* d_src, void* d_dest, size_t total_elements,
+                    size_t element_bytes, int ndims,
+                    const std::vector<int>& input_shape,
+                    const std::vector<int>& permutation,
+                    cudaStream_t stream = 0);
+
+}  // namespace dalotia
+
+#endif  // DALOTIA_WITH_CUDA
diff --git a/src/dalotia_tensor_file.cpp b/src/dalotia_tensor_file.cpp
index 234deb8..654a4b7 100644
--- a/src/dalotia_tensor_file.cpp
+++ b/src/dalotia_tensor_file.cpp
@@ -1,7 +1,7 @@
 #include "dalotia_tensor_file.hpp"
 
 #ifdef DALOTIA_WITH_CUDA
-#include <cuda_runtime.h>
+#include "dalotia_permute_gpu.cuh"
 #endif
 #ifdef DALOTIA_WITH_CUFILE
 #include "dalotia_cufile.hpp"
@@ -9,68 +9,82 @@
 
 namespace dalotia {
 
-#ifdef DALOTIA_WITH_CUDA
-bool is_device_pointer(const void* ptr) noexcept {
-    if (!ptr)
-        return false;
-    cudaPointerAttributes attrs{};
-    cudaError_t err = cudaPointerGetAttributes(&attrs, ptr);
-    if (err != cudaSuccess) {
-        cudaGetLastError();
-        return false;
-    }
-    return attrs.type == cudaMemoryTypeDevice ||
-           attrs.type == cudaMemoryTypeManaged;
-}
-#endif  // DALOTIA_WITH_CUDA
-
 void TensorFile::load_tensor_dense(const std::string& tensor_name,
                                    dalotia_WeightFormat weightFormat,
                                    dalotia_Ordering ordering,
                                    dalotia_byte* __restrict__ tensor,
-                                   const std::vector<int>& permutation) {
-    // Normalize the permutation once: convert from user-supplied
-    // (possibly 1-indexed, possibly F-ordering) to 0-indexed C-order.
+                                   const std::vector<int>& permutation
+#ifdef DALOTIA_WITH_CUDA
+                                   ,
+                                   cudaStream_t stream
+#endif
+) {
     auto final_perm = final_c_permutation_from_permutation_and_order(
         permutation, ordering, this->get_num_dimensions(tensor_name));
 
 #ifdef DALOTIA_WITH_CUDA
     if (is_device_pointer(tensor)) {
-        if (!final_perm.empty()) {
+        auto info = this->get_tensor_info(tensor_name);
+        if (info.format != weightFormat) {
             throw std::runtime_error(
-                "load_tensor_dense: permutation to device memory is not "
-                "supported; transpose on-device after loading instead.");
+                "load_tensor_dense: format conversion to device memory is "
+                "not yet supported (file format " +
+                std::to_string(info.format) + " != requested " +
+                std::to_string(weightFormat) +
+                "); convert on-device after loading instead.");
         }
 
-        auto extents = this->get_tensor_extents(tensor_name);
+        bool needs_permute = !final_perm.empty();
+
+        auto input_extents = this->get_tensor_extents(tensor_name);
         auto total_elements =
-            std::accumulate(extents.begin(), extents.end(), size_t{1},
-                            std::multiplies<size_t>());
-        size_t nbytes = total_elements * sizeof_weight_format(weightFormat);
+            std::accumulate(input_extents.begin(), input_extents.end(),
+                            size_t{1}, std::multiplies<size_t>());
+        size_t element_bytes = sizeof_weight_format(weightFormat);
+        size_t nbytes = total_elements * element_bytes;
+
+        // If permutation needed, load into a temp buffer then permute.
+        CudaBuffer d_tmp;
+        dalotia_byte* d_raw = tensor;
+        if (needs_permute) {
+            d_tmp = CudaBuffer(nbytes, stream);
+            d_raw = d_tmp.as<dalotia_byte>();
+        }
 
+        bool loaded = false;
 #ifdef DALOTIA_WITH_CUFILE
         if (gpu_data_source_ && data_source_ && data_source_->host_data(0)) {
-            // GDS path: read directly from file to device memory.
             auto ptrs = this->get_mmap_tensor_pointers(tensor_name);
             if (!ptrs.empty()) {
                 size_t offset = reinterpret_cast<const uint8_t*>(ptrs[0]) -
                                 data_source_->host_data(0);
-                gpu_data_source_->read_into(offset, nbytes, tensor);
-                return;
+                gpu_data_source_->read_into(offset, nbytes, d_raw);
+                loaded = true;
             }
         }
 #endif  // DALOTIA_WITH_CUFILE
+        if (!loaded) {
+            // Fallback: load to host buffer, then cudaMemcpyAsync.
+            std::vector<dalotia_byte> host_buf(nbytes);
+            load_tensor_dense_impl(tensor_name, weightFormat,
+                                   dalotia_C_ordering, host_buf.data(), {});
+            cudaError_t err = cudaMemcpyAsync(d_raw, host_buf.data(), nbytes,
+                                              cudaMemcpyHostToDevice, stream);
+            if (err != cudaSuccess) {
+                throw std::runtime_error(
+                    std::string("load_tensor_dense: cudaMemcpy failed: ") +
+                    cudaGetErrorString(err));
+            }
+            // Must synchronize before host_buf goes out of scope.
+            cudaStreamSynchronize(stream);
+        }
 
-        // Fallback: load to a temporary host buffer, then cudaMemcpy.
-        std::vector<dalotia_byte> host_buf(nbytes);
-        load_tensor_dense_impl(tensor_name, weightFormat, ordering,
-                               host_buf.data(), {});
-        cudaError_t err =
-            cudaMemcpy(tensor, host_buf.data(), nbytes, cudaMemcpyHostToDevice);
-        if (err != cudaSuccess) {
-            throw std::runtime_error(
-                std::string("load_tensor_dense: cudaMemcpy failed: ") +
-                cudaGetErrorString(err));
+        if (needs_permute) {
+            permute_on_gpu(d_raw, tensor, total_elements, element_bytes,
+                           static_cast<int>(input_extents.size()),
+                           input_extents, final_perm, stream);
+            // Synchronize before d_tmp is freed (RAII destructor).
+            cudaStreamSynchronize(stream);
         }
         return;
     }
diff --git a/src/dalotia_tensor_file.hpp b/src/dalotia_tensor_file.hpp
index ccb7583..34c70ba 100644
--- a/src/dalotia_tensor_file.hpp
+++ b/src/dalotia_tensor_file.hpp
@@ -16,13 +16,11 @@
 #include "dalotia_assignment.hpp"
 #include "dalotia_datasource.hpp"
 
-namespace dalotia {
-
 #ifdef DALOTIA_WITH_CUDA
-// Returns true if `ptr` is a CUDA device pointer (cudaMalloc'd or managed).
-// Returns false for host pointers (including cudaMallocHost pinned memory).
-bool is_device_pointer(const void* ptr) noexcept;
-#endif  // DALOTIA_WITH_CUDA
+#include "dalotia_cuda.hpp"
+#endif
+
+namespace dalotia {
 
 class TensorFile {
    public:
@@ -109,7 +107,12 @@ class TensorFile {
                            dalotia_WeightFormat weightFormat,
                            dalotia_Ordering ordering,
                            dalotia_byte* __restrict__ tensor,
-                           const std::vector<int>& permutation = {});
+                           const std::vector<int>& permutation = {}
+#ifdef DALOTIA_WITH_CUDA
+                           ,
+                           cudaStream_t stream = 0
+#endif
+    );
 
     template <typename value_type = dalotia_byte>  //? or have no defaults?
     [[nodiscard]] std::pair<std::vector<int>, dalotia::vector<value_type>>
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index c4fbd1b..9187fca 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,5 +1,3 @@
-list(APPEND CMAKE_CTEST_ARGUMENTS "--output-on-failure")
-
 # cf. https://stackoverflow.com/questions/52730994/how-to-pass-arguments-to-memcheck-with-ctest
 set(MEMORYCHECK_COMMAND_OPTIONS
     "--gen-suppressions=all --suppressions=${CMAKE_CURRENT_SOURCE_DIR}/tensorflow.supp --leak-check=full"
diff --git a/test/test_cufile.cpp b/test/test_cufile.cpp
index 58a0605..2f8163a 100644
--- a/test/test_cufile.cpp
+++ b/test/test_cufile.cpp
@@ -144,7 +144,6 @@ void test_load_to_gpu() {
     CHECK_CUDA(cudaFree(d_tensor));
     std::cout << "OK (via fallback)" << std::endl;
 }
-
 #ifdef DALOTIA_WITH_CUFILE
 void test_load_to_gpu_with_driver() {
     std::cout << "test_load_to_gpu_with_driver... " << std::flush;
@@ -181,6 +180,7 @@ void test_same_file_host_and_gpu() {
     // a host pointer and a device pointer, and verify both match.
     std::cout << "test_same_file_host_and_gpu... " << std::flush;
 
+    auto driver = try_open_driver();
     dalotia::SafetensorsFile file(TEST_FILE);
 
     // Load to host
@@ -211,6 +211,46 @@ void test_same_file_host_and_gpu() {
     std::cout << "OK" << std::endl;
 }
 
+void test_permuted_load_to_gpu() {
+    std::cout << "test_permuted_load_to_gpu... " << std::flush;
+
+    auto driver = try_open_driver();
+    // The test model has "embedding_firstchanged" with shape [4,3,5].
+    // Permutation [1,0,2] gives shape [3,4,5] with values 0..59.
+    const char* perm_tensor = "embedding_firstchanged";
+    std::vector<int> perm = {1, 0, 2};
+
+    // Load with permutation on CPU as reference
+    auto file = std::unique_ptr<dalotia::TensorFile>(
+        dalotia::make_tensor_file(TEST_FILE));
+    auto [extents_ref, h_ref] = file->load_tensor_dense<double>(
+        perm_tensor, FORMAT, dalotia_C_ordering, perm);
+    assert(extents_ref == std::vector<int>({3, 4, 5}));
+    assert(h_ref.size() == NUM_ELEMENTS);
+    for (int i = 0; i < NUM_ELEMENTS; i++) {
+        assert(h_ref[i] == static_cast<double>(i));
+    }
+
+    // Now load with permutation directly to GPU
+    const size_t nbytes = NUM_ELEMENTS * sizeof(double);
+    double* d_tensor = nullptr;
+    CHECK_CUDA(cudaMalloc(&d_tensor, nbytes));
+
+    file->load_tensor_dense(perm_tensor, FORMAT, dalotia_C_ordering,
+                            reinterpret_cast<dalotia_byte*>(d_tensor), perm);
+
+    // Copy back and verify
+    std::vector<double> h_result(NUM_ELEMENTS);
+    CHECK_CUDA(
+        cudaMemcpy(h_result.data(), d_tensor, nbytes, cudaMemcpyDeviceToHost));
+    for (int i = 0; i < NUM_ELEMENTS; i++) {
+        assert(h_result[i] == h_ref[i]);
+    }
+
+    CHECK_CUDA(cudaFree(d_tensor));
+    std::cout << "OK" << std::endl;
+}
+
 int main() {
     test_is_device_pointer();
 #ifdef DALOTIA_WITH_CUFILE
@@ -222,6 +262,7 @@ int main() {
     test_load_to_gpu_with_driver();
 #endif
     test_same_file_host_and_gpu();
+    test_permuted_load_to_gpu();
     std::cout << "test_cufile succeeded" << std::endl;
     return 0;
 }