From 15ad866a7c90a417bf7d0c9a008d659a175eec0f Mon Sep 17 00:00:00 2001
From: shijiashuai <shijiashuai@bgi.com>
Date: Fri, 22 May 2026 11:15:59 +0800
Subject: [PATCH] refactor(core): deepen internal execution seams

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 AGENTS.md                      |   3 +-
 CLAUDE.md                      |   5 +-
 CMakeLists.txt                 | 102 +++++++++++------
 include/spmv/common.h          |   2 +-
 include/spmv/cuda_buffer.h     |   2 +-
 include/spmv/cuda_compat.h     | 150 ++++++++++++++++++++++++
 include/spmv/spmv.h            |  13 +--
 src/bandwidth.cpp              |   3 +-
 src/csr_matrix.cpp             | 197 ++-----------------------------
 src/ell_matrix.cpp             | 153 ++-----------------------
 src/internal/csr_device.cpp    | 204 +++++++++++++++++++++++++++++++++
 src/internal/csr_device.h      |   5 +
 src/internal/ell_device.cpp    | 160 ++++++++++++++++++++++++++
 src/internal/ell_device.h      |   5 +
 src/internal/pagerank_common.h |  15 +++
 src/internal/texture_cache.h   |  13 +++
 src/no_cuda_stubs.cpp          | 170 +++++++++++++++++++++++++++
 src/pagerank.cu                |  71 +-----------
 src/pagerank_common.cpp        |  82 +++++++++++++
 src/spmv_context.cpp           |  58 ++++++++++
 src/spmv_kernels.cu            |  56 +--------
 tests/test_architecture.cpp    |  20 ++++
 tests/test_no_cuda.cpp         |  44 +++++++
 tests/test_pagerank_core.cpp   |  60 ++++++++++
 24 files changed, 1090 insertions(+), 503 deletions(-)
 create mode 100644 include/spmv/cuda_compat.h
 create mode 100644 src/internal/csr_device.cpp
 create mode 100644 src/internal/ell_device.cpp
 create mode 100644 src/internal/pagerank_common.h
 create mode 100644 src/internal/texture_cache.h
 create mode 100644 src/no_cuda_stubs.cpp
 create mode 100644 src/pagerank_common.cpp
 create mode 100644 src/spmv_context.cpp
 create mode 100644 tests/test_architecture.cpp
 create mode 100644 tests/test_no_cuda.cpp
 create mode 100644 tests/test_pagerank_core.cpp

diff --git a/AGENTS.md b/AGENTS.md
index d2b0c6c..60b708d 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -101,6 +101,7 @@ cmake --preset release && cmake --build --preset release
 
 # CPU-only（无 GPU 环境，CI 使用此配置）
 cmake -S . -B build-no-cuda -DSPMV_REQUIRE_CUDA=OFF && cmake --build build-no-cuda
+ctest --test-dir build-no-cuda --output-on-failure
 
 # 运行测试
 ctest --preset default
@@ -110,7 +111,7 @@ ctest --preset default
 find src include tests benchmarks -type f \( -name "*.cpp" -o -name "*.h" -o -name "*.cu" \) | xargs clang-format -i
 ```
 
-> **CI 无 GPU**：需要 CUDA 设备的测试在 CI 中会跳过。`benchmarks/main.cu` 和 `pagerank.cu` 在无 GPU 时自动退出。
+> **CI 无 GPU**：CPU-only 配置会构建 core library + CPU 测试；需要 CUDA 设备的测试、基准程序和 PageRank CUDA 实现不会参与该配置。
 
 ---
 
diff --git a/CLAUDE.md b/CLAUDE.md
index 79ba41d..1be3379 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -35,7 +35,7 @@
           cmake
           - S.- B build - no - cuda - DSPMV_REQUIRE_CUDA =
     OFF && cmake-- build build - no -
-                   cuda
+                   cuda && ctest --test-dir build-no-cuda --output-on-failure
 
 #测试
                    ctest-- preset default
@@ -62,7 +62,8 @@
 
                          ##CI 特殊说明
 
-                   - CI 无 GPU：`benchmarks / main.cu`、`src / pagerank.cu` 无 GPU 时自动退出
+                   - CI 无 GPU：CPU-only 配置会构建 core library + CPU 测试；CUDA 测试 / benchmark /
+                   PageRank CUDA 路径不参与该配置
                    - CI 使用 clang - format - 18 检查格式 - CPU - only 构建：`cmake - S.- B build
                    - no - cuda - DSPMV_REQUIRE_CUDA = OFF`
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 273f09b..e20a614 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,41 +8,60 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-if(NOT SPMV_REQUIRE_CUDA)
-    message(STATUS "SPMV_REQUIRE_CUDA=OFF; configuring in no-CUDA mode. No build targets will be generated.")
-    enable_testing()
-    return()
-endif()
-
-check_language(CUDA)
-if(NOT CMAKE_CUDA_COMPILER)
-    message(FATAL_ERROR "CUDA toolkit with nvcc is required to build gpu-spmv. Set CUDAToolkit_ROOT or ensure nvcc is available on PATH, or configure with -DSPMV_REQUIRE_CUDA=OFF for a configure-only fallback.")
-endif()
-
-enable_language(CUDA)
-set(CMAKE_CUDA_STANDARD 17)
-set(CMAKE_CUDA_STANDARD_REQUIRED ON)
-
-# CUDA 架构设置
-if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-    set(CMAKE_CUDA_ARCHITECTURES 70 75 80 86 89 90)
+set(SPMV_WITH_CUDA OFF)
+
+if(SPMV_REQUIRE_CUDA)
+    check_language(CUDA)
+    if(NOT CMAKE_CUDA_COMPILER)
+        message(FATAL_ERROR "CUDA toolkit with nvcc is required to build gpu-spmv. Set CUDAToolkit_ROOT or ensure nvcc is available on PATH, or configure with -DSPMV_REQUIRE_CUDA=OFF for a CPU-only fallback.")
+    endif()
+
+    enable_language(CUDA)
+    set(CMAKE_CUDA_STANDARD 17)
+    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+    set(SPMV_WITH_CUDA ON)
+
+    # CUDA 架构设置
+    if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+        set(CMAKE_CUDA_ARCHITECTURES 70 75 80 86 89 90)
+    endif()
+else()
+    message(STATUS "SPMV_REQUIRE_CUDA=OFF; configuring in no-CUDA mode. Building core library and CPU tests only.")
 endif()
 
 # ---------- 主库 ----------
 set(SPMV_SOURCES
     src/csr_matrix.cpp
     src/ell_matrix.cpp
+    src/internal/csr_device.cpp
+    src/internal/ell_device.cpp
     src/spmv_cpu.cpp
+    src/spmv_context.cpp
     src/internal/kernel_selector.cpp
-    src/spmv_kernels.cu
     src/bandwidth.cpp
-    src/benchmark.cu
-    src/pagerank.cu
+    src/pagerank_common.cpp
 )
 
+if(SPMV_WITH_CUDA)
+    list(APPEND SPMV_SOURCES
+        src/spmv_kernels.cu
+        src/benchmark.cu
+        src/pagerank.cu
+    )
+else()
+    list(APPEND SPMV_SOURCES
+        src/no_cuda_stubs.cpp
+    )
+endif()
+
 add_library(spmv STATIC ${SPMV_SOURCES})
 target_include_directories(spmv PUBLIC ${CMAKE_SOURCE_DIR}/include)
-set_target_properties(spmv PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+if(SPMV_WITH_CUDA)
+    target_compile_definitions(spmv PUBLIC SPMV_WITH_CUDA=1)
+    set_target_properties(spmv PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+else()
+    target_compile_definitions(spmv PUBLIC SPMV_WITH_CUDA=0)
+endif()
 
 # CUDA 编译选项
 target_compile_options(spmv PRIVATE
@@ -64,19 +83,32 @@ enable_testing()
 
 # ---------- 测试 ----------
 set(TEST_SOURCES
+    tests/test_architecture.cpp
     tests/test_common.cpp
     tests/test_csr.cpp
     tests/test_ell.cpp
     tests/test_kernel_selector.cpp
-    tests/test_spmv.cu
-    tests/test_bandwidth.cu
-    tests/test_benchmark.cu
-    tests/test_pagerank.cu
+    tests/test_no_cuda.cpp
+    tests/test_pagerank_core.cpp
 )
 
+if(SPMV_WITH_CUDA)
+    list(APPEND TEST_SOURCES
+        tests/test_spmv.cu
+        tests/test_bandwidth.cu
+        tests/test_benchmark.cu
+        tests/test_pagerank.cu
+    )
+endif()
+
 add_executable(spmv_tests ${TEST_SOURCES})
 target_link_libraries(spmv_tests PRIVATE spmv GTest::gtest_main)
-set_target_properties(spmv_tests PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+if(SPMV_WITH_CUDA)
+    target_compile_definitions(spmv_tests PRIVATE SPMV_WITH_CUDA=1)
+    set_target_properties(spmv_tests PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+else()
+    target_compile_definitions(spmv_tests PRIVATE SPMV_WITH_CUDA=0)
+endif()
 target_compile_options(spmv_tests PRIVATE
     $<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>
 )
@@ -84,13 +116,15 @@ target_compile_options(spmv_tests PRIVATE
 include(GoogleTest)
 gtest_discover_tests(spmv_tests)
 
-# ---------- 基准测试 ----------
-add_executable(spmv_benchmark benchmarks/main.cu)
-target_link_libraries(spmv_benchmark PRIVATE spmv)
-set_target_properties(spmv_benchmark PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-target_compile_options(spmv_benchmark PRIVATE
-    $<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>
-)
+if(SPMV_WITH_CUDA)
+    # ---------- 基准测试 ----------
+    add_executable(spmv_benchmark benchmarks/main.cu)
+    target_link_libraries(spmv_benchmark PRIVATE spmv)
+    set_target_properties(spmv_benchmark PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+    target_compile_options(spmv_benchmark PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>
+    )
+endif()
 
 # ---------- Installation ----------
 include(GNUInstallDirs)
diff --git a/include/spmv/common.h b/include/spmv/common.h
index 5785240..6920454 100644
--- a/include/spmv/common.h
+++ b/include/spmv/common.h
@@ -1,7 +1,7 @@
 #ifndef SPMV_COMMON_H
 #define SPMV_COMMON_H
 
-#include <cuda_runtime.h>
+#include "cuda_compat.h"
 
 #include <cstdint>
 #include <cstdio>
diff --git a/include/spmv/cuda_buffer.h b/include/spmv/cuda_buffer.h
index 36fec6a..38efb25 100644
--- a/include/spmv/cuda_buffer.h
+++ b/include/spmv/cuda_buffer.h
@@ -1,7 +1,7 @@
 #ifndef SPMV_CUDA_BUFFER_H
 #define SPMV_CUDA_BUFFER_H
 
-#include <cuda_runtime.h>
+#include "cuda_compat.h"
 
 #include <cstddef>
 #include <utility>
diff --git a/include/spmv/cuda_compat.h b/include/spmv/cuda_compat.h
new file mode 100644
index 0000000..d0d92ad
--- /dev/null
+++ b/include/spmv/cuda_compat.h
@@ -0,0 +1,150 @@
+#ifndef SPMV_CUDA_COMPAT_H
+#define SPMV_CUDA_COMPAT_H
+
+#if defined(SPMV_WITH_CUDA) && SPMV_WITH_CUDA
+
+#include <cuda_runtime.h>
+
+#else
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+using cudaError_t = int;
+using cudaTextureObject_t = std::uintptr_t;
+using cudaEvent_t = void*;
+
+constexpr cudaError_t cudaSuccess = 0;
+constexpr cudaError_t cudaErrorMemoryAllocation = 2;
+constexpr cudaError_t cudaErrorInvalidValue = 11;
+
+enum cudaMemcpyKind {
+    cudaMemcpyHostToHost = 0,
+    cudaMemcpyHostToDevice = 1,
+    cudaMemcpyDeviceToHost = 2,
+    cudaMemcpyDeviceToDevice = 3
+};
+
+enum {
+    cudaResourceTypeLinear = 0,
+    cudaAddressModeClamp = 0,
+    cudaFilterModePoint = 0,
+    cudaReadModeElementType = 0
+};
+
+struct cudaChannelFormatDesc {
+    int x = 0;
+    int y = 0;
+    int z = 0;
+    int w = 0;
+    int f = 0;
+};
+
+template <typename T>
+inline cudaChannelFormatDesc cudaCreateChannelDesc() {
+    return {};
+}
+
+struct cudaResourceDesc {
+    int resType = cudaResourceTypeLinear;
+    struct {
+        struct {
+            void* devPtr = nullptr;
+            cudaChannelFormatDesc desc{};
+            size_t sizeInBytes = 0;
+        } linear;
+    } res;
+};
+
+struct cudaTextureDesc {
+    int addressMode[3] = {cudaAddressModeClamp, cudaAddressModeClamp, cudaAddressModeClamp};
+    int filterMode = cudaFilterModePoint;
+    int readMode = cudaReadModeElementType;
+    int normalizedCoords = 0;
+};
+
+struct cudaDeviceProp {
+    int memoryClockRate = 0;
+    int memoryBusWidth = 0;
+};
+
+inline const char* cudaGetErrorString(cudaError_t err) {
+    switch (err) {
+        case cudaSuccess:
+            return "success";
+        case cudaErrorMemoryAllocation:
+            return "memory allocation failed";
+        case cudaErrorInvalidValue:
+            return "invalid value";
+        default:
+            return "cuda unavailable";
+    }
+}
+
+inline cudaError_t cudaMalloc(void** ptr, size_t size) {
+    if (!ptr) {
+        return cudaErrorInvalidValue;
+    }
+    *ptr = (size == 0) ? nullptr : std::malloc(size);
+    return (size == 0 || *ptr != nullptr) ? cudaSuccess : cudaErrorMemoryAllocation;
+}
+
+template <typename T>
+inline cudaError_t cudaMalloc(T** ptr, size_t size) {
+    return cudaMalloc(reinterpret_cast<void**>(ptr), size);
+}
+
+inline cudaError_t cudaFree(void* ptr) {
+    std::free(ptr);
+    return cudaSuccess;
+}
+
+inline cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind) {
+    if (count > 0 && (!dst || !src)) {
+        return cudaErrorInvalidValue;
+    }
+    if (count > 0) {
+        std::memcpy(dst, src, count);
+    }
+    return cudaSuccess;
+}
+
+inline cudaError_t cudaMemset(void* dst, int value, size_t count) {
+    if (count > 0 && !dst) {
+        return cudaErrorInvalidValue;
+    }
+    if (count > 0) {
+        std::memset(dst, value, count);
+    }
+    return cudaSuccess;
+}
+
+inline cudaError_t cudaCreateTextureObject(cudaTextureObject_t* tex,
+                                           const cudaResourceDesc*,
+                                           const cudaTextureDesc*,
+                                           const void*) {
+    static cudaTextureObject_t next_texture = 1;
+    if (!tex) {
+        return cudaErrorInvalidValue;
+    }
+    *tex = next_texture++;
+    return cudaSuccess;
+}
+
+inline cudaError_t cudaDestroyTextureObject(cudaTextureObject_t) {
+    return cudaSuccess;
+}
+
+inline cudaError_t cudaGetDeviceProperties(cudaDeviceProp* prop, int) {
+    if (!prop) {
+        return cudaErrorInvalidValue;
+    }
+    *prop = {};
+    return cudaSuccess;
+}
+
+#endif
+
+#endif  // SPMV_CUDA_COMPAT_H
diff --git a/include/spmv/spmv.h b/include/spmv/spmv.h
index 5cc4df1..627afb0 100644
--- a/include/spmv/spmv.h
+++ b/include/spmv/spmv.h
@@ -129,16 +129,11 @@ class SpMVExecutionContext {
     /** @brief Query whether a texture object is currently bound. */
     bool is_texture_bound() const { return tex_x_ != 0; }
 
-    /**
-     * @brief Prepare texture object for input vector x.
-     * @return 0 on success, negative error code on failure.
-     *
-     * Internal API; defined in spmv_kernels.cu.
-     */
-    int prepare_texture(const float* d_x, size_t x_length, bool requested,
-                        cudaTextureObject_t* tex_out, bool* use_texture_out);
-
    private:
+    friend int spmv_prepare_texture(SpMVExecutionContext* context, const float* d_x, size_t x_length,
+                                    bool requested, cudaTextureObject_t* tex_out,
+                                    bool* use_texture_out);
+
     cudaTextureObject_t tex_x_ = 0;
     const float* cached_x_ = nullptr;
     size_t cached_x_length_ = 0;
diff --git a/src/bandwidth.cpp b/src/bandwidth.cpp
index b001050..51dc95d 100644
--- a/src/bandwidth.cpp
+++ b/src/bandwidth.cpp
@@ -1,6 +1,5 @@
 #include "spmv/bandwidth.h"
-
-#include <cuda_runtime.h>
+#include "spmv/cuda_compat.h"
 
 #include <algorithm>
 #include <mutex>
diff --git a/src/csr_matrix.cpp b/src/csr_matrix.cpp
index f692490..11b8a1b 100644
--- a/src/csr_matrix.cpp
+++ b/src/csr_matrix.cpp
@@ -1,3 +1,4 @@
+#include "internal/csr_device.h"
 #include "spmv/csr_matrix.h"
 
 #include <algorithm>
@@ -9,79 +10,6 @@
 
 namespace spmv {
 
-// Internal device state -------------------------------------------------------
-struct CSRMatrixInternal {
-    float* d_values = nullptr;
-    int* d_col_indices = nullptr;
-    int* d_row_ptrs = nullptr;
-};
-
-// Helpers --------------------------------------------------------------------
-static CSRMatrixInternal* get_internal(CSRMatrix* mat) {
-    return mat ? static_cast<CSRMatrixInternal*>(mat->internal) : nullptr;
-}
-
-static const CSRMatrixInternal* get_internal(const CSRMatrix* mat) {
-    return mat ? static_cast<const CSRMatrixInternal*>(mat->internal) : nullptr;
-}
-
-static void free_device(CSRMatrixInternal* internal) {
-    if (!internal)
-        return;
-    if (internal->d_values) {
-        cudaFree(internal->d_values);
-        internal->d_values = nullptr;
-    }
-    if (internal->d_col_indices) {
-        cudaFree(internal->d_col_indices);
-        internal->d_col_indices = nullptr;
-    }
-    if (internal->d_row_ptrs) {
-        cudaFree(internal->d_row_ptrs);
-        internal->d_row_ptrs = nullptr;
-    }
-}
-
-// Internal API (used by src/ only) -------------------------------------------
-float* csr_d_values(CSRMatrix* mat) {
-    auto* internal = get_internal(mat);
-    return internal ? internal->d_values : nullptr;
-}
-
-const float* csr_d_values(const CSRMatrix* mat) {
-    auto* internal = get_internal(mat);
-    return internal ? internal->d_values : nullptr;
-}
-
-int* csr_d_col_indices(CSRMatrix* mat) {
-    auto* internal = get_internal(mat);
-    return internal ? internal->d_col_indices : nullptr;
-}
-
-const int* csr_d_col_indices(const CSRMatrix* mat) {
-    auto* internal = get_internal(mat);
-    return internal ? internal->d_col_indices : nullptr;
-}
-
-int* csr_d_row_ptrs(CSRMatrix* mat) {
-    auto* internal = get_internal(mat);
-    return internal ? internal->d_row_ptrs : nullptr;
-}
-
-const int* csr_d_row_ptrs(const CSRMatrix* mat) {
-    auto* internal = get_internal(mat);
-    return internal ? internal->d_row_ptrs : nullptr;
-}
-
-bool csr_has_device_data(const CSRMatrix* mat) {
-    auto* internal = get_internal(mat);
-    return internal && internal->d_row_ptrs != nullptr;
-}
-
-void csr_free_device_data(CSRMatrix* mat) {
-    free_device(get_internal(mat));
-}
-
 // Public API -----------------------------------------------------------------
 
 CSRMatrix* csr_create(int rows, int cols, int nnz) {
@@ -97,7 +25,7 @@ CSRMatrix* csr_create(int rows, int cols, int nnz) {
     mat->values = (nnz > 0) ? new float[nnz]() : nullptr;
     mat->col_indices = (nnz > 0) ? new int[nnz]() : nullptr;
     mat->row_ptrs = new int[rows + 1]();
-    mat->internal = new CSRMatrixInternal();
+    mat->internal = csr_create_device_state();
 
     return mat;
 }
@@ -110,11 +38,7 @@ void csr_destroy(CSRMatrix* mat) {
     delete[] mat->col_indices;
     delete[] mat->row_ptrs;
 
-    auto* internal = get_internal(mat);
-    if (internal) {
-        free_device(internal);
-        delete internal;
-    }
+    csr_destroy_device_state(mat);
 
     delete mat;
 }
@@ -129,7 +53,7 @@ int csr_from_dense(CSRMatrix* csr, const float* dense, int rows, int cols) {
     }
 
     // Host-side mutation invalidates the device mirror immediately.
-    free_device(get_internal(csr));
+    csr_free_device_data(csr);
 
     int nnz = 0;
     for (int i = 0; i < rows * cols; i++) {
@@ -208,116 +132,9 @@ float csr_get_element(const CSRMatrix* mat, int row, int col) {
     return 0.0f;
 }
 
-int csr_to_gpu(CSRMatrix* mat) {
-    if (!mat) {
-        return static_cast<int>(SpMVError::INVALID_ARGUMENT);
-    }
-
-    if (!mat->row_ptrs || (mat->nnz > 0 && (!mat->values || !mat->col_indices))) {
-        return static_cast<int>(SpMVError::INVALID_FORMAT);
-    }
-
-    auto* internal = get_internal(mat);
-    if (!internal) {
-        return static_cast<int>(SpMVError::INVALID_ARGUMENT);
-    }
-
-    // Free any existing device data before allocating new buffers.
-    free_device(internal);
-
-    float* new_d_values = nullptr;
-    int* new_d_col_indices = nullptr;
-    int* new_d_row_ptrs = nullptr;
+int csr_to_gpu(CSRMatrix* mat) { return csr_upload_device_data(mat); }
 
-    auto cleanup = [&]() {
-        if (new_d_values)
-            cudaFree(new_d_values);
-        if (new_d_col_indices)
-            cudaFree(new_d_col_indices);
-        if (new_d_row_ptrs)
-            cudaFree(new_d_row_ptrs);
-    };
-
-    if (mat->nnz > 0) {
-        cudaError_t err =
-            cudaMalloc(reinterpret_cast<void**>(&new_d_values), mat->nnz * sizeof(float));
-        if (err != cudaSuccess) {
-            cleanup();
-            return static_cast<int>(SpMVError::CUDA_MALLOC);
-        }
-
-        err = cudaMalloc(reinterpret_cast<void**>(&new_d_col_indices), mat->nnz * sizeof(int));
-        if (err != cudaSuccess) {
-            cleanup();
-            return static_cast<int>(SpMVError::CUDA_MALLOC);
-        }
-    }
-
-    cudaError_t err =
-        cudaMalloc(reinterpret_cast<void**>(&new_d_row_ptrs), (mat->num_rows + 1) * sizeof(int));
-    if (err != cudaSuccess) {
-        cleanup();
-        return static_cast<int>(SpMVError::CUDA_MALLOC);
-    }
-
-    if (mat->nnz > 0) {
-        err =
-            cudaMemcpy(new_d_values, mat->values, mat->nnz * sizeof(float), cudaMemcpyHostToDevice);
-        if (err != cudaSuccess) {
-            cleanup();
-            return static_cast<int>(SpMVError::CUDA_MEMCPY);
-        }
-
-        err = cudaMemcpy(new_d_col_indices, mat->col_indices, mat->nnz * sizeof(int),
-                         cudaMemcpyHostToDevice);
-        if (err != cudaSuccess) {
-            cleanup();
-            return static_cast<int>(SpMVError::CUDA_MEMCPY);
-        }
-    }
-
-    err = cudaMemcpy(new_d_row_ptrs, mat->row_ptrs, (mat->num_rows + 1) * sizeof(int),
-                     cudaMemcpyHostToDevice);
-    if (err != cudaSuccess) {
-        cleanup();
-        return static_cast<int>(SpMVError::CUDA_MEMCPY);
-    }
-
-    internal->d_values = new_d_values;
-    internal->d_col_indices = new_d_col_indices;
-    internal->d_row_ptrs = new_d_row_ptrs;
-
-    return static_cast<int>(SpMVError::SUCCESS);
-}
-
-int csr_from_gpu(CSRMatrix* mat) {
-    if (!mat) {
-        return static_cast<int>(SpMVError::INVALID_ARGUMENT);
-    }
-
-    auto* internal = get_internal(mat);
-    if (!internal || !internal->d_row_ptrs) {
-        return static_cast<int>(SpMVError::INVALID_ARGUMENT);
-    }
-
-    if (!mat->row_ptrs) {
-        return static_cast<int>(SpMVError::INVALID_ARGUMENT);
-    }
-
-    if (mat->nnz > 0 && internal->d_values && internal->d_col_indices) {
-        if (!mat->values || !mat->col_indices) {
-            return static_cast<int>(SpMVError::INVALID_ARGUMENT);
-        }
-        CUDA_CHECK_MEMCPY(cudaMemcpy(mat->values, internal->d_values, mat->nnz * sizeof(float),
-                                       cudaMemcpyDeviceToHost));
-        CUDA_CHECK_MEMCPY(cudaMemcpy(mat->col_indices, internal->d_col_indices,
-                                     mat->nnz * sizeof(int), cudaMemcpyDeviceToHost));
-    }
-    CUDA_CHECK_MEMCPY(cudaMemcpy(mat->row_ptrs, internal->d_row_ptrs,
-                                 (mat->num_rows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
-
-    return static_cast<int>(SpMVError::SUCCESS);
-}
+int csr_from_gpu(CSRMatrix* mat) { return csr_download_device_data(mat); }
 
 int csr_serialize(const CSRMatrix* mat, const char* filename) {
     if (!mat || !filename) {
@@ -396,7 +213,7 @@ int csr_deserialize(CSRMatrix* mat, const char* filename) {
     }
 
     // Host-side mutation invalidates device mirror.
-    free_device(get_internal(mat));
+    csr_free_device_data(mat);
 
     delete[] mat->values;
     delete[] mat->col_indices;
diff --git a/src/ell_matrix.cpp b/src/ell_matrix.cpp
index 6c5f69f..58b9673 100644
--- a/src/ell_matrix.cpp
+++ b/src/ell_matrix.cpp
@@ -1,70 +1,14 @@
+#include "internal/ell_device.h"
 #include "spmv/ell_matrix.h"
 
 #include <algorithm>
+#include <climits>
 #include <cstring>
 #include <fstream>
 #include <new>
 
 namespace spmv {
 
-// Internal device state -------------------------------------------------------
-struct ELLMatrixInternal {
-    float* d_values = nullptr;
-    int* d_col_indices = nullptr;
-};
-
-// Helpers --------------------------------------------------------------------
-static ELLMatrixInternal* get_internal(ELLMatrix* mat) {
-    return mat ? static_cast<ELLMatrixInternal*>(mat->internal) : nullptr;
-}
-
-static const ELLMatrixInternal* get_internal(const ELLMatrix* mat) {
-    return mat ? static_cast<const ELLMatrixInternal*>(mat->internal) : nullptr;
-}
-
-static void free_device(ELLMatrixInternal* internal) {
-    if (!internal)
-        return;
-    if (internal->d_values) {
-        cudaFree(internal->d_values);
-        internal->d_values = nullptr;
-    }
-    if (internal->d_col_indices) {
-        cudaFree(internal->d_col_indices);
-        internal->d_col_indices = nullptr;
-    }
-}
-
-// Internal API (used by src/ only) -------------------------------------------
-float* ell_d_values(ELLMatrix* mat) {
-    auto* internal = get_internal(mat);
-    return internal ? internal->d_values : nullptr;
-}
-
-const float* ell_d_values(const ELLMatrix* mat) {
-    auto* internal = get_internal(mat);
-    return internal ? internal->d_values : nullptr;
-}
-
-int* ell_d_col_indices(ELLMatrix* mat) {
-    auto* internal = get_internal(mat);
-    return internal ? internal->d_col_indices : nullptr;
-}
-
-const int* ell_d_col_indices(const ELLMatrix* mat) {
-    auto* internal = get_internal(mat);
-    return internal ? internal->d_col_indices : nullptr;
-}
-
-bool ell_has_device_data(const ELLMatrix* mat) {
-    auto* internal = get_internal(mat);
-    return internal && internal->d_values != nullptr;
-}
-
-void ell_free_device_data(ELLMatrix* mat) {
-    free_device(get_internal(mat));
-}
-
 // Public API -----------------------------------------------------------------
 
 ELLMatrix* ell_create(int rows, int cols, int max_nnz_per_row) {
@@ -88,7 +32,7 @@ ELLMatrix* ell_create(int rows, int cols, int max_nnz_per_row) {
         }
     }
 
-    mat->internal = new ELLMatrixInternal();
+    mat->internal = ell_create_device_state();
 
     return mat;
 }
@@ -100,11 +44,7 @@ void ell_destroy(ELLMatrix* mat) {
     delete[] mat->values;
     delete[] mat->col_indices;
 
-    auto* internal = get_internal(mat);
-    if (internal) {
-        free_device(internal);
-        delete internal;
-    }
+    ell_destroy_device_state(mat);
 
     delete mat;
 }
@@ -118,7 +58,7 @@ int ell_from_dense(ELLMatrix* ell, const float* dense, int rows, int cols) {
         return static_cast<int>(SpMVError::INVALID_ARGUMENT);
     }
 
-    free_device(get_internal(ell));
+    ell_free_device_data(ell);
 
     int max_nnz = 0;
     for (int i = 0; i < rows; i++) {
@@ -177,7 +117,7 @@ int ell_from_csr(ELLMatrix* ell, const CSRMatrix* csr) {
         return static_cast<int>(SpMVError::INVALID_FORMAT);
     }
 
-    free_device(get_internal(ell));
+    ell_free_device_data(ell);
 
     int max_nnz = 0;
     for (int i = 0; i < csr->num_rows; i++) {
@@ -260,84 +200,9 @@ float ell_get_element(const ELLMatrix* mat, int row, int col) {
     return 0.0f;
 }
 
-int ell_to_gpu(ELLMatrix* mat) {
-    if (!mat) {
-        return static_cast<int>(SpMVError::INVALID_ARGUMENT);
-    }
-
-    size_t size = static_cast<size_t>(mat->num_rows) * mat->max_nnz_per_row;
-    if (size > 0 && (!mat->values || !mat->col_indices)) {
-        return static_cast<int>(SpMVError::INVALID_FORMAT);
-    }
-
-    auto* internal = get_internal(mat);
-    if (!internal) {
-        return static_cast<int>(SpMVError::INVALID_ARGUMENT);
-    }
-
-    free_device(internal);
-
-    float* new_d_values = nullptr;
-    int* new_d_col_indices = nullptr;
-
-    auto cleanup = [&]() {
-        if (new_d_values)
-            cudaFree(new_d_values);
-        if (new_d_col_indices)
-            cudaFree(new_d_col_indices);
-    };
-
-    if (size > 0) {
-        cudaError_t err = cudaMalloc(reinterpret_cast<void**>(&new_d_values), size * sizeof(float));
-        if (err != cudaSuccess) {
-            cleanup();
-            return static_cast<int>(SpMVError::CUDA_MALLOC);
-        }
-
-        err = cudaMalloc(reinterpret_cast<void**>(&new_d_col_indices), size * sizeof(int));
-        if (err != cudaSuccess) {
-            cleanup();
-            return static_cast<int>(SpMVError::CUDA_MALLOC);
-        }
-
-        err = cudaMemcpy(new_d_values, mat->values, size * sizeof(float), cudaMemcpyHostToDevice);
-        if (err != cudaSuccess) {
-            cleanup();
-            return static_cast<int>(SpMVError::CUDA_MEMCPY);
-        }
-
-        err = cudaMemcpy(new_d_col_indices, mat->col_indices, size * sizeof(int),
-                         cudaMemcpyHostToDevice);
-        if (err != cudaSuccess) {
-            cleanup();
-            return static_cast<int>(SpMVError::CUDA_MEMCPY);
-        }
-    }
-
-    internal->d_values = new_d_values;
-    internal->d_col_indices = new_d_col_indices;
-    return static_cast<int>(SpMVError::SUCCESS);
-}
-
-int ell_from_gpu(ELLMatrix* mat) {
-    if (!mat) {
-        return static_cast<int>(SpMVError::INVALID_ARGUMENT);
-    }
+int ell_to_gpu(ELLMatrix* mat) { return ell_upload_device_data(mat); }
 
-    auto* internal = get_internal(mat);
-    size_t size = static_cast<size_t>(mat->num_rows) * mat->max_nnz_per_row;
-    if (size > 0 && internal && internal->d_values && internal->d_col_indices) {
-        if (!mat->values || !mat->col_indices) {
-            return static_cast<int>(SpMVError::INVALID_ARGUMENT);
-        }
-        CUDA_CHECK_MEMCPY(
-            cudaMemcpy(mat->values, internal->d_values, size * sizeof(float), cudaMemcpyDeviceToHost));
-        CUDA_CHECK_MEMCPY(cudaMemcpy(mat->col_indices, internal->d_col_indices, size * sizeof(int),
-                                     cudaMemcpyDeviceToHost));
-    }
-
-    return static_cast<int>(SpMVError::SUCCESS);
-}
+int ell_from_gpu(ELLMatrix* mat) { return ell_download_device_data(mat); }
 
 int ell_serialize(const ELLMatrix* mat, const char* filename) {
     if (!mat || !filename) {
@@ -412,7 +277,7 @@ int ell_deserialize(ELLMatrix* mat, const char* filename) {
         return static_cast<int>(SpMVError::FILE_IO);
     }
 
-    free_device(get_internal(mat));
+    ell_free_device_data(mat);
 
     delete[] mat->values;
     delete[] mat->col_indices;
diff --git a/src/internal/csr_device.cpp b/src/internal/csr_device.cpp
new file mode 100644
index 0000000..f92e5d2
--- /dev/null
+++ b/src/internal/csr_device.cpp
@@ -0,0 +1,204 @@
+#include "csr_device.h"
+
+namespace spmv {
+
+namespace {
+
+struct CSRMatrixInternal {
+    float* d_values = nullptr;
+    int* d_col_indices = nullptr;
+    int* d_row_ptrs = nullptr;
+};
+
+CSRMatrixInternal* get_internal(CSRMatrix* mat) {
+    return mat ? static_cast<CSRMatrixInternal*>(mat->internal) : nullptr;
+}
+
+const CSRMatrixInternal* get_internal(const CSRMatrix* mat) {
+    return mat ? static_cast<const CSRMatrixInternal*>(mat->internal) : nullptr;
+}
+
+void free_device(CSRMatrixInternal* internal) {
+    if (!internal) {
+        return;
+    }
+    if (internal->d_values) {
+        cudaFree(internal->d_values);
+        internal->d_values = nullptr;
+    }
+    if (internal->d_col_indices) {
+        cudaFree(internal->d_col_indices);
+        internal->d_col_indices = nullptr;
+    }
+    if (internal->d_row_ptrs) {
+        cudaFree(internal->d_row_ptrs);
+        internal->d_row_ptrs = nullptr;
+    }
+}
+
+}  // namespace
+
+void* csr_create_device_state() {
+    return new CSRMatrixInternal();
+}
+
+void csr_destroy_device_state(CSRMatrix* mat) {
+    auto* internal = get_internal(mat);
+    if (!internal) {
+        return;
+    }
+    free_device(internal);
+    delete internal;
+    mat->internal = nullptr;
+}
+
+float* csr_d_values(CSRMatrix* mat) {
+    auto* internal = get_internal(mat);
+    return internal ? internal->d_values : nullptr;
+}
+
+const float* csr_d_values(const CSRMatrix* mat) {
+    auto* internal = get_internal(mat);
+    return internal ? internal->d_values : nullptr;
+}
+
+int* csr_d_col_indices(CSRMatrix* mat) {
+    auto* internal = get_internal(mat);
+    return internal ? internal->d_col_indices : nullptr;
+}
+
+const int* csr_d_col_indices(const CSRMatrix* mat) {
+    auto* internal = get_internal(mat);
+    return internal ? internal->d_col_indices : nullptr;
+}
+
+int* csr_d_row_ptrs(CSRMatrix* mat) {
+    auto* internal = get_internal(mat);
+    return internal ? internal->d_row_ptrs : nullptr;
+}
+
+const int* csr_d_row_ptrs(const CSRMatrix* mat) {
+    auto* internal = get_internal(mat);
+    return internal ? internal->d_row_ptrs : nullptr;
+}
+
+bool csr_has_device_data(const CSRMatrix* mat) {
+    auto* internal = get_internal(mat);
+    return internal && internal->d_row_ptrs != nullptr;
+}
+
+void csr_free_device_data(CSRMatrix* mat) {
+    free_device(get_internal(mat));
+}
+
+int csr_upload_device_data(CSRMatrix* mat) {
+    if (!mat) {
+        return static_cast<int>(SpMVError::INVALID_ARGUMENT);
+    }
+
+    if (!mat->row_ptrs || (mat->nnz > 0 && (!mat->values || !mat->col_indices))) {
+        return static_cast<int>(SpMVError::INVALID_FORMAT);
+    }
+
+    auto* internal = get_internal(mat);
+    if (!internal) {
+        return static_cast<int>(SpMVError::INVALID_ARGUMENT);
+    }
+
+    free_device(internal);
+
+    float* new_d_values = nullptr;
+    int* new_d_col_indices = nullptr;
+    int* new_d_row_ptrs = nullptr;
+
+    auto cleanup = [&]() {
+        if (new_d_values) {
+            cudaFree(new_d_values);
+        }
+        if (new_d_col_indices) {
+            cudaFree(new_d_col_indices);
+        }
+        if (new_d_row_ptrs) {
+            cudaFree(new_d_row_ptrs);
+        }
+    };
+
+    if (mat->nnz > 0) {
+        cudaError_t err = cudaMalloc(reinterpret_cast<void**>(&new_d_values), mat->nnz * sizeof(float));
+        if (err != cudaSuccess) {
+            cleanup();
+            return static_cast<int>(SpMVError::CUDA_MALLOC);
+        }
+
+        err = cudaMalloc(reinterpret_cast<void**>(&new_d_col_indices), mat->nnz * sizeof(int));
+        if (err != cudaSuccess) {
+            cleanup();
+            return static_cast<int>(SpMVError::CUDA_MALLOC);
+        }
+    }
+
+    cudaError_t err = cudaMalloc(reinterpret_cast<void**>(&new_d_row_ptrs),
+                                 (mat->num_rows + 1) * sizeof(int));
+    if (err != cudaSuccess) {
+        cleanup();
+        return static_cast<int>(SpMVError::CUDA_MALLOC);
+    }
+
+    if (mat->nnz > 0) {
+        err = cudaMemcpy(new_d_values, mat->values, mat->nnz * sizeof(float), cudaMemcpyHostToDevice);
+        if (err != cudaSuccess) {
+            cleanup();
+            return static_cast<int>(SpMVError::CUDA_MEMCPY);
+        }
+
+        err = cudaMemcpy(new_d_col_indices, mat->col_indices, mat->nnz * sizeof(int),
+                         cudaMemcpyHostToDevice);
+        if (err != cudaSuccess) {
+            cleanup();
+            return static_cast<int>(SpMVError::CUDA_MEMCPY);
+        }
+    }
+
+    err = cudaMemcpy(new_d_row_ptrs, mat->row_ptrs, (mat->num_rows + 1) * sizeof(int),
+                     cudaMemcpyHostToDevice);
+    if (err != cudaSuccess) {
+        cleanup();
+        return static_cast<int>(SpMVError::CUDA_MEMCPY);
+    }
+
+    internal->d_values = new_d_values;
+    internal->d_col_indices = new_d_col_indices;
+    internal->d_row_ptrs = new_d_row_ptrs;
+    return static_cast<int>(SpMVError::SUCCESS);
+}
+
+int csr_download_device_data(CSRMatrix* mat) {
+    if (!mat) {
+        return static_cast<int>(SpMVError::INVALID_ARGUMENT);
+    }
+
+    auto* internal = get_internal(mat);
+    if (!internal || !internal->d_row_ptrs) {
+        return static_cast<int>(SpMVError::INVALID_ARGUMENT);
+    }
+
+    if (!mat->row_ptrs) {
+        return static_cast<int>(SpMVError::INVALID_ARGUMENT);
+    }
+
+    if (mat->nnz > 0 && internal->d_values && internal->d_col_indices) {
+        if (!mat->values || !mat->col_indices) {
+            return static_cast<int>(SpMVError::INVALID_ARGUMENT);
+        }
+        CUDA_CHECK_MEMCPY(cudaMemcpy(mat->values, internal->d_values, mat->nnz * sizeof(float),
+                                     cudaMemcpyDeviceToHost));
+        CUDA_CHECK_MEMCPY(cudaMemcpy(mat->col_indices, internal->d_col_indices,
+                                     mat->nnz * sizeof(int), cudaMemcpyDeviceToHost));
+    }
+    CUDA_CHECK_MEMCPY(cudaMemcpy(mat->row_ptrs, internal->d_row_ptrs,
+                                 (mat->num_rows + 1) * sizeof(int), cudaMemcpyDeviceToHost));
+
+    return static_cast<int>(SpMVError::SUCCESS);
+}
+
+}  // namespace spmv
diff --git a/src/internal/csr_device.h b/src/internal/csr_device.h
index 7b1f4cb..3c60994 100644
--- a/src/internal/csr_device.h
+++ b/src/internal/csr_device.h
@@ -9,6 +9,9 @@ namespace spmv {
 // These are NOT part of the public API; they are exposed only to
 // compilation units inside src/ that need to launch kernels.
 
+void* csr_create_device_state();
+void csr_destroy_device_state(CSRMatrix* mat);
+
 float* csr_d_values(CSRMatrix* mat);
 const float* csr_d_values(const CSRMatrix* mat);
 int* csr_d_col_indices(CSRMatrix* mat);
@@ -20,6 +23,8 @@ bool csr_has_device_data(const CSRMatrix* mat);
 
 // Free only device memory (called internally by to_gpu / destroy).
 void csr_free_device_data(CSRMatrix* mat);
+int csr_upload_device_data(CSRMatrix* mat);
+int csr_download_device_data(CSRMatrix* mat);
 
 }  // namespace spmv
 
diff --git a/src/internal/ell_device.cpp b/src/internal/ell_device.cpp
new file mode 100644
index 0000000..1ce7aa6
--- /dev/null
+++ b/src/internal/ell_device.cpp
@@ -0,0 +1,160 @@
+#include "ell_device.h"
+
+namespace spmv {
+
+namespace {
+
+struct ELLMatrixInternal {
+    float* d_values = nullptr;
+    int* d_col_indices = nullptr;
+};
+
+ELLMatrixInternal* get_internal(ELLMatrix* mat) {
+    return mat ? static_cast<ELLMatrixInternal*>(mat->internal) : nullptr;
+}
+
+const ELLMatrixInternal* get_internal(const ELLMatrix* mat) {
+    return mat ? static_cast<const ELLMatrixInternal*>(mat->internal) : nullptr;
+}
+
+void free_device(ELLMatrixInternal* internal) {
+    if (!internal) {
+        return;
+    }
+    if (internal->d_values) {
+        cudaFree(internal->d_values);
+        internal->d_values = nullptr;
+    }
+    if (internal->d_col_indices) {
+        cudaFree(internal->d_col_indices);
+        internal->d_col_indices = nullptr;
+    }
+}
+
+}  // namespace
+
+void* ell_create_device_state() {
+    return new ELLMatrixInternal();
+}
+
+void ell_destroy_device_state(ELLMatrix* mat) {
+    auto* internal = get_internal(mat);
+    if (!internal) {
+        return;
+    }
+    free_device(internal);
+    delete internal;
+    mat->internal = nullptr;
+}
+
+float* ell_d_values(ELLMatrix* mat) {
+    auto* internal = get_internal(mat);
+    return internal ? internal->d_values : nullptr;
+}
+
+const float* ell_d_values(const ELLMatrix* mat) {
+    auto* internal = get_internal(mat);
+    return internal ? internal->d_values : nullptr;
+}
+
+int* ell_d_col_indices(ELLMatrix* mat) {
+    auto* internal = get_internal(mat);
+    return internal ? internal->d_col_indices : nullptr;
+}
+
+const int* ell_d_col_indices(const ELLMatrix* mat) {
+    auto* internal = get_internal(mat);
+    return internal ? internal->d_col_indices : nullptr;
+}
+
+bool ell_has_device_data(const ELLMatrix* mat) {
+    auto* internal = get_internal(mat);
+    return internal && internal->d_values != nullptr;
+}
+
+void ell_free_device_data(ELLMatrix* mat) {
+    free_device(get_internal(mat));
+}
+
+int ell_upload_device_data(ELLMatrix* mat) {
+    if (!mat) {
+        return static_cast<int>(SpMVError::INVALID_ARGUMENT);
+    }
+
+    size_t size = static_cast<size_t>(mat->num_rows) * mat->max_nnz_per_row;
+    if (size > 0 && (!mat->values || !mat->col_indices)) {
+        return static_cast<int>(SpMVError::INVALID_FORMAT);
+    }
+
+    auto* internal = get_internal(mat);
+    if (!internal) {
+        return static_cast<int>(SpMVError::INVALID_ARGUMENT);
+    }
+
+    free_device(internal);
+
+    float* new_d_values = nullptr;
+    int* new_d_col_indices = nullptr;
+
+    auto cleanup = [&]() {
+        if (new_d_values) {
+            cudaFree(new_d_values);
+        }
+        if (new_d_col_indices) {
+            cudaFree(new_d_col_indices);
+        }
+    };
+
+    if (size > 0) {
+        cudaError_t err = cudaMalloc(reinterpret_cast<void**>(&new_d_values), size * sizeof(float));
+        if (err != cudaSuccess) {
+            cleanup();
+            return static_cast<int>(SpMVError::CUDA_MALLOC);
+        }
+
+        err = cudaMalloc(reinterpret_cast<void**>(&new_d_col_indices), size * sizeof(int));
+        if (err != cudaSuccess) {
+            cleanup();
+            return static_cast<int>(SpMVError::CUDA_MALLOC);
+        }
+
+        err = cudaMemcpy(new_d_values, mat->values, size * sizeof(float), cudaMemcpyHostToDevice);
+        if (err != cudaSuccess) {
+            cleanup();
+            return static_cast<int>(SpMVError::CUDA_MEMCPY);
+        }
+
+        err = cudaMemcpy(new_d_col_indices, mat->col_indices, size * sizeof(int),
+                         cudaMemcpyHostToDevice);
+        if (err != cudaSuccess) {
+            cleanup();
+            return static_cast<int>(SpMVError::CUDA_MEMCPY);
+        }
+    }
+
+    internal->d_values = new_d_values;
+    internal->d_col_indices = new_d_col_indices;
+    return static_cast<int>(SpMVError::SUCCESS);
+}
+
+int ell_download_device_data(ELLMatrix* mat) {
+    if (!mat) {
+        return static_cast<int>(SpMVError::INVALID_ARGUMENT);
+    }
+
+    auto* internal = get_internal(mat);
+    size_t size = static_cast<size_t>(mat->num_rows) * mat->max_nnz_per_row;
+    if (size > 0 && internal && internal->d_values && internal->d_col_indices) {
+        if (!mat->values || !mat->col_indices) {
+            return static_cast<int>(SpMVError::INVALID_ARGUMENT);
+        }
+        CUDA_CHECK_MEMCPY(
+            cudaMemcpy(mat->values, internal->d_values, size * sizeof(float), cudaMemcpyDeviceToHost));
+        CUDA_CHECK_MEMCPY(cudaMemcpy(mat->col_indices, internal->d_col_indices, size * sizeof(int),
+                                     cudaMemcpyDeviceToHost));
+    }
+
+    return static_cast<int>(SpMVError::SUCCESS);
+}
+
+}  // namespace spmv
diff --git a/src/internal/ell_device.h b/src/internal/ell_device.h
index 37e21fb..538c451 100644
--- a/src/internal/ell_device.h
+++ b/src/internal/ell_device.h
@@ -8,6 +8,9 @@ namespace spmv {
 // Internal device-state accessors for ELLMatrix.
 // These are NOT part of the public API.
 
+void* ell_create_device_state();
+void ell_destroy_device_state(ELLMatrix* mat);
+
 float* ell_d_values(ELLMatrix* mat);
 const float* ell_d_values(const ELLMatrix* mat);
 int* ell_d_col_indices(ELLMatrix* mat);
@@ -17,6 +20,8 @@ bool ell_has_device_data(const ELLMatrix* mat);
 
 // Free only device memory (called internally by to_gpu / destroy).
 void ell_free_device_data(ELLMatrix* mat);
+int ell_upload_device_data(ELLMatrix* mat);
+int ell_download_device_data(ELLMatrix* mat);
 
 }  // namespace spmv
 
diff --git a/src/internal/pagerank_common.h b/src/internal/pagerank_common.h
new file mode 100644
index 0000000..f60d94b
--- /dev/null
+++ b/src/internal/pagerank_common.h
@@ -0,0 +1,15 @@
+#ifndef SPMV_INTERNAL_PAGERANK_COMMON_H
+#define SPMV_INTERNAL_PAGERANK_COMMON_H
+
+#include "spmv/pagerank.h"
+
+#include <vector>
+
+namespace spmv {
+
+std::vector<int> pagerank_find_dangling_nodes(const CSRMatrix* adj_matrix);
+void pagerank_normalize(float* ranks, int n);
+
+}  // namespace spmv
+
+#endif  // SPMV_INTERNAL_PAGERANK_COMMON_H
diff --git a/src/internal/texture_cache.h b/src/internal/texture_cache.h
new file mode 100644
index 0000000..5f8ffa2
--- /dev/null
+++ b/src/internal/texture_cache.h
@@ -0,0 +1,13 @@
+#ifndef SPMV_INTERNAL_TEXTURE_CACHE_H
+#define SPMV_INTERNAL_TEXTURE_CACHE_H
+
+#include "spmv/spmv.h"
+
+namespace spmv {
+
+int spmv_prepare_texture(SpMVExecutionContext* context, const float* d_x, size_t x_length,
+                         bool requested, cudaTextureObject_t* tex_out, bool* use_texture_out);
+
+}  // namespace spmv
+
+#endif  // SPMV_INTERNAL_TEXTURE_CACHE_H
diff --git a/src/no_cuda_stubs.cpp b/src/no_cuda_stubs.cpp
new file mode 100644
index 0000000..985dcfd
--- /dev/null
+++ b/src/no_cuda_stubs.cpp
@@ -0,0 +1,170 @@
+#include "spmv/benchmark.h"
+#include "spmv/pagerank.h"
+#include "spmv/spmv.h"
+#include "internal/pagerank_common.h"
+
+#include <cmath>
+#include <new>
+#include <sstream>
+#include <vector>
+
+namespace spmv {
+
+namespace {
+
+int no_cuda_error() {
+    return static_cast<int>(SpMVError::KERNEL_LAUNCH);
+}
+
+}  // namespace
+
+SpMVResult spmv_csr(const CSRMatrix*, const float*, float* d_y, const SpMVConfig*, int,
+                    SpMVExecutionContext*) {
+    SpMVResult result;
+    result.y = d_y;
+    result.error_code = no_cuda_error();
+    return result;
+}
+
+SpMVResult spmv_ell(const ELLMatrix*, const float*, float* d_y, const SpMVConfig*, int,
+                    SpMVExecutionContext*) {
+    SpMVResult result;
+    result.y = d_y;
+    result.error_code = no_cuda_error();
+    return result;
+}
+
+BenchmarkResult benchmark_csr(const CSRMatrix*, const float*, const SpMVConfig*,
+                              const BenchmarkConfig*) {
+    BenchmarkResult result;
+    result.error_code = no_cuda_error();
+    return result;
+}
+
+BenchmarkResult benchmark_ell(const ELLMatrix*, const float*, const BenchmarkConfig*) {
+    BenchmarkResult result;
+    result.error_code = no_cuda_error();
+    return result;
+}
+
+ComparisonResult compare_gpu_cpu_csr(const CSRMatrix*, const float*, const SpMVConfig*,
+                                     const BenchmarkConfig*) {
+    ComparisonResult result;
+    result.error_code = no_cuda_error();
+    result.gpu_result.error_code = no_cuda_error();
+    result.cpu_result.error_code = no_cuda_error();
+    return result;
+}
+
+std::string benchmark_to_json(const BenchmarkResult& result) {
+    std::ostringstream json;
+    json << "{\"name\":\"" << result.name << "\",\"execution_time_ms\":" << result.execution_time_ms
+         << ",\"gflops\":" << result.gflops << ",\"bandwidth_gb_s\":" << result.bandwidth_gb_s
+         << ",\"avg_time_ms\":" << result.avg_time_ms << ",\"min_time_ms\":" << result.min_time_ms
+         << ",\"max_time_ms\":" << result.max_time_ms << ",\"stddev_time_ms\":"
+         << result.stddev_time_ms << ",\"num_runs\":" << result.num_runs << ",\"error_code\":"
+         << result.error_code << "}";
+    return json.str();
+}
+
+std::string comparison_to_json(const ComparisonResult& result) {
+    std::ostringstream json;
+    json << "{\"speedup\":" << result.speedup << ",\"error_code\":" << result.error_code << "}";
+    return json.str();
+}
+
+BenchmarkResult benchmark_from_json(const std::string&) {
+    BenchmarkResult result;
+    result.error_code = no_cuda_error();
+    return result;
+}
+
+PageRankResult pagerank(const CSRMatrix* adj_matrix, const PageRankConfig* config) {
+    PageRankResult result;
+
+    if (!adj_matrix) {
+        result.error_code = static_cast<int>(SpMVError::INVALID_ARGUMENT);
+        return result;
+    }
+    if (adj_matrix->num_rows < 0 || adj_matrix->num_cols < 0 || adj_matrix->nnz < 0) {
+        result.error_code = static_cast<int>(SpMVError::INVALID_ARGUMENT);
+        return result;
+    }
+    if (adj_matrix->num_rows != adj_matrix->num_cols) {
+        result.error_code = static_cast<int>(SpMVError::INVALID_DIMENSION);
+        return result;
+    }
+    if (!adj_matrix->row_ptrs ||
+        (adj_matrix->nnz > 0 && (!adj_matrix->values || !adj_matrix->col_indices))) {
+        result.error_code = static_cast<int>(SpMVError::INVALID_FORMAT);
+        return result;
+    }
+
+    PageRankConfig default_config;
+    if (!config) {
+        config = &default_config;
+    }
+    if (config->max_iterations < 0 || config->tolerance < 0.0f ||
+        config->damping_factor < 0.0f || config->damping_factor > 1.0f) {
+        result.error_code = static_cast<int>(SpMVError::INVALID_ARGUMENT);
+        return result;
+    }
+
+    int n = adj_matrix->num_rows;
+    if (n == 0) {
+        result.converged = true;
+        result.error_code = static_cast<int>(SpMVError::SUCCESS);
+        return result;
+    }
+
+    result.ranks = new (std::nothrow) float[n];
+    if (!result.ranks) {
+        result.error_code = static_cast<int>(SpMVError::OUT_OF_MEMORY);
+        return result;
+    }
+
+    float init_rank = 1.0f / static_cast<float>(n);
+    std::vector<float> next_ranks(n, 0.0f);
+    for (int i = 0; i < n; i++) {
+        result.ranks[i] = init_rank;
+    }
+
+    std::vector<int> dangling_nodes = pagerank_find_dangling_nodes(adj_matrix);
+    float damping = config->damping_factor;
+    float teleport = (1.0f - damping) / static_cast<float>(n);
+
+    for (int iter = 0; iter < config->max_iterations; iter++) {
+        float dangling_sum = 0.0f;
+        for (int node : dangling_nodes) {
+            dangling_sum += result.ranks[node];
+        }
+
+        spmv_cpu_csr(adj_matrix, result.ranks, next_ranks.data());
+
+        float dangling_contrib = damping * dangling_sum / static_cast<float>(n);
+        float residual_sq = 0.0f;
+        for (int i = 0; i < n; i++) {
+            next_ranks[i] = damping * next_ranks[i] + dangling_contrib + teleport;
+            float diff = next_ranks[i] - result.ranks[i];
+            residual_sq += diff * diff;
+        }
+
+        result.iterations = iter + 1;
+        result.final_residual = std::sqrt(residual_sq);
+
+        for (int i = 0; i < n; i++) {
+            result.ranks[i] = next_ranks[i];
+        }
+
+        if (result.final_residual < config->tolerance) {
+            result.converged = true;
+            break;
+        }
+    }
+
+    pagerank_normalize(result.ranks, n);
+    result.error_code = static_cast<int>(SpMVError::SUCCESS);
+    return result;
+}
+
+}  // namespace spmv
diff --git a/src/pagerank.cu b/src/pagerank.cu
index 66d2371..810d2e2 100644
--- a/src/pagerank.cu
+++ b/src/pagerank.cu
@@ -1,4 +1,5 @@
 #include "internal/csr_device.h"
+#include "internal/pagerank_common.h"
 #include "spmv/cuda_buffer.h"
 #include "spmv/pagerank.h"
 #include "spmv/spmv.h"
@@ -39,36 +40,6 @@ static int map_cuda_exception_to_spmv_error(const CudaException& e) {
                                                     : static_cast<int>(SpMVError::CUDA_MEMCPY);
 }
 
-static std::vector<int> find_dangling_nodes(const CSRMatrix* adj_matrix) {
-    std::vector<int> dangling;
-    if (!adj_matrix || adj_matrix->num_cols <= 0 || adj_matrix->num_rows <= 0) {
-        return dangling;
-    }
-    if (!adj_matrix->values || !adj_matrix->col_indices || !adj_matrix->row_ptrs) {
-        return dangling;
-    }
-
-    int num_cols = adj_matrix->num_cols;
-    std::vector<float> col_sums(num_cols, 0.0f);
-    for (int row = 0; row < adj_matrix->num_rows; row++) {
-        int start = adj_matrix->row_ptrs[row];
-        int end = adj_matrix->row_ptrs[row + 1];
-        for (int idx = start; idx < end; idx++) {
-            int col = adj_matrix->col_indices[idx];
-            if (col >= 0 && col < num_cols) {
-                col_sums[col] += adj_matrix->values[idx];
-            }
-        }
-    }
-
-    for (int col = 0; col < num_cols; col++) {
-        if (col_sums[col] == 0.0f) {
-            dangling.push_back(col);
-        }
-    }
-    return dangling;
-}
-
 PageRankResult pagerank(const CSRMatrix* adj_matrix, const PageRankConfig* config) {
     PageRankResult result;
 
@@ -135,7 +106,7 @@ PageRankResult pagerank(const CSRMatrix* adj_matrix, const PageRankConfig* confi
 
         d_ranks_old.copyFromHost(result.ranks, n);
 
-        std::vector<int> dangling_nodes = find_dangling_nodes(adj_matrix);
+        std::vector<int> dangling_nodes = pagerank_find_dangling_nodes(adj_matrix);
         CudaBuffer<int> d_dangling_nodes(dangling_nodes.size());
         if (!dangling_nodes.empty()) {
             d_dangling_nodes.copyFromHost(dangling_nodes.data(), dangling_nodes.size());
@@ -213,15 +184,7 @@ PageRankResult pagerank(const CSRMatrix* adj_matrix, const PageRankConfig* confi
             d_ranks_old.copyToHost(result.ranks, n);
         }
 
-        float sum = 0.0f;
-        for (int i = 0; i < n; i++) {
-            sum += result.ranks[i];
-        }
-        if (sum > 0.0f) {
-            for (int i = 0; i < n; i++) {
-                result.ranks[i] /= sum;
-            }
-        }
+        pagerank_normalize(result.ranks, n);
 
         result.error_code = static_cast<int>(SpMVError::SUCCESS);
         return result;
@@ -232,32 +195,4 @@ PageRankResult pagerank(const CSRMatrix* adj_matrix, const PageRankConfig* confi
     }
 }
 
-void pagerank_free(PageRankResult* result) {
-    if (result && result->ranks) {
-        delete[] result->ranks;
-        result->ranks = nullptr;
-    }
-}
-
-void pagerank_top_k(const PageRankResult* result, int num_nodes, int k, TopKNode* top_k) {
-    if (!result || !result->ranks || !top_k || k <= 0 || num_nodes <= 0 ||
-        result->error_code != static_cast<int>(SpMVError::SUCCESS)) {
-        return;
-    }
-
-    std::vector<TopKNode> nodes(num_nodes);
-    for (int i = 0; i < num_nodes; i++) {
-        nodes[i].node_id = i;
-        nodes[i].rank = result->ranks[i];
-    }
-
-    int actual_k = std::min(k, num_nodes);
-    std::partial_sort(nodes.begin(), nodes.begin() + actual_k, nodes.end(),
-                      [](const TopKNode& a, const TopKNode& b) { return a.rank > b.rank; });
-
-    for (int i = 0; i < actual_k; i++) {
-        top_k[i] = nodes[i];
-    }
-}
-
 }  // namespace spmv
diff --git a/src/pagerank_common.cpp b/src/pagerank_common.cpp
new file mode 100644
index 0000000..e55edda
--- /dev/null
+++ b/src/pagerank_common.cpp
@@ -0,0 +1,82 @@
+#include "internal/pagerank_common.h"
+
+#include <algorithm>
+
+namespace spmv {
+
+std::vector<int> pagerank_find_dangling_nodes(const CSRMatrix* adj_matrix) {
+    std::vector<int> dangling;
+    if (!adj_matrix || adj_matrix->num_cols <= 0 || adj_matrix->num_rows <= 0) {
+        return dangling;
+    }
+    if (!adj_matrix->values || !adj_matrix->col_indices || !adj_matrix->row_ptrs) {
+        return dangling;
+    }
+
+    int num_cols = adj_matrix->num_cols;
+    std::vector<float> col_sums(num_cols, 0.0f);
+    for (int row = 0; row < adj_matrix->num_rows; row++) {
+        int start = adj_matrix->row_ptrs[row];
+        int end = adj_matrix->row_ptrs[row + 1];
+        for (int idx = start; idx < end; idx++) {
+            int col = adj_matrix->col_indices[idx];
+            if (col >= 0 && col < num_cols) {
+                col_sums[col] += adj_matrix->values[idx];
+            }
+        }
+    }
+
+    for (int col = 0; col < num_cols; col++) {
+        if (col_sums[col] == 0.0f) {
+            dangling.push_back(col);
+        }
+    }
+    return dangling;
+}
+
+void pagerank_normalize(float* ranks, int n) {
+    if (!ranks || n <= 0) {
+        return;
+    }
+
+    float sum = 0.0f;
+    for (int i = 0; i < n; i++) {
+        sum += ranks[i];
+    }
+    if (sum <= 0.0f) {
+        return;
+    }
+    for (int i = 0; i < n; i++) {
+        ranks[i] /= sum;
+    }
+}
+
+void pagerank_free(PageRankResult* result) {
+    if (result && result->ranks) {
+        delete[] result->ranks;
+        result->ranks = nullptr;
+    }
+}
+
+void pagerank_top_k(const PageRankResult* result, int num_nodes, int k, TopKNode* top_k) {
+    if (!result || !result->ranks || !top_k || k <= 0 || num_nodes <= 0 ||
+        result->error_code != static_cast<int>(SpMVError::SUCCESS)) {
+        return;
+    }
+
+    std::vector<TopKNode> nodes(num_nodes);
+    for (int i = 0; i < num_nodes; i++) {
+        nodes[i].node_id = i;
+        nodes[i].rank = result->ranks[i];
+    }
+
+    int actual_k = std::min(k, num_nodes);
+    std::partial_sort(nodes.begin(), nodes.begin() + actual_k, nodes.end(),
+                      [](const TopKNode& a, const TopKNode& b) { return a.rank > b.rank; });
+
+    for (int i = 0; i < actual_k; i++) {
+        top_k[i] = nodes[i];
+    }
+}
+
+}  // namespace spmv
diff --git a/src/spmv_context.cpp b/src/spmv_context.cpp
new file mode 100644
index 0000000..fee90d6
--- /dev/null
+++ b/src/spmv_context.cpp
@@ -0,0 +1,58 @@
+#include "internal/texture_cache.h"
+
+namespace spmv {
+
+int spmv_prepare_texture(SpMVExecutionContext* context, const float* d_x, size_t x_length,
+                         bool requested, cudaTextureObject_t* tex_out, bool* use_texture_out) {
+    if (!tex_out || !use_texture_out) {
+        return static_cast<int>(SpMVError::INVALID_ARGUMENT);
+    }
+
+    *tex_out = 0;
+    *use_texture_out = false;
+
+    if (!requested || !d_x || x_length == 0) {
+        if (context) {
+            context->reset();
+        }
+        return static_cast<int>(SpMVError::SUCCESS);
+    }
+
+    if (!context) {
+        return static_cast<int>(SpMVError::INVALID_ARGUMENT);
+    }
+
+    bool needs_rebuild = !context->texture_enabled_ || context->tex_x_ == 0 || context->cached_x_ != d_x ||
+                         context->cached_x_length_ != x_length;
+    if (needs_rebuild) {
+        context->reset();
+
+        cudaResourceDesc res_desc{};
+        res_desc.resType = cudaResourceTypeLinear;
+        res_desc.res.linear.devPtr = const_cast<float*>(d_x);
+        res_desc.res.linear.desc = cudaCreateChannelDesc<float>();
+        res_desc.res.linear.sizeInBytes = x_length * sizeof(float);
+
+        cudaTextureDesc tex_desc{};
+        tex_desc.addressMode[0] = cudaAddressModeClamp;
+        tex_desc.filterMode = cudaFilterModePoint;
+        tex_desc.readMode = cudaReadModeElementType;
+        tex_desc.normalizedCoords = 0;
+
+        cudaError_t err = cudaCreateTextureObject(&context->tex_x_, &res_desc, &tex_desc, nullptr);
+        if (err != cudaSuccess) {
+            context->reset();
+            return static_cast<int>(SpMVError::CUDA_MALLOC);
+        }
+
+        context->cached_x_ = d_x;
+        context->cached_x_length_ = x_length;
+        context->texture_enabled_ = true;
+    }
+
+    *tex_out = context->tex_x_;
+    *use_texture_out = true;
+    return static_cast<int>(SpMVError::SUCCESS);
+}
+
+}  // namespace spmv
diff --git a/src/spmv_kernels.cu b/src/spmv_kernels.cu
index bb2e21d..0164e34 100644
--- a/src/spmv_kernels.cu
+++ b/src/spmv_kernels.cu
@@ -1,5 +1,6 @@
 #include "internal/csr_device.h"
 #include "internal/ell_device.h"
+#include "internal/texture_cache.h"
 #include "spmv/bandwidth.h"
 #include "spmv/spmv.h"
 
@@ -125,53 +126,6 @@ struct ScopedTexture {
     }
 };
 
-int SpMVExecutionContext::prepare_texture(const float* d_x, size_t x_length, bool requested,
-                                          cudaTextureObject_t* tex_out, bool* use_texture_out) {
-    if (!tex_out || !use_texture_out) {
-        return static_cast<int>(SpMVError::INVALID_ARGUMENT);
-    }
-
-    *tex_out = 0;
-    *use_texture_out = false;
-
-    if (!requested || !d_x || x_length == 0) {
-        reset();
-        return static_cast<int>(SpMVError::SUCCESS);
-    }
-
-    bool needs_rebuild = !texture_enabled_ || tex_x_ == 0 || cached_x_ != d_x ||
-                         cached_x_length_ != x_length;
-    if (needs_rebuild) {
-        reset();
-
-        cudaResourceDesc res_desc{};
-        res_desc.resType = cudaResourceTypeLinear;
-        res_desc.res.linear.devPtr = const_cast<float*>(d_x);
-        res_desc.res.linear.desc = cudaCreateChannelDesc<float>();
-        res_desc.res.linear.sizeInBytes = x_length * sizeof(float);
-
-        cudaTextureDesc tex_desc{};
-        tex_desc.addressMode[0] = cudaAddressModeClamp;
-        tex_desc.filterMode = cudaFilterModePoint;
-        tex_desc.readMode = cudaReadModeElementType;
-        tex_desc.normalizedCoords = 0;
-
-        cudaError_t err = cudaCreateTextureObject(&tex_x_, &res_desc, &tex_desc, nullptr);
-        if (err != cudaSuccess) {
-            reset();
-            return static_cast<int>(SpMVError::CUDA_MALLOC);
-        }
-
-        cached_x_ = d_x;
-        cached_x_length_ = x_length;
-        texture_enabled_ = true;
-    }
-
-    *tex_out = tex_x_;
-    *use_texture_out = true;
-    return static_cast<int>(SpMVError::SUCCESS);
-}
-
 __device__ __forceinline__ float fetch_x(const float* x, cudaTextureObject_t tex_x,
                                          bool use_texture, int idx) {
     return use_texture ? tex1Dfetch<float>(tex_x, idx) : x[idx];
@@ -430,8 +384,8 @@ SpMVResult spmv_csr(const CSRMatrix* A, const float* d_x, float* d_y, const SpMV
 
     if (use_texture && texture_length > 0) {
         if (context) {
-            int tex_status = context->prepare_texture(d_x, texture_length, use_texture,
-                                                       &tex_x, &use_texture);
+            int tex_status = spmv_prepare_texture(context, d_x, texture_length, use_texture, &tex_x,
+                                                  &use_texture);
             if (tex_status != static_cast<int>(SpMVError::SUCCESS)) {
                 result.error_code = tex_status;
                 return result;
@@ -565,8 +519,8 @@ SpMVResult spmv_ell(const ELLMatrix* A, const float* d_x, float* d_y, const SpMV
 
     if (use_texture && texture_length > 0) {
         if (context) {
-            int tex_status = context->prepare_texture(d_x, texture_length, use_texture,
-                                                       &tex_x, &use_texture);
+            int tex_status = spmv_prepare_texture(context, d_x, texture_length, use_texture, &tex_x,
+                                                  &use_texture);
             if (tex_status != static_cast<int>(SpMVError::SUCCESS)) {
                 result.error_code = tex_status;
                 return result;
diff --git a/tests/test_architecture.cpp b/tests/test_architecture.cpp
new file mode 100644
index 0000000..1e571c8
--- /dev/null
+++ b/tests/test_architecture.cpp
@@ -0,0 +1,20 @@
+#include "spmv/spmv.h"
+
+#include <gtest/gtest.h>
+#include <type_traits>
+
+using namespace spmv;
+
+namespace {
+
+template <typename T, typename = void>
+struct HasPublicPrepareTexture : std::false_type {};
+
+template <typename T>
+struct HasPublicPrepareTexture<T, std::void_t<decltype(&T::prepare_texture)>> : std::true_type {};
+
+}  // namespace
+
+TEST(SpMVExecutionContextArchitectureTest, PrepareTextureStaysInternal) {
+    EXPECT_FALSE(HasPublicPrepareTexture<SpMVExecutionContext>::value);
+}
diff --git a/tests/test_no_cuda.cpp b/tests/test_no_cuda.cpp
new file mode 100644
index 0000000..45d8167
--- /dev/null
+++ b/tests/test_no_cuda.cpp
@@ -0,0 +1,44 @@
+#include "spmv/benchmark.h"
+#include "spmv/csr_matrix.h"
+#include "spmv/pagerank.h"
+#include "spmv/spmv.h"
+
+#include <gtest/gtest.h>
+#include <vector>
+
+using namespace spmv;
+
+#if !SPMV_WITH_CUDA
+
+TEST(NoCudaModeTest, SpMVCsrFailsGracefullyWithoutCudaBackend) {
+    std::vector<float> dense = {1.0f, 0.0f, 0.0f, 2.0f};
+    std::vector<float> x = {3.0f, 4.0f};
+    std::vector<float> y(2, 0.0f);
+
+    CSRMatrix* csr = csr_create(0, 0, 0);
+    ASSERT_NE(csr, nullptr);
+    ASSERT_EQ(csr_from_dense(csr, dense.data(), 2, 2), static_cast<int>(SpMVError::SUCCESS));
+
+    SpMVResult result = spmv_csr(csr, x.data(), y.data(), nullptr, 2);
+
+    EXPECT_EQ(result.error_code, static_cast<int>(SpMVError::KERNEL_LAUNCH));
+    EXPECT_EQ(result.y, y.data());
+
+    csr_destroy(csr);
+}
+
+TEST(NoCudaModeTest, BenchmarkFailsGracefullyWithoutCudaBackend) {
+    std::vector<float> dense = {0.0f, 1.0f, 1.0f, 0.0f};
+    std::vector<float> x = {1.0f, 1.0f};
+
+    CSRMatrix* csr = csr_create(0, 0, 0);
+    ASSERT_NE(csr, nullptr);
+    ASSERT_EQ(csr_from_dense(csr, dense.data(), 2, 2), static_cast<int>(SpMVError::SUCCESS));
+
+    BenchmarkResult benchmark_result = benchmark_csr(csr, x.data(), nullptr, nullptr);
+    EXPECT_EQ(benchmark_result.error_code, static_cast<int>(SpMVError::KERNEL_LAUNCH));
+
+    csr_destroy(csr);
+}
+
+#endif
diff --git a/tests/test_pagerank_core.cpp b/tests/test_pagerank_core.cpp
new file mode 100644
index 0000000..e8e2d01
--- /dev/null
+++ b/tests/test_pagerank_core.cpp
@@ -0,0 +1,60 @@
+#include "spmv/csr_matrix.h"
+#include "spmv/pagerank.h"
+
+#include <gtest/gtest.h>
+#include <vector>
+
+using namespace spmv;
+
+#if !SPMV_WITH_CUDA
+
+TEST(PageRankCoreTest, NoCudaBuildUsesWorkingBackendForSimpleCycle) {
+    std::vector<float> adj = {0.0f, 0.0f, 1.0f,
+                              1.0f, 0.0f, 0.0f,
+                              0.0f, 1.0f, 0.0f};
+
+    CSRMatrix* csr = csr_create(0, 0, 0);
+    ASSERT_NE(csr, nullptr);
+    ASSERT_EQ(csr_from_dense(csr, adj.data(), 3, 3), static_cast<int>(SpMVError::SUCCESS));
+
+    PageRankResult result = pagerank(csr, nullptr);
+
+    ASSERT_EQ(result.error_code, static_cast<int>(SpMVError::SUCCESS));
+    ASSERT_NE(result.ranks, nullptr);
+    EXPECT_TRUE(result.converged);
+    EXPECT_NEAR(result.ranks[0], result.ranks[1], 1e-4f);
+    EXPECT_NEAR(result.ranks[1], result.ranks[2], 1e-4f);
+
+    pagerank_free(&result);
+    csr_destroy(csr);
+}
+
+TEST(PageRankCoreTest, NoCudaBuildKeepsDanglingGraphNormalized) {
+    std::vector<float> adj = {0.0f, 0.0f, 0.0f,
+                              1.0f, 0.0f, 0.0f,
+                              0.0f, 1.0f, 0.0f};
+
+    CSRMatrix* csr = csr_create(0, 0, 0);
+    ASSERT_NE(csr, nullptr);
+    ASSERT_EQ(csr_from_dense(csr, adj.data(), 3, 3), static_cast<int>(SpMVError::SUCCESS));
+
+    PageRankConfig config;
+    config.max_iterations = 100;
+    config.tolerance = 1e-6f;
+
+    PageRankResult result = pagerank(csr, &config);
+
+    ASSERT_EQ(result.error_code, static_cast<int>(SpMVError::SUCCESS));
+    ASSERT_NE(result.ranks, nullptr);
+
+    float sum = result.ranks[0] + result.ranks[1] + result.ranks[2];
+    EXPECT_NEAR(sum, 1.0f, 1e-4f);
+    EXPECT_GE(result.ranks[0], 0.0f);
+    EXPECT_GE(result.ranks[1], 0.0f);
+    EXPECT_GE(result.ranks[2], 0.0f);
+
+    pagerank_free(&result);
+    csr_destroy(csr);
+}
+
+#endif