From 15ad866a7c90a417bf7d0c9a008d659a175eec0f Mon Sep 17 00:00:00 2001 From: shijiashuai Date: Fri, 22 May 2026 11:15:59 +0800 Subject: [PATCH] refactor(core): deepen internal execution seams Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- AGENTS.md | 3 +- CLAUDE.md | 5 +- CMakeLists.txt | 102 +++++++++++------ include/spmv/common.h | 2 +- include/spmv/cuda_buffer.h | 2 +- include/spmv/cuda_compat.h | 150 ++++++++++++++++++++++++ include/spmv/spmv.h | 13 +-- src/bandwidth.cpp | 3 +- src/csr_matrix.cpp | 197 ++----------------------------- src/ell_matrix.cpp | 153 ++----------------------- src/internal/csr_device.cpp | 204 +++++++++++++++++++++++++++++++++ src/internal/csr_device.h | 5 + src/internal/ell_device.cpp | 160 ++++++++++++++++++++++++++ src/internal/ell_device.h | 5 + src/internal/pagerank_common.h | 15 +++ src/internal/texture_cache.h | 13 +++ src/no_cuda_stubs.cpp | 170 +++++++++++++++++++++++++++ src/pagerank.cu | 71 +----------- src/pagerank_common.cpp | 82 +++++++++++++ src/spmv_context.cpp | 58 ++++++++++ src/spmv_kernels.cu | 56 +-------- tests/test_architecture.cpp | 20 ++++ tests/test_no_cuda.cpp | 44 +++++++ tests/test_pagerank_core.cpp | 60 ++++++++++ 24 files changed, 1090 insertions(+), 503 deletions(-) create mode 100644 include/spmv/cuda_compat.h create mode 100644 src/internal/csr_device.cpp create mode 100644 src/internal/ell_device.cpp create mode 100644 src/internal/pagerank_common.h create mode 100644 src/internal/texture_cache.h create mode 100644 src/no_cuda_stubs.cpp create mode 100644 src/pagerank_common.cpp create mode 100644 src/spmv_context.cpp create mode 100644 tests/test_architecture.cpp create mode 100644 tests/test_no_cuda.cpp create mode 100644 tests/test_pagerank_core.cpp diff --git a/AGENTS.md b/AGENTS.md index d2b0c6c..60b708d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -101,6 +101,7 @@ cmake --preset release && cmake --build --preset release # CPU-only(无 GPU 环境,CI 使用此配置) cmake -S . -B build-no-cuda -DSPMV_REQUIRE_CUDA=OFF && cmake --build build-no-cuda +ctest --test-dir build-no-cuda --output-on-failure # 运行测试 ctest --preset default @@ -110,7 +111,7 @@ ctest --preset default find src include tests benchmarks -type f \( -name "*.cpp" -o -name "*.h" -o -name "*.cu" \) | xargs clang-format -i ``` -> **CI 无 GPU**:需要 CUDA 设备的测试在 CI 中会跳过。`benchmarks/main.cu` 和 `pagerank.cu` 在无 GPU 时自动退出。 +> **CI 无 GPU**:CPU-only 配置会构建 core library + CPU 测试;需要 CUDA 设备的测试、基准程序和 PageRank CUDA 实现不会参与该配置。 --- diff --git a/CLAUDE.md b/CLAUDE.md index 79ba41d..1be3379 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -35,7 +35,7 @@ cmake - S.- B build - no - cuda - DSPMV_REQUIRE_CUDA = OFF && cmake-- build build - no - - cuda + cuda && ctest --test-dir build-no-cuda --output-on-failure #测试 ctest-- preset default @@ -62,7 +62,8 @@ ##CI 特殊说明 - - CI 无 GPU:`benchmarks / main.cu`、`src / pagerank.cu` 无 GPU 时自动退出 + - CI 无 GPU:CPU-only 配置会构建 core library + CPU 测试;CUDA 测试 / benchmark / + PageRank CUDA 路径不参与该配置 - CI 使用 clang - format - 18 检查格式 - CPU - only 构建:`cmake - S.- B build - no - cuda - DSPMV_REQUIRE_CUDA = OFF` diff --git a/CMakeLists.txt b/CMakeLists.txt index 273f09b..e20a614 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,41 +8,60 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -if(NOT SPMV_REQUIRE_CUDA) - message(STATUS "SPMV_REQUIRE_CUDA=OFF; configuring in no-CUDA mode. No build targets will be generated.") - enable_testing() - return() -endif() - -check_language(CUDA) -if(NOT CMAKE_CUDA_COMPILER) - message(FATAL_ERROR "CUDA toolkit with nvcc is required to build gpu-spmv. Set CUDAToolkit_ROOT or ensure nvcc is available on PATH, or configure with -DSPMV_REQUIRE_CUDA=OFF for a configure-only fallback.") -endif() - -enable_language(CUDA) -set(CMAKE_CUDA_STANDARD 17) -set(CMAKE_CUDA_STANDARD_REQUIRED ON) - -# CUDA 架构设置 -if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - set(CMAKE_CUDA_ARCHITECTURES 70 75 80 86 89 90) +set(SPMV_WITH_CUDA OFF) + +if(SPMV_REQUIRE_CUDA) + check_language(CUDA) + if(NOT CMAKE_CUDA_COMPILER) + message(FATAL_ERROR "CUDA toolkit with nvcc is required to build gpu-spmv. Set CUDAToolkit_ROOT or ensure nvcc is available on PATH, or configure with -DSPMV_REQUIRE_CUDA=OFF for a CPU-only fallback.") + endif() + + enable_language(CUDA) + set(CMAKE_CUDA_STANDARD 17) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) + set(SPMV_WITH_CUDA ON) + + # CUDA 架构设置 + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES 70 75 80 86 89 90) + endif() +else() + message(STATUS "SPMV_REQUIRE_CUDA=OFF; configuring in no-CUDA mode. Building core library and CPU tests only.") endif() # ---------- 主库 ---------- set(SPMV_SOURCES src/csr_matrix.cpp src/ell_matrix.cpp + src/internal/csr_device.cpp + src/internal/ell_device.cpp src/spmv_cpu.cpp + src/spmv_context.cpp src/internal/kernel_selector.cpp - src/spmv_kernels.cu src/bandwidth.cpp - src/benchmark.cu - src/pagerank.cu + src/pagerank_common.cpp ) +if(SPMV_WITH_CUDA) + list(APPEND SPMV_SOURCES + src/spmv_kernels.cu + src/benchmark.cu + src/pagerank.cu + ) +else() + list(APPEND SPMV_SOURCES + src/no_cuda_stubs.cpp + ) +endif() + add_library(spmv STATIC ${SPMV_SOURCES}) target_include_directories(spmv PUBLIC ${CMAKE_SOURCE_DIR}/include) -set_target_properties(spmv PROPERTIES CUDA_SEPARABLE_COMPILATION ON) +if(SPMV_WITH_CUDA) + target_compile_definitions(spmv PUBLIC SPMV_WITH_CUDA=1) + set_target_properties(spmv PROPERTIES CUDA_SEPARABLE_COMPILATION ON) +else() + target_compile_definitions(spmv PUBLIC SPMV_WITH_CUDA=0) +endif() # CUDA 编译选项 target_compile_options(spmv PRIVATE @@ -64,19 +83,32 @@ enable_testing() # ---------- 测试 ---------- set(TEST_SOURCES + tests/test_architecture.cpp tests/test_common.cpp tests/test_csr.cpp tests/test_ell.cpp tests/test_kernel_selector.cpp - tests/test_spmv.cu - tests/test_bandwidth.cu - tests/test_benchmark.cu - tests/test_pagerank.cu + tests/test_no_cuda.cpp + tests/test_pagerank_core.cpp ) +if(SPMV_WITH_CUDA) + list(APPEND TEST_SOURCES + tests/test_spmv.cu + tests/test_bandwidth.cu + tests/test_benchmark.cu + tests/test_pagerank.cu + ) +endif() + add_executable(spmv_tests ${TEST_SOURCES}) target_link_libraries(spmv_tests PRIVATE spmv GTest::gtest_main) -set_target_properties(spmv_tests PROPERTIES CUDA_SEPARABLE_COMPILATION ON) +if(SPMV_WITH_CUDA) + target_compile_definitions(spmv_tests PRIVATE SPMV_WITH_CUDA=1) + set_target_properties(spmv_tests PROPERTIES CUDA_SEPARABLE_COMPILATION ON) +else() + target_compile_definitions(spmv_tests PRIVATE SPMV_WITH_CUDA=0) +endif() target_compile_options(spmv_tests PRIVATE $<$:-lineinfo> ) @@ -84,13 +116,15 @@ target_compile_options(spmv_tests PRIVATE include(GoogleTest) gtest_discover_tests(spmv_tests) -# ---------- 基准测试 ---------- -add_executable(spmv_benchmark benchmarks/main.cu) -target_link_libraries(spmv_benchmark PRIVATE spmv) -set_target_properties(spmv_benchmark PROPERTIES CUDA_SEPARABLE_COMPILATION ON) -target_compile_options(spmv_benchmark PRIVATE - $<$:-lineinfo> -) +if(SPMV_WITH_CUDA) + # ---------- 基准测试 ---------- + add_executable(spmv_benchmark benchmarks/main.cu) + target_link_libraries(spmv_benchmark PRIVATE spmv) + set_target_properties(spmv_benchmark PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + target_compile_options(spmv_benchmark PRIVATE + $<$:-lineinfo> + ) +endif() # ---------- Installation ---------- include(GNUInstallDirs) diff --git a/include/spmv/common.h b/include/spmv/common.h index 5785240..6920454 100644 --- a/include/spmv/common.h +++ b/include/spmv/common.h @@ -1,7 +1,7 @@ #ifndef SPMV_COMMON_H #define SPMV_COMMON_H -#include +#include "cuda_compat.h" #include #include diff --git a/include/spmv/cuda_buffer.h b/include/spmv/cuda_buffer.h index 36fec6a..38efb25 100644 --- a/include/spmv/cuda_buffer.h +++ b/include/spmv/cuda_buffer.h @@ -1,7 +1,7 @@ #ifndef SPMV_CUDA_BUFFER_H #define SPMV_CUDA_BUFFER_H -#include +#include "cuda_compat.h" #include #include diff --git a/include/spmv/cuda_compat.h b/include/spmv/cuda_compat.h new file mode 100644 index 0000000..d0d92ad --- /dev/null +++ b/include/spmv/cuda_compat.h @@ -0,0 +1,150 @@ +#ifndef SPMV_CUDA_COMPAT_H +#define SPMV_CUDA_COMPAT_H + +#if defined(SPMV_WITH_CUDA) && SPMV_WITH_CUDA + +#include + +#else + +#include +#include +#include +#include + +using cudaError_t = int; +using cudaTextureObject_t = std::uintptr_t; +using cudaEvent_t = void*; + +constexpr cudaError_t cudaSuccess = 0; +constexpr cudaError_t cudaErrorMemoryAllocation = 2; +constexpr cudaError_t cudaErrorInvalidValue = 11; + +enum cudaMemcpyKind { + cudaMemcpyHostToHost = 0, + cudaMemcpyHostToDevice = 1, + cudaMemcpyDeviceToHost = 2, + cudaMemcpyDeviceToDevice = 3 +}; + +enum { + cudaResourceTypeLinear = 0, + cudaAddressModeClamp = 0, + cudaFilterModePoint = 0, + cudaReadModeElementType = 0 +}; + +struct cudaChannelFormatDesc { + int x = 0; + int y = 0; + int z = 0; + int w = 0; + int f = 0; +}; + +template +inline cudaChannelFormatDesc cudaCreateChannelDesc() { + return {}; +} + +struct cudaResourceDesc { + int resType = cudaResourceTypeLinear; + struct { + struct { + void* devPtr = nullptr; + cudaChannelFormatDesc desc{}; + size_t sizeInBytes = 0; + } linear; + } res; +}; + +struct cudaTextureDesc { + int addressMode[3] = {cudaAddressModeClamp, cudaAddressModeClamp, cudaAddressModeClamp}; + int filterMode = cudaFilterModePoint; + int readMode = cudaReadModeElementType; + int normalizedCoords = 0; +}; + +struct cudaDeviceProp { + int memoryClockRate = 0; + int memoryBusWidth = 0; +}; + +inline const char* cudaGetErrorString(cudaError_t err) { + switch (err) { + case cudaSuccess: + return "success"; + case cudaErrorMemoryAllocation: + return "memory allocation failed"; + case cudaErrorInvalidValue: + return "invalid value"; + default: + return "cuda unavailable"; + } +} + +inline cudaError_t cudaMalloc(void** ptr, size_t size) { + if (!ptr) { + return cudaErrorInvalidValue; + } + *ptr = (size == 0) ? nullptr : std::malloc(size); + return (size == 0 || *ptr != nullptr) ? cudaSuccess : cudaErrorMemoryAllocation; +} + +template +inline cudaError_t cudaMalloc(T** ptr, size_t size) { + return cudaMalloc(reinterpret_cast(ptr), size); +} + +inline cudaError_t cudaFree(void* ptr) { + std::free(ptr); + return cudaSuccess; +} + +inline cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind) { + if (count > 0 && (!dst || !src)) { + return cudaErrorInvalidValue; + } + if (count > 0) { + std::memcpy(dst, src, count); + } + return cudaSuccess; +} + +inline cudaError_t cudaMemset(void* dst, int value, size_t count) { + if (count > 0 && !dst) { + return cudaErrorInvalidValue; + } + if (count > 0) { + std::memset(dst, value, count); + } + return cudaSuccess; +} + +inline cudaError_t cudaCreateTextureObject(cudaTextureObject_t* tex, + const cudaResourceDesc*, + const cudaTextureDesc*, + const void*) { + static cudaTextureObject_t next_texture = 1; + if (!tex) { + return cudaErrorInvalidValue; + } + *tex = next_texture++; + return cudaSuccess; +} + +inline cudaError_t cudaDestroyTextureObject(cudaTextureObject_t) { + return cudaSuccess; +} + +inline cudaError_t cudaGetDeviceProperties(cudaDeviceProp* prop, int) { + if (!prop) { + return cudaErrorInvalidValue; + } + *prop = {}; + return cudaSuccess; +} + +#endif + +#endif // SPMV_CUDA_COMPAT_H diff --git a/include/spmv/spmv.h b/include/spmv/spmv.h index 5cc4df1..627afb0 100644 --- a/include/spmv/spmv.h +++ b/include/spmv/spmv.h @@ -129,16 +129,11 @@ class SpMVExecutionContext { /** @brief Query whether a texture object is currently bound. */ bool is_texture_bound() const { return tex_x_ != 0; } - /** - * @brief Prepare texture object for input vector x. - * @return 0 on success, negative error code on failure. - * - * Internal API; defined in spmv_kernels.cu. - */ - int prepare_texture(const float* d_x, size_t x_length, bool requested, - cudaTextureObject_t* tex_out, bool* use_texture_out); - private: + friend int spmv_prepare_texture(SpMVExecutionContext* context, const float* d_x, size_t x_length, + bool requested, cudaTextureObject_t* tex_out, + bool* use_texture_out); + cudaTextureObject_t tex_x_ = 0; const float* cached_x_ = nullptr; size_t cached_x_length_ = 0; diff --git a/src/bandwidth.cpp b/src/bandwidth.cpp index b001050..51dc95d 100644 --- a/src/bandwidth.cpp +++ b/src/bandwidth.cpp @@ -1,6 +1,5 @@ #include "spmv/bandwidth.h" - -#include +#include "spmv/cuda_compat.h" #include #include diff --git a/src/csr_matrix.cpp b/src/csr_matrix.cpp index f692490..11b8a1b 100644 --- a/src/csr_matrix.cpp +++ b/src/csr_matrix.cpp @@ -1,3 +1,4 @@ +#include "internal/csr_device.h" #include "spmv/csr_matrix.h" #include @@ -9,79 +10,6 @@ namespace spmv { -// Internal device state ------------------------------------------------------- -struct CSRMatrixInternal { - float* d_values = nullptr; - int* d_col_indices = nullptr; - int* d_row_ptrs = nullptr; -}; - -// Helpers -------------------------------------------------------------------- -static CSRMatrixInternal* get_internal(CSRMatrix* mat) { - return mat ? static_cast(mat->internal) : nullptr; -} - -static const CSRMatrixInternal* get_internal(const CSRMatrix* mat) { - return mat ? static_cast(mat->internal) : nullptr; -} - -static void free_device(CSRMatrixInternal* internal) { - if (!internal) - return; - if (internal->d_values) { - cudaFree(internal->d_values); - internal->d_values = nullptr; - } - if (internal->d_col_indices) { - cudaFree(internal->d_col_indices); - internal->d_col_indices = nullptr; - } - if (internal->d_row_ptrs) { - cudaFree(internal->d_row_ptrs); - internal->d_row_ptrs = nullptr; - } -} - -// Internal API (used by src/ only) ------------------------------------------- -float* csr_d_values(CSRMatrix* mat) { - auto* internal = get_internal(mat); - return internal ? internal->d_values : nullptr; -} - -const float* csr_d_values(const CSRMatrix* mat) { - auto* internal = get_internal(mat); - return internal ? internal->d_values : nullptr; -} - -int* csr_d_col_indices(CSRMatrix* mat) { - auto* internal = get_internal(mat); - return internal ? internal->d_col_indices : nullptr; -} - -const int* csr_d_col_indices(const CSRMatrix* mat) { - auto* internal = get_internal(mat); - return internal ? internal->d_col_indices : nullptr; -} - -int* csr_d_row_ptrs(CSRMatrix* mat) { - auto* internal = get_internal(mat); - return internal ? internal->d_row_ptrs : nullptr; -} - -const int* csr_d_row_ptrs(const CSRMatrix* mat) { - auto* internal = get_internal(mat); - return internal ? internal->d_row_ptrs : nullptr; -} - -bool csr_has_device_data(const CSRMatrix* mat) { - auto* internal = get_internal(mat); - return internal && internal->d_row_ptrs != nullptr; -} - -void csr_free_device_data(CSRMatrix* mat) { - free_device(get_internal(mat)); -} - // Public API ----------------------------------------------------------------- CSRMatrix* csr_create(int rows, int cols, int nnz) { @@ -97,7 +25,7 @@ CSRMatrix* csr_create(int rows, int cols, int nnz) { mat->values = (nnz > 0) ? new float[nnz]() : nullptr; mat->col_indices = (nnz > 0) ? new int[nnz]() : nullptr; mat->row_ptrs = new int[rows + 1](); - mat->internal = new CSRMatrixInternal(); + mat->internal = csr_create_device_state(); return mat; } @@ -110,11 +38,7 @@ void csr_destroy(CSRMatrix* mat) { delete[] mat->col_indices; delete[] mat->row_ptrs; - auto* internal = get_internal(mat); - if (internal) { - free_device(internal); - delete internal; - } + csr_destroy_device_state(mat); delete mat; } @@ -129,7 +53,7 @@ int csr_from_dense(CSRMatrix* csr, const float* dense, int rows, int cols) { } // Host-side mutation invalidates the device mirror immediately. - free_device(get_internal(csr)); + csr_free_device_data(csr); int nnz = 0; for (int i = 0; i < rows * cols; i++) { @@ -208,116 +132,9 @@ float csr_get_element(const CSRMatrix* mat, int row, int col) { return 0.0f; } -int csr_to_gpu(CSRMatrix* mat) { - if (!mat) { - return static_cast(SpMVError::INVALID_ARGUMENT); - } - - if (!mat->row_ptrs || (mat->nnz > 0 && (!mat->values || !mat->col_indices))) { - return static_cast(SpMVError::INVALID_FORMAT); - } - - auto* internal = get_internal(mat); - if (!internal) { - return static_cast(SpMVError::INVALID_ARGUMENT); - } - - // Free any existing device data before allocating new buffers. - free_device(internal); - - float* new_d_values = nullptr; - int* new_d_col_indices = nullptr; - int* new_d_row_ptrs = nullptr; +int csr_to_gpu(CSRMatrix* mat) { return csr_upload_device_data(mat); } - auto cleanup = [&]() { - if (new_d_values) - cudaFree(new_d_values); - if (new_d_col_indices) - cudaFree(new_d_col_indices); - if (new_d_row_ptrs) - cudaFree(new_d_row_ptrs); - }; - - if (mat->nnz > 0) { - cudaError_t err = - cudaMalloc(reinterpret_cast(&new_d_values), mat->nnz * sizeof(float)); - if (err != cudaSuccess) { - cleanup(); - return static_cast(SpMVError::CUDA_MALLOC); - } - - err = cudaMalloc(reinterpret_cast(&new_d_col_indices), mat->nnz * sizeof(int)); - if (err != cudaSuccess) { - cleanup(); - return static_cast(SpMVError::CUDA_MALLOC); - } - } - - cudaError_t err = - cudaMalloc(reinterpret_cast(&new_d_row_ptrs), (mat->num_rows + 1) * sizeof(int)); - if (err != cudaSuccess) { - cleanup(); - return static_cast(SpMVError::CUDA_MALLOC); - } - - if (mat->nnz > 0) { - err = - cudaMemcpy(new_d_values, mat->values, mat->nnz * sizeof(float), cudaMemcpyHostToDevice); - if (err != cudaSuccess) { - cleanup(); - return static_cast(SpMVError::CUDA_MEMCPY); - } - - err = cudaMemcpy(new_d_col_indices, mat->col_indices, mat->nnz * sizeof(int), - cudaMemcpyHostToDevice); - if (err != cudaSuccess) { - cleanup(); - return static_cast(SpMVError::CUDA_MEMCPY); - } - } - - err = cudaMemcpy(new_d_row_ptrs, mat->row_ptrs, (mat->num_rows + 1) * sizeof(int), - cudaMemcpyHostToDevice); - if (err != cudaSuccess) { - cleanup(); - return static_cast(SpMVError::CUDA_MEMCPY); - } - - internal->d_values = new_d_values; - internal->d_col_indices = new_d_col_indices; - internal->d_row_ptrs = new_d_row_ptrs; - - return static_cast(SpMVError::SUCCESS); -} - -int csr_from_gpu(CSRMatrix* mat) { - if (!mat) { - return static_cast(SpMVError::INVALID_ARGUMENT); - } - - auto* internal = get_internal(mat); - if (!internal || !internal->d_row_ptrs) { - return static_cast(SpMVError::INVALID_ARGUMENT); - } - - if (!mat->row_ptrs) { - return static_cast(SpMVError::INVALID_ARGUMENT); - } - - if (mat->nnz > 0 && internal->d_values && internal->d_col_indices) { - if (!mat->values || !mat->col_indices) { - return static_cast(SpMVError::INVALID_ARGUMENT); - } - CUDA_CHECK_MEMCPY(cudaMemcpy(mat->values, internal->d_values, mat->nnz * sizeof(float), - cudaMemcpyDeviceToHost)); - CUDA_CHECK_MEMCPY(cudaMemcpy(mat->col_indices, internal->d_col_indices, - mat->nnz * sizeof(int), cudaMemcpyDeviceToHost)); - } - CUDA_CHECK_MEMCPY(cudaMemcpy(mat->row_ptrs, internal->d_row_ptrs, - (mat->num_rows + 1) * sizeof(int), cudaMemcpyDeviceToHost)); - - return static_cast(SpMVError::SUCCESS); -} +int csr_from_gpu(CSRMatrix* mat) { return csr_download_device_data(mat); } int csr_serialize(const CSRMatrix* mat, const char* filename) { if (!mat || !filename) { @@ -396,7 +213,7 @@ int csr_deserialize(CSRMatrix* mat, const char* filename) { } // Host-side mutation invalidates device mirror. - free_device(get_internal(mat)); + csr_free_device_data(mat); delete[] mat->values; delete[] mat->col_indices; diff --git a/src/ell_matrix.cpp b/src/ell_matrix.cpp index 6c5f69f..58b9673 100644 --- a/src/ell_matrix.cpp +++ b/src/ell_matrix.cpp @@ -1,70 +1,14 @@ +#include "internal/ell_device.h" #include "spmv/ell_matrix.h" #include +#include #include #include #include namespace spmv { -// Internal device state ------------------------------------------------------- -struct ELLMatrixInternal { - float* d_values = nullptr; - int* d_col_indices = nullptr; -}; - -// Helpers -------------------------------------------------------------------- -static ELLMatrixInternal* get_internal(ELLMatrix* mat) { - return mat ? static_cast(mat->internal) : nullptr; -} - -static const ELLMatrixInternal* get_internal(const ELLMatrix* mat) { - return mat ? static_cast(mat->internal) : nullptr; -} - -static void free_device(ELLMatrixInternal* internal) { - if (!internal) - return; - if (internal->d_values) { - cudaFree(internal->d_values); - internal->d_values = nullptr; - } - if (internal->d_col_indices) { - cudaFree(internal->d_col_indices); - internal->d_col_indices = nullptr; - } -} - -// Internal API (used by src/ only) ------------------------------------------- -float* ell_d_values(ELLMatrix* mat) { - auto* internal = get_internal(mat); - return internal ? internal->d_values : nullptr; -} - -const float* ell_d_values(const ELLMatrix* mat) { - auto* internal = get_internal(mat); - return internal ? internal->d_values : nullptr; -} - -int* ell_d_col_indices(ELLMatrix* mat) { - auto* internal = get_internal(mat); - return internal ? internal->d_col_indices : nullptr; -} - -const int* ell_d_col_indices(const ELLMatrix* mat) { - auto* internal = get_internal(mat); - return internal ? internal->d_col_indices : nullptr; -} - -bool ell_has_device_data(const ELLMatrix* mat) { - auto* internal = get_internal(mat); - return internal && internal->d_values != nullptr; -} - -void ell_free_device_data(ELLMatrix* mat) { - free_device(get_internal(mat)); -} - // Public API ----------------------------------------------------------------- ELLMatrix* ell_create(int rows, int cols, int max_nnz_per_row) { @@ -88,7 +32,7 @@ ELLMatrix* ell_create(int rows, int cols, int max_nnz_per_row) { } } - mat->internal = new ELLMatrixInternal(); + mat->internal = ell_create_device_state(); return mat; } @@ -100,11 +44,7 @@ void ell_destroy(ELLMatrix* mat) { delete[] mat->values; delete[] mat->col_indices; - auto* internal = get_internal(mat); - if (internal) { - free_device(internal); - delete internal; - } + ell_destroy_device_state(mat); delete mat; } @@ -118,7 +58,7 @@ int ell_from_dense(ELLMatrix* ell, const float* dense, int rows, int cols) { return static_cast(SpMVError::INVALID_ARGUMENT); } - free_device(get_internal(ell)); + ell_free_device_data(ell); int max_nnz = 0; for (int i = 0; i < rows; i++) { @@ -177,7 +117,7 @@ int ell_from_csr(ELLMatrix* ell, const CSRMatrix* csr) { return static_cast(SpMVError::INVALID_FORMAT); } - free_device(get_internal(ell)); + ell_free_device_data(ell); int max_nnz = 0; for (int i = 0; i < csr->num_rows; i++) { @@ -260,84 +200,9 @@ float ell_get_element(const ELLMatrix* mat, int row, int col) { return 0.0f; } -int ell_to_gpu(ELLMatrix* mat) { - if (!mat) { - return static_cast(SpMVError::INVALID_ARGUMENT); - } - - size_t size = static_cast(mat->num_rows) * mat->max_nnz_per_row; - if (size > 0 && (!mat->values || !mat->col_indices)) { - return static_cast(SpMVError::INVALID_FORMAT); - } - - auto* internal = get_internal(mat); - if (!internal) { - return static_cast(SpMVError::INVALID_ARGUMENT); - } - - free_device(internal); - - float* new_d_values = nullptr; - int* new_d_col_indices = nullptr; - - auto cleanup = [&]() { - if (new_d_values) - cudaFree(new_d_values); - if (new_d_col_indices) - cudaFree(new_d_col_indices); - }; - - if (size > 0) { - cudaError_t err = cudaMalloc(reinterpret_cast(&new_d_values), size * sizeof(float)); - if (err != cudaSuccess) { - cleanup(); - return static_cast(SpMVError::CUDA_MALLOC); - } - - err = cudaMalloc(reinterpret_cast(&new_d_col_indices), size * sizeof(int)); - if (err != cudaSuccess) { - cleanup(); - return static_cast(SpMVError::CUDA_MALLOC); - } - - err = cudaMemcpy(new_d_values, mat->values, size * sizeof(float), cudaMemcpyHostToDevice); - if (err != cudaSuccess) { - cleanup(); - return static_cast(SpMVError::CUDA_MEMCPY); - } - - err = cudaMemcpy(new_d_col_indices, mat->col_indices, size * sizeof(int), - cudaMemcpyHostToDevice); - if (err != cudaSuccess) { - cleanup(); - return static_cast(SpMVError::CUDA_MEMCPY); - } - } - - internal->d_values = new_d_values; - internal->d_col_indices = new_d_col_indices; - return static_cast(SpMVError::SUCCESS); -} - -int ell_from_gpu(ELLMatrix* mat) { - if (!mat) { - return static_cast(SpMVError::INVALID_ARGUMENT); - } +int ell_to_gpu(ELLMatrix* mat) { return ell_upload_device_data(mat); } - auto* internal = get_internal(mat); - size_t size = static_cast(mat->num_rows) * mat->max_nnz_per_row; - if (size > 0 && internal && internal->d_values && internal->d_col_indices) { - if (!mat->values || !mat->col_indices) { - return static_cast(SpMVError::INVALID_ARGUMENT); - } - CUDA_CHECK_MEMCPY( - cudaMemcpy(mat->values, internal->d_values, size * sizeof(float), cudaMemcpyDeviceToHost)); - CUDA_CHECK_MEMCPY(cudaMemcpy(mat->col_indices, internal->d_col_indices, size * sizeof(int), - cudaMemcpyDeviceToHost)); - } - - return static_cast(SpMVError::SUCCESS); -} +int ell_from_gpu(ELLMatrix* mat) { return ell_download_device_data(mat); } int ell_serialize(const ELLMatrix* mat, const char* filename) { if (!mat || !filename) { @@ -412,7 +277,7 @@ int ell_deserialize(ELLMatrix* mat, const char* filename) { return static_cast(SpMVError::FILE_IO); } - free_device(get_internal(mat)); + ell_free_device_data(mat); delete[] mat->values; delete[] mat->col_indices; diff --git a/src/internal/csr_device.cpp b/src/internal/csr_device.cpp new file mode 100644 index 0000000..f92e5d2 --- /dev/null +++ b/src/internal/csr_device.cpp @@ -0,0 +1,204 @@ +#include "csr_device.h" + +namespace spmv { + +namespace { + +struct CSRMatrixInternal { + float* d_values = nullptr; + int* d_col_indices = nullptr; + int* d_row_ptrs = nullptr; +}; + +CSRMatrixInternal* get_internal(CSRMatrix* mat) { + return mat ? static_cast(mat->internal) : nullptr; +} + +const CSRMatrixInternal* get_internal(const CSRMatrix* mat) { + return mat ? static_cast(mat->internal) : nullptr; +} + +void free_device(CSRMatrixInternal* internal) { + if (!internal) { + return; + } + if (internal->d_values) { + cudaFree(internal->d_values); + internal->d_values = nullptr; + } + if (internal->d_col_indices) { + cudaFree(internal->d_col_indices); + internal->d_col_indices = nullptr; + } + if (internal->d_row_ptrs) { + cudaFree(internal->d_row_ptrs); + internal->d_row_ptrs = nullptr; + } +} + +} // namespace + +void* csr_create_device_state() { + return new CSRMatrixInternal(); +} + +void csr_destroy_device_state(CSRMatrix* mat) { + auto* internal = get_internal(mat); + if (!internal) { + return; + } + free_device(internal); + delete internal; + mat->internal = nullptr; +} + +float* csr_d_values(CSRMatrix* mat) { + auto* internal = get_internal(mat); + return internal ? internal->d_values : nullptr; +} + +const float* csr_d_values(const CSRMatrix* mat) { + auto* internal = get_internal(mat); + return internal ? internal->d_values : nullptr; +} + +int* csr_d_col_indices(CSRMatrix* mat) { + auto* internal = get_internal(mat); + return internal ? internal->d_col_indices : nullptr; +} + +const int* csr_d_col_indices(const CSRMatrix* mat) { + auto* internal = get_internal(mat); + return internal ? internal->d_col_indices : nullptr; +} + +int* csr_d_row_ptrs(CSRMatrix* mat) { + auto* internal = get_internal(mat); + return internal ? internal->d_row_ptrs : nullptr; +} + +const int* csr_d_row_ptrs(const CSRMatrix* mat) { + auto* internal = get_internal(mat); + return internal ? internal->d_row_ptrs : nullptr; +} + +bool csr_has_device_data(const CSRMatrix* mat) { + auto* internal = get_internal(mat); + return internal && internal->d_row_ptrs != nullptr; +} + +void csr_free_device_data(CSRMatrix* mat) { + free_device(get_internal(mat)); +} + +int csr_upload_device_data(CSRMatrix* mat) { + if (!mat) { + return static_cast(SpMVError::INVALID_ARGUMENT); + } + + if (!mat->row_ptrs || (mat->nnz > 0 && (!mat->values || !mat->col_indices))) { + return static_cast(SpMVError::INVALID_FORMAT); + } + + auto* internal = get_internal(mat); + if (!internal) { + return static_cast(SpMVError::INVALID_ARGUMENT); + } + + free_device(internal); + + float* new_d_values = nullptr; + int* new_d_col_indices = nullptr; + int* new_d_row_ptrs = nullptr; + + auto cleanup = [&]() { + if (new_d_values) { + cudaFree(new_d_values); + } + if (new_d_col_indices) { + cudaFree(new_d_col_indices); + } + if (new_d_row_ptrs) { + cudaFree(new_d_row_ptrs); + } + }; + + if (mat->nnz > 0) { + cudaError_t err = cudaMalloc(reinterpret_cast(&new_d_values), mat->nnz * sizeof(float)); + if (err != cudaSuccess) { + cleanup(); + return static_cast(SpMVError::CUDA_MALLOC); + } + + err = cudaMalloc(reinterpret_cast(&new_d_col_indices), mat->nnz * sizeof(int)); + if (err != cudaSuccess) { + cleanup(); + return static_cast(SpMVError::CUDA_MALLOC); + } + } + + cudaError_t err = cudaMalloc(reinterpret_cast(&new_d_row_ptrs), + (mat->num_rows + 1) * sizeof(int)); + if (err != cudaSuccess) { + cleanup(); + return static_cast(SpMVError::CUDA_MALLOC); + } + + if (mat->nnz > 0) { + err = cudaMemcpy(new_d_values, mat->values, mat->nnz * sizeof(float), cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + cleanup(); + return static_cast(SpMVError::CUDA_MEMCPY); + } + + err = cudaMemcpy(new_d_col_indices, mat->col_indices, mat->nnz * sizeof(int), + cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + cleanup(); + return static_cast(SpMVError::CUDA_MEMCPY); + } + } + + err = cudaMemcpy(new_d_row_ptrs, mat->row_ptrs, (mat->num_rows + 1) * sizeof(int), + cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + cleanup(); + return static_cast(SpMVError::CUDA_MEMCPY); + } + + internal->d_values = new_d_values; + internal->d_col_indices = new_d_col_indices; + internal->d_row_ptrs = new_d_row_ptrs; + return static_cast(SpMVError::SUCCESS); +} + +int csr_download_device_data(CSRMatrix* mat) { + if (!mat) { + return static_cast(SpMVError::INVALID_ARGUMENT); + } + + auto* internal = get_internal(mat); + if (!internal || !internal->d_row_ptrs) { + return static_cast(SpMVError::INVALID_ARGUMENT); + } + + if (!mat->row_ptrs) { + return static_cast(SpMVError::INVALID_ARGUMENT); + } + + if (mat->nnz > 0 && internal->d_values && internal->d_col_indices) { + if (!mat->values || !mat->col_indices) { + return static_cast(SpMVError::INVALID_ARGUMENT); + } + CUDA_CHECK_MEMCPY(cudaMemcpy(mat->values, internal->d_values, mat->nnz * sizeof(float), + cudaMemcpyDeviceToHost)); + CUDA_CHECK_MEMCPY(cudaMemcpy(mat->col_indices, internal->d_col_indices, + mat->nnz * sizeof(int), cudaMemcpyDeviceToHost)); + } + CUDA_CHECK_MEMCPY(cudaMemcpy(mat->row_ptrs, internal->d_row_ptrs, + (mat->num_rows + 1) * sizeof(int), cudaMemcpyDeviceToHost)); + + return static_cast(SpMVError::SUCCESS); +} + +} // namespace spmv diff --git a/src/internal/csr_device.h b/src/internal/csr_device.h index 7b1f4cb..3c60994 100644 --- a/src/internal/csr_device.h +++ b/src/internal/csr_device.h @@ -9,6 +9,9 @@ namespace spmv { // These are NOT part of the public API; they are exposed only to // compilation units inside src/ that need to launch kernels. +void* csr_create_device_state(); +void csr_destroy_device_state(CSRMatrix* mat); + float* csr_d_values(CSRMatrix* mat); const float* csr_d_values(const CSRMatrix* mat); int* csr_d_col_indices(CSRMatrix* mat); @@ -20,6 +23,8 @@ bool csr_has_device_data(const CSRMatrix* mat); // Free only device memory (called internally by to_gpu / destroy). void csr_free_device_data(CSRMatrix* mat); +int csr_upload_device_data(CSRMatrix* mat); +int csr_download_device_data(CSRMatrix* mat); } // namespace spmv diff --git a/src/internal/ell_device.cpp b/src/internal/ell_device.cpp new file mode 100644 index 0000000..1ce7aa6 --- /dev/null +++ b/src/internal/ell_device.cpp @@ -0,0 +1,160 @@ +#include "ell_device.h" + +namespace spmv { + +namespace { + +struct ELLMatrixInternal { + float* d_values = nullptr; + int* d_col_indices = nullptr; +}; + +ELLMatrixInternal* get_internal(ELLMatrix* mat) { + return mat ? static_cast(mat->internal) : nullptr; +} + +const ELLMatrixInternal* get_internal(const ELLMatrix* mat) { + return mat ? static_cast(mat->internal) : nullptr; +} + +void free_device(ELLMatrixInternal* internal) { + if (!internal) { + return; + } + if (internal->d_values) { + cudaFree(internal->d_values); + internal->d_values = nullptr; + } + if (internal->d_col_indices) { + cudaFree(internal->d_col_indices); + internal->d_col_indices = nullptr; + } +} + +} // namespace + +void* ell_create_device_state() { + return new ELLMatrixInternal(); +} + +void ell_destroy_device_state(ELLMatrix* mat) { + auto* internal = get_internal(mat); + if (!internal) { + return; + } + free_device(internal); + delete internal; + mat->internal = nullptr; +} + +float* ell_d_values(ELLMatrix* mat) { + auto* internal = get_internal(mat); + return internal ? internal->d_values : nullptr; +} + +const float* ell_d_values(const ELLMatrix* mat) { + auto* internal = get_internal(mat); + return internal ? internal->d_values : nullptr; +} + +int* ell_d_col_indices(ELLMatrix* mat) { + auto* internal = get_internal(mat); + return internal ? internal->d_col_indices : nullptr; +} + +const int* ell_d_col_indices(const ELLMatrix* mat) { + auto* internal = get_internal(mat); + return internal ? internal->d_col_indices : nullptr; +} + +bool ell_has_device_data(const ELLMatrix* mat) { + auto* internal = get_internal(mat); + return internal && internal->d_values != nullptr; +} + +void ell_free_device_data(ELLMatrix* mat) { + free_device(get_internal(mat)); +} + +int ell_upload_device_data(ELLMatrix* mat) { + if (!mat) { + return static_cast(SpMVError::INVALID_ARGUMENT); + } + + size_t size = static_cast(mat->num_rows) * mat->max_nnz_per_row; + if (size > 0 && (!mat->values || !mat->col_indices)) { + return static_cast(SpMVError::INVALID_FORMAT); + } + + auto* internal = get_internal(mat); + if (!internal) { + return static_cast(SpMVError::INVALID_ARGUMENT); + } + + free_device(internal); + + float* new_d_values = nullptr; + int* new_d_col_indices = nullptr; + + auto cleanup = [&]() { + if (new_d_values) { + cudaFree(new_d_values); + } + if (new_d_col_indices) { + cudaFree(new_d_col_indices); + } + }; + + if (size > 0) { + cudaError_t err = cudaMalloc(reinterpret_cast(&new_d_values), size * sizeof(float)); + if (err != cudaSuccess) { + cleanup(); + return static_cast(SpMVError::CUDA_MALLOC); + } + + err = cudaMalloc(reinterpret_cast(&new_d_col_indices), size * sizeof(int)); + if (err != cudaSuccess) { + cleanup(); + return static_cast(SpMVError::CUDA_MALLOC); + } + + err = cudaMemcpy(new_d_values, mat->values, size * sizeof(float), cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + cleanup(); + return static_cast(SpMVError::CUDA_MEMCPY); + } + + err = cudaMemcpy(new_d_col_indices, mat->col_indices, size * sizeof(int), + cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + cleanup(); + return static_cast(SpMVError::CUDA_MEMCPY); + } + } + + internal->d_values = new_d_values; + internal->d_col_indices = new_d_col_indices; + return static_cast(SpMVError::SUCCESS); +} + +int ell_download_device_data(ELLMatrix* mat) { + if (!mat) { + return static_cast(SpMVError::INVALID_ARGUMENT); + } + + auto* internal = get_internal(mat); + size_t size = static_cast(mat->num_rows) * mat->max_nnz_per_row; + if (size > 0 && internal && internal->d_values && internal->d_col_indices) { + if (!mat->values || !mat->col_indices) { + return static_cast(SpMVError::INVALID_ARGUMENT); + } + CUDA_CHECK_MEMCPY( + cudaMemcpy(mat->values, internal->d_values, size * sizeof(float), cudaMemcpyDeviceToHost)); + CUDA_CHECK_MEMCPY(cudaMemcpy(mat->col_indices, internal->d_col_indices, size * sizeof(int), + cudaMemcpyDeviceToHost)); + } + + return static_cast(SpMVError::SUCCESS); +} + +} // namespace spmv diff --git a/src/internal/ell_device.h b/src/internal/ell_device.h index 37e21fb..538c451 100644 --- a/src/internal/ell_device.h +++ b/src/internal/ell_device.h @@ -8,6 +8,9 @@ namespace spmv { // Internal device-state accessors for ELLMatrix. // These are NOT part of the public API. +void* ell_create_device_state(); +void ell_destroy_device_state(ELLMatrix* mat); + float* ell_d_values(ELLMatrix* mat); const float* ell_d_values(const ELLMatrix* mat); int* ell_d_col_indices(ELLMatrix* mat); @@ -17,6 +20,8 @@ bool ell_has_device_data(const ELLMatrix* mat); // Free only device memory (called internally by to_gpu / destroy). void ell_free_device_data(ELLMatrix* mat); +int ell_upload_device_data(ELLMatrix* mat); +int ell_download_device_data(ELLMatrix* mat); } // namespace spmv diff --git a/src/internal/pagerank_common.h b/src/internal/pagerank_common.h new file mode 100644 index 0000000..f60d94b --- /dev/null +++ b/src/internal/pagerank_common.h @@ -0,0 +1,15 @@ +#ifndef SPMV_INTERNAL_PAGERANK_COMMON_H +#define SPMV_INTERNAL_PAGERANK_COMMON_H + +#include "spmv/pagerank.h" + +#include + +namespace spmv { + +std::vector pagerank_find_dangling_nodes(const CSRMatrix* adj_matrix); +void pagerank_normalize(float* ranks, int n); + +} // namespace spmv + +#endif // SPMV_INTERNAL_PAGERANK_COMMON_H diff --git a/src/internal/texture_cache.h b/src/internal/texture_cache.h new file mode 100644 index 0000000..5f8ffa2 --- /dev/null +++ b/src/internal/texture_cache.h @@ -0,0 +1,13 @@ +#ifndef SPMV_INTERNAL_TEXTURE_CACHE_H +#define SPMV_INTERNAL_TEXTURE_CACHE_H + +#include "spmv/spmv.h" + +namespace spmv { + +int spmv_prepare_texture(SpMVExecutionContext* context, const float* d_x, size_t x_length, + bool requested, cudaTextureObject_t* tex_out, bool* use_texture_out); + +} // namespace spmv + +#endif // SPMV_INTERNAL_TEXTURE_CACHE_H diff --git a/src/no_cuda_stubs.cpp b/src/no_cuda_stubs.cpp new file mode 100644 index 0000000..985dcfd --- /dev/null +++ b/src/no_cuda_stubs.cpp @@ -0,0 +1,170 @@ +#include "spmv/benchmark.h" +#include "spmv/pagerank.h" +#include "spmv/spmv.h" +#include "internal/pagerank_common.h" + +#include +#include +#include +#include + +namespace spmv { + +namespace { + +int no_cuda_error() { + return static_cast(SpMVError::KERNEL_LAUNCH); +} + +} // namespace + +SpMVResult spmv_csr(const CSRMatrix*, const float*, float* d_y, const SpMVConfig*, int, + SpMVExecutionContext*) { + SpMVResult result; + result.y = d_y; + result.error_code = no_cuda_error(); + return result; +} + +SpMVResult spmv_ell(const ELLMatrix*, const float*, float* d_y, const SpMVConfig*, int, + SpMVExecutionContext*) { + SpMVResult result; + result.y = d_y; + result.error_code = no_cuda_error(); + return result; +} + +BenchmarkResult benchmark_csr(const CSRMatrix*, const float*, const SpMVConfig*, + const BenchmarkConfig*) { + BenchmarkResult result; + result.error_code = no_cuda_error(); + return result; +} + +BenchmarkResult benchmark_ell(const ELLMatrix*, const float*, const BenchmarkConfig*) { + BenchmarkResult result; + result.error_code = no_cuda_error(); + return result; +} + +ComparisonResult compare_gpu_cpu_csr(const CSRMatrix*, const float*, const SpMVConfig*, + const BenchmarkConfig*) { + ComparisonResult result; + result.error_code = no_cuda_error(); + result.gpu_result.error_code = no_cuda_error(); + result.cpu_result.error_code = no_cuda_error(); + return result; +} + +std::string benchmark_to_json(const BenchmarkResult& result) { + std::ostringstream json; + json << "{\"name\":\"" << result.name << "\",\"execution_time_ms\":" << result.execution_time_ms + << ",\"gflops\":" << result.gflops << ",\"bandwidth_gb_s\":" << result.bandwidth_gb_s + << ",\"avg_time_ms\":" << result.avg_time_ms << ",\"min_time_ms\":" << result.min_time_ms + << ",\"max_time_ms\":" << result.max_time_ms << ",\"stddev_time_ms\":" + << result.stddev_time_ms << ",\"num_runs\":" << result.num_runs << ",\"error_code\":" + << result.error_code << "}"; + return json.str(); +} + +std::string comparison_to_json(const ComparisonResult& result) { + std::ostringstream json; + json << "{\"speedup\":" << result.speedup << ",\"error_code\":" << result.error_code << "}"; + return json.str(); +} + +BenchmarkResult benchmark_from_json(const std::string&) { + BenchmarkResult result; + result.error_code = no_cuda_error(); + return result; +} + +PageRankResult pagerank(const CSRMatrix* adj_matrix, const PageRankConfig* config) { + PageRankResult result; + + if (!adj_matrix) { + result.error_code = static_cast(SpMVError::INVALID_ARGUMENT); + return result; + } + if (adj_matrix->num_rows < 0 || adj_matrix->num_cols < 0 || adj_matrix->nnz < 0) { + result.error_code = static_cast(SpMVError::INVALID_ARGUMENT); + return result; + } + if (adj_matrix->num_rows != adj_matrix->num_cols) { + result.error_code = static_cast(SpMVError::INVALID_DIMENSION); + return result; + } + if (!adj_matrix->row_ptrs || + (adj_matrix->nnz > 0 && (!adj_matrix->values || !adj_matrix->col_indices))) { + result.error_code = static_cast(SpMVError::INVALID_FORMAT); + return result; + } + + PageRankConfig default_config; + if (!config) { + config = &default_config; + } + if (config->max_iterations < 0 || config->tolerance < 0.0f || + config->damping_factor < 0.0f || config->damping_factor > 1.0f) { + result.error_code = static_cast(SpMVError::INVALID_ARGUMENT); + return result; + } + + int n = adj_matrix->num_rows; + if (n == 0) { + result.converged = true; + result.error_code = static_cast(SpMVError::SUCCESS); + return result; + } + + result.ranks = new (std::nothrow) float[n]; + if (!result.ranks) { + result.error_code = static_cast(SpMVError::OUT_OF_MEMORY); + return result; + } + + float init_rank = 1.0f / static_cast(n); + std::vector next_ranks(n, 0.0f); + for (int i = 0; i < n; i++) { + result.ranks[i] = init_rank; + } + + std::vector dangling_nodes = pagerank_find_dangling_nodes(adj_matrix); + float damping = config->damping_factor; + float teleport = (1.0f - damping) / static_cast(n); + + for (int iter = 0; iter < config->max_iterations; iter++) { + float dangling_sum = 0.0f; + for (int node : dangling_nodes) { + dangling_sum += result.ranks[node]; + } + + spmv_cpu_csr(adj_matrix, result.ranks, next_ranks.data()); + + float dangling_contrib = damping * dangling_sum / static_cast(n); + float residual_sq = 0.0f; + for (int i = 0; i < n; i++) { + next_ranks[i] = damping * next_ranks[i] + dangling_contrib + teleport; + float diff = next_ranks[i] - result.ranks[i]; + residual_sq += diff * diff; + } + + result.iterations = iter + 1; + result.final_residual = std::sqrt(residual_sq); + + for (int i = 0; i < n; i++) { + result.ranks[i] = next_ranks[i]; + } + + if (result.final_residual < config->tolerance) { + result.converged = true; + break; + } + } + + pagerank_normalize(result.ranks, n); + result.error_code = static_cast(SpMVError::SUCCESS); + return result; +} + +} // namespace spmv diff --git a/src/pagerank.cu b/src/pagerank.cu index 66d2371..810d2e2 100644 --- a/src/pagerank.cu +++ b/src/pagerank.cu @@ -1,4 +1,5 @@ #include "internal/csr_device.h" +#include "internal/pagerank_common.h" #include "spmv/cuda_buffer.h" #include "spmv/pagerank.h" #include "spmv/spmv.h" @@ -39,36 +40,6 @@ static int map_cuda_exception_to_spmv_error(const CudaException& e) { : static_cast(SpMVError::CUDA_MEMCPY); } -static std::vector find_dangling_nodes(const CSRMatrix* adj_matrix) { - std::vector dangling; - if (!adj_matrix || adj_matrix->num_cols <= 0 || adj_matrix->num_rows <= 0) { - return dangling; - } - if (!adj_matrix->values || !adj_matrix->col_indices || !adj_matrix->row_ptrs) { - return dangling; - } - - int num_cols = adj_matrix->num_cols; - std::vector col_sums(num_cols, 0.0f); - for (int row = 0; row < adj_matrix->num_rows; row++) { - int start = adj_matrix->row_ptrs[row]; - int end = adj_matrix->row_ptrs[row + 1]; - for (int idx = start; idx < end; idx++) { - int col = adj_matrix->col_indices[idx]; - if (col >= 0 && col < num_cols) { - col_sums[col] += adj_matrix->values[idx]; - } - } - } - - for (int col = 0; col < num_cols; col++) { - if (col_sums[col] == 0.0f) { - dangling.push_back(col); - } - } - return dangling; -} - PageRankResult pagerank(const CSRMatrix* adj_matrix, const PageRankConfig* config) { PageRankResult result; @@ -135,7 +106,7 @@ PageRankResult pagerank(const CSRMatrix* adj_matrix, const PageRankConfig* confi d_ranks_old.copyFromHost(result.ranks, n); - std::vector dangling_nodes = find_dangling_nodes(adj_matrix); + std::vector dangling_nodes = pagerank_find_dangling_nodes(adj_matrix); CudaBuffer d_dangling_nodes(dangling_nodes.size()); if (!dangling_nodes.empty()) { d_dangling_nodes.copyFromHost(dangling_nodes.data(), dangling_nodes.size()); @@ -213,15 +184,7 @@ PageRankResult pagerank(const CSRMatrix* adj_matrix, const PageRankConfig* confi d_ranks_old.copyToHost(result.ranks, n); } - float sum = 0.0f; - for (int i = 0; i < n; i++) { - sum += result.ranks[i]; - } - if (sum > 0.0f) { - for (int i = 0; i < n; i++) { - result.ranks[i] /= sum; - } - } + pagerank_normalize(result.ranks, n); result.error_code = static_cast(SpMVError::SUCCESS); return result; @@ -232,32 +195,4 @@ PageRankResult pagerank(const CSRMatrix* adj_matrix, const PageRankConfig* confi } } -void pagerank_free(PageRankResult* result) { - if (result && result->ranks) { - delete[] result->ranks; - result->ranks = nullptr; - } -} - -void pagerank_top_k(const PageRankResult* result, int num_nodes, int k, TopKNode* top_k) { - if (!result || !result->ranks || !top_k || k <= 0 || num_nodes <= 0 || - result->error_code != static_cast(SpMVError::SUCCESS)) { - return; - } - - std::vector nodes(num_nodes); - for (int i = 0; i < num_nodes; i++) { - nodes[i].node_id = i; - nodes[i].rank = result->ranks[i]; - } - - int actual_k = std::min(k, num_nodes); - std::partial_sort(nodes.begin(), nodes.begin() + actual_k, nodes.end(), - [](const TopKNode& a, const TopKNode& b) { return a.rank > b.rank; }); - - for (int i = 0; i < actual_k; i++) { - top_k[i] = nodes[i]; - } -} - } // namespace spmv diff --git a/src/pagerank_common.cpp b/src/pagerank_common.cpp new file mode 100644 index 0000000..e55edda --- /dev/null +++ b/src/pagerank_common.cpp @@ -0,0 +1,82 @@ +#include "internal/pagerank_common.h" + +#include + +namespace spmv { + +std::vector pagerank_find_dangling_nodes(const CSRMatrix* adj_matrix) { + std::vector dangling; + if (!adj_matrix || adj_matrix->num_cols <= 0 || adj_matrix->num_rows <= 0) { + return dangling; + } + if (!adj_matrix->values || !adj_matrix->col_indices || !adj_matrix->row_ptrs) { + return dangling; + } + + int num_cols = adj_matrix->num_cols; + std::vector col_sums(num_cols, 0.0f); + for (int row = 0; row < adj_matrix->num_rows; row++) { + int start = adj_matrix->row_ptrs[row]; + int end = adj_matrix->row_ptrs[row + 1]; + for (int idx = start; idx < end; idx++) { + int col = adj_matrix->col_indices[idx]; + if (col >= 0 && col < num_cols) { + col_sums[col] += adj_matrix->values[idx]; + } + } + } + + for (int col = 0; col < num_cols; col++) { + if (col_sums[col] == 0.0f) { + dangling.push_back(col); + } + } + return dangling; +} + +void pagerank_normalize(float* ranks, int n) { + if (!ranks || n <= 0) { + return; + } + + float sum = 0.0f; + for (int i = 0; i < n; i++) { + sum += ranks[i]; + } + if (sum <= 0.0f) { + return; + } + for (int i = 0; i < n; i++) { + ranks[i] /= sum; + } +} + +void pagerank_free(PageRankResult* result) { + if (result && result->ranks) { + delete[] result->ranks; + result->ranks = nullptr; + } +} + +void pagerank_top_k(const PageRankResult* result, int num_nodes, int k, TopKNode* top_k) { + if (!result || !result->ranks || !top_k || k <= 0 || num_nodes <= 0 || + result->error_code != static_cast(SpMVError::SUCCESS)) { + return; + } + + std::vector nodes(num_nodes); + for (int i = 0; i < num_nodes; i++) { + nodes[i].node_id = i; + nodes[i].rank = result->ranks[i]; + } + + int actual_k = std::min(k, num_nodes); + std::partial_sort(nodes.begin(), nodes.begin() + actual_k, nodes.end(), + [](const TopKNode& a, const TopKNode& b) { return a.rank > b.rank; }); + + for (int i = 0; i < actual_k; i++) { + top_k[i] = nodes[i]; + } +} + +} // namespace spmv diff --git a/src/spmv_context.cpp b/src/spmv_context.cpp new file mode 100644 index 0000000..fee90d6 --- /dev/null +++ b/src/spmv_context.cpp @@ -0,0 +1,58 @@ +#include "internal/texture_cache.h" + +namespace spmv { + +int spmv_prepare_texture(SpMVExecutionContext* context, const float* d_x, size_t x_length, + bool requested, cudaTextureObject_t* tex_out, bool* use_texture_out) { + if (!tex_out || !use_texture_out) { + return static_cast(SpMVError::INVALID_ARGUMENT); + } + + *tex_out = 0; + *use_texture_out = false; + + if (!requested || !d_x || x_length == 0) { + if (context) { + context->reset(); + } + return static_cast(SpMVError::SUCCESS); + } + + if (!context) { + return static_cast(SpMVError::INVALID_ARGUMENT); + } + + bool needs_rebuild = !context->texture_enabled_ || context->tex_x_ == 0 || context->cached_x_ != d_x || + context->cached_x_length_ != x_length; + if (needs_rebuild) { + context->reset(); + + cudaResourceDesc res_desc{}; + res_desc.resType = cudaResourceTypeLinear; + res_desc.res.linear.devPtr = const_cast(d_x); + res_desc.res.linear.desc = cudaCreateChannelDesc(); + res_desc.res.linear.sizeInBytes = x_length * sizeof(float); + + cudaTextureDesc tex_desc{}; + tex_desc.addressMode[0] = cudaAddressModeClamp; + tex_desc.filterMode = cudaFilterModePoint; + tex_desc.readMode = cudaReadModeElementType; + tex_desc.normalizedCoords = 0; + + cudaError_t err = cudaCreateTextureObject(&context->tex_x_, &res_desc, &tex_desc, nullptr); + if (err != cudaSuccess) { + context->reset(); + return static_cast(SpMVError::CUDA_MALLOC); + } + + context->cached_x_ = d_x; + context->cached_x_length_ = x_length; + context->texture_enabled_ = true; + } + + *tex_out = context->tex_x_; + *use_texture_out = true; + return static_cast(SpMVError::SUCCESS); +} + +} // namespace spmv diff --git a/src/spmv_kernels.cu b/src/spmv_kernels.cu index bb2e21d..0164e34 100644 --- a/src/spmv_kernels.cu +++ b/src/spmv_kernels.cu @@ -1,5 +1,6 @@ #include "internal/csr_device.h" #include "internal/ell_device.h" +#include "internal/texture_cache.h" #include "spmv/bandwidth.h" #include "spmv/spmv.h" @@ -125,53 +126,6 @@ struct ScopedTexture { } }; -int SpMVExecutionContext::prepare_texture(const float* d_x, size_t x_length, bool requested, - cudaTextureObject_t* tex_out, bool* use_texture_out) { - if (!tex_out || !use_texture_out) { - return static_cast(SpMVError::INVALID_ARGUMENT); - } - - *tex_out = 0; - *use_texture_out = false; - - if (!requested || !d_x || x_length == 0) { - reset(); - return static_cast(SpMVError::SUCCESS); - } - - bool needs_rebuild = !texture_enabled_ || tex_x_ == 0 || cached_x_ != d_x || - cached_x_length_ != x_length; - if (needs_rebuild) { - reset(); - - cudaResourceDesc res_desc{}; - res_desc.resType = cudaResourceTypeLinear; - res_desc.res.linear.devPtr = const_cast(d_x); - res_desc.res.linear.desc = cudaCreateChannelDesc(); - res_desc.res.linear.sizeInBytes = x_length * sizeof(float); - - cudaTextureDesc tex_desc{}; - tex_desc.addressMode[0] = cudaAddressModeClamp; - tex_desc.filterMode = cudaFilterModePoint; - tex_desc.readMode = cudaReadModeElementType; - tex_desc.normalizedCoords = 0; - - cudaError_t err = cudaCreateTextureObject(&tex_x_, &res_desc, &tex_desc, nullptr); - if (err != cudaSuccess) { - reset(); - return static_cast(SpMVError::CUDA_MALLOC); - } - - cached_x_ = d_x; - cached_x_length_ = x_length; - texture_enabled_ = true; - } - - *tex_out = tex_x_; - *use_texture_out = true; - return static_cast(SpMVError::SUCCESS); -} - __device__ __forceinline__ float fetch_x(const float* x, cudaTextureObject_t tex_x, bool use_texture, int idx) { return use_texture ? tex1Dfetch(tex_x, idx) : x[idx]; @@ -430,8 +384,8 @@ SpMVResult spmv_csr(const CSRMatrix* A, const float* d_x, float* d_y, const SpMV if (use_texture && texture_length > 0) { if (context) { - int tex_status = context->prepare_texture(d_x, texture_length, use_texture, - &tex_x, &use_texture); + int tex_status = spmv_prepare_texture(context, d_x, texture_length, use_texture, &tex_x, + &use_texture); if (tex_status != static_cast(SpMVError::SUCCESS)) { result.error_code = tex_status; return result; @@ -565,8 +519,8 @@ SpMVResult spmv_ell(const ELLMatrix* A, const float* d_x, float* d_y, const SpMV if (use_texture && texture_length > 0) { if (context) { - int tex_status = context->prepare_texture(d_x, texture_length, use_texture, - &tex_x, &use_texture); + int tex_status = spmv_prepare_texture(context, d_x, texture_length, use_texture, &tex_x, + &use_texture); if (tex_status != static_cast(SpMVError::SUCCESS)) { result.error_code = tex_status; return result; diff --git a/tests/test_architecture.cpp b/tests/test_architecture.cpp new file mode 100644 index 0000000..1e571c8 --- /dev/null +++ b/tests/test_architecture.cpp @@ -0,0 +1,20 @@ +#include "spmv/spmv.h" + +#include +#include + +using namespace spmv; + +namespace { + +template +struct HasPublicPrepareTexture : std::false_type {}; + +template +struct HasPublicPrepareTexture> : std::true_type {}; + +} // namespace + +TEST(SpMVExecutionContextArchitectureTest, PrepareTextureStaysInternal) { + EXPECT_FALSE(HasPublicPrepareTexture::value); +} diff --git a/tests/test_no_cuda.cpp b/tests/test_no_cuda.cpp new file mode 100644 index 0000000..45d8167 --- /dev/null +++ b/tests/test_no_cuda.cpp @@ -0,0 +1,44 @@ +#include "spmv/benchmark.h" +#include "spmv/csr_matrix.h" +#include "spmv/pagerank.h" +#include "spmv/spmv.h" + +#include +#include + +using namespace spmv; + +#if !SPMV_WITH_CUDA + +TEST(NoCudaModeTest, SpMVCsrFailsGracefullyWithoutCudaBackend) { + std::vector dense = {1.0f, 0.0f, 0.0f, 2.0f}; + std::vector x = {3.0f, 4.0f}; + std::vector y(2, 0.0f); + + CSRMatrix* csr = csr_create(0, 0, 0); + ASSERT_NE(csr, nullptr); + ASSERT_EQ(csr_from_dense(csr, dense.data(), 2, 2), static_cast(SpMVError::SUCCESS)); + + SpMVResult result = spmv_csr(csr, x.data(), y.data(), nullptr, 2); + + EXPECT_EQ(result.error_code, static_cast(SpMVError::KERNEL_LAUNCH)); + EXPECT_EQ(result.y, y.data()); + + csr_destroy(csr); +} + +TEST(NoCudaModeTest, BenchmarkFailsGracefullyWithoutCudaBackend) { + std::vector dense = {0.0f, 1.0f, 1.0f, 0.0f}; + std::vector x = {1.0f, 1.0f}; + + CSRMatrix* csr = csr_create(0, 0, 0); + ASSERT_NE(csr, nullptr); + ASSERT_EQ(csr_from_dense(csr, dense.data(), 2, 2), static_cast(SpMVError::SUCCESS)); + + BenchmarkResult benchmark_result = benchmark_csr(csr, x.data(), nullptr, nullptr); + EXPECT_EQ(benchmark_result.error_code, static_cast(SpMVError::KERNEL_LAUNCH)); + + csr_destroy(csr); +} + +#endif diff --git a/tests/test_pagerank_core.cpp b/tests/test_pagerank_core.cpp new file mode 100644 index 0000000..e8e2d01 --- /dev/null +++ b/tests/test_pagerank_core.cpp @@ -0,0 +1,60 @@ +#include "spmv/csr_matrix.h" +#include "spmv/pagerank.h" + +#include +#include + +using namespace spmv; + +#if !SPMV_WITH_CUDA + +TEST(PageRankCoreTest, NoCudaBuildUsesWorkingBackendForSimpleCycle) { + std::vector adj = {0.0f, 0.0f, 1.0f, + 1.0f, 0.0f, 0.0f, + 0.0f, 1.0f, 0.0f}; + + CSRMatrix* csr = csr_create(0, 0, 0); + ASSERT_NE(csr, nullptr); + ASSERT_EQ(csr_from_dense(csr, adj.data(), 3, 3), static_cast(SpMVError::SUCCESS)); + + PageRankResult result = pagerank(csr, nullptr); + + ASSERT_EQ(result.error_code, static_cast(SpMVError::SUCCESS)); + ASSERT_NE(result.ranks, nullptr); + EXPECT_TRUE(result.converged); + EXPECT_NEAR(result.ranks[0], result.ranks[1], 1e-4f); + EXPECT_NEAR(result.ranks[1], result.ranks[2], 1e-4f); + + pagerank_free(&result); + csr_destroy(csr); +} + +TEST(PageRankCoreTest, NoCudaBuildKeepsDanglingGraphNormalized) { + std::vector adj = {0.0f, 0.0f, 0.0f, + 1.0f, 0.0f, 0.0f, + 0.0f, 1.0f, 0.0f}; + + CSRMatrix* csr = csr_create(0, 0, 0); + ASSERT_NE(csr, nullptr); + ASSERT_EQ(csr_from_dense(csr, adj.data(), 3, 3), static_cast(SpMVError::SUCCESS)); + + PageRankConfig config; + config.max_iterations = 100; + config.tolerance = 1e-6f; + + PageRankResult result = pagerank(csr, &config); + + ASSERT_EQ(result.error_code, static_cast(SpMVError::SUCCESS)); + ASSERT_NE(result.ranks, nullptr); + + float sum = result.ranks[0] + result.ranks[1] + result.ranks[2]; + EXPECT_NEAR(sum, 1.0f, 1e-4f); + EXPECT_GE(result.ranks[0], 0.0f); + EXPECT_GE(result.ranks[1], 0.0f); + EXPECT_GE(result.ranks[2], 0.0f); + + pagerank_free(&result); + csr_destroy(csr); +} + +#endif