diff --git a/tasks/example_threads/stl/report.md b/tasks/example_threads/stl/report.md deleted file mode 100644 index e69de29bb..000000000 diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/all/include/ops_all.hpp b/tasks/kurpiakov_a_sp_comp_mat_mul/all/include/ops_all.hpp new file mode 100644 index 000000000..41a24cc06 --- /dev/null +++ b/tasks/kurpiakov_a_sp_comp_mat_mul/all/include/ops_all.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include "kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp" +#include "task/include/task.hpp" + +namespace kurpiakov_a_sp_comp_mat_mul { + +class KurpiakovACRSMatMulALL : public BaseTask { + public: + static constexpr ppc::task::TypeOfTask GetStaticTypeOfTask() { + return ppc::task::TypeOfTask::kALL; + } + explicit KurpiakovACRSMatMulALL(const InType &in); + + private: + bool ValidationImpl() override; + bool PreProcessingImpl() override; + bool RunImpl() override; + bool PostProcessingImpl() override; +}; + +} // namespace kurpiakov_a_sp_comp_mat_mul diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/all/src/ops_all.cpp b/tasks/kurpiakov_a_sp_comp_mat_mul/all/src/ops_all.cpp new file mode 100644 index 000000000..2fedadf40 --- /dev/null +++ b/tasks/kurpiakov_a_sp_comp_mat_mul/all/src/ops_all.cpp @@ -0,0 +1,248 @@ +#include "kurpiakov_a_sp_comp_mat_mul/all/include/ops_all.hpp" + +#include + +#include +#include +#include +#include +#include +#include + +#include "kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp" +#include "util/include/util.hpp" + +namespace kurpiakov_a_sp_comp_mat_mul { + +namespace { + +bool ValidateCSR(const SparseMatrix &m) { + if (m.rows <= 0 || m.cols <= 0) { + return false; + } + if (static_cast(m.row_ptr.size()) != m.rows + 1) { + return false; + } + if (m.row_ptr[0] != 0) { + return false; + } + if (std::cmp_not_equal(m.values.size(), m.row_ptr[m.rows])) { + return false; + } + if (m.col_indices.size() != m.values.size()) { + return false; + } + for (int i = 0; i < m.rows; ++i) { + for (int j = m.row_ptr[i]; j < m.row_ptr[i + 1]; ++j) { + if (m.col_indices[j] < 0 || m.col_indices[j] >= m.cols) { + return false; + } + } + } + return true; +} + +std::pair GetRowRange(int total_rows, int rank, int size) { + const int begin = (total_rows * rank) / size; + const int end = (total_rows * (rank + 1)) / size; + return {begin, end}; +} + +void MultiplySingleRow(const SparseMatrix &a, const SparseMatrix &b, int row_idx, std::vector &row_acc, + std::vector &row_used, std::vector &used_cols, std::vector &out_values, + std::vector &out_cols) { + used_cols.clear(); + + for (int ja = a.row_ptr[row_idx]; ja < a.row_ptr[row_idx + 1]; ++ja) { + const int ka = a.col_indices[ja]; + const ComplexD &a_val = a.values[ja]; + + for (int jb = b.row_ptr[ka]; jb < b.row_ptr[ka + 1]; ++jb) { + const int cb = b.col_indices[jb]; + const ComplexD &b_val = b.values[jb]; + + if (row_used[cb] == 0) { + row_used[cb] = 1; + row_acc[cb] = ComplexD(); + used_cols.push_back(cb); + } + + row_acc[cb] += a_val * b_val; + } + } + + std::ranges::sort(used_cols); + + out_values.clear(); + out_cols.clear(); + out_values.reserve(used_cols.size()); + out_cols.reserve(used_cols.size()); + + for (int col : used_cols) { + out_values.push_back(row_acc[col]); + out_cols.push_back(col); + row_used[col] = 0; + } +} + +void ComputeLocalRowsThreads(const SparseMatrix &a, const SparseMatrix &b, int row_begin, int row_end, + std::vector> &local_values, + std::vector> &local_cols) { + const int local_rows = row_end - row_begin; + const int requested_threads = ppc::util::GetNumThreads(); + const int max_threads = std::max(1, local_rows); + const int num_threads = std::max(1, std::min(requested_threads, max_threads)); + + std::atomic next_row(row_begin); + std::vector workers; + workers.reserve(num_threads); + + for (int tid = 0; tid < num_threads; ++tid) { + workers.emplace_back([&]() { + std::vector row_acc(b.cols); + std::vector row_used(b.cols, 0); + std::vector used_cols; + + while (true) { + const int row = next_row.fetch_add(1, std::memory_order_relaxed); + if (row >= row_end) { + break; + } + + const int local_idx = row - row_begin; + MultiplySingleRow(a, b, row, row_acc, row_used, used_cols, local_values[local_idx], local_cols[local_idx]); + } + }); + } + + for (auto &worker : workers) { + worker.join(); + } +} + +} // namespace + +KurpiakovACRSMatMulALL::KurpiakovACRSMatMulALL(const InType &in) { + SetTypeOfTask(GetStaticTypeOfTask()); + GetInput() = in; + GetOutput() = SparseMatrix(); +} + +bool KurpiakovACRSMatMulALL::ValidationImpl() { + const auto &[a, b] = GetInput(); + + if (!ValidateCSR(a) || !ValidateCSR(b)) { + return false; + } + + return a.cols == b.rows; +} + +bool KurpiakovACRSMatMulALL::PreProcessingImpl() { + return true; +} + +bool KurpiakovACRSMatMulALL::RunImpl() { + const auto &[a, b] = GetInput(); + const int rows = a.rows; + const int cols = b.cols; + + int rank = 0; + int world_size = 1; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + const auto [row_begin, row_end] = GetRowRange(rows, rank, world_size); + const int local_rows = row_end - row_begin; + + std::vector> local_values(local_rows); + std::vector> local_cols(local_rows); + + ComputeLocalRowsThreads(a, b, row_begin, row_end, local_values, local_cols); + + std::vector local_row_nnz(rows, 0); + for (int local_i = 0; local_i < local_rows; ++local_i) { + local_row_nnz[row_begin + local_i] = static_cast(local_values[local_i].size()); + } + + std::vector global_row_nnz(rows, 0); + MPI_Allreduce(local_row_nnz.data(), global_row_nnz.data(), rows, MPI_INT, MPI_SUM, MPI_COMM_WORLD); + + std::vector global_row_ptr(rows + 1, 0); + for (int i = 0; i < rows; ++i) { + global_row_ptr[i + 1] = global_row_ptr[i] + global_row_nnz[i]; + } + + const int total_nnz = global_row_ptr[rows]; + const int local_nnz = global_row_ptr[row_end] - global_row_ptr[row_begin]; + + std::vector local_re(local_nnz); + std::vector local_im(local_nnz); + std::vector local_col_indices(local_nnz); + + int pos = 0; + for (int local_i = 0; local_i < local_rows; ++local_i) { + const auto &vals = local_values[local_i]; + const auto &cols_row = local_cols[local_i]; + + for (size_t j = 0; j < vals.size(); ++j) { + local_re[pos] = vals[j].re; + local_im[pos] = vals[j].im; + local_col_indices[pos] = cols_row[j]; + ++pos; + } + } + + std::vector recv_counts(world_size, 0); + MPI_Allgather(&local_nnz, 1, MPI_INT, recv_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); + + std::vector recv_displs(world_size, 0); + for (int rec = 1; rec < world_size; ++rec) { + recv_displs[rec] = recv_displs[rec - 1] + recv_counts[rec - 1]; + } + + std::vector global_re; + std::vector global_im; + std::vector global_col_indices; + + if (rank == 0) { + global_re.resize(total_nnz); + global_im.resize(total_nnz); + global_col_indices.resize(total_nnz); + } + + MPI_Gatherv(local_re.data(), local_nnz, MPI_DOUBLE, global_re.data(), recv_counts.data(), recv_displs.data(), + MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Gatherv(local_im.data(), local_nnz, MPI_DOUBLE, global_im.data(), recv_counts.data(), recv_displs.data(), + MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Gatherv(local_col_indices.data(), local_nnz, MPI_INT, global_col_indices.data(), recv_counts.data(), + recv_displs.data(), MPI_INT, 0, MPI_COMM_WORLD); + + if (rank != 0) { + global_re.resize(total_nnz); + global_im.resize(total_nnz); + global_col_indices.resize(total_nnz); + } + + MPI_Bcast(global_re.data(), total_nnz, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(global_im.data(), total_nnz, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(global_col_indices.data(), total_nnz, MPI_INT, 0, MPI_COMM_WORLD); + + SparseMatrix result(rows, cols); + result.row_ptr = std::move(global_row_ptr); + result.col_indices = std::move(global_col_indices); + result.values.resize(static_cast(total_nnz)); + + for (int i = 0; i < total_nnz; ++i) { + result.values[static_cast(i)] = ComplexD(global_re[i], global_im[i]); + } + + GetOutput() = std::move(result); + return true; +} + +bool KurpiakovACRSMatMulALL::PostProcessingImpl() { + return true; +} + +} // namespace kurpiakov_a_sp_comp_mat_mul diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp b/tasks/kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp index 9b407ba0c..3dc8f9511 100644 --- a/tasks/kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp +++ b/tasks/kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp @@ -131,6 +131,88 @@ class CSRMatrix { return result; } + private: + static void ProcessRow(int i, const CSRMatrix &self, const CSRMatrix &other, std::vector &acc_re, + std::vector &acc_im, std::vector &local_used, + std::vector>> &row_values, + std::vector> &row_col_indices) { + std::vector used_cols; + used_cols.reserve(other.cols); + + for (int ja = self.row_ptr[i]; ja < self.row_ptr[i + 1]; ++ja) { + const int ka = self.col_indices[ja]; + const T a_re = self.values[ja].re; + const T a_im = self.values[ja].im; + const int jb_start = other.row_ptr[ka]; + const int jb_end = other.row_ptr[ka + 1]; + + for (int jb = jb_start; jb < jb_end; ++jb) { + const int cb = other.col_indices[jb]; + if (!local_used[cb]) { + local_used[cb] = true; + acc_re[cb] = T(0); + acc_im[cb] = T(0); + used_cols.push_back(cb); + } + acc_re[cb] += (a_re * other.values[jb].re) - (a_im * other.values[jb].im); + acc_im[cb] += (a_re * other.values[jb].im) + (a_im * other.values[jb].re); + } + } + + std::ranges::sort(used_cols); + row_values[i].reserve(used_cols.size()); + row_col_indices[i].reserve(used_cols.size()); + + for (const int c : used_cols) { + row_values[i].emplace_back(acc_re[c], acc_im[c]); + row_col_indices[i].push_back(c); + local_used[c] = false; + } + } + + public: + [[nodiscard]] CSRMatrix OMPMultiply(const CSRMatrix &other) const { + if (cols != other.rows) { + return {}; + } + + CSRMatrix result(rows, other.cols); + std::vector>> row_values(rows); + std::vector> row_col_indices(rows); + + const CSRMatrix &self = *this; + const int nrows = rows; + const int ncols = other.cols; + +#pragma omp parallel default(none) shared(self, other, row_values, row_col_indices, nrows, ncols) + { + std::vector acc_re(static_cast(ncols)); + std::vector acc_im(static_cast(ncols)); + std::vector local_used(static_cast(ncols), false); + +#pragma omp for schedule(dynamic) + for (int i = 0; i < nrows; ++i) { + ProcessRow(i, self, other, acc_re, acc_im, local_used, row_values, row_col_indices); + } + } + + int total_nnz = 0; + for (int i = 0; i < rows; ++i) { + total_nnz += static_cast(row_values[i].size()); + } + + result.values.reserve(static_cast(total_nnz)); + result.col_indices.reserve(static_cast(total_nnz)); + + for (int i = 0; i < rows; ++i) { + result.values.insert(result.values.end(), row_values[i].begin(), row_values[i].end()); + result.col_indices.insert(result.col_indices.end(), row_col_indices[i].begin(), row_col_indices[i].end()); + result.row_ptr[i + 1] = static_cast(result.values.size()); + } + + return result; + } + [[nodiscard]] std::vector> ToDense() const { std::vector> dense(rows * cols); for (int i = 0; i < rows; ++i) { diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/omp/include/ops_omp.hpp b/tasks/kurpiakov_a_sp_comp_mat_mul/omp/include/ops_omp.hpp new file mode 100644 index 000000000..a4aa74df7 --- /dev/null +++ b/tasks/kurpiakov_a_sp_comp_mat_mul/omp/include/ops_omp.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include "kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp" +#include "task/include/task.hpp" + +namespace kurpiakov_a_sp_comp_mat_mul { + +class KurpiakovACRSMatMulOMP : public BaseTask { + public: + static constexpr ppc::task::TypeOfTask GetStaticTypeOfTask() { + return ppc::task::TypeOfTask::kOMP; + } + explicit KurpiakovACRSMatMulOMP(const InType &in); + + private: + bool ValidationImpl() override; + bool PreProcessingImpl() override; + bool RunImpl() override; + bool PostProcessingImpl() override; +}; + +} // namespace kurpiakov_a_sp_comp_mat_mul diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/omp/src/ops_omp.cpp b/tasks/kurpiakov_a_sp_comp_mat_mul/omp/src/ops_omp.cpp new file mode 100644 index 000000000..49b72b9f4 --- /dev/null +++ b/tasks/kurpiakov_a_sp_comp_mat_mul/omp/src/ops_omp.cpp @@ -0,0 +1,70 @@ +#include "kurpiakov_a_sp_comp_mat_mul/omp/include/ops_omp.hpp" + +#include +#include + +#include "kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp" + +namespace kurpiakov_a_sp_comp_mat_mul { + +namespace { + +bool ValidateCSR(const SparseMatrix &m) { + if (m.rows <= 0 || m.cols <= 0) { + return false; + } + if (static_cast(m.row_ptr.size()) != m.rows + 1) { + return false; + } + if (m.row_ptr[0] != 0) { + return false; + } + if (std::cmp_not_equal(m.values.size(), m.row_ptr[m.rows])) { + return false; + } + if (m.col_indices.size() != m.values.size()) { + return false; + } + for (int i = 0; i < m.rows; ++i) { + for (int j = m.row_ptr[i]; j < m.row_ptr[i + 1]; ++j) { + if (m.col_indices[j] < 0 || m.col_indices[j] >= m.cols) { + return false; + } + } + } + return true; +} + +} // namespace + +KurpiakovACRSMatMulOMP::KurpiakovACRSMatMulOMP(const InType &in) { + SetTypeOfTask(GetStaticTypeOfTask()); + GetInput() = in; + GetOutput() = SparseMatrix(); +} + +bool KurpiakovACRSMatMulOMP::ValidationImpl() { + const auto &[a, b] = GetInput(); + + if (!ValidateCSR(a) || !ValidateCSR(b)) { + return false; + } + + return a.cols == b.rows; +} + +bool KurpiakovACRSMatMulOMP::PreProcessingImpl() { + return true; +} + +bool KurpiakovACRSMatMulOMP::RunImpl() { + const auto &[a, b] = GetInput(); + GetOutput() = a.OMPMultiply(b); + return true; +} + +bool KurpiakovACRSMatMulOMP::PostProcessingImpl() { + return true; +} + +} // namespace kurpiakov_a_sp_comp_mat_mul diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/stl/include/ops_stl.hpp b/tasks/kurpiakov_a_sp_comp_mat_mul/stl/include/ops_stl.hpp new file mode 100644 index 000000000..8531d8d87 --- /dev/null +++ b/tasks/kurpiakov_a_sp_comp_mat_mul/stl/include/ops_stl.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include "kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp" +#include "task/include/task.hpp" + +namespace kurpiakov_a_sp_comp_mat_mul { + +class KurpiakovACRSMatMulSTL : public BaseTask { + public: + static constexpr ppc::task::TypeOfTask GetStaticTypeOfTask() { + return ppc::task::TypeOfTask::kSTL; + } + explicit KurpiakovACRSMatMulSTL(const InType &in); + + private: + bool ValidationImpl() override; + bool PreProcessingImpl() override; + bool RunImpl() override; + bool PostProcessingImpl() override; +}; + +} // namespace kurpiakov_a_sp_comp_mat_mul diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/stl/src/ops_stl.cpp b/tasks/kurpiakov_a_sp_comp_mat_mul/stl/src/ops_stl.cpp new file mode 100644 index 000000000..120fdcfc4 --- /dev/null +++ b/tasks/kurpiakov_a_sp_comp_mat_mul/stl/src/ops_stl.cpp @@ -0,0 +1,176 @@ +#include "kurpiakov_a_sp_comp_mat_mul/stl/include/ops_stl.hpp" + +#include +#include +#include +#include +#include +#include + +#include "kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp" +#include "util/include/util.hpp" + +namespace kurpiakov_a_sp_comp_mat_mul { + +namespace { + +bool ValidateCSR(const SparseMatrix &m) { + if (m.rows <= 0 || m.cols <= 0) { + return false; + } + if (static_cast(m.row_ptr.size()) != m.rows + 1) { + return false; + } + if (m.row_ptr[0] != 0) { + return false; + } + if (std::cmp_not_equal(m.values.size(), m.row_ptr[m.rows])) { + return false; + } + if (m.col_indices.size() != m.values.size()) { + return false; + } + for (int i = 0; i < m.rows; ++i) { + for (int j = m.row_ptr[i]; j < m.row_ptr[i + 1]; ++j) { + if (m.col_indices[j] < 0 || m.col_indices[j] >= m.cols) { + return false; + } + } + } + return true; +} + +struct ThreadLocalRowState { + explicit ThreadLocalRowState(int cols) : row_acc(cols), row_used(cols, 0) {} + + std::vector row_acc; + std::vector row_used; + std::vector used_cols; +}; + +void MultiplySingleRow(const SparseMatrix &a, const SparseMatrix &b, int row_idx, ThreadLocalRowState &state, + std::vector &out_values, std::vector &out_cols) { + state.used_cols.clear(); + + for (int ja = a.row_ptr[row_idx]; ja < a.row_ptr[row_idx + 1]; ++ja) { + const int ka = a.col_indices[ja]; + const ComplexD &a_val = a.values[ja]; + + for (int jb = b.row_ptr[ka]; jb < b.row_ptr[ka + 1]; ++jb) { + const int cb = b.col_indices[jb]; + const ComplexD &b_val = b.values[jb]; + + if (state.row_used[cb] == 0) { + state.row_used[cb] = 1; + state.row_acc[cb] = ComplexD(); + state.used_cols.push_back(cb); + } + + state.row_acc[cb] += a_val * b_val; + } + } + + std::ranges::sort(state.used_cols); + + out_values.clear(); + out_cols.clear(); + out_values.reserve(state.used_cols.size()); + out_cols.reserve(state.used_cols.size()); + + for (int col : state.used_cols) { + out_values.push_back(state.row_acc[col]); + out_cols.push_back(col); + state.row_used[col] = 0; + } +} + +void ComputeRowsWithThreads(const SparseMatrix &a, const SparseMatrix &b, int rows, int num_threads, + std::vector> &row_values, std::vector> &row_cols) { + std::atomic next_row(0); + std::vector workers; + workers.reserve(num_threads); + + for (int thread_idx = 0; thread_idx < num_threads; ++thread_idx) { + workers.emplace_back([&]() { + ThreadLocalRowState state(b.cols); + + while (true) { + const int row_idx = next_row.fetch_add(1, std::memory_order_relaxed); + if (row_idx >= rows) { + break; + } + + MultiplySingleRow(a, b, row_idx, state, row_values[row_idx], row_cols[row_idx]); + } + }); + } + + for (auto &worker : workers) { + worker.join(); + } +} + +SparseMatrix BuildResult(int rows, int cols, const std::vector> &row_values, + const std::vector> &row_cols) { + SparseMatrix result(rows, cols); + + std::size_t total_nnz = 0; + for (int i = 0; i < rows; ++i) { + total_nnz += row_values[i].size(); + } + + result.values.reserve(total_nnz); + result.col_indices.reserve(total_nnz); + + for (int i = 0; i < rows; ++i) { + result.values.insert(result.values.end(), row_values[i].begin(), row_values[i].end()); + result.col_indices.insert(result.col_indices.end(), row_cols[i].begin(), row_cols[i].end()); + result.row_ptr[i + 1] = static_cast(result.values.size()); + } + + return result; +} + +} // namespace + +KurpiakovACRSMatMulSTL::KurpiakovACRSMatMulSTL(const InType &in) { + SetTypeOfTask(GetStaticTypeOfTask()); + GetInput() = in; + GetOutput() = SparseMatrix(); +} + +bool KurpiakovACRSMatMulSTL::ValidationImpl() { + const auto &[a, b] = GetInput(); + + if (!ValidateCSR(a) || !ValidateCSR(b)) { + return false; + } + + return a.cols == b.rows; +} + +bool KurpiakovACRSMatMulSTL::PreProcessingImpl() { + return true; +} + +bool KurpiakovACRSMatMulSTL::RunImpl() { + const auto &[a, b] = GetInput(); + const int rows = a.rows; + const int cols = b.cols; + + const int requested_threads = ppc::util::GetNumThreads(); + const int num_threads = std::max(1, std::min(requested_threads, rows)); + + std::vector> row_values(rows); + std::vector> row_cols(rows); + + ComputeRowsWithThreads(a, b, rows, num_threads, row_values, row_cols); + GetOutput() = BuildResult(rows, cols, row_values, row_cols); + return true; +} + +bool KurpiakovACRSMatMulSTL::PostProcessingImpl() { + return true; +} + +} // namespace kurpiakov_a_sp_comp_mat_mul diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/tbb/include/ops_tbb.hpp b/tasks/kurpiakov_a_sp_comp_mat_mul/tbb/include/ops_tbb.hpp new file mode 100644 index 000000000..18e545e3d --- /dev/null +++ b/tasks/kurpiakov_a_sp_comp_mat_mul/tbb/include/ops_tbb.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include "kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp" +#include "task/include/task.hpp" + +namespace kurpiakov_a_sp_comp_mat_mul { + +class KurpiakovACRSMatMulTBB : public BaseTask { + public: + static constexpr ppc::task::TypeOfTask GetStaticTypeOfTask() { + return ppc::task::TypeOfTask::kTBB; + } + explicit KurpiakovACRSMatMulTBB(const InType &in); + + private: + bool ValidationImpl() override; + bool PreProcessingImpl() override; + bool RunImpl() override; + bool PostProcessingImpl() override; +}; + +} // namespace kurpiakov_a_sp_comp_mat_mul diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/tbb/src/ops_tbb.cpp b/tasks/kurpiakov_a_sp_comp_mat_mul/tbb/src/ops_tbb.cpp new file mode 100644 index 000000000..b2822049e --- /dev/null +++ b/tasks/kurpiakov_a_sp_comp_mat_mul/tbb/src/ops_tbb.cpp @@ -0,0 +1,157 @@ +#include "kurpiakov_a_sp_comp_mat_mul/tbb/include/ops_tbb.hpp" + +#include + +#include +#include +#include +#include + +#include "kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp" + +namespace kurpiakov_a_sp_comp_mat_mul { + +namespace { + +bool ValidateCSR(const SparseMatrix &m) { + if (m.rows <= 0 || m.cols <= 0) { + return false; + } + if (static_cast(m.row_ptr.size()) != m.rows + 1) { + return false; + } + if (m.row_ptr[0] != 0) { + return false; + } + if (std::cmp_not_equal(m.values.size(), m.row_ptr[m.rows])) { + return false; + } + if (m.col_indices.size() != m.values.size()) { + return false; + } + for (int i = 0; i < m.rows; ++i) { + for (int j = m.row_ptr[i]; j < m.row_ptr[i + 1]; ++j) { + if (m.col_indices[j] < 0 || m.col_indices[j] >= m.cols) { + return false; + } + } + } + return true; +} + +struct ThreadLocalRowBuffers { + explicit ThreadLocalRowBuffers(int cols) : row_acc(cols), row_used(cols, 0) {} + + std::vector row_acc; + std::vector row_used; + std::vector used_cols; +}; + +void ComputeRow(const SparseMatrix &a, const SparseMatrix &b, int row_idx, ThreadLocalRowBuffers &buffers, + std::vector &out_values, std::vector &out_cols) { + buffers.used_cols.clear(); + + for (int ja = a.row_ptr[row_idx]; ja < a.row_ptr[row_idx + 1]; ++ja) { + const int ka = a.col_indices[ja]; + const ComplexD &a_val = a.values[ja]; + + for (int jb = b.row_ptr[ka]; jb < b.row_ptr[ka + 1]; ++jb) { + const int cb = b.col_indices[jb]; + const ComplexD &b_val = b.values[jb]; + + if (buffers.row_used[cb] == 0) { + buffers.row_used[cb] = 1; + buffers.row_acc[cb] = ComplexD(); + buffers.used_cols.push_back(cb); + } + + buffers.row_acc[cb] += a_val * b_val; + } + } + + std::ranges::sort(buffers.used_cols); + + out_values.clear(); + out_cols.clear(); + out_values.reserve(buffers.used_cols.size()); + out_cols.reserve(buffers.used_cols.size()); + + for (int c : buffers.used_cols) { + out_values.push_back(buffers.row_acc[c]); + out_cols.push_back(c); + buffers.row_used[c] = 0; + } +} + +SparseMatrix BuildResultFromRows(int rows, int cols, const std::vector> &row_values, + const std::vector> &row_cols) { + SparseMatrix result(rows, cols); + + for (int i = 0; i < rows; ++i) { + result.row_ptr[i + 1] = result.row_ptr[i] + static_cast(row_values[i].size()); + } + + const auto total_nnz = static_cast(result.row_ptr[rows]); + + result.values.resize(total_nnz); + result.col_indices.resize(total_nnz); + + tbb::parallel_for(tbb::blocked_range(0, rows), [&](const tbb::blocked_range &range) { + for (int i = range.begin(); i < range.end(); ++i) { + const auto offset = static_cast::difference_type>(result.row_ptr[i]); + std::copy(row_values[i].begin(), row_values[i].end(), result.values.begin() + offset); + std::copy(row_cols[i].begin(), row_cols[i].end(), result.col_indices.begin() + offset); + } + }); + + return result; +} + +} // namespace + +KurpiakovACRSMatMulTBB::KurpiakovACRSMatMulTBB(const InType &in) { + SetTypeOfTask(GetStaticTypeOfTask()); + GetInput() = in; + GetOutput() = SparseMatrix(); +} + +bool KurpiakovACRSMatMulTBB::ValidationImpl() { + const auto &[a, b] = GetInput(); + + if (!ValidateCSR(a) || !ValidateCSR(b)) { + return false; + } + + return a.cols == b.rows; +} + +bool KurpiakovACRSMatMulTBB::PreProcessingImpl() { + return true; +} + +bool KurpiakovACRSMatMulTBB::RunImpl() { + const auto &[a, b] = GetInput(); + const int rows = a.rows; + const int cols = b.cols; + + std::vector> row_values(rows); + std::vector> row_cols(rows); + + tbb::enumerable_thread_specific tls_buffers([&] { return ThreadLocalRowBuffers(cols); }); + + tbb::parallel_for(tbb::blocked_range(0, rows), [&](const tbb::blocked_range &range) { + auto &buffers = tls_buffers.local(); + for (int i = range.begin(); i < range.end(); ++i) { + ComputeRow(a, b, i, buffers, row_values[i], row_cols[i]); + } + }); + + GetOutput() = BuildResultFromRows(rows, cols, row_values, row_cols); + return true; +} + +bool KurpiakovACRSMatMulTBB::PostProcessingImpl() { + return true; +} + +} // namespace kurpiakov_a_sp_comp_mat_mul diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/tests/functional/main.cpp b/tasks/kurpiakov_a_sp_comp_mat_mul/tests/functional/main.cpp index 4c0294cee..03862e345 100644 --- a/tasks/kurpiakov_a_sp_comp_mat_mul/tests/functional/main.cpp +++ b/tasks/kurpiakov_a_sp_comp_mat_mul/tests/functional/main.cpp @@ -5,8 +5,12 @@ #include #include +#include "kurpiakov_a_sp_comp_mat_mul/all/include/ops_all.hpp" #include "kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp" +#include "kurpiakov_a_sp_comp_mat_mul/omp/include/ops_omp.hpp" #include "kurpiakov_a_sp_comp_mat_mul/seq/include/ops_seq.hpp" +#include "kurpiakov_a_sp_comp_mat_mul/stl/include/ops_stl.hpp" +#include "kurpiakov_a_sp_comp_mat_mul/tbb/include/ops_tbb.hpp" #include "util/include/func_test_util.hpp" #include "util/include/util.hpp" @@ -146,7 +150,11 @@ TEST_P(KurpiakovRunFuncTestsThreads, SparseMatMulFromParams) { } const auto kTestTasksList = std::tuple_cat( - ppc::util::AddFuncTask(kTestParam, PPC_SETTINGS_kurpiakov_a_sp_comp_mat_mul)); + ppc::util::AddFuncTask(kTestParam, PPC_SETTINGS_kurpiakov_a_sp_comp_mat_mul), + ppc::util::AddFuncTask(kTestParam, PPC_SETTINGS_kurpiakov_a_sp_comp_mat_mul), + ppc::util::AddFuncTask(kTestParam, PPC_SETTINGS_kurpiakov_a_sp_comp_mat_mul), + ppc::util::AddFuncTask(kTestParam, PPC_SETTINGS_kurpiakov_a_sp_comp_mat_mul), + ppc::util::AddFuncTask(kTestParam, PPC_SETTINGS_kurpiakov_a_sp_comp_mat_mul)); const auto kGtestValues = ppc::util::ExpandToValues(kTestTasksList); const auto kPerfTestName = KurpiakovRunFuncTestsThreads::PrintFuncTestName; diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/tests/performance/main.cpp b/tasks/kurpiakov_a_sp_comp_mat_mul/tests/performance/main.cpp index c00e2c8a2..249bd5cc4 100644 --- a/tasks/kurpiakov_a_sp_comp_mat_mul/tests/performance/main.cpp +++ b/tasks/kurpiakov_a_sp_comp_mat_mul/tests/performance/main.cpp @@ -3,8 +3,12 @@ #include #include +#include "kurpiakov_a_sp_comp_mat_mul/all/include/ops_all.hpp" #include "kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp" +#include "kurpiakov_a_sp_comp_mat_mul/omp/include/ops_omp.hpp" #include "kurpiakov_a_sp_comp_mat_mul/seq/include/ops_seq.hpp" +#include "kurpiakov_a_sp_comp_mat_mul/stl/include/ops_stl.hpp" +#include "kurpiakov_a_sp_comp_mat_mul/tbb/include/ops_tbb.hpp" #include "util/include/perf_test_util.hpp" namespace kurpiakov_a_sp_comp_mat_mul { @@ -65,7 +69,9 @@ TEST_P(KurpiakovRunPerfTests, SparseMatMulPerf) { namespace { const auto kAllPerfTasks = - ppc::util::MakeAllPerfTasks(PPC_SETTINGS_kurpiakov_a_sp_comp_mat_mul); + ppc::util::MakeAllPerfTasks( + PPC_SETTINGS_kurpiakov_a_sp_comp_mat_mul); const auto kGtestValues = ppc::util::TupleToGTestValues(kAllPerfTasks); const auto kPerfTestName = KurpiakovRunPerfTests::CustomPerfTestName;