diff --git a/tasks/example_threads/stl/report.md b/tasks/example_threads/stl/report.md
deleted file mode 100644
index e69de29bb..000000000
diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/all/include/ops_all.hpp b/tasks/kurpiakov_a_sp_comp_mat_mul/all/include/ops_all.hpp
new file mode 100644
index 000000000..41a24cc06
--- /dev/null
+++ b/tasks/kurpiakov_a_sp_comp_mat_mul/all/include/ops_all.hpp
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp"
+#include "task/include/task.hpp"
+
+namespace kurpiakov_a_sp_comp_mat_mul {
+
+class KurpiakovACRSMatMulALL : public BaseTask {
+ public:
+  static constexpr ppc::task::TypeOfTask GetStaticTypeOfTask() {
+    return ppc::task::TypeOfTask::kALL;
+  }
+  explicit KurpiakovACRSMatMulALL(const InType &in);
+
+ private:
+  bool ValidationImpl() override;
+  bool PreProcessingImpl() override;
+  bool RunImpl() override;
+  bool PostProcessingImpl() override;
+};
+
+}  // namespace kurpiakov_a_sp_comp_mat_mul
diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/all/src/ops_all.cpp b/tasks/kurpiakov_a_sp_comp_mat_mul/all/src/ops_all.cpp
new file mode 100644
index 000000000..2fedadf40
--- /dev/null
+++ b/tasks/kurpiakov_a_sp_comp_mat_mul/all/src/ops_all.cpp
@@ -0,0 +1,248 @@
+#include "kurpiakov_a_sp_comp_mat_mul/all/include/ops_all.hpp"
+
+#include <mpi.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cstddef>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp"
+#include "util/include/util.hpp"
+
+namespace kurpiakov_a_sp_comp_mat_mul {
+
+namespace {
+
+bool ValidateCSR(const SparseMatrix &m) {
+  if (m.rows <= 0 || m.cols <= 0) {
+    return false;
+  }
+  if (static_cast<int>(m.row_ptr.size()) != m.rows + 1) {
+    return false;
+  }
+  if (m.row_ptr[0] != 0) {
+    return false;
+  }
+  if (std::cmp_not_equal(m.values.size(), m.row_ptr[m.rows])) {
+    return false;
+  }
+  if (m.col_indices.size() != m.values.size()) {
+    return false;
+  }
+  for (int i = 0; i < m.rows; ++i) {
+    for (int j = m.row_ptr[i]; j < m.row_ptr[i + 1]; ++j) {
+      if (m.col_indices[j] < 0 || m.col_indices[j] >= m.cols) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+std::pair<int, int> GetRowRange(int total_rows, int rank, int size) {
+  const int begin = (total_rows * rank) / size;
+  const int end = (total_rows * (rank + 1)) / size;
+  return {begin, end};
+}
+
+void MultiplySingleRow(const SparseMatrix &a, const SparseMatrix &b, int row_idx, std::vector<ComplexD> &row_acc,
+                       std::vector<char> &row_used, std::vector<int> &used_cols, std::vector<ComplexD> &out_values,
+                       std::vector<int> &out_cols) {
+  used_cols.clear();
+
+  for (int ja = a.row_ptr[row_idx]; ja < a.row_ptr[row_idx + 1]; ++ja) {
+    const int ka = a.col_indices[ja];
+    const ComplexD &a_val = a.values[ja];
+
+    for (int jb = b.row_ptr[ka]; jb < b.row_ptr[ka + 1]; ++jb) {
+      const int cb = b.col_indices[jb];
+      const ComplexD &b_val = b.values[jb];
+
+      if (row_used[cb] == 0) {
+        row_used[cb] = 1;
+        row_acc[cb] = ComplexD();
+        used_cols.push_back(cb);
+      }
+
+      row_acc[cb] += a_val * b_val;
+    }
+  }
+
+  std::ranges::sort(used_cols);
+
+  out_values.clear();
+  out_cols.clear();
+  out_values.reserve(used_cols.size());
+  out_cols.reserve(used_cols.size());
+
+  for (int col : used_cols) {
+    out_values.push_back(row_acc[col]);
+    out_cols.push_back(col);
+    row_used[col] = 0;
+  }
+}
+
+void ComputeLocalRowsThreads(const SparseMatrix &a, const SparseMatrix &b, int row_begin, int row_end,
+                             std::vector<std::vector<ComplexD>> &local_values,
+                             std::vector<std::vector<int>> &local_cols) {
+  const int local_rows = row_end - row_begin;
+  const int requested_threads = ppc::util::GetNumThreads();
+  const int max_threads = std::max(1, local_rows);
+  const int num_threads = std::max(1, std::min(requested_threads, max_threads));
+
+  std::atomic<int> next_row(row_begin);
+  std::vector<std::thread> workers;
+  workers.reserve(num_threads);
+
+  for (int tid = 0; tid < num_threads; ++tid) {
+    workers.emplace_back([&]() {
+      std::vector<ComplexD> row_acc(b.cols);
+      std::vector<char> row_used(b.cols, 0);
+      std::vector<int> used_cols;
+
+      while (true) {
+        const int row = next_row.fetch_add(1, std::memory_order_relaxed);
+        if (row >= row_end) {
+          break;
+        }
+
+        const int local_idx = row - row_begin;
+        MultiplySingleRow(a, b, row, row_acc, row_used, used_cols, local_values[local_idx], local_cols[local_idx]);
+      }
+    });
+  }
+
+  for (auto &worker : workers) {
+    worker.join();
+  }
+}
+
+}  // namespace
+
+KurpiakovACRSMatMulALL::KurpiakovACRSMatMulALL(const InType &in) {
+  SetTypeOfTask(GetStaticTypeOfTask());
+  GetInput() = in;
+  GetOutput() = SparseMatrix();
+}
+
+bool KurpiakovACRSMatMulALL::ValidationImpl() {
+  const auto &[a, b] = GetInput();
+
+  if (!ValidateCSR(a) || !ValidateCSR(b)) {
+    return false;
+  }
+
+  return a.cols == b.rows;
+}
+
+bool KurpiakovACRSMatMulALL::PreProcessingImpl() {
+  return true;
+}
+
+bool KurpiakovACRSMatMulALL::RunImpl() {
+  const auto &[a, b] = GetInput();
+  const int rows = a.rows;
+  const int cols = b.cols;
+
+  int rank = 0;
+  int world_size = 1;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+
+  const auto [row_begin, row_end] = GetRowRange(rows, rank, world_size);
+  const int local_rows = row_end - row_begin;
+
+  std::vector<std::vector<ComplexD>> local_values(local_rows);
+  std::vector<std::vector<int>> local_cols(local_rows);
+
+  ComputeLocalRowsThreads(a, b, row_begin, row_end, local_values, local_cols);
+
+  std::vector<int> local_row_nnz(rows, 0);
+  for (int local_i = 0; local_i < local_rows; ++local_i) {
+    local_row_nnz[row_begin + local_i] = static_cast<int>(local_values[local_i].size());
+  }
+
+  std::vector<int> global_row_nnz(rows, 0);
+  MPI_Allreduce(local_row_nnz.data(), global_row_nnz.data(), rows, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+
+  std::vector<int> global_row_ptr(rows + 1, 0);
+  for (int i = 0; i < rows; ++i) {
+    global_row_ptr[i + 1] = global_row_ptr[i] + global_row_nnz[i];
+  }
+
+  const int total_nnz = global_row_ptr[rows];
+  const int local_nnz = global_row_ptr[row_end] - global_row_ptr[row_begin];
+
+  std::vector<double> local_re(local_nnz);
+  std::vector<double> local_im(local_nnz);
+  std::vector<int> local_col_indices(local_nnz);
+
+  int pos = 0;
+  for (int local_i = 0; local_i < local_rows; ++local_i) {
+    const auto &vals = local_values[local_i];
+    const auto &cols_row = local_cols[local_i];
+
+    for (size_t j = 0; j < vals.size(); ++j) {
+      local_re[pos] = vals[j].re;
+      local_im[pos] = vals[j].im;
+      local_col_indices[pos] = cols_row[j];
+      ++pos;
+    }
+  }
+
+  std::vector<int> recv_counts(world_size, 0);
+  MPI_Allgather(&local_nnz, 1, MPI_INT, recv_counts.data(), 1, MPI_INT, MPI_COMM_WORLD);
+
+  std::vector<int> recv_displs(world_size, 0);
+  for (int rec = 1; rec < world_size; ++rec) {
+    recv_displs[rec] = recv_displs[rec - 1] + recv_counts[rec - 1];
+  }
+
+  std::vector<double> global_re;
+  std::vector<double> global_im;
+  std::vector<int> global_col_indices;
+
+  if (rank == 0) {
+    global_re.resize(total_nnz);
+    global_im.resize(total_nnz);
+    global_col_indices.resize(total_nnz);
+  }
+
+  MPI_Gatherv(local_re.data(), local_nnz, MPI_DOUBLE, global_re.data(), recv_counts.data(), recv_displs.data(),
+              MPI_DOUBLE, 0, MPI_COMM_WORLD);
+  MPI_Gatherv(local_im.data(), local_nnz, MPI_DOUBLE, global_im.data(), recv_counts.data(), recv_displs.data(),
+              MPI_DOUBLE, 0, MPI_COMM_WORLD);
+  MPI_Gatherv(local_col_indices.data(), local_nnz, MPI_INT, global_col_indices.data(), recv_counts.data(),
+              recv_displs.data(), MPI_INT, 0, MPI_COMM_WORLD);
+
+  if (rank != 0) {
+    global_re.resize(total_nnz);
+    global_im.resize(total_nnz);
+    global_col_indices.resize(total_nnz);
+  }
+
+  MPI_Bcast(global_re.data(), total_nnz, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+  MPI_Bcast(global_im.data(), total_nnz, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+  MPI_Bcast(global_col_indices.data(), total_nnz, MPI_INT, 0, MPI_COMM_WORLD);
+
+  SparseMatrix result(rows, cols);
+  result.row_ptr = std::move(global_row_ptr);
+  result.col_indices = std::move(global_col_indices);
+  result.values.resize(static_cast<size_t>(total_nnz));
+
+  for (int i = 0; i < total_nnz; ++i) {
+    result.values[static_cast<size_t>(i)] = ComplexD(global_re[i], global_im[i]);
+  }
+
+  GetOutput() = std::move(result);
+  return true;
+}
+
+bool KurpiakovACRSMatMulALL::PostProcessingImpl() {
+  return true;
+}
+
+}  // namespace kurpiakov_a_sp_comp_mat_mul
diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp b/tasks/kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp
index 9b407ba0c..3dc8f9511 100644
--- a/tasks/kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp
+++ b/tasks/kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp
@@ -131,6 +131,88 @@ class CSRMatrix {
     return result;
   }
 
+ private:
+  static void ProcessRow(int i, const CSRMatrix &self, const CSRMatrix &other, std::vector<T> &acc_re,
+                         std::vector<T> &acc_im, std::vector<bool> &local_used,
+                         std::vector<std::vector<Complex<T>>> &row_values,
+                         std::vector<std::vector<int>> &row_col_indices) {
+    std::vector<int> used_cols;
+    used_cols.reserve(other.cols);
+
+    for (int ja = self.row_ptr[i]; ja < self.row_ptr[i + 1]; ++ja) {
+      const int ka = self.col_indices[ja];
+      const T a_re = self.values[ja].re;
+      const T a_im = self.values[ja].im;
+      const int jb_start = other.row_ptr[ka];
+      const int jb_end = other.row_ptr[ka + 1];
+
+      for (int jb = jb_start; jb < jb_end; ++jb) {
+        const int cb = other.col_indices[jb];
+        if (!local_used[cb]) {
+          local_used[cb] = true;
+          acc_re[cb] = T(0);
+          acc_im[cb] = T(0);
+          used_cols.push_back(cb);
+        }
+        acc_re[cb] += (a_re * other.values[jb].re) - (a_im * other.values[jb].im);
+        acc_im[cb] += (a_re * other.values[jb].im) + (a_im * other.values[jb].re);
+      }
+    }
+
+    std::ranges::sort(used_cols);
+    row_values[i].reserve(used_cols.size());
+    row_col_indices[i].reserve(used_cols.size());
+
+    for (const int c : used_cols) {
+      row_values[i].emplace_back(acc_re[c], acc_im[c]);
+      row_col_indices[i].push_back(c);
+      local_used[c] = false;
+    }
+  }
+
+ public:
+  [[nodiscard]] CSRMatrix OMPMultiply(const CSRMatrix &other) const {
+    if (cols != other.rows) {
+      return {};
+    }
+
+    CSRMatrix result(rows, other.cols);
+    std::vector<std::vector<Complex<T>>> row_values(rows);
+    std::vector<std::vector<int>> row_col_indices(rows);
+
+    const CSRMatrix &self = *this;
+    const int nrows = rows;
+    const int ncols = other.cols;
+
+#pragma omp parallel default(none) shared(self, other, row_values, row_col_indices, nrows, ncols)
+    {
+      std::vector<T> acc_re(static_cast<std::size_t>(ncols));
+      std::vector<T> acc_im(static_cast<std::size_t>(ncols));
+      std::vector<bool> local_used(static_cast<std::size_t>(ncols), false);
+
+#pragma omp for schedule(dynamic)
+      for (int i = 0; i < nrows; ++i) {
+        ProcessRow(i, self, other, acc_re, acc_im, local_used, row_values, row_col_indices);
+      }
+    }
+
+    int total_nnz = 0;
+    for (int i = 0; i < rows; ++i) {
+      total_nnz += static_cast<int>(row_values[i].size());
+    }
+
+    result.values.reserve(static_cast<std::size_t>(total_nnz));
+    result.col_indices.reserve(static_cast<std::size_t>(total_nnz));
+
+    for (int i = 0; i < rows; ++i) {
+      result.values.insert(result.values.end(), row_values[i].begin(), row_values[i].end());
+      result.col_indices.insert(result.col_indices.end(), row_col_indices[i].begin(), row_col_indices[i].end());
+      result.row_ptr[i + 1] = static_cast<int>(result.values.size());
+    }
+
+    return result;
+  }
+
   [[nodiscard]] std::vector<Complex<T>> ToDense() const {
     std::vector<Complex<T>> dense(rows * cols);
     for (int i = 0; i < rows; ++i) {
diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/omp/include/ops_omp.hpp b/tasks/kurpiakov_a_sp_comp_mat_mul/omp/include/ops_omp.hpp
new file mode 100644
index 000000000..a4aa74df7
--- /dev/null
+++ b/tasks/kurpiakov_a_sp_comp_mat_mul/omp/include/ops_omp.hpp
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp"
+#include "task/include/task.hpp"
+
+namespace kurpiakov_a_sp_comp_mat_mul {
+
+class KurpiakovACRSMatMulOMP : public BaseTask {
+ public:
+  static constexpr ppc::task::TypeOfTask GetStaticTypeOfTask() {
+    return ppc::task::TypeOfTask::kOMP;
+  }
+  explicit KurpiakovACRSMatMulOMP(const InType &in);
+
+ private:
+  bool ValidationImpl() override;
+  bool PreProcessingImpl() override;
+  bool RunImpl() override;
+  bool PostProcessingImpl() override;
+};
+
+}  // namespace kurpiakov_a_sp_comp_mat_mul
diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/omp/src/ops_omp.cpp b/tasks/kurpiakov_a_sp_comp_mat_mul/omp/src/ops_omp.cpp
new file mode 100644
index 000000000..49b72b9f4
--- /dev/null
+++ b/tasks/kurpiakov_a_sp_comp_mat_mul/omp/src/ops_omp.cpp
@@ -0,0 +1,70 @@
+#include "kurpiakov_a_sp_comp_mat_mul/omp/include/ops_omp.hpp"
+
+#include <utility>
+#include <vector>
+
+#include "kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp"
+
+namespace kurpiakov_a_sp_comp_mat_mul {
+
+namespace {
+
+bool ValidateCSR(const SparseMatrix &m) {
+  if (m.rows <= 0 || m.cols <= 0) {
+    return false;
+  }
+  if (static_cast<int>(m.row_ptr.size()) != m.rows + 1) {
+    return false;
+  }
+  if (m.row_ptr[0] != 0) {
+    return false;
+  }
+  if (std::cmp_not_equal(m.values.size(), m.row_ptr[m.rows])) {
+    return false;
+  }
+  if (m.col_indices.size() != m.values.size()) {
+    return false;
+  }
+  for (int i = 0; i < m.rows; ++i) {
+    for (int j = m.row_ptr[i]; j < m.row_ptr[i + 1]; ++j) {
+      if (m.col_indices[j] < 0 || m.col_indices[j] >= m.cols) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+KurpiakovACRSMatMulOMP::KurpiakovACRSMatMulOMP(const InType &in) {
+  SetTypeOfTask(GetStaticTypeOfTask());
+  GetInput() = in;
+  GetOutput() = SparseMatrix();
+}
+
+bool KurpiakovACRSMatMulOMP::ValidationImpl() {
+  const auto &[a, b] = GetInput();
+
+  if (!ValidateCSR(a) || !ValidateCSR(b)) {
+    return false;
+  }
+
+  return a.cols == b.rows;
+}
+
+bool KurpiakovACRSMatMulOMP::PreProcessingImpl() {
+  return true;
+}
+
+bool KurpiakovACRSMatMulOMP::RunImpl() {
+  const auto &[a, b] = GetInput();
+  GetOutput() = a.OMPMultiply(b);
+  return true;
+}
+
+bool KurpiakovACRSMatMulOMP::PostProcessingImpl() {
+  return true;
+}
+
+}  // namespace kurpiakov_a_sp_comp_mat_mul
diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/stl/include/ops_stl.hpp b/tasks/kurpiakov_a_sp_comp_mat_mul/stl/include/ops_stl.hpp
new file mode 100644
index 000000000..8531d8d87
--- /dev/null
+++ b/tasks/kurpiakov_a_sp_comp_mat_mul/stl/include/ops_stl.hpp
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp"
+#include "task/include/task.hpp"
+
+namespace kurpiakov_a_sp_comp_mat_mul {
+
+class KurpiakovACRSMatMulSTL : public BaseTask {
+ public:
+  static constexpr ppc::task::TypeOfTask GetStaticTypeOfTask() {
+    return ppc::task::TypeOfTask::kSTL;
+  }
+  explicit KurpiakovACRSMatMulSTL(const InType &in);
+
+ private:
+  bool ValidationImpl() override;
+  bool PreProcessingImpl() override;
+  bool RunImpl() override;
+  bool PostProcessingImpl() override;
+};
+
+}  // namespace kurpiakov_a_sp_comp_mat_mul
diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/stl/src/ops_stl.cpp b/tasks/kurpiakov_a_sp_comp_mat_mul/stl/src/ops_stl.cpp
new file mode 100644
index 000000000..120fdcfc4
--- /dev/null
+++ b/tasks/kurpiakov_a_sp_comp_mat_mul/stl/src/ops_stl.cpp
@@ -0,0 +1,176 @@
+#include "kurpiakov_a_sp_comp_mat_mul/stl/include/ops_stl.hpp"
+
+#include <algorithm>
+#include <atomic>
+#include <cstddef>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp"
+#include "util/include/util.hpp"
+
+namespace kurpiakov_a_sp_comp_mat_mul {
+
+namespace {
+
+bool ValidateCSR(const SparseMatrix &m) {
+  if (m.rows <= 0 || m.cols <= 0) {
+    return false;
+  }
+  if (static_cast<int>(m.row_ptr.size()) != m.rows + 1) {
+    return false;
+  }
+  if (m.row_ptr[0] != 0) {
+    return false;
+  }
+  if (std::cmp_not_equal(m.values.size(), m.row_ptr[m.rows])) {
+    return false;
+  }
+  if (m.col_indices.size() != m.values.size()) {
+    return false;
+  }
+  for (int i = 0; i < m.rows; ++i) {
+    for (int j = m.row_ptr[i]; j < m.row_ptr[i + 1]; ++j) {
+      if (m.col_indices[j] < 0 || m.col_indices[j] >= m.cols) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+struct ThreadLocalRowState {
+  explicit ThreadLocalRowState(int cols) : row_acc(cols), row_used(cols, 0) {}
+
+  std::vector<ComplexD> row_acc;
+  std::vector<char> row_used;
+  std::vector<int> used_cols;
+};
+
+void MultiplySingleRow(const SparseMatrix &a, const SparseMatrix &b, int row_idx, ThreadLocalRowState &state,
+                       std::vector<ComplexD> &out_values, std::vector<int> &out_cols) {
+  state.used_cols.clear();
+
+  for (int ja = a.row_ptr[row_idx]; ja < a.row_ptr[row_idx + 1]; ++ja) {
+    const int ka = a.col_indices[ja];
+    const ComplexD &a_val = a.values[ja];
+
+    for (int jb = b.row_ptr[ka]; jb < b.row_ptr[ka + 1]; ++jb) {
+      const int cb = b.col_indices[jb];
+      const ComplexD &b_val = b.values[jb];
+
+      if (state.row_used[cb] == 0) {
+        state.row_used[cb] = 1;
+        state.row_acc[cb] = ComplexD();
+        state.used_cols.push_back(cb);
+      }
+
+      state.row_acc[cb] += a_val * b_val;
+    }
+  }
+
+  std::ranges::sort(state.used_cols);
+
+  out_values.clear();
+  out_cols.clear();
+  out_values.reserve(state.used_cols.size());
+  out_cols.reserve(state.used_cols.size());
+
+  for (int col : state.used_cols) {
+    out_values.push_back(state.row_acc[col]);
+    out_cols.push_back(col);
+    state.row_used[col] = 0;
+  }
+}
+
+void ComputeRowsWithThreads(const SparseMatrix &a, const SparseMatrix &b, int rows, int num_threads,
+                            std::vector<std::vector<ComplexD>> &row_values, std::vector<std::vector<int>> &row_cols) {
+  std::atomic<int> next_row(0);
+  std::vector<std::thread> workers;
+  workers.reserve(num_threads);
+
+  for (int thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
+    workers.emplace_back([&]() {
+      ThreadLocalRowState state(b.cols);
+
+      while (true) {
+        const int row_idx = next_row.fetch_add(1, std::memory_order_relaxed);
+        if (row_idx >= rows) {
+          break;
+        }
+
+        MultiplySingleRow(a, b, row_idx, state, row_values[row_idx], row_cols[row_idx]);
+      }
+    });
+  }
+
+  for (auto &worker : workers) {
+    worker.join();
+  }
+}
+
+SparseMatrix BuildResult(int rows, int cols, const std::vector<std::vector<ComplexD>> &row_values,
+                         const std::vector<std::vector<int>> &row_cols) {
+  SparseMatrix result(rows, cols);
+
+  std::size_t total_nnz = 0;
+  for (int i = 0; i < rows; ++i) {
+    total_nnz += row_values[i].size();
+  }
+
+  result.values.reserve(total_nnz);
+  result.col_indices.reserve(total_nnz);
+
+  for (int i = 0; i < rows; ++i) {
+    result.values.insert(result.values.end(), row_values[i].begin(), row_values[i].end());
+    result.col_indices.insert(result.col_indices.end(), row_cols[i].begin(), row_cols[i].end());
+    result.row_ptr[i + 1] = static_cast<int>(result.values.size());
+  }
+
+  return result;
+}
+
+}  // namespace
+
+KurpiakovACRSMatMulSTL::KurpiakovACRSMatMulSTL(const InType &in) {
+  SetTypeOfTask(GetStaticTypeOfTask());
+  GetInput() = in;
+  GetOutput() = SparseMatrix();
+}
+
+bool KurpiakovACRSMatMulSTL::ValidationImpl() {
+  const auto &[a, b] = GetInput();
+
+  if (!ValidateCSR(a) || !ValidateCSR(b)) {
+    return false;
+  }
+
+  return a.cols == b.rows;
+}
+
+bool KurpiakovACRSMatMulSTL::PreProcessingImpl() {
+  return true;
+}
+
+bool KurpiakovACRSMatMulSTL::RunImpl() {
+  const auto &[a, b] = GetInput();
+  const int rows = a.rows;
+  const int cols = b.cols;
+
+  const int requested_threads = ppc::util::GetNumThreads();
+  const int num_threads = std::max(1, std::min(requested_threads, rows));
+
+  std::vector<std::vector<ComplexD>> row_values(rows);
+  std::vector<std::vector<int>> row_cols(rows);
+
+  ComputeRowsWithThreads(a, b, rows, num_threads, row_values, row_cols);
+  GetOutput() = BuildResult(rows, cols, row_values, row_cols);
+  return true;
+}
+
+bool KurpiakovACRSMatMulSTL::PostProcessingImpl() {
+  return true;
+}
+
+}  // namespace kurpiakov_a_sp_comp_mat_mul
diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/tbb/include/ops_tbb.hpp b/tasks/kurpiakov_a_sp_comp_mat_mul/tbb/include/ops_tbb.hpp
new file mode 100644
index 000000000..18e545e3d
--- /dev/null
+++ b/tasks/kurpiakov_a_sp_comp_mat_mul/tbb/include/ops_tbb.hpp
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp"
+#include "task/include/task.hpp"
+
+namespace kurpiakov_a_sp_comp_mat_mul {
+
+class KurpiakovACRSMatMulTBB : public BaseTask {
+ public:
+  static constexpr ppc::task::TypeOfTask GetStaticTypeOfTask() {
+    return ppc::task::TypeOfTask::kTBB;
+  }
+  explicit KurpiakovACRSMatMulTBB(const InType &in);
+
+ private:
+  bool ValidationImpl() override;
+  bool PreProcessingImpl() override;
+  bool RunImpl() override;
+  bool PostProcessingImpl() override;
+};
+
+}  // namespace kurpiakov_a_sp_comp_mat_mul
diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/tbb/src/ops_tbb.cpp b/tasks/kurpiakov_a_sp_comp_mat_mul/tbb/src/ops_tbb.cpp
new file mode 100644
index 000000000..b2822049e
--- /dev/null
+++ b/tasks/kurpiakov_a_sp_comp_mat_mul/tbb/src/ops_tbb.cpp
@@ -0,0 +1,157 @@
+#include "kurpiakov_a_sp_comp_mat_mul/tbb/include/ops_tbb.hpp"
+
+#include <tbb/tbb.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+#include "kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp"
+
+namespace kurpiakov_a_sp_comp_mat_mul {
+
+namespace {
+
+bool ValidateCSR(const SparseMatrix &m) {
+  if (m.rows <= 0 || m.cols <= 0) {
+    return false;
+  }
+  if (static_cast<int>(m.row_ptr.size()) != m.rows + 1) {
+    return false;
+  }
+  if (m.row_ptr[0] != 0) {
+    return false;
+  }
+  if (std::cmp_not_equal(m.values.size(), m.row_ptr[m.rows])) {
+    return false;
+  }
+  if (m.col_indices.size() != m.values.size()) {
+    return false;
+  }
+  for (int i = 0; i < m.rows; ++i) {
+    for (int j = m.row_ptr[i]; j < m.row_ptr[i + 1]; ++j) {
+      if (m.col_indices[j] < 0 || m.col_indices[j] >= m.cols) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+struct ThreadLocalRowBuffers {
+  explicit ThreadLocalRowBuffers(int cols) : row_acc(cols), row_used(cols, 0) {}
+
+  std::vector<ComplexD> row_acc;
+  std::vector<char> row_used;
+  std::vector<int> used_cols;
+};
+
+void ComputeRow(const SparseMatrix &a, const SparseMatrix &b, int row_idx, ThreadLocalRowBuffers &buffers,
+                std::vector<ComplexD> &out_values, std::vector<int> &out_cols) {
+  buffers.used_cols.clear();
+
+  for (int ja = a.row_ptr[row_idx]; ja < a.row_ptr[row_idx + 1]; ++ja) {
+    const int ka = a.col_indices[ja];
+    const ComplexD &a_val = a.values[ja];
+
+    for (int jb = b.row_ptr[ka]; jb < b.row_ptr[ka + 1]; ++jb) {
+      const int cb = b.col_indices[jb];
+      const ComplexD &b_val = b.values[jb];
+
+      if (buffers.row_used[cb] == 0) {
+        buffers.row_used[cb] = 1;
+        buffers.row_acc[cb] = ComplexD();
+        buffers.used_cols.push_back(cb);
+      }
+
+      buffers.row_acc[cb] += a_val * b_val;
+    }
+  }
+
+  std::ranges::sort(buffers.used_cols);
+
+  out_values.clear();
+  out_cols.clear();
+  out_values.reserve(buffers.used_cols.size());
+  out_cols.reserve(buffers.used_cols.size());
+
+  for (int c : buffers.used_cols) {
+    out_values.push_back(buffers.row_acc[c]);
+    out_cols.push_back(c);
+    buffers.row_used[c] = 0;
+  }
+}
+
+SparseMatrix BuildResultFromRows(int rows, int cols, const std::vector<std::vector<ComplexD>> &row_values,
+                                 const std::vector<std::vector<int>> &row_cols) {
+  SparseMatrix result(rows, cols);
+
+  for (int i = 0; i < rows; ++i) {
+    result.row_ptr[i + 1] = result.row_ptr[i] + static_cast<int>(row_values[i].size());
+  }
+
+  const auto total_nnz = static_cast<std::size_t>(result.row_ptr[rows]);
+
+  result.values.resize(total_nnz);
+  result.col_indices.resize(total_nnz);
+
+  tbb::parallel_for(tbb::blocked_range<int>(0, rows), [&](const tbb::blocked_range<int> &range) {
+    for (int i = range.begin(); i < range.end(); ++i) {
+      const auto offset = static_cast<std::vector<ComplexD>::difference_type>(result.row_ptr[i]);
+      std::copy(row_values[i].begin(), row_values[i].end(), result.values.begin() + offset);
+      std::copy(row_cols[i].begin(), row_cols[i].end(), result.col_indices.begin() + offset);
+    }
+  });
+
+  return result;
+}
+
+}  // namespace
+
+KurpiakovACRSMatMulTBB::KurpiakovACRSMatMulTBB(const InType &in) {
+  SetTypeOfTask(GetStaticTypeOfTask());
+  GetInput() = in;
+  GetOutput() = SparseMatrix();
+}
+
+bool KurpiakovACRSMatMulTBB::ValidationImpl() {
+  const auto &[a, b] = GetInput();
+
+  if (!ValidateCSR(a) || !ValidateCSR(b)) {
+    return false;
+  }
+
+  return a.cols == b.rows;
+}
+
+bool KurpiakovACRSMatMulTBB::PreProcessingImpl() {
+  return true;
+}
+
+bool KurpiakovACRSMatMulTBB::RunImpl() {
+  const auto &[a, b] = GetInput();
+  const int rows = a.rows;
+  const int cols = b.cols;
+
+  std::vector<std::vector<ComplexD>> row_values(rows);
+  std::vector<std::vector<int>> row_cols(rows);
+
+  tbb::enumerable_thread_specific<ThreadLocalRowBuffers> tls_buffers([&] { return ThreadLocalRowBuffers(cols); });
+
+  tbb::parallel_for(tbb::blocked_range<int>(0, rows), [&](const tbb::blocked_range<int> &range) {
+    auto &buffers = tls_buffers.local();
+    for (int i = range.begin(); i < range.end(); ++i) {
+      ComputeRow(a, b, i, buffers, row_values[i], row_cols[i]);
+    }
+  });
+
+  GetOutput() = BuildResultFromRows(rows, cols, row_values, row_cols);
+  return true;
+}
+
+bool KurpiakovACRSMatMulTBB::PostProcessingImpl() {
+  return true;
+}
+
+}  // namespace kurpiakov_a_sp_comp_mat_mul
diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/tests/functional/main.cpp b/tasks/kurpiakov_a_sp_comp_mat_mul/tests/functional/main.cpp
index 4c0294cee..03862e345 100644
--- a/tasks/kurpiakov_a_sp_comp_mat_mul/tests/functional/main.cpp
+++ b/tasks/kurpiakov_a_sp_comp_mat_mul/tests/functional/main.cpp
@@ -5,8 +5,12 @@
 #include <string>
 #include <tuple>
 
+#include "kurpiakov_a_sp_comp_mat_mul/all/include/ops_all.hpp"
 #include "kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp"
+#include "kurpiakov_a_sp_comp_mat_mul/omp/include/ops_omp.hpp"
 #include "kurpiakov_a_sp_comp_mat_mul/seq/include/ops_seq.hpp"
+#include "kurpiakov_a_sp_comp_mat_mul/stl/include/ops_stl.hpp"
+#include "kurpiakov_a_sp_comp_mat_mul/tbb/include/ops_tbb.hpp"
 #include "util/include/func_test_util.hpp"
 #include "util/include/util.hpp"
 
@@ -146,7 +150,11 @@ TEST_P(KurpiakovRunFuncTestsThreads, SparseMatMulFromParams) {
 }
 
 const auto kTestTasksList = std::tuple_cat(
-    ppc::util::AddFuncTask<KurpiskovACRSMatMulSEQ, InType>(kTestParam, PPC_SETTINGS_kurpiakov_a_sp_comp_mat_mul));
+    ppc::util::AddFuncTask<KurpiskovACRSMatMulSEQ, InType>(kTestParam, PPC_SETTINGS_kurpiakov_a_sp_comp_mat_mul),
+    ppc::util::AddFuncTask<KurpiakovACRSMatMulOMP, InType>(kTestParam, PPC_SETTINGS_kurpiakov_a_sp_comp_mat_mul),
+    ppc::util::AddFuncTask<KurpiakovACRSMatMulTBB, InType>(kTestParam, PPC_SETTINGS_kurpiakov_a_sp_comp_mat_mul),
+    ppc::util::AddFuncTask<KurpiakovACRSMatMulSTL, InType>(kTestParam, PPC_SETTINGS_kurpiakov_a_sp_comp_mat_mul),
+    ppc::util::AddFuncTask<KurpiakovACRSMatMulALL, InType>(kTestParam, PPC_SETTINGS_kurpiakov_a_sp_comp_mat_mul));
 
 const auto kGtestValues = ppc::util::ExpandToValues(kTestTasksList);
 const auto kPerfTestName = KurpiakovRunFuncTestsThreads::PrintFuncTestName<KurpiakovRunFuncTestsThreads>;
diff --git a/tasks/kurpiakov_a_sp_comp_mat_mul/tests/performance/main.cpp b/tasks/kurpiakov_a_sp_comp_mat_mul/tests/performance/main.cpp
index c00e2c8a2..249bd5cc4 100644
--- a/tasks/kurpiakov_a_sp_comp_mat_mul/tests/performance/main.cpp
+++ b/tasks/kurpiakov_a_sp_comp_mat_mul/tests/performance/main.cpp
@@ -3,8 +3,12 @@
 #include <utility>
 #include <vector>
 
+#include "kurpiakov_a_sp_comp_mat_mul/all/include/ops_all.hpp"
 #include "kurpiakov_a_sp_comp_mat_mul/common/include/common.hpp"
+#include "kurpiakov_a_sp_comp_mat_mul/omp/include/ops_omp.hpp"
 #include "kurpiakov_a_sp_comp_mat_mul/seq/include/ops_seq.hpp"
+#include "kurpiakov_a_sp_comp_mat_mul/stl/include/ops_stl.hpp"
+#include "kurpiakov_a_sp_comp_mat_mul/tbb/include/ops_tbb.hpp"
 #include "util/include/perf_test_util.hpp"
 
 namespace kurpiakov_a_sp_comp_mat_mul {
@@ -65,7 +69,9 @@ TEST_P(KurpiakovRunPerfTests, SparseMatMulPerf) {
 namespace {
 
 const auto kAllPerfTasks =
-    ppc::util::MakeAllPerfTasks<InType, KurpiskovACRSMatMulSEQ>(PPC_SETTINGS_kurpiakov_a_sp_comp_mat_mul);
+    ppc::util::MakeAllPerfTasks<InType, KurpiskovACRSMatMulSEQ, KurpiakovACRSMatMulOMP, KurpiakovACRSMatMulTBB,
+                                KurpiakovACRSMatMulSTL, KurpiakovACRSMatMulALL>(
+        PPC_SETTINGS_kurpiakov_a_sp_comp_mat_mul);
 const auto kGtestValues = ppc::util::TupleToGTestValues(kAllPerfTasks);
 const auto kPerfTestName = KurpiakovRunPerfTests::CustomPerfTestName;