diff --git a/.gitignore b/.gitignore index 9ed92a0..5b53e9d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ build GNUmakefile +benchmark diff --git a/CMakeLists.txt b/CMakeLists.txt index 0d6a8f8..6cbd4e1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,17 +1,19 @@ cmake_minimum_required(VERSION 3.10) -set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 17) set(CMAKE_BUILD_TYPE Release) project(main LANGUAGES CXX) +# set(BUILD_SHARED_LIBS 1) +# add_subdirectory(benchmark) add_executable(main main.cpp) #find_package(OpenMP REQUIRED) #target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX) find_package(TBB REQUIRED) -target_link_libraries(main PUBLIC TBB::tbb) +target_link_libraries(main PUBLIC tbb) #find_package(benchmark REQUIRED) #target_link_libraries(main PUBLIC benchmark::benchmark) diff --git a/main.cpp b/main.cpp index a1d2625..416727b 100644 --- a/main.cpp +++ b/main.cpp @@ -5,15 +5,28 @@ #include #include #include "ticktock.h" +#include "tbb/tbb.h" +#include +#include "pod.h" +//显式定义线程数 +#define NUM_THREADS 8 // TODO: 并行化所有这些 for 循环 template std::vector fill(std::vector &arr, Func const &func) { TICK(fill); - for (size_t i = 0; i < arr.size(); i++) { - arr[i] = func(i); - } + // for (size_t i = 0; i < arr.size(); i++) { + // arr[i] = func(i); + // } + tbb::parallel_for( + tbb::blocked_range(0,arr.size()), + [&](tbb::blocked_range r){ + for ( size_t i=r.begin();i fill(std::vector &arr, Func const &func) { template void saxpy(T a, std::vector &x, std::vector const &y) { TICK(saxpy); - for (size_t i = 0; i < x.size(); i++) { - x[i] = a * x[i] + y[i]; - } + // for (size_t i = 0; i < x.size(); i++) { + // x[i] = a * x[i] + y[i]; + // } + tbb::parallel_for( + tbb::blocked_range (0 , x.size()), + [&](tbb::blocked_range r){ + for ( size_t i=r.begin();i T sqrtdot(std::vector const &x, std::vector const &y) { TICK(sqrtdot); - T ret = 0; - for (size_t i = 0; i < std::min(x.size(), y.size()); i++) { - ret += x[i] * y[i]; - } + // T ret = 0; + // for (size_t i = 0; i < std::min(x.size(), y.size()); i++) { + // ret += x[i] * y[i]; + // } + T ret=tbb::parallel_reduce( + tbb::blocked_range(0,std::min(x.size(), y.size())),T(0), + [&](tbb::blocked_range r,T local_ret){ + for (size_t i=r.begin();i const &x, std::vector const &y) { template T minvalue(std::vector const &x) { TICK(minvalue); - T ret = x[0]; - for (size_t i = 1; i < x.size(); i++) { - if (x[i] < ret) - ret = x[i]; - } + // T ret = x[0]; + // for (size_t i = 1; i < x.size(); i++) { + // if (x[i] < ret) + // ret = x[i]; + // } + T ret=tbb::parallel_reduce( + tbb::blocked_range(0,x.size()),x[0], + [&](tbb::blocked_range r,T local_ret){ + for(size_t i=r.begin();i -std::vector magicfilter(std::vector const &x, std::vector const &y) { +std::vector> magicfilter(std::vector const &x, std::vector const &y) { TICK(magicfilter); - std::vector res; - for (size_t i = 0; i < std::min(x.size(), y.size()); i++) { - if (x[i] > y[i]) { - res.push_back(x[i]); - } else if (y[i] > x[i] && y[i] > 0.5f) { - res.push_back(y[i]); - res.push_back(x[i] * y[i]); + // std::vector res; + // for (size_t i = 0; i < std::min(x.size(), y.size()); i++) { + // if (x[i] > y[i]) { + // res.push_back(x[i]); + // } else if (y[i] > x[i] && y[i] > 0.5f) { + // res.push_back(y[i]); + // res.push_back(x[i] * y[i]); + // } + // } + std::vector> res(2*std::min(x.size(), y.size())); + std::atomic index{0}; + tbb::parallel_for( + tbb::blocked_range(0,std::min(x.size(), y.size())), + [&](tbb::blocked_range r){ + std::vector> local_res; + // local_res.reserve(y.size()/NUM_THREADS); + for(size_t i=r.begin();i y[i]) { + local_res.push_back(x[i]); + } else if (y[i] > x[i] && y[i] > 0.5f) { + local_res.push_back(y[i]); + local_res.push_back(x[i] * y[i]); + } + } + int beg=index.fetch_add(local_res.size()); + std::memcpy(&res[beg],&local_res[0],local_res.size()); } - } + ); + res.resize(index); TOCK(magicfilter); return res; } @@ -70,13 +137,66 @@ std::vector magicfilter(std::vector const &x, std::vector const &y) { template T scanner(std::vector &x) { TICK(scanner); - T ret = 0; - for (size_t i = 0; i < x.size(); i++) { - ret += x[i]; - x[i] = ret; + // T ret = 0; + // for (size_t i = 0; i < x.size(); i++) { + // ret += x[i]; + // x[i] = ret; + // } + + //实测下面的手动划分task方式比auto_partiioner的parallel_scan更快 + // float ret = tbb::parallel_scan(tbb::blocked_range(0, x.size()), (float)0, + // [&] (tbb::blocked_range r, float local_res, auto is_final) { + // for (size_t i = r.begin(); i < r.end(); i++) { + // local_res += x[i]; + // if (is_final) { + // x[i] = local_res; + // } + // } + // return local_res; + // }, [] (float x, float y) { + // return x + y; + // }); + + //手动划分任务区间 + tbb::task_group tg; + std::vector local_res(NUM_THREADS); + for (size_t k=0;k x(n); std::vector y(n); + tbb::task_scheduler_init init(NUM_THREADS); fill(x, [&] (size_t i) { return std::sin(i); }); fill(y, [&] (size_t i) { return std::cos(i); }); diff --git a/run.sh b/run.sh index 99e6ef6..904ee03 100755 --- a/run.sh +++ b/run.sh @@ -1,5 +1,8 @@ -#!/bin/sh +#!/bin/bash +if [[ $1 = "clean" && -d build ]];then + rm -rf build +fi set -e cmake -B build -cmake --build build +cmake --build build -- -j 4 build/main diff --git a/score.md b/score.md index edea7fd..32a39c8 100644 --- a/score.md +++ b/score.md @@ -1,55 +1,41 @@ # 原版 -fill: 0.691299s -fill: 0.694203s -saxpy: 0.0268882s -sqrtdot: 0.0655007s +fill: 0.728694s +fill: 0.74406s +saxpy: 0.0409412s +sqrtdot: 0.0724723s 5165.4 -minvalue: 0.0654602s +minvalue: 0.0691992s -1.11803 -magicfilter: 0.280727s +magicfilter: 0.371052s 55924034 -scanner: 0.0651282s +scanner: 0.0702286s 6.18926e+07 # 2 -fill: 0.135927s -fill: 0.135436s -saxpy: 0.0261193s -sqrtdot: 0.0166558s +fill: 0.110683s +fill: 0.10929s +saxpy: 0.0116205s +sqrtdot: 0.011483s 5792.62 -minvalue: 0.00855201s +minvalue: 0.00942511s -1.11803 -magicfilter: 0.0343181s +magicfilter: 0.0292501s 55924034 -scanner: 0.0292899s -6.19238e+07 +scanner: 0.0187589s +6.19048e+07 # 3 - -fill: 0.151911s -fill: 0.149576s -saxpy: 0.0256344s -sqrtdot: 0.0161882s -5792.61 -minvalue: 0.00839197s --1.11803 -magicfilter: 0.174838s -55924034 -scanner: 0.0305014s -6.19266e+07 - -# 4 - -fill: 0.135299s -fill: 0.135698s -saxpy: 0.0259649s -sqrtdot: 0.016133s -5792.63 -minvalue: 0.0083628s +只是针对parallel scan,发现手动划分task并行比parallel_scan更快,应该是我没找到最佳的partitioner,但是我测试了好几种都是手动划分更快 +fill: 0.111643s +fill: 0.113284s +saxpy: 0.0120721s +sqrtdot: 0.0143203s +5792.62 +minvalue: 0.00966027s -1.11803 -magicfilter: 0.0378731s +magicfilter: 0.0297295s 55924034 -scanner: 0.0257618s -6.19406e+07 +scanner: 0.016231s +6.19332e+07 \ No newline at end of file