Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
build
GNUmakefile
benchmark
6 changes: 4 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
cmake_minimum_required(VERSION 3.10)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_BUILD_TYPE Release)

project(main LANGUAGES CXX)

# set(BUILD_SHARED_LIBS 1)
# add_subdirectory(benchmark)
add_executable(main main.cpp)

#find_package(OpenMP REQUIRED)
#target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX)

find_package(TBB REQUIRED)
target_link_libraries(main PUBLIC TBB::tbb)
target_link_libraries(main PUBLIC tbb)

#find_package(benchmark REQUIRED)
#target_link_libraries(main PUBLIC benchmark::benchmark)
179 changes: 150 additions & 29 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,35 +5,69 @@
#include <numeric>
#include <algorithm>
#include "ticktock.h"
#include "tbb/tbb.h"
#include <atomic>
#include "pod.h"

//显式定义线程数
#define NUM_THREADS 8
// TODO: 并行化所有这些 for 循环

template <class T, class Func>
std::vector<T> fill(std::vector<T> &arr, Func const &func) {
TICK(fill);
for (size_t i = 0; i < arr.size(); i++) {
arr[i] = func(i);
}
// for (size_t i = 0; i < arr.size(); i++) {
// arr[i] = func(i);
// }
tbb::parallel_for(
tbb::blocked_range<size_t>(0,arr.size()),
[&](tbb::blocked_range<size_t> r){
for ( size_t i=r.begin();i<r.end();i++){
arr[i]=func(i);
}
}
);
TOCK(fill);
return arr;
}

template <class T>
void saxpy(T a, std::vector<T> &x, std::vector<T> const &y) {
TICK(saxpy);
for (size_t i = 0; i < x.size(); i++) {
x[i] = a * x[i] + y[i];
}
// for (size_t i = 0; i < x.size(); i++) {
// x[i] = a * x[i] + y[i];
// }
tbb::parallel_for(
tbb::blocked_range<size_t> (0 , x.size()),
[&](tbb::blocked_range<size_t> r){
for ( size_t i=r.begin();i<r.end();i++){
x[i] = a * x[i] + y[i];
}
}
);

TOCK(saxpy);
}

template <class T>
T sqrtdot(std::vector<T> const &x, std::vector<T> const &y) {
TICK(sqrtdot);
T ret = 0;
for (size_t i = 0; i < std::min(x.size(), y.size()); i++) {
ret += x[i] * y[i];
}
// T ret = 0;
// for (size_t i = 0; i < std::min(x.size(), y.size()); i++) {
// ret += x[i] * y[i];
// }
T ret=tbb::parallel_reduce(
tbb::blocked_range<size_t>(0,std::min(x.size(), y.size())),T(0),
[&](tbb::blocked_range<size_t> r,T local_ret){
for (size_t i=r.begin();i<r.end();i++){
local_ret+=x[i]*y[i];
}
return local_ret;
},
[](T a,T b){
return a+b;
}
);
ret = std::sqrt(ret);
TOCK(sqrtdot);
return ret;
Expand All @@ -42,48 +76,135 @@ T sqrtdot(std::vector<T> const &x, std::vector<T> const &y) {
template <class T>
T minvalue(std::vector<T> const &x) {
TICK(minvalue);
T ret = x[0];
for (size_t i = 1; i < x.size(); i++) {
if (x[i] < ret)
ret = x[i];
}
// T ret = x[0];
// for (size_t i = 1; i < x.size(); i++) {
// if (x[i] < ret)
// ret = x[i];
// }
T ret=tbb::parallel_reduce(
tbb::blocked_range<size_t>(0,x.size()),x[0],
[&](tbb::blocked_range<size_t> r,T local_ret){
for(size_t i=r.begin();i<r.end();i++){
if(x[i]<local_ret)
local_ret=x[i];
}
return local_ret;
},
[](T a,T b){
return a<b?a:b;
}
);
TOCK(minvalue);
return ret;
}

template <class T>
std::vector<T> magicfilter(std::vector<T> const &x, std::vector<T> const &y) {
std::vector<pod<T>> magicfilter(std::vector<T> const &x, std::vector<T> const &y) {
TICK(magicfilter);
std::vector<T> res;
for (size_t i = 0; i < std::min(x.size(), y.size()); i++) {
if (x[i] > y[i]) {
res.push_back(x[i]);
} else if (y[i] > x[i] && y[i] > 0.5f) {
res.push_back(y[i]);
res.push_back(x[i] * y[i]);
// std::vector<T> res;
// for (size_t i = 0; i < std::min(x.size(), y.size()); i++) {
// if (x[i] > y[i]) {
// res.push_back(x[i]);
// } else if (y[i] > x[i] && y[i] > 0.5f) {
// res.push_back(y[i]);
// res.push_back(x[i] * y[i]);
// }
// }
std::vector<pod<T>> res(2*std::min(x.size(), y.size()));
std::atomic<size_t> index{0};
tbb::parallel_for(
tbb::blocked_range<size_t>(0,std::min(x.size(), y.size())),
[&](tbb::blocked_range<size_t> r){
std::vector<pod<T>> local_res;
// local_res.reserve(y.size()/NUM_THREADS);
for(size_t i=r.begin();i<r.end();i++){
if (x[i] > y[i]) {
local_res.push_back(x[i]);
} else if (y[i] > x[i] && y[i] > 0.5f) {
local_res.push_back(y[i]);
local_res.push_back(x[i] * y[i]);
}
}
int beg=index.fetch_add(local_res.size());
std::memcpy(&res[beg],&local_res[0],local_res.size());
}
}
);
res.resize(index);
TOCK(magicfilter);
return res;
}

template <class T>
T scanner(std::vector<T> &x) {
TICK(scanner);
T ret = 0;
for (size_t i = 0; i < x.size(); i++) {
ret += x[i];
x[i] = ret;
// T ret = 0;
// for (size_t i = 0; i < x.size(); i++) {
// ret += x[i];
// x[i] = ret;
// }

//实测下面的手动划分task方式比auto_partiioner的parallel_scan更快
// float ret = tbb::parallel_scan(tbb::blocked_range<size_t>(0, x.size()), (float)0,
// [&] (tbb::blocked_range<size_t> r, float local_res, auto is_final) {
// for (size_t i = r.begin(); i < r.end(); i++) {
// local_res += x[i];
// if (is_final) {
// x[i] = local_res;
// }
// }
// return local_res;
// }, [] (float x, float y) {
// return x + y;
// });

//手动划分任务区间
tbb::task_group tg;
std::vector<T> local_res(NUM_THREADS);
for (size_t k=0;k<NUM_THREADS;k++){
size_t beg=k*(x.size()+NUM_THREADS-1)/NUM_THREADS;//应该向上取整
size_t end=std::min((k+1)*(x.size()+NUM_THREADS-1)/NUM_THREADS,x.size());
tg.run(
[&,k,beg,end](){
T tmp=0.f;
for(size_t i=beg;i<end;i++){
tmp+=x[i];
x[i]=tmp;
}
local_res[k]=tmp;
}
);
}
tg.wait();
T pre_sum=0.f;
for (size_t k=0;k<NUM_THREADS;k++){
pre_sum+=local_res[k];
local_res[k]=pre_sum;
}
for(size_t k=1;k<NUM_THREADS;k++){
size_t beg=k*(x.size()+NUM_THREADS-1)/NUM_THREADS;
size_t end=std::min((k+1)*(x.size()+NUM_THREADS-1)/NUM_THREADS,x.size());
tg.run(
[&,k,beg,end](){
for(size_t i=beg;i<end;i++){
x[i]+=local_res[k-1];
}
}
);
}
tg.wait();


TOCK(scanner);
return ret;
return local_res[NUM_THREADS-1];
// return ret;
}

int main() {
size_t n = 1<<26;
std::vector<float> x(n);
std::vector<float> y(n);

tbb::task_scheduler_init init(NUM_THREADS);
fill(x, [&] (size_t i) { return std::sin(i); });
fill(y, [&] (size_t i) { return std::cos(i); });

Expand Down
7 changes: 5 additions & 2 deletions run.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#!/bin/sh
#!/bin/bash
if [[ $1 = "clean" && -d build ]];then
rm -rf build
fi
set -e
cmake -B build
cmake --build build
cmake --build build -- -j 4
build/main
64 changes: 25 additions & 39 deletions score.md
Original file line number Diff line number Diff line change
@@ -1,55 +1,41 @@
# 原版

fill: 0.691299s
fill: 0.694203s
saxpy: 0.0268882s
sqrtdot: 0.0655007s
fill: 0.728694s
fill: 0.74406s
saxpy: 0.0409412s
sqrtdot: 0.0724723s
5165.4
minvalue: 0.0654602s
minvalue: 0.0691992s
-1.11803
magicfilter: 0.280727s
magicfilter: 0.371052s
55924034
scanner: 0.0651282s
scanner: 0.0702286s
6.18926e+07

# 2

fill: 0.135927s
fill: 0.135436s
saxpy: 0.0261193s
sqrtdot: 0.0166558s
fill: 0.110683s
fill: 0.10929s
saxpy: 0.0116205s
sqrtdot: 0.011483s
5792.62
minvalue: 0.00855201s
minvalue: 0.00942511s
-1.11803
magicfilter: 0.0343181s
magicfilter: 0.0292501s
55924034
scanner: 0.0292899s
6.19238e+07
scanner: 0.0187589s
6.19048e+07

# 3

fill: 0.151911s
fill: 0.149576s
saxpy: 0.0256344s
sqrtdot: 0.0161882s
5792.61
minvalue: 0.00839197s
-1.11803
magicfilter: 0.174838s
55924034
scanner: 0.0305014s
6.19266e+07

# 4

fill: 0.135299s
fill: 0.135698s
saxpy: 0.0259649s
sqrtdot: 0.016133s
5792.63
minvalue: 0.0083628s
只是针对parallel scan,发现手动划分task并行比parallel_scan更快,应该是我没找到最佳的partitioner,但是我测试了好几种都是手动划分更快
fill: 0.111643s
fill: 0.113284s
saxpy: 0.0120721s
sqrtdot: 0.0143203s
5792.62
minvalue: 0.00966027s
-1.11803
magicfilter: 0.0378731s
magicfilter: 0.0297295s
55924034
scanner: 0.0257618s
6.19406e+07
scanner: 0.016231s
6.19332e+07