diff --git a/main.cpp b/main.cpp index a1d2625..41774f2 100644 --- a/main.cpp +++ b/main.cpp @@ -5,15 +5,32 @@ #include #include #include "ticktock.h" - +#include +#include +#include "pod.h" +#include +#include // TODO: 并行化所有这些 for 循环 +// 使用任务域指定使用的线程数4 template std::vector fill(std::vector &arr, Func const &func) { TICK(fill); - for (size_t i = 0; i < arr.size(); i++) { - arr[i] = func(i); - } + // for (size_t i = 0; i < arr.size(); i++) { + // arr[i] = func(i); + // } + + // fiil(x) 0.951s -> 0.258s 提高3.69倍 + // fill(y) 0.978s -> 0.253s 提高3.86倍 + tbb::task_arena ta(4); + ta.execute([&] { + tbb::parallel_for(tbb::blocked_range(0, arr.size()), + [&] (tbb::blocked_range r) { + for (size_t i = r.begin(); i < r.end(); i++) { + arr[i] = func(i); + } + }); + }); TOCK(fill); return arr; } @@ -21,48 +38,123 @@ std::vector fill(std::vector &arr, Func const &func) { template void saxpy(T a, std::vector &x, std::vector const &y) { TICK(saxpy); - for (size_t i = 0; i < x.size(); i++) { - x[i] = a * x[i] + y[i]; - } + // for (size_t i = 0; i < x.size(); i++) { + // x[i] = a * x[i] + y[i]; + // } + // 0.042s -> 0.021s 提高2倍 + tbb::task_arena ta(4); + ta.execute([&] { + tbb::parallel_for(tbb::blocked_range(0, x.size()), + [&] (tbb::blocked_range r) { + for (size_t i = r.begin(); i < r.end(); i++) { + x[i] = a * x[i] + y[i]; + } + }, tbb::auto_partitioner{}); + }); TOCK(saxpy); } template T sqrtdot(std::vector const &x, std::vector const &y) { TICK(sqrtdot); + // 串行reduce float类型会出现浮点误差 + // T ret = 0; + // for (size_t i = 0; i < std::min(x.size(), y.size()); i++) { + // ret += x[i] * y[i]; + // } + + // 0.0916s -> 0.0214s 提高3.88倍 T ret = 0; - for (size_t i = 0; i < std::min(x.size(), y.size()); i++) { - ret += x[i] * y[i]; - } + tbb::task_arena ta(4); + + ta.execute([&] { + ret = tbb::parallel_reduce(tbb::blocked_range(0, std::min(x.size(), y.size())), T(0), + [&] (tbb::blocked_range r, T local_ret) { + for (size_t i = r.begin(); i < r.end(); i++) { + local_ret += x[i] * y[i]; + } + return local_ret; + }, [] (T x, T y) { + return x + y; + }); + }); ret = std::sqrt(ret); TOCK(sqrtdot); return ret; } +// 求最小值也是reduce template T minvalue(std::vector const &x) { TICK(minvalue); + // T ret = x[0]; + // for (size_t i = 1; i < x.size(); i++) { + // if (x[i] < ret) + // ret = x[i]; + // } + + // 0.092s -> 0.023s 提高4倍 T ret = x[0]; - for (size_t i = 1; i < x.size(); i++) { - if (x[i] < ret) - ret = x[i]; - } + tbb::task_arena ta(4); + + ta.execute([&] { + ret = tbb::parallel_reduce(tbb::blocked_range(0, x.size()), x[1], + [&] (tbb::blocked_range r, T local_ret) { + for (size_t i = r.begin(); i < r.end(); i++) { + if (x[i] < local_ret) + local_ret = x[i]; + } + return local_ret; + }, [] (T x, T y) { + return std::min(x, y); + }); + }); TOCK(minvalue); return ret; } +// 这里使用了小彭老师写的pod.h 使用resize不初始化其中的值 template -std::vector magicfilter(std::vector const &x, std::vector const &y) { +auto magicfilter(std::vector const &x, std::vector const &y) { TICK(magicfilter); - std::vector res; - for (size_t i = 0; i < std::min(x.size(), y.size()); i++) { - if (x[i] > y[i]) { - res.push_back(x[i]); - } else if (y[i] > x[i] && y[i] > 0.5f) { - res.push_back(y[i]); - res.push_back(x[i] * y[i]); - } - } + // std::vector res; + // for (size_t i = 0; i < std::min(x.size(), y.size()); i++) { + // if (x[i] > y[i]) { + // res.push_back(x[i]); + // } else if (y[i] > x[i] && y[i] > 0.5f) { + // res.push_back(y[i]); + // res.push_back(x[i] * y[i]); + // } + // } + + // 0.41s -> 0.05s 提高8倍 + std::vector> res; // 避免初始化 + std::atomic res_size = 0; + size_t n = std::min(x.size(), y.size()); + res.resize(n); + + tbb::task_arena ta(4); + + ta.execute([&] { + tbb::parallel_for(tbb::blocked_range(0, n), + [&] (tbb::blocked_range r) { + std::vector> local_res(r.size()); + size_t lrsize = 0; + for (size_t i = r.begin(); i < r.end(); i++) { + if (x[i] > y[i]) { + local_res[lrsize++] = x[i]; + } else if (y[i] > x[i] && y[i] > 0.5f) { + local_res[lrsize++] = y[i]; + local_res[lrsize++] = x[i] * y[i]; + } + } + size_t base = res_size.fetch_add(lrsize); + for (size_t i = 0; i < lrsize; i++) { + res[base + i] = local_res[i]; + } + }); + }); + res.resize(res_size); TOCK(magicfilter); return res; } @@ -70,11 +162,30 @@ std::vector magicfilter(std::vector const &x, std::vector const &y) { template T scanner(std::vector &x) { TICK(scanner); + // T ret = 0; + // for (size_t i = 0; i < x.size(); i++) { + // ret += x[i]; + // x[i] = ret; + // } + + // 0.09s -> 0.047s 提高1.91倍 T ret = 0; - for (size_t i = 0; i < x.size(); i++) { - ret += x[i]; - x[i] = ret; - } + tbb::task_arena ta(4); + + ta.execute([&] { + ret = tbb::parallel_scan(tbb::blocked_range(0, x.size()), T(0), + [&] (tbb::blocked_range r, T local_ret, auto is_final) { + for (size_t i = r.begin(); i < r.end(); i++) { + local_ret += x[i]; + if (is_final) { + x[i] = local_ret; + } + } + return local_ret; + }, [] (T x, T y) { + return x + y; + }); + }); TOCK(scanner); return ret; }