diff --git a/main.cpp b/main.cpp index a1d2625..dc59d63 100644 --- a/main.cpp +++ b/main.cpp @@ -1,102 +1,278 @@ -#include -#include -#include -#include -#include -#include -#include "ticktock.h" - -// TODO: 并行化所有这些 for 循环 - -template -std::vector fill(std::vector &arr, Func const &func) { - TICK(fill); - for (size_t i = 0; i < arr.size(); i++) { - arr[i] = func(i); - } - TOCK(fill); - return arr; -} - -template -void saxpy(T a, std::vector &x, std::vector const &y) { - TICK(saxpy); - for (size_t i = 0; i < x.size(); i++) { - x[i] = a * x[i] + y[i]; - } - TOCK(saxpy); -} - -template -T sqrtdot(std::vector const &x, std::vector const &y) { - TICK(sqrtdot); - T ret = 0; - for (size_t i = 0; i < std::min(x.size(), y.size()); i++) { - ret += x[i] * y[i]; - } - ret = std::sqrt(ret); - TOCK(sqrtdot); - return ret; -} - -template -T minvalue(std::vector const &x) { - TICK(minvalue); - T ret = x[0]; - for (size_t i = 1; i < x.size(); i++) { - if (x[i] < ret) - ret = x[i]; - } - TOCK(minvalue); - return ret; -} - -template -std::vector magicfilter(std::vector const &x, std::vector const &y) { - TICK(magicfilter); - std::vector res; - for (size_t i = 0; i < std::min(x.size(), y.size()); i++) { - if (x[i] > y[i]) { - res.push_back(x[i]); - } else if (y[i] > x[i] && y[i] > 0.5f) { - res.push_back(y[i]); - res.push_back(x[i] * y[i]); - } - } - TOCK(magicfilter); - return res; -} - -template -T scanner(std::vector &x) { - TICK(scanner); - T ret = 0; - for (size_t i = 0; i < x.size(); i++) { - ret += x[i]; - x[i] = ret; - } - TOCK(scanner); - return ret; -} - -int main() { - size_t n = 1<<26; - std::vector x(n); - std::vector y(n); - - fill(x, [&] (size_t i) { return std::sin(i); }); - fill(y, [&] (size_t i) { return std::cos(i); }); - - saxpy(0.5f, x, y); - - std::cout << sqrtdot(x, y) << std::endl; - std::cout << minvalue(x) << std::endl; - - auto arr = magicfilter(x, y); - std::cout << arr.size() << std::endl; - - scanner(x); - std::cout << std::reduce(x.begin(), x.end()) << std::endl; - - return 0; -} +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ticktock.h" + + +// TODO: 并行化所有这些 for 循环 +// 测试平台:Windows 10 专业版 20H2 +// CPU: Intel(R) Core(TM) i5-5200U CPU @ 2.20GHz 2.19 GHz +// 编译工具:VS2017 C++17 +// TBB版本: 2020_U3#6 + +// PARALLEL_TYPE_TEST +#define PARALLEL_TYPE_NULL 0 +#define PARALLEL_TYPE_FOR 1 +#define PARALLEL_TYPE_RANDOM_ALLOC 2 +// 用于切换并行方案 +#define SPTT PARALLEL_TYPE_RANDOM_ALLOC + +// 排序测试 +// PARALLEL_SORT_TEST +#define PARALLEL_SORT_NULL 0 +#define PARALLEL_FOR_SORT 1 +#define PARALLEL_SORT_SORT 2 +// 用于切换并行方案 +#define SPST PARALLEL_FOR_SORT + +template +std::vector fill(std::vector &arr, Func const &func) { + TICK(fill); + +#if (SPTT == PARALLEL_TYPE_FOR) + // 并行赋值 1.7s左右 + std::cout << "PARALLEL_TYPE_FOR" << std::endl; + tbb::parallel_for(tbb::blocked_range(0, arr.size()), + [&](tbb::blocked_range r) { + for (size_t i = r.begin(); i < r.end(); ++i) + { + arr[i] = func(i); + } + }); +#elif (SPTT == PARALLEL_TYPE_RANDOM_ALLOC) + // 建立四个线程 任务域 + // 使用tbb::affinity_partitioner自动负载均衡,第二次比第一次快0.4s。 + // 使用tbb::simple_partitioner 8s左右 + // 使用tbb::auto_partitioner 1.9s左右 + tbb::task_arena ta(4); + ta.execute([&] { + tbb::affinity_partitioner affinity; + tbb::parallel_for(tbb::blocked_range(0, arr.size()), + [&](tbb::blocked_range r) + { + for (size_t i = r.begin(); i < r.end(); ++i) + { + arr[i] = func(i); + } + }, affinity); + }); +#else + for (size_t i = 0; i < arr.size(); i++) { + arr[i] = func(i); + } +#endif + + TOCK(fill); + return arr; +} + +template +void saxpy(T a, std::vector &x, std::vector const &y) { + TICK(saxpy); + // 并行 0.081 -> 0.049提升0.032s + auto mincnt = std::min(x.size(), y.size()); + tbb::parallel_for(tbb::blocked_range(0, mincnt), + [&](tbb::blocked_range r) { + for (size_t i = r.begin(); i < r.end(); ++i) + { + // 直接赋值开销小 + x[i] = a * x[i] + y[i]; + } + }); + TOCK(saxpy); +} + +template +T sqrtdot(std::vector const &x, std::vector const &y) { + TICK(sqrtdot); + T ret = 0; + // 避免循环中重复计算vector的size, 提升0.05s。 + auto mincnt = std::min(x.size(), y.size()); + tbb::spin_mutex spin_mtx; + + // 求和需要考虑线程同步 +#if (SPTT == PARALLEL_TYPE_FOR) + // 并行 0.16-> 0.07s左右。 + tbb::parallel_for(tbb::blocked_range(0, mincnt), + [&](tbb::blocked_range r) + { + // 以下语句内存不足, 不采用小彭老师的推荐方案 + // std::vector temp_a(r.size()); + T val; + T total{ 0 }; + for (size_t i = r.begin(); i < r.end(); ++i) + { + val = x[i] * y[i]; + if (val > 0) + { + total += val;; + } + } + + std::lock_guard lck(spin_mtx); + ret += total; + } + ); +#elif (SPTT == PARALLEL_TYPE_RANDOM_ALLOC) + // 建立四个线程 + // 使用tbb::affinity_partitioner自动负载均衡,0.16-> 0.05s左右。 + // 使用tbb::simple_partitioner 0.16-> 0.052s左右 + // 使用tbb::auto_partitioner 0.16-> 0.05s左右 + tbb::task_arena ta(4); + ta.execute([&] { + tbb::affinity_partitioner affinity; + tbb::parallel_for(tbb::blocked_range(0, mincnt), + [&](tbb::blocked_range r) + { + T val; + T total{ 0 }; + for (size_t i = r.begin(); i < r.end(); ++i) + { + val = x[i] * y[i]; + if (val > 0) + { + total += val;; + } + } + + std::lock_guard lck(spin_mtx); + ret += total; + }, affinity); + }); + +#else + for (size_t i = 0; i < mincnt; i++) { + ret += x[i] * y[i]; + } +#endif + + ret = std::sqrt(ret); + TOCK(sqrtdot); + return ret; +} + +template +T minvalue(std::vector const &x) { + TICK(minvalue); + + T ret = x[0]; +#if (SPST == PARALLEL_FOR_SORT) + // 采用parallel for 0.092 -> 0.033s 左右 + tbb::spin_mutex spin_mtx; + tbb::parallel_for(tbb::blocked_range(0, x.size()), + [&](tbb::blocked_range r) { + T min_val{x[r.begin()]}; + for (size_t i = r.begin(); i < r.end(); ++i) + { + if (x[i] < min_val) + { + min_val = x[i]; + } + } + std::lock_guard lck(spin_mtx); + if (min_val < ret) + { + ret = min_val; + } + }); + +#elif (SPST == PARALLEL_SORT_SORT) + // 采用parallel sort 0.092 -> 4.7s 左右 + std::vector vec_temp = std::move(x); + tbb::parallel_sort(vec_temp.begin(), vec_temp.end(), std::less{}); + ret = vec_temp[0]; + +#else + // 非并行取最小值 + for (size_t i = 1; i < x.size(); i++) { + if (x[i] < ret) + ret = x[i]; + } +#endif + + TOCK(minvalue); + return ret; +} + +template +std::vector magicfilter(std::vector const &x, std::vector const &y) { + TICK(magicfilter); + std::mutex mtx; + auto mincnt = std::min(x.size(), y.size()); + + // 无法事先预计返回数据长度 + // 预分配空间反而会导致性能降低 + //std::vector res(mincnt); + std::vector res; + +#if 1 + // 优化前后对比 0.8s -> 0.56s + tbb::task_arena ta(4); + ta.execute([&] { + tbb::parallel_for(tbb::blocked_range(0, mincnt), + [&](tbb::blocked_range r) + { + std::vector temp; + for (size_t i = r.begin(); i < r.end(); ++i) + { + if (x[i] > y[i]) { + temp.push_back(x[i]); + } + else if (y[i] > 0.5f && y[i] < x[i]) { + // 主观预计(y[i] > 0.5f) 为false的概率大于(y[i] < x[i]) + // 故将其放在前面判断 + temp.push_back(y[i]); + temp.push_back(x[i] * y[i]); + } + } + std::lock_guard lck(mtx); + std::copy(temp.begin(), temp.end(), std::back_inserter(res)); + }, tbb::auto_partitioner{}); + }); + +#else + + for (size_t i = 0; i < mincnt; i++) { + if (x[i] > y[i]) { + res.push_back(x[i]); + } + else if (y[i] > x[i] && y[i] > 0.5f) { + res.push_back(y[i]); + res.push_back(x[i] * y[i]); + } + } + +#endif + + TOCK(magicfilter); + return res; +} + +int main() { + size_t n = 1<<26; + std::vector x(n); + std::vector y(n); + + fill(x, [&] (size_t i) { return std::sin(i); }); + fill(y, [&] (size_t i) { return std::cos(i); }); + + saxpy(0.5f, x, y); + + std::cout << sqrtdot(x, y) << std::endl; + std::cout << minvalue(x) << std::endl; + + auto arr = magicfilter(x, y); + std::cout << arr.size() << std::endl; + + return 0; +}