From dc44641dcbc8a99ef9ec049f4fd9708e05576728 Mon Sep 17 00:00:00 2001 From: Mwsxy Date: Sun, 23 Jan 2022 15:33:25 +0800 Subject: [PATCH] homework --- CMakeLists.txt | 12 +++-- main.cpp | 130 ++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 110 insertions(+), 32 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0d6a8f8..237d7bb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,17 +1,23 @@ cmake_minimum_required(VERSION 3.10) -set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 20) set(CMAKE_BUILD_TYPE Release) project(main LANGUAGES CXX) add_executable(main main.cpp) -#find_package(OpenMP REQUIRED) -#target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX) +find_package(OpenMP REQUIRED) +target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX) find_package(TBB REQUIRED) target_link_libraries(main PUBLIC TBB::tbb) #find_package(benchmark REQUIRED) #target_link_libraries(main PUBLIC benchmark::benchmark) + +if (MSVC) + target_compile_options(main PUBLIC /fp:fast /arch:AVX) +else() + target_compile_options(main PUBLIC -ffast-math -march=native) +endif() \ No newline at end of file diff --git a/main.cpp b/main.cpp index a1d2625..98f9559 100644 --- a/main.cpp +++ b/main.cpp @@ -6,47 +6,93 @@ #include #include "ticktock.h" +#include +#include +#include +#include +#include +#include +#include +#include + // TODO: 并行化所有这些 for 循环 template std::vector fill(std::vector &arr, Func const &func) { TICK(fill); - for (size_t i = 0; i < arr.size(); i++) { + + tbb::parallel_for((size_t)0, arr.size(), + [&] (size_t i) { arr[i] = func(i); - } + }); + TOCK(fill); return arr; } template -void saxpy(T a, std::vector &x, std::vector const &y) { +void saxpy(T a, std::vector __restrict &x, std::vector const __restrict &y) { TICK(saxpy); - for (size_t i = 0; i < x.size(); i++) { - x[i] = a * x[i] + y[i]; - } + tbb::task_arena ta(4); + ta.execute([&] { + tbb::parallel_for(tbb::blocked_range(0, x.size(), 4), + [&] (tbb::blocked_range r) { + for (size_t i = r.begin(); i < r.end(); i+=r.grainsize()) { + auto ma = _mm_set_ps1(a); + auto mx = _mm_load_ps(&x[i]); + auto my = _mm_load_ps(&y[i]); + auto res = _mm_add_ps(_mm_mul_ps(ma, mx), my); + _mm_store_ps(&x[i], res); + // x[i] = a * x[i] + y[i]; + } + }); + }); + TOCK(saxpy); } template T sqrtdot(std::vector const &x, std::vector const &y) { TICK(sqrtdot); - T ret = 0; - for (size_t i = 0; i < std::min(x.size(), y.size()); i++) { - ret += x[i] * y[i]; - } - ret = std::sqrt(ret); + + size_t n = std::min(x.size(), y.size()); + std::atomic aret = ATOMIC_VAR_INIT(0); + + tbb::parallel_for(tbb::blocked_range(0, n), + [&](tbb::blocked_range r) { + T local_xmy = 0; +#pragma omp simd + for (size_t i = r.begin(); i < r.end(); i++) { + local_xmy += x[i]*y[i]; + } + T val = aret.load(); + while (!aret.compare_exchange_strong(val, val+local_xmy)); + }); + + T aaret = std::sqrt(aret.load()); TOCK(sqrtdot); - return ret; + return aaret; } template T minvalue(std::vector const &x) { TICK(minvalue); - T ret = x[0]; - for (size_t i = 1; i < x.size(); i++) { - if (x[i] < ret) - ret = x[i]; - } + std::atomic atm = ATOMIC_VAR_INIT(x[0]); + + tbb::parallel_for(tbb::blocked_range(0, x.size()), + [&] (tbb::blocked_range r) { + T local_min_value = x[r.begin()]; + for (size_t i = r.begin() + 1; i < r.end(); i++) { + if (x[i] < local_min_value) + local_min_value = x[i]; + } + + T old = atm.load(); + while (local_min_value < old && !atm.compare_exchange_weak(old, local_min_value)); + }, tbb::auto_partitioner{}); + + T ret = atm.load(); + TOCK(minvalue); return ret; } @@ -54,15 +100,29 @@ T minvalue(std::vector const &x) { template std::vector magicfilter(std::vector const &x, std::vector const &y) { TICK(magicfilter); + + size_t n = std::min(x.size(), y.size()); + std::mutex mtx; + std::vector res; - for (size_t i = 0; i < std::min(x.size(), y.size()); i++) { - if (x[i] > y[i]) { - res.push_back(x[i]); - } else if (y[i] > x[i] && y[i] > 0.5f) { - res.push_back(y[i]); - res.push_back(x[i] * y[i]); + res.reserve(n*3); + tbb::parallel_for(tbb::blocked_range(0, n), + [&](tbb::blocked_range r) { + static thread_local std::vector local; + local.clear(); + local.reserve(r.size()*2); + for (size_t i = r.begin(); i < r.end(); i++) { + if (x[i] > y[i]) { + local.push_back(x[i]); + } + else if (y[i] > 0.5f && y[i] > x[i]) { + local.push_back(y[i]); + local.push_back(x[i]*y[i]); + } } - } + std::lock_guard lck(mtx); + std::copy(local.begin(), local.end(), std::back_inserter(res)); + }); TOCK(magicfilter); return res; } @@ -71,10 +131,22 @@ template T scanner(std::vector &x) { TICK(scanner); T ret = 0; - for (size_t i = 0; i < x.size(); i++) { - ret += x[i]; - x[i] = ret; - } + + ret = tbb::parallel_scan(tbb::blocked_range(0, x.size()), (float)0, + [&] (tbb::blocked_range r, T sum, auto is_final_scan)->T { + T temp = sum; + for (size_t i = r.begin(); i < r.end(); i++) { + temp += x[i]; + if (is_final_scan) { + x[i] = temp; + } + } + return temp; + }, + [] (T left, T right) { + return left + right; + }); + TOCK(scanner); return ret; } @@ -99,4 +171,4 @@ int main() { std::cout << std::reduce(x.begin(), x.end()) << std::endl; return 0; -} +} \ No newline at end of file