Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
cmake_minimum_required(VERSION 3.10)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_BUILD_TYPE Release)

project(main LANGUAGES CXX)

add_executable(main main.cpp)

#find_package(OpenMP REQUIRED)
#target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX)
find_package(OpenMP REQUIRED)
target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX)

find_package(TBB REQUIRED)
target_link_libraries(main PUBLIC TBB::tbb)

#find_package(benchmark REQUIRED)
#target_link_libraries(main PUBLIC benchmark::benchmark)

if (MSVC)
target_compile_options(main PUBLIC /fp:fast /arch:AVX)
else()
target_compile_options(main PUBLIC -ffast-math -march=native)
endif()
130 changes: 101 additions & 29 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,63 +6,123 @@
#include <algorithm>
#include "ticktock.h"

#include <mutex>
#include <tbb/parallel_for.h>
#include <tbb/task_arena.h>
#include <tbb/spin_mutex.h>
#include <tbb/parallel_scan.h>
#include <atomic>
#include <memory>
#include <xmmintrin.h>

// TODO: 并行化所有这些 for 循环

template <class T, class Func>
std::vector<T> fill(std::vector<T> &arr, Func const &func) {
TICK(fill);
for (size_t i = 0; i < arr.size(); i++) {

tbb::parallel_for((size_t)0, arr.size(),
[&] (size_t i) {
arr[i] = func(i);
}
});

TOCK(fill);
return arr;
}

template <class T>
void saxpy(T a, std::vector<T> &x, std::vector<T> const &y) {
void saxpy(T a, std::vector<T> __restrict &x, std::vector<T> const __restrict &y) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

看第四课,这个没有用哦!需要取出 x.data() 存到一个 __restrict 指针才行。

TICK(saxpy);
for (size_t i = 0; i < x.size(); i++) {
x[i] = a * x[i] + y[i];
}
tbb::task_arena ta(4);
ta.execute([&] {
tbb::parallel_for(tbb::blocked_range<size_t>(0, x.size(), 4),
[&] (tbb::blocked_range<size_t> r) {
for (size_t i = r.begin(); i < r.end(); i+=r.grainsize()) {
auto ma = _mm_set_ps1(a);
auto mx = _mm_load_ps(&x[i]);
auto my = _mm_load_ps(&y[i]);
auto res = _mm_add_ps(_mm_mul_ps(ma, mx), my);
_mm_store_ps(&x[i], res);
// x[i] = a * x[i] + y[i];
}
});
});

TOCK(saxpy);
}

template <class T>
T sqrtdot(std::vector<T> const &x, std::vector<T> const &y) {
TICK(sqrtdot);
T ret = 0;
for (size_t i = 0; i < std::min(x.size(), y.size()); i++) {
ret += x[i] * y[i];
}
ret = std::sqrt(ret);

size_t n = std::min(x.size(), y.size());
std::atomic<float> aret = ATOMIC_VAR_INIT(0);

tbb::parallel_for(tbb::blocked_range<size_t>(0, n),
[&](tbb::blocked_range<size_t> r) {
T local_xmy = 0;
#pragma omp simd
for (size_t i = r.begin(); i < r.end(); i++) {
local_xmy += x[i]*y[i];
}
T val = aret.load();
while (!aret.compare_exchange_strong(val, val+local_xmy));
});

T aaret = std::sqrt(aret.load());
TOCK(sqrtdot);
return ret;
return aaret;
}

template <class T>
T minvalue(std::vector<T> const &x) {
TICK(minvalue);
T ret = x[0];
for (size_t i = 1; i < x.size(); i++) {
if (x[i] < ret)
ret = x[i];
}
std::atomic<T> atm = ATOMIC_VAR_INIT(x[0]);

tbb::parallel_for(tbb::blocked_range<size_t>(0, x.size()),
[&] (tbb::blocked_range<size_t> r) {
T local_min_value = x[r.begin()];
for (size_t i = r.begin() + 1; i < r.end(); i++) {
if (x[i] < local_min_value)
local_min_value = x[i];
}

T old = atm.load();
while (local_min_value < old && !atm.compare_exchange_weak(old, local_min_value));
}, tbb::auto_partitioner{});

T ret = atm.load();

TOCK(minvalue);
return ret;
}

template <class T>
std::vector<T> magicfilter(std::vector<T> const &x, std::vector<T> const &y) {
TICK(magicfilter);

size_t n = std::min(x.size(), y.size());
std::mutex mtx;

std::vector<T> res;
for (size_t i = 0; i < std::min(x.size(), y.size()); i++) {
if (x[i] > y[i]) {
res.push_back(x[i]);
} else if (y[i] > x[i] && y[i] > 0.5f) {
res.push_back(y[i]);
res.push_back(x[i] * y[i]);
res.reserve(n*3);
tbb::parallel_for(tbb::blocked_range<size_t>(0, n),
[&](tbb::blocked_range<size_t> r) {
static thread_local std::vector<T> local;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

很有创意!的确可以这样来优化。

local.clear();
local.reserve(r.size()*2);
for (size_t i = r.begin(); i < r.end(); i++) {
if (x[i] > y[i]) {
local.push_back(x[i]);
}
else if (y[i] > 0.5f && y[i] > x[i]) {
local.push_back(y[i]);
local.push_back(x[i]*y[i]);
}
}
}
std::lock_guard lck(mtx);
std::copy(local.begin(), local.end(), std::back_inserter(res));
});
TOCK(magicfilter);
return res;
}
Expand All @@ -71,10 +131,22 @@ template <class T>
T scanner(std::vector<T> &x) {
TICK(scanner);
T ret = 0;
for (size_t i = 0; i < x.size(); i++) {
ret += x[i];
x[i] = ret;
}

ret = tbb::parallel_scan(tbb::blocked_range<size_t>(0, x.size()), (float)0,
[&] (tbb::blocked_range<size_t> r, T sum, auto is_final_scan)->T {
T temp = sum;
for (size_t i = r.begin(); i < r.end(); i++) {
temp += x[i];
if (is_final_scan) {
x[i] = temp;
}
}
return temp;
},
[] (T left, T right) {
return left + right;
});

TOCK(scanner);
return ret;
}
Expand All @@ -99,4 +171,4 @@ int main() {
std::cout << std::reduce(x.begin(), x.end()) << std::endl;

return 0;
}
}