-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathmatrix_operations.cpp
More file actions
109 lines (90 loc) · 2.87 KB
/
matrix_operations.cpp
File metadata and controls
109 lines (90 loc) · 2.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#include "matrix_operations.h"
#include <iostream>
#include <random>
#include <chrono>
#include <stdexcept>
#ifdef __x86_64__
#include <immintrin.h>
#define USE_X86_SIMD 1
#else
#define USE_X86_SIMD 0
#endif
Matrix::Matrix(size_t r, size_t c) : rows(r), cols(c) {
data.resize(rows, std::vector<double>(cols, 0.0));
}
void Matrix::randomize() {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<> dis(0.0, 10.0);
for (size_t i = 0; i < rows; i++) {
for (size_t j = 0; j < cols; j++) {
data[i][j] = dis(gen);
}
}
}
Matrix Matrix::multiply(const Matrix& other) const {
if (cols != other.rows) {
throw std::runtime_error("Invalid matrix dimensions for multiplication");
}
Matrix result(rows, other.cols);
#if USE_X86_SIMD
// x86-64 optimized path using SSE2
for (size_t i = 0; i < rows; i++) {
for (size_t j = 0; j < other.cols; j++) {
__m128d sum_vec = _mm_setzero_pd();
size_t k = 0;
// Process 2 elements at a time with SSE2
for (; k + 1 < cols; k += 2) {
__m128d a_vec = _mm_loadu_pd(&data[i][k]);
__m128d b_vec = _mm_set_pd(other.data[k+1][j], other.data[k][j]);
sum_vec = _mm_add_pd(sum_vec, _mm_mul_pd(a_vec, b_vec));
}
// Horizontal add
double sum_arr[2];
_mm_storeu_pd(sum_arr, sum_vec);
double sum = sum_arr[0] + sum_arr[1];
// Handle remaining element
if (k < cols) {
sum += data[i][k] * other.data[k][j];
}
result.data[i][j] = sum;
}
}
#else
// Fallback scalar implementation
for (size_t i = 0; i < rows; i++) {
for (size_t j = 0; j < other.cols; j++) {
double sum = 0.0;
for (size_t k = 0; k < cols; k++) {
sum += data[i][k] * other.data[k][j];
}
result.data[i][j] = sum;
}
}
#endif
return result;
}
double Matrix::sum() const {
double total = 0.0;
for (size_t i = 0; i < rows; i++) {
for (size_t j = 0; j < cols; j++) {
total += data[i][j];
}
}
return total;
}
void benchmark_matrix_ops() {
std::cout << "\n=== Matrix Multiplication Benchmark ===" << std::endl;
const size_t size = 200;
Matrix a(size, size);
Matrix b(size, size);
a.randomize();
b.randomize();
auto start = std::chrono::high_resolution_clock::now();
Matrix c = a.multiply(b);
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << "Matrix size: " << size << "x" << size << std::endl;
std::cout << "Time: " << duration.count() << " ms" << std::endl;
std::cout << "Result sum: " << c.sum() << std::endl;
}