-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsimd_add.c
More file actions
107 lines (89 loc) · 2.81 KB
/
Copy pathsimd_add.c
File metadata and controls
107 lines (89 loc) · 2.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
/*
* simd_add.c
* Simple demonstration of SIMD vectorization with Vector addition using Intel SSE C intrinsics.
* The program times the addition of two vector w/ and w/o SIMD vectorization optimization
*
* Chirag Dhamange
*
* Updated: 12 December, 2024
*/
#include <stdlib.h>
#include <stdio.h>
#include <emmintrin.h>
#include "timer.h"
/******************************************************
* Macros
*******************************************************/
#define N (10000000U)
#define SM (CLS / sizeof (unsigned int))
// max vector register size = 128
#define STRIDE (128 / sizeof(unsigned int))
/******************************************************
* Declare Matrices
******************************************************/
unsigned int res[N] __attribute__ ((aligned (CLS)));
unsigned int a[N] __attribute__ ((aligned (CLS)));
unsigned int b[N] __attribute__ ((aligned (CLS)));
int main (void)
{
printf("Cache Line size:%d\n%ld\n", CLS, SM);
double start, end;
GET_TIME(start);
for(unsigned int i = 0; i < N; i++)
a[i] = 1;
GET_TIME(end);
printf("Intialized array A\nTime taken = %15.14f seconds\n",
(end - start));
GET_TIME(start);
for(unsigned int i = 0; i < N; i++)
b[i] = 1;
GET_TIME(end);
printf("\nIntialized array B\nTime taken = %15.14f seconds\n",
(end - start));
// Vector addition without any SIMD optimization
GET_TIME(start);
for(unsigned int i = 0; i < N; i++)
res[i] = a[i] + b[i];
GET_TIME(end);
printf("\nNo Optimization\nArray A + Array B\nTime taken = %15.14f seconds\n",
(end - start));
unsigned int *rres;
unsigned int *ra;
unsigned int *rb;
for(unsigned int i = 0; i < N; i++)
res[i] = 0;
unsigned int i;
unsigned short int i2;
// Matrix Multiplication with SIMD optimization
// Multiply, add, and store SM values at a time
GET_TIME(start);
for (i = 0, ra = &a[i]; i < N; i += SM, ra += SM)
{
_mm_prefetch(&ra[SM], _MM_HINT_NTA);
for(rres = &res[i], rb = &b[i], i2 = 0;
i2 < SM;
i2 += STRIDE, rres += STRIDE)
{
// Load 4 32-bit integers into a 128-bit vector register for each of the two arrays
__m128i ma = _mm_set_epi32(ra[i2], ra[i2 + 1], ra[i2 + 2], ra[i2 + 3]);
__m128i mb = _mm_set_epi32(rb[i2], rb[i2 + 1], rb[i2 + 2], rb[i2 + 3]);
// add and store the result of the vector addition
_mm_store_si128((__m128i*)&rres[i2],
_mm_add_epi32(ma, mb)
);
}
}
GET_TIME(end);
printf("\nSIMD Vectorization\nArray A + Array B\nTime taken = %15.14f seconds\n",
(end - start));
// Simple verification to check whether all values
// resulting from the SIMD operation are as expected
for(unsigned int i = 0; i < N; i++)
if(res[2] != 2)
{
printf("False\n");
return -1;
}
printf("All values correct\n");
return 0;
}