-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsse2.cpp
More file actions
137 lines (137 loc) · 4.09 KB
/
Copy pathsse2.cpp
File metadata and controls
137 lines (137 loc) · 4.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#ifndef JUST32BIT
#include "lifealgo.h"
#include "util.h"
#include <algorithm>
#include <cstdlib>
#include <x86intrin.h>
using namespace std ;
typedef __m128i lifeword ;
const int lifewordwidth = 128 ;
const int shift = 7 ;
class sse2algo : public lifealgo {
public:
virtual void init(int w, int h) ;
virtual void setcell(int x, int y) ;
virtual int getpopulation() ;
virtual int nextstep(int, int, int) ;
virtual void swap() ;
int w, h, wordwidth ;
long long wh ;
lifeword *u0, *u1 ;
} ;
static class sse2algofactory : public lifealgofactory {
public:
sse2algofactory() ;
virtual lifealgo *createInstance() {
return new sse2algo() ;
}
} factory ;
sse2algofactory::sse2algofactory() {
registerAlgo("sse2", &factory) ;
}
void sse2algo::init(int w_, int h_) {
w = w_ ;
h = h_ ;
wordwidth = (w + lifewordwidth-1) >> shift ;
wh = wordwidth * h ;
u0 = (lifeword *)calloc(wordwidth*sizeof(lifeword), h+1) ;
u1 = (lifeword *)calloc(wordwidth*sizeof(lifeword), h+1) ;
}
void sse2algo::setcell(int x, int y) {
static lifeword bigbig = _mm_set_epi64x(1, 0) ;
static lifeword bigone = _mm_cvtsi64_si128(1) ;
int xx = x & (lifewordwidth - 1) ;
if (xx < 64) {
u0[(x>>shift)*h+y] |= bigone << xx ;
} else {
u0[(x>>shift)*h+y] |= bigbig << (xx - 64) ;
}
}
static inline int popcount128(__m128i n) {
const __m128i n_hi = _mm_unpackhi_epi64(n, n);
return __builtin_popcountll(_mm_cvtsi128_si64(n)) +
__builtin_popcountll(_mm_cvtsi128_si64(n_hi));
}
int sse2algo::getpopulation() {
int r = 0 ;
for (int i=0; i<wh; i++)
r += popcount128(u0[i]) ;
return r ;
}
static inline void add2(lifeword a, lifeword b,
lifeword &c0, lifeword &c1) {
c0 = a ^ b ;
c1 = a & b ;
}
static inline void add3(lifeword a, lifeword b, lifeword c,
lifeword &c0, lifeword &c1) {
lifeword t0, t1, t2 ;
add2(a, b, t0, t1) ;
add2(t0, c, c0, t2) ;
c1 = t2 | t1 ;
}
static lifeword leftshift(lifeword v) {
return _mm_slli_epi64(v, 1) | _mm_slli_si128(_mm_srli_epi64(v, 63), 8) ;
}
static lifeword rightshift(lifeword v) {
return _mm_srli_epi64(v, 1) | _mm_srli_si128(_mm_slli_epi64(v, 63), 8) ;
}
static lifeword bigleftshift(lifeword v) {
return _mm_slli_epi64(_mm_slli_si128(v, 8), 63) ;
}
static lifeword bigrightshift(lifeword v) {
return _mm_srli_epi64(_mm_srli_si128(v, 8), 63) ;
}
void sse2algo::swap() { ::swap(u0, u1) ; }
int sse2algo::nextstep(int id, int n, int needpop) {
int r = 0 ;
int loi = id * wordwidth / n ;
int hii = (id + 1) * wordwidth / n ;
for (int i=loi; i<hii; i++) {
lifeword w300 = _mm_cvtsi64_si128(0) ;
lifeword w301 = _mm_cvtsi64_si128(0) ;
lifeword *col = u0 + i * h + 1 ;
lifeword *pcol = u0 + (i-1) * h + 1 ;
lifeword *ncol = u0 + (i+1) * h + 1 ;
lifeword *wcol = u1 + i * h + 1 ;
lifeword w1 = *col ;
lifeword w1l = leftshift(w1) ;
lifeword w1r = rightshift(w1) ;
if (i > 0)
w1l += bigrightshift(*pcol) ;
if (i+1 < wordwidth)
w1r += bigleftshift(*ncol) ;
lifeword w210, w211, w310, w311 ;
add2(w1l, w1r, w210, w211) ;
add2(w1, w210, w310, w311) ;
w311 |= w211 ;
for (int j=1; j+1<h; j++, col++, pcol++, ncol++, wcol++) {
lifeword w2 = col[1] ;
lifeword w2l = leftshift(w2) ;
lifeword w2r = rightshift(w2) ;
if (i > 0)
w2l |= bigrightshift(pcol[1]) ;
if (i+1 < wordwidth)
w2r |= bigleftshift(ncol[1]) ;
lifeword w220, w221, w320, w321, a0, a1 ;
add2(w2l, w2r, w220, w221) ;
add2(w2, w220, w320, w321) ;
w321 |= w221 ;
add3(w300, w210, w320, a0, a1) ;
lifeword ng1 = (a1 ^ w301 ^ w211 ^ w321) &
((a1 | w301) ^ (w211 | w321)) & (a0 | w1) ;
wcol[0] = ng1 ;
if (needpop)
r += popcount128(ng1) ;
w300 = w310 ;
w301 = w311 ;
w310 = w320 ;
w311 = w321 ;
w210 = w220 ;
w211 = w221 ;
w1 = w2 ;
}
}
return r ;
}
#endif