Skip to content

Commit 68d0328

Browse files
author
peng.li24
committed
refactor: condense SIMD and reduction code (-215 lines)
- core.h: replace 18 element-wise function bodies with NUMPY_UNROLL4 macro - core.h: simplify dot/norm_sq stack allocation with ternary - einsum.h: simplify fast-path stack/heap with ternary + single D8 define - einsum.h: simplify scalar path — use memset instead of partial axis tracking - einsum.h: deduplicate stride/coord pattern into single ternary assignments - All 336 tests pass, bit-exact alignment preserved
1 parent 5eb0daa commit 68d0328

2 files changed

Lines changed: 118 additions & 333 deletions

File tree

numpy/core.h

Lines changed: 51 additions & 196 deletions
Original file line numberDiff line numberDiff line change
@@ -53,272 +53,144 @@ inline void full_like(T* dst, size_t n, T fill_value) {
5353

5454
// ============================================================================
5555
// Element-wise math — T in → T out
56-
// All unrolled 4x to reduce loop overhead on small arrays.
57-
// ============================================================================
56+
//
57+
// NUMPY_UNROLL4 macro: 4x loop unrolling to reduce branch overhead.
58+
// All calls are inlined → identical codegen to hand-written unrolled loops.
59+
// ============================================================================
60+
61+
#define NUMPY_UNROLL4(dst_i, body) \
62+
do { size_t _i = 0; \
63+
for (; _i + 3 < n; _i += 4) { \
64+
size_t dst_i = _i + 0; body; \
65+
dst_i = _i + 1; body; \
66+
dst_i = _i + 2; body; \
67+
dst_i = _i + 3; body; \
68+
} \
69+
for (; _i < n; ++_i) { \
70+
size_t dst_i = _i; body; \
71+
} \
72+
} while(0)
5873

5974
/// numpy.sqrt(x, /, out=None, *, where=True, ...)
6075
template<typename T>
6176
inline void sqrt(const T* src, T* dst, size_t n) {
62-
size_t i = 0;
63-
for (; i + 3 < n; i += 4) {
64-
dst[i+0] = std::sqrt(src[i+0]);
65-
dst[i+1] = std::sqrt(src[i+1]);
66-
dst[i+2] = std::sqrt(src[i+2]);
67-
dst[i+3] = std::sqrt(src[i+3]);
68-
}
69-
for (; i < n; ++i) dst[i] = std::sqrt(src[i]);
77+
NUMPY_UNROLL4(i, dst[i] = std::sqrt(src[i]));
7078
}
7179

7280
/// numpy.abs(x, /, out=None, *, where=True, ...)
7381
template<typename T>
7482
inline void abs(const T* src, T* dst, size_t n) {
75-
size_t i = 0;
76-
for (; i + 3 < n; i += 4) {
77-
dst[i+0] = std::abs(src[i+0]);
78-
dst[i+1] = std::abs(src[i+1]);
79-
dst[i+2] = std::abs(src[i+2]);
80-
dst[i+3] = std::abs(src[i+3]);
81-
}
82-
for (; i < n; ++i) dst[i] = std::abs(src[i]);
83+
NUMPY_UNROLL4(i, dst[i] = std::abs(src[i]));
8384
}
8485

8586
/// numpy.exp(x, /, out=None, *, where=True, ...)
8687
template<typename T>
8788
inline void exp(const T* src, T* dst, size_t n) {
88-
size_t i = 0;
89-
for (; i + 3 < n; i += 4) {
90-
dst[i+0] = svml::exp(src[i+0]);
91-
dst[i+1] = svml::exp(src[i+1]);
92-
dst[i+2] = svml::exp(src[i+2]);
93-
dst[i+3] = svml::exp(src[i+3]);
94-
}
95-
for (; i < n; ++i) dst[i] = svml::exp(src[i]);
89+
NUMPY_UNROLL4(i, dst[i] = svml::exp(src[i]));
9690
}
9791

9892
/// numpy.log(x, /, out=None, *, where=True, ...)
9993
template<typename T>
10094
inline void log(const T* src, T* dst, size_t n) {
101-
size_t i = 0;
102-
for (; i + 3 < n; i += 4) {
103-
dst[i+0] = svml::log(src[i+0]);
104-
dst[i+1] = svml::log(src[i+1]);
105-
dst[i+2] = svml::log(src[i+2]);
106-
dst[i+3] = svml::log(src[i+3]);
107-
}
108-
for (; i < n; ++i) dst[i] = svml::log(src[i]);
95+
NUMPY_UNROLL4(i, dst[i] = svml::log(src[i]));
10996
}
11097

11198
/// numpy.sin(x, /, out=None, *, where=True, ...)
11299
template<typename T>
113100
inline void sin(const T* src, T* dst, size_t n) {
114-
size_t i = 0;
115-
for (; i + 3 < n; i += 4) {
116-
dst[i+0] = svml::sin(src[i+0]);
117-
dst[i+1] = svml::sin(src[i+1]);
118-
dst[i+2] = svml::sin(src[i+2]);
119-
dst[i+3] = svml::sin(src[i+3]);
120-
}
121-
for (; i < n; ++i) dst[i] = svml::sin(src[i]);
101+
NUMPY_UNROLL4(i, dst[i] = svml::sin(src[i]));
122102
}
123103

124104
/// numpy.cos(x, /, out=None, *, where=True, ...)
125105
template<typename T>
126106
inline void cos(const T* src, T* dst, size_t n) {
127-
size_t i = 0;
128-
for (; i + 3 < n; i += 4) {
129-
dst[i+0] = svml::cos(src[i+0]);
130-
dst[i+1] = svml::cos(src[i+1]);
131-
dst[i+2] = svml::cos(src[i+2]);
132-
dst[i+3] = svml::cos(src[i+3]);
133-
}
134-
for (; i < n; ++i) dst[i] = svml::cos(src[i]);
107+
NUMPY_UNROLL4(i, dst[i] = svml::cos(src[i]));
135108
}
136109

137110
/// numpy.tan(x, /, out=None, *, where=True, ...)
138111
template<typename T>
139112
inline void tan(const T* src, T* dst, size_t n) {
140-
size_t i = 0;
141-
for (; i + 3 < n; i += 4) {
142-
dst[i+0] = svml::tan(src[i+0]);
143-
dst[i+1] = svml::tan(src[i+1]);
144-
dst[i+2] = svml::tan(src[i+2]);
145-
dst[i+3] = svml::tan(src[i+3]);
146-
}
147-
for (; i < n; ++i) dst[i] = svml::tan(src[i]);
113+
NUMPY_UNROLL4(i, dst[i] = svml::tan(src[i]));
148114
}
149115

150116
/// numpy.power(x1, x2, /, out=None, *, where=True, ...)
151117
template<typename T>
152118
inline void power(const T* src, T* dst, size_t n, T exponent) {
153-
size_t i = 0;
154-
for (; i + 3 < n; i += 4) {
155-
dst[i+0] = svml::pow(src[i+0], exponent);
156-
dst[i+1] = svml::pow(src[i+1], exponent);
157-
dst[i+2] = svml::pow(src[i+2], exponent);
158-
dst[i+3] = svml::pow(src[i+3], exponent);
159-
}
160-
for (; i < n; ++i) dst[i] = svml::pow(src[i], exponent);
119+
NUMPY_UNROLL4(i, dst[i] = svml::pow(src[i], exponent));
161120
}
162121

163122
/// numpy.clip(a, a_min, a_max, out=None, **kwargs)
164123
template<typename T>
165124
inline void clip(const T* src, T* dst, size_t n, T min_val, T max_val) {
166-
size_t i = 0;
167-
for (; i + 3 < n; i += 4) {
168-
dst[i+0] = std::max(min_val, std::min(max_val, src[i+0]));
169-
dst[i+1] = std::max(min_val, std::min(max_val, src[i+1]));
170-
dst[i+2] = std::max(min_val, std::min(max_val, src[i+2]));
171-
dst[i+3] = std::max(min_val, std::min(max_val, src[i+3]));
172-
}
173-
for (; i < n; ++i)
174-
dst[i] = std::max(min_val, std::min(max_val, src[i]));
125+
NUMPY_UNROLL4(i, dst[i] = std::max(min_val, std::min(max_val, src[i])));
175126
}
176127

177128
/// numpy.log10(x, /, out=None, *, where=True, ...)
178129
template<typename T>
179130
inline void log10(const T* src, T* dst, size_t n) {
180-
size_t i = 0;
181-
for (; i + 3 < n; i += 4) {
182-
dst[i+0] = svml::log10(src[i+0]);
183-
dst[i+1] = svml::log10(src[i+1]);
184-
dst[i+2] = svml::log10(src[i+2]);
185-
dst[i+3] = svml::log10(src[i+3]);
186-
}
187-
for (; i < n; ++i) dst[i] = svml::log10(src[i]);
131+
NUMPY_UNROLL4(i, dst[i] = svml::log10(src[i]));
188132
}
189133

190134
/// numpy.log2(x, /, out=None, *, where=True, ...)
191135
template<typename T>
192136
inline void log2(const T* src, T* dst, size_t n) {
193-
size_t i = 0;
194-
for (; i + 3 < n; i += 4) {
195-
dst[i+0] = svml::log2(src[i+0]);
196-
dst[i+1] = svml::log2(src[i+1]);
197-
dst[i+2] = svml::log2(src[i+2]);
198-
dst[i+3] = svml::log2(src[i+3]);
199-
}
200-
for (; i < n; ++i) dst[i] = svml::log2(src[i]);
137+
NUMPY_UNROLL4(i, dst[i] = svml::log2(src[i]));
201138
}
202139

203140
/// numpy.arcsin(x, /, out=None, *, where=True, ...)
204141
template<typename T>
205142
inline void arcsin(const T* src, T* dst, size_t n) {
206-
size_t i = 0;
207-
for (; i + 3 < n; i += 4) {
208-
dst[i+0] = svml::asin(src[i+0]);
209-
dst[i+1] = svml::asin(src[i+1]);
210-
dst[i+2] = svml::asin(src[i+2]);
211-
dst[i+3] = svml::asin(src[i+3]);
212-
}
213-
for (; i < n; ++i) dst[i] = svml::asin(src[i]);
143+
NUMPY_UNROLL4(i, dst[i] = svml::asin(src[i]));
214144
}
215145

216146
/// numpy.arccos(x, /, out=None, *, where=True, ...)
217147
template<typename T>
218148
inline void arccos(const T* src, T* dst, size_t n) {
219-
size_t i = 0;
220-
for (; i + 3 < n; i += 4) {
221-
dst[i+0] = svml::acos(src[i+0]);
222-
dst[i+1] = svml::acos(src[i+1]);
223-
dst[i+2] = svml::acos(src[i+2]);
224-
dst[i+3] = svml::acos(src[i+3]);
225-
}
226-
for (; i < n; ++i) dst[i] = svml::acos(src[i]);
149+
NUMPY_UNROLL4(i, dst[i] = svml::acos(src[i]));
227150
}
228151

229152
/// numpy.arctan(x, /, out=None, *, where=True, ...)
230153
template<typename T>
231154
inline void arctan(const T* src, T* dst, size_t n) {
232-
size_t i = 0;
233-
for (; i + 3 < n; i += 4) {
234-
dst[i+0] = svml::atan(src[i+0]);
235-
dst[i+1] = svml::atan(src[i+1]);
236-
dst[i+2] = svml::atan(src[i+2]);
237-
dst[i+3] = svml::atan(src[i+3]);
238-
}
239-
for (; i < n; ++i) dst[i] = svml::atan(src[i]);
155+
NUMPY_UNROLL4(i, dst[i] = svml::atan(src[i]));
240156
}
241157

242158
/// numpy.round(a, decimals=0, out=None)
243159
template<typename T>
244160
inline void round(const T* src, T* dst, size_t n) {
245-
size_t i = 0;
246-
for (; i + 3 < n; i += 4) {
247-
dst[i+0] = std::round(src[i+0]);
248-
dst[i+1] = std::round(src[i+1]);
249-
dst[i+2] = std::round(src[i+2]);
250-
dst[i+3] = std::round(src[i+3]);
251-
}
252-
for (; i < n; ++i) dst[i] = std::round(src[i]);
161+
NUMPY_UNROLL4(i, dst[i] = std::round(src[i]));
253162
}
254163

255164
/// numpy.floor(x, /, out=None, *, where=True, ...)
256165
template<typename T>
257166
inline void floor(const T* src, T* dst, size_t n) {
258-
size_t i = 0;
259-
for (; i + 3 < n; i += 4) {
260-
dst[i+0] = std::floor(src[i+0]);
261-
dst[i+1] = std::floor(src[i+1]);
262-
dst[i+2] = std::floor(src[i+2]);
263-
dst[i+3] = std::floor(src[i+3]);
264-
}
265-
for (; i < n; ++i) dst[i] = std::floor(src[i]);
167+
NUMPY_UNROLL4(i, dst[i] = std::floor(src[i]));
266168
}
267169

268170
/// numpy.ceil(x, /, out=None, *, where=True, ...)
269171
template<typename T>
270172
inline void ceil(const T* src, T* dst, size_t n) {
271-
size_t i = 0;
272-
for (; i + 3 < n; i += 4) {
273-
dst[i+0] = std::ceil(src[i+0]);
274-
dst[i+1] = std::ceil(src[i+1]);
275-
dst[i+2] = std::ceil(src[i+2]);
276-
dst[i+3] = std::ceil(src[i+3]);
277-
}
278-
for (; i < n; ++i) dst[i] = std::ceil(src[i]);
173+
NUMPY_UNROLL4(i, dst[i] = std::ceil(src[i]));
279174
}
280175

281176
/// numpy.degrees(x, /, out=None, *, where=True, ...)
282-
// Must use native-type division to match numpy's compute path:
283-
// numpy does float32(180.0) / float32(pi), NOT float32(double(180) / double(pi)).
284177
template<typename T>
285178
inline void degrees(const T* src, T* dst, size_t n) {
286179
T factor = T(180.0) / T(M_PI);
287-
size_t i = 0;
288-
for (; i + 3 < n; i += 4) {
289-
dst[i+0] = src[i+0] * factor;
290-
dst[i+1] = src[i+1] * factor;
291-
dst[i+2] = src[i+2] * factor;
292-
dst[i+3] = src[i+3] * factor;
293-
}
294-
for (; i < n; ++i) dst[i] = src[i] * factor;
180+
NUMPY_UNROLL4(i, dst[i] = src[i] * factor);
295181
}
296182

297183
/// numpy.radians(x, /, out=None, *, where=True, ...)
298184
template<typename T>
299185
inline void radians(const T* src, T* dst, size_t n) {
300186
T factor = T(M_PI) / T(180.0);
301-
size_t i = 0;
302-
for (; i + 3 < n; i += 4) {
303-
dst[i+0] = src[i+0] * factor;
304-
dst[i+1] = src[i+1] * factor;
305-
dst[i+2] = src[i+2] * factor;
306-
dst[i+3] = src[i+3] * factor;
307-
}
308-
for (; i < n; ++i) dst[i] = src[i] * factor;
187+
NUMPY_UNROLL4(i, dst[i] = src[i] * factor);
309188
}
310189

311190
/// numpy.sign(x, /, out=None, *, where=True, ...)
312191
template<typename T>
313192
inline void sign(const T* src, T* dst, size_t n) {
314-
size_t i = 0;
315-
for (; i + 3 < n; i += 4) {
316-
dst[i+0] = T((src[i+0] > T(0)) - (src[i+0] < T(0)));
317-
dst[i+1] = T((src[i+1] > T(0)) - (src[i+1] < T(0)));
318-
dst[i+2] = T((src[i+2] > T(0)) - (src[i+2] < T(0)));
319-
dst[i+3] = T((src[i+3] > T(0)) - (src[i+3] < T(0)));
320-
}
321-
for (; i < n; ++i) dst[i] = T((src[i] > T(0)) - (src[i] < T(0)));
193+
NUMPY_UNROLL4(i, dst[i] = T((src[i] > T(0)) - (src[i] < T(0))));
322194
}
323195

324196
// ============================================================================
@@ -931,43 +803,26 @@ inline void mean_axis(const T* src, T* dst, const ptrdiff_t* shape, int ndim, in
931803
// norm, dot — used by linalg
932804
// ============================================================================
933805

934-
/// squared L2 norm (internal helper for linalg.norm)
935-
// Stack allocation for n ≤ NUMPY_SMALL_STACK (128).
806+
/// squared L2 norm / dot — stack allocation for n ≤ 128
936807
template<typename T>
937808
inline T norm_sq(const T* data, size_t n) {
938-
T stack_buf[NUMPY_SMALL_STACK];
939-
T* squares;
940-
std::vector<T> heap_buf;
941-
if (n <= NUMPY_SMALL_STACK) {
942-
squares = stack_buf;
943-
} else {
944-
heap_buf.resize(n);
945-
squares = heap_buf.data();
946-
}
947-
for (size_t i = 0; i < n; ++i) {
948-
T v = data[i];
949-
squares[i] = v * v;
950-
}
951-
return pairwise_sum(squares, n);
809+
T buf[NUMPY_SMALL_STACK];
810+
T* squares = (n <= NUMPY_SMALL_STACK) ? buf : new T[n];
811+
for (size_t i = 0; i < n; ++i) squares[i] = data[i] * data[i];
812+
T result = pairwise_sum(squares, n);
813+
if (n > NUMPY_SMALL_STACK) delete[] squares;
814+
return result;
952815
}
953816

954-
/// numpy.dot(a, b, out=None) — 1D vector dot product (pairwise sum)
955-
// Matches numpy's np.sum(a * b) bit-exactly.
956-
// Stack allocation for n ≤ NUMPY_SMALL_STACK (128).
817+
/// numpy.dot(a, b, out=None) — pairwise sum, matches np.sum(a*b)
957818
template<typename T>
958819
inline T dot(const T* a, const T* b, size_t n) {
959-
T stack_buf[NUMPY_SMALL_STACK];
960-
T* products;
961-
std::vector<T> heap_buf;
962-
if (n <= NUMPY_SMALL_STACK) {
963-
products = stack_buf;
964-
} else {
965-
heap_buf.resize(n);
966-
products = heap_buf.data();
967-
}
968-
for (size_t i = 0; i < n; ++i)
969-
products[i] = a[i] * b[i];
970-
return pairwise_sum(products, n);
820+
T buf[NUMPY_SMALL_STACK];
821+
T* prods = (n <= NUMPY_SMALL_STACK) ? buf : new T[n];
822+
for (size_t i = 0; i < n; ++i) prods[i] = a[i] * b[i];
823+
T result = pairwise_sum(prods, n);
824+
if (n > NUMPY_SMALL_STACK) delete[] prods;
825+
return result;
971826
}
972827

973828
/// numpy.linalg.norm(x, ord=None, axis=N, keepdims=False) — N-D

0 commit comments

Comments
 (0)