Skip to content
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ project(cneuron C CXX)

set(CMAKE_C_STANDARD 17)
set(CMAKE_C_STANDARD_REQUIRED ON)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

Expand Down Expand Up @@ -32,6 +32,8 @@ set(PROFILE_FLAG -O3 -p -g)

function(apply_target target)
target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/include)
target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/external)
target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/src)
target_compile_options(${target} PRIVATE ${WARNING_FLAGS})
target_link_libraries(libcneuron PRIVATE ${BLAS_LIBRARIES} m)

Expand Down
137 changes: 137 additions & 0 deletions external/shishua/shishua-avx2.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#ifndef SHISHUA_AVX2_H
#define SHISHUA_AVX2_H
#include <assert.h>
#include <immintrin.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
typedef struct prng_state {
__m256i state[4];
__m256i output[4];
__m256i counter;
} prng_state;

// buf's size must be a multiple of 128 bytes.
static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) {
__m256i o0 = s->output[0], o1 = s->output[1], o2 = s->output[2], o3 = s->output[3],
s0 = s->state[0], s1 = s->state[1], s2 = s->state[2], s3 = s->state[3],
t0, t1, t2, t3, u0, u1, u2, u3, counter = s->counter;
// The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
// additions to strong positions for enrichment. The low 32-bit part of a
// 64-bit chunk never moves to the same 64-bit chunk as its high part.
// They do not remain in the same chunk. Each part eventually reaches all
// positions ringwise: A to B, B to C, …, H to A.
// You may notice that they are simply 256-bit rotations (96 and 160).
__m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5),
shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3);
// The counter is not necessary to beat PractRand.
// It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
// or about 7 millenia at 10 GiB/s.
// The increments are picked as odd numbers,
// since only coprimes of the base cover the full cycle,
// and all odd numbers are coprime of 2.
// I use different odd numbers for each 64-bit chunk
// for a tiny amount of variation stirring.
// I used the smallest odd numbers to avoid having a magic number.
__m256i increment = _mm256_set_epi64x(1, 3, 5, 7);

// TODO: consider adding proper uneven write handling
assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes.");

for (size_t i = 0; i < size; i += 128) {
if (buf != NULL) {
_mm256_storeu_si256((__m256i *)&buf[i + 0], o0);
_mm256_storeu_si256((__m256i *)&buf[i + 32], o1);
_mm256_storeu_si256((__m256i *)&buf[i + 64], o2);
_mm256_storeu_si256((__m256i *)&buf[i + 96], o3);
}

// I apply the counter to s1,
// since it is the one whose shift loses most entropy.
s1 = _mm256_add_epi64(s1, counter);
s3 = _mm256_add_epi64(s3, counter);
counter = _mm256_add_epi64(counter, increment);

// SIMD does not support rotations. Shift is the next best thing to entangle
// bits with other 64-bit positions. We must shift by an odd number so that
// each bit reaches all 64-bit positions, not just half. We must lose bits
// of information, so we minimize it: 1 and 3. We use different shift values
// to increase divergence between the two sides. We use rightward shift
// because the rightmost bits have the least diffusion in addition (the low
// bit is just a XOR of the low bits).
u0 = _mm256_srli_epi64(s0, 1);
u1 = _mm256_srli_epi64(s1, 3);
u2 = _mm256_srli_epi64(s2, 1);
u3 = _mm256_srli_epi64(s3, 3);
t0 = _mm256_permutevar8x32_epi32(s0, shu0);
t1 = _mm256_permutevar8x32_epi32(s1, shu1);
t2 = _mm256_permutevar8x32_epi32(s2, shu0);
t3 = _mm256_permutevar8x32_epi32(s3, shu1);
// Addition is the main source of diffusion.
// Storing the output in the state keeps that diffusion permanently.
s0 = _mm256_add_epi64(t0, u0);
s1 = _mm256_add_epi64(t1, u1);
s2 = _mm256_add_epi64(t2, u2);
s3 = _mm256_add_epi64(t3, u3);

// Two orthogonally grown pieces evolving independently, XORed.
o0 = _mm256_xor_si256(u0, t1);
o1 = _mm256_xor_si256(u2, t3);
o2 = _mm256_xor_si256(s0, s3);
o3 = _mm256_xor_si256(s2, s1);
}
s->output[0] = o0;
s->output[1] = o1;
s->output[2] = o2;
s->output[3] = o3;
s->state[0] = s0;
s->state[1] = s1;
s->state[2] = s2;
s->state[3] = s3;
s->counter = counter;
}

// Nothing up my sleeve: those are the hex digits of Φ,
// the least approximable irrational number.
// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
static uint64_t phi[16] = {
0x9E3779B97F4A7C15,
0xF39CC0605CEDC834,
0x1082276BF3A27251,
0xF86C6A11D0C18E95,
0x2767F0B153D27B7F,
0x0347045B5BF1827F,
0x01886F0928403002,
0xC1D64BA40F335E36,
0xF06AD7AE9717877E,
0x85839D6EFFBD7DC6,
0x64D325D1C5371682,
0xCADD0CCCFDFFBBE1,
0x626E33B8D04B4331,
0xBBF73C790D94F79D,
0x471C4AB3ED3D82A5,
0xFEC507705E4AE6E5,
};

void prng_init(prng_state *s, uint64_t seed[4]) {
memset(s, 0, sizeof(prng_state));
#define STEPS 1
#define ROUNDS 13
uint8_t buf[128 * STEPS];
// Diffuse first two seed elements in s0, then the last two. Same for s1.
// We must keep half of the state unchanged so users cannot set a bad state.
s->state[0] = _mm256_set_epi64x(phi[3], phi[2] ^ seed[1], phi[1], phi[0] ^ seed[0]);
s->state[1] = _mm256_set_epi64x(phi[7], phi[6] ^ seed[3], phi[5], phi[4] ^ seed[2]);
s->state[2] = _mm256_set_epi64x(phi[11], phi[10] ^ seed[3], phi[9], phi[8] ^ seed[2]);
s->state[3] = _mm256_set_epi64x(phi[15], phi[14] ^ seed[1], phi[13], phi[12] ^ seed[0]);
for (size_t i = 0; i < ROUNDS; i++) {
prng_gen(s, buf, 128 * STEPS);
s->state[0] = s->output[3];
s->state[1] = s->output[2];
s->state[2] = s->output[1];
s->state[3] = s->output[0];
}
#undef STEPS
#undef ROUNDS
}
#endif
107 changes: 107 additions & 0 deletions external/shishua/shishua-half-avx2.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#ifndef SHISHUA_H
#define SHISHUA_H
#include <assert.h>
#include <immintrin.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
typedef struct prng_state {
__m256i state[2];
__m256i output;
__m256i counter;
} prng_state;

// buf's size must be a multiple of 32 bytes.
static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) {
__m256i s0 = s->state[0], counter = s->counter,
s1 = s->state[1], o = s->output,
t0, t1, t2, t3, u0, u1, u2, u3;
// The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
// additions to strong positions for enrichment. The low 32-bit part of a
// 64-bit chunk never moves to the same 64-bit chunk as its high part.
// They do not remain in the same chunk. Each part eventually reaches all
// positions ringwise: A to B, B to C, …, H to A.
// You may notice that they are simply 256-bit rotations (96 and 160).
__m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5),
shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3);
// The counter is not necessary to beat PractRand.
// It sets a lower bound of 2^69 bytes = 512 EiB to the period,
// or about 1 millenia at 10 GiB/s.
// The increments are picked as odd numbers,
// since only coprimes of the base cover the full cycle,
// and all odd numbers are coprime of 2.
// I use different odd numbers for each 64-bit chunk
// for a tiny amount of variation stirring.
// I used the smallest odd numbers to avoid having a magic number.
__m256i increment = _mm256_set_epi64x(1, 3, 5, 7);

// TODO: consider adding proper uneven write handling
assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes.");

for (size_t i = 0; i < size; i += 32) {
if (buf != NULL) {
_mm256_storeu_si256((__m256i *)&buf[i], o);
}

// I apply the counter to s1,
// since it is the one whose shift loses most entropy.
s1 = _mm256_add_epi64(s1, counter);
counter = _mm256_add_epi64(counter, increment);

// SIMD does not support rotations. Shift is the next best thing to entangle
// bits with other 64-bit positions. We must shift by an odd number so that
// each bit reaches all 64-bit positions, not just half. We must lose bits
// of information, so we minimize it: 1 and 3. We use different shift values
// to increase divergence between the two sides. We use rightward shift
// because the rightmost bits have the least diffusion in addition (the low
// bit is just a XOR of the low bits).
u0 = _mm256_srli_epi64(s0, 1);
u1 = _mm256_srli_epi64(s1, 3);
t0 = _mm256_permutevar8x32_epi32(s0, shu0);
t1 = _mm256_permutevar8x32_epi32(s1, shu1);
// Addition is the main source of diffusion.
// Storing the output in the state keeps that diffusion permanently.
s0 = _mm256_add_epi64(t0, u0);
s1 = _mm256_add_epi64(t1, u1);

// Two orthogonally grown pieces evolving independently, XORed.
o = _mm256_xor_si256(u0, t1);
}
s->state[0] = s0;
s->counter = counter;
s->state[1] = s1;
s->output = o;
}

// Nothing up my sleeve: those are the hex digits of Φ,
// the least approximable irrational number.
// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
static uint64_t phi[8] = {
0x9E3779B97F4A7C15,
0xF39CC0605CEDC834,
0x1082276BF3A27251,
0xF86C6A11D0C18E95,
0x2767F0B153D27B7F,
0x0347045B5BF1827F,
0x01886F0928403002,
0xC1D64BA40F335E36,
};

void prng_init(prng_state *s, uint64_t seed[4]) {
memset(s, 0, sizeof(prng_state));
#define STEPS 5
#define ROUNDS 4
uint8_t buf[32 * STEPS]; // 4 64-bit numbers per 256-bit SIMD.
// Diffuse first two seed elements in s0, then the last two. Same for s1.
// We must keep half of the state unchanged so users cannot set a bad state.
s->state[0] = _mm256_set_epi64x(phi[3], phi[2] ^ seed[1], phi[1], phi[0] ^ seed[0]);
s->state[1] = _mm256_set_epi64x(phi[7], phi[6] ^ seed[3], phi[5], phi[4] ^ seed[2]);
for (size_t i = 0; i < ROUNDS; i++) {
prng_gen(s, buf, 32 * STEPS);
s->state[0] = s->state[1];
s->state[1] = s->output;
}
#undef STEPS
#undef ROUNDS
}
#endif
Loading
Loading