Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ set(PROFILE_FLAG -O3 -p -g)

function(apply_target target)
target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/include)
target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/external)
target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/src)
target_compile_options(${target} PRIVATE ${WARNING_FLAGS})
target_link_libraries(libcneuron PRIVATE ${BLAS_LIBRARIES} m)

Expand Down
137 changes: 137 additions & 0 deletions external/shishua/shishua-avx2.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#ifndef SHISHUA_AVX2_H
#define SHISHUA_AVX2_H
#include <assert.h>
#include <immintrin.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
typedef struct prng_state {
__m256i state[4];
__m256i output[4];
__m256i counter;
} prng_state;

// buf's size must be a multiple of 128 bytes.
static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) {
__m256i o0 = s->output[0], o1 = s->output[1], o2 = s->output[2], o3 = s->output[3],
s0 = s->state[0], s1 = s->state[1], s2 = s->state[2], s3 = s->state[3],
t0, t1, t2, t3, u0, u1, u2, u3, counter = s->counter;
// The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
// additions to strong positions for enrichment. The low 32-bit part of a
// 64-bit chunk never moves to the same 64-bit chunk as its high part.
// They do not remain in the same chunk. Each part eventually reaches all
// positions ringwise: A to B, B to C, …, H to A.
// You may notice that they are simply 256-bit rotations (96 and 160).
__m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5),
shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3);
// The counter is not necessary to beat PractRand.
// It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
// or about 7 millenia at 10 GiB/s.
// The increments are picked as odd numbers,
// since only coprimes of the base cover the full cycle,
// and all odd numbers are coprime of 2.
// I use different odd numbers for each 64-bit chunk
// for a tiny amount of variation stirring.
// I used the smallest odd numbers to avoid having a magic number.
__m256i increment = _mm256_set_epi64x(1, 3, 5, 7);

// TODO: consider adding proper uneven write handling
assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes.");

for (size_t i = 0; i < size; i += 128) {
if (buf != NULL) {
_mm256_storeu_si256((__m256i *)&buf[i + 0], o0);
_mm256_storeu_si256((__m256i *)&buf[i + 32], o1);
_mm256_storeu_si256((__m256i *)&buf[i + 64], o2);
_mm256_storeu_si256((__m256i *)&buf[i + 96], o3);
}

// I apply the counter to s1,
// since it is the one whose shift loses most entropy.
s1 = _mm256_add_epi64(s1, counter);
s3 = _mm256_add_epi64(s3, counter);
counter = _mm256_add_epi64(counter, increment);

// SIMD does not support rotations. Shift is the next best thing to entangle
// bits with other 64-bit positions. We must shift by an odd number so that
// each bit reaches all 64-bit positions, not just half. We must lose bits
// of information, so we minimize it: 1 and 3. We use different shift values
// to increase divergence between the two sides. We use rightward shift
// because the rightmost bits have the least diffusion in addition (the low
// bit is just a XOR of the low bits).
u0 = _mm256_srli_epi64(s0, 1);
u1 = _mm256_srli_epi64(s1, 3);
u2 = _mm256_srli_epi64(s2, 1);
u3 = _mm256_srli_epi64(s3, 3);
t0 = _mm256_permutevar8x32_epi32(s0, shu0);
t1 = _mm256_permutevar8x32_epi32(s1, shu1);
t2 = _mm256_permutevar8x32_epi32(s2, shu0);
t3 = _mm256_permutevar8x32_epi32(s3, shu1);
// Addition is the main source of diffusion.
// Storing the output in the state keeps that diffusion permanently.
s0 = _mm256_add_epi64(t0, u0);
s1 = _mm256_add_epi64(t1, u1);
s2 = _mm256_add_epi64(t2, u2);
s3 = _mm256_add_epi64(t3, u3);

// Two orthogonally grown pieces evolving independently, XORed.
o0 = _mm256_xor_si256(u0, t1);
o1 = _mm256_xor_si256(u2, t3);
o2 = _mm256_xor_si256(s0, s3);
o3 = _mm256_xor_si256(s2, s1);
}
s->output[0] = o0;
s->output[1] = o1;
s->output[2] = o2;
s->output[3] = o3;
s->state[0] = s0;
s->state[1] = s1;
s->state[2] = s2;
s->state[3] = s3;
s->counter = counter;
}

// Nothing up my sleeve: those are the hex digits of Φ,
// the least approximable irrational number.
// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
static uint64_t phi[16] = {
0x9E3779B97F4A7C15,
0xF39CC0605CEDC834,
0x1082276BF3A27251,
0xF86C6A11D0C18E95,
0x2767F0B153D27B7F,
0x0347045B5BF1827F,
0x01886F0928403002,
0xC1D64BA40F335E36,
0xF06AD7AE9717877E,
0x85839D6EFFBD7DC6,
0x64D325D1C5371682,
0xCADD0CCCFDFFBBE1,
0x626E33B8D04B4331,
0xBBF73C790D94F79D,
0x471C4AB3ED3D82A5,
0xFEC507705E4AE6E5,
};

void prng_init(prng_state *s, uint64_t seed[4]) {
memset(s, 0, sizeof(prng_state));
#define STEPS 1
#define ROUNDS 13
uint8_t buf[128 * STEPS];
// Diffuse first two seed elements in s0, then the last two. Same for s1.
// We must keep half of the state unchanged so users cannot set a bad state.
s->state[0] = _mm256_set_epi64x(phi[3], phi[2] ^ seed[1], phi[1], phi[0] ^ seed[0]);
s->state[1] = _mm256_set_epi64x(phi[7], phi[6] ^ seed[3], phi[5], phi[4] ^ seed[2]);
s->state[2] = _mm256_set_epi64x(phi[11], phi[10] ^ seed[3], phi[9], phi[8] ^ seed[2]);
s->state[3] = _mm256_set_epi64x(phi[15], phi[14] ^ seed[1], phi[13], phi[12] ^ seed[0]);
for (size_t i = 0; i < ROUNDS; i++) {
prng_gen(s, buf, 128 * STEPS);
s->state[0] = s->output[3];
s->state[1] = s->output[2];
s->state[2] = s->output[1];
s->state[3] = s->output[0];
}
#undef STEPS
#undef ROUNDS
}
#endif
107 changes: 107 additions & 0 deletions external/shishua/shishua-half-avx2.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#ifndef SHISHUA_H
#define SHISHUA_H
#include <assert.h>
#include <immintrin.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
typedef struct prng_state {
__m256i state[2];
__m256i output;
__m256i counter;
} prng_state;

// buf's size must be a multiple of 32 bytes.
static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) {
__m256i s0 = s->state[0], counter = s->counter,
s1 = s->state[1], o = s->output,
t0, t1, t2, t3, u0, u1, u2, u3;
// The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
// additions to strong positions for enrichment. The low 32-bit part of a
// 64-bit chunk never moves to the same 64-bit chunk as its high part.
// They do not remain in the same chunk. Each part eventually reaches all
// positions ringwise: A to B, B to C, …, H to A.
// You may notice that they are simply 256-bit rotations (96 and 160).
__m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5),
shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3);
// The counter is not necessary to beat PractRand.
// It sets a lower bound of 2^69 bytes = 512 EiB to the period,
// or about 1 millenia at 10 GiB/s.
// The increments are picked as odd numbers,
// since only coprimes of the base cover the full cycle,
// and all odd numbers are coprime of 2.
// I use different odd numbers for each 64-bit chunk
// for a tiny amount of variation stirring.
// I used the smallest odd numbers to avoid having a magic number.
__m256i increment = _mm256_set_epi64x(1, 3, 5, 7);

// TODO: consider adding proper uneven write handling
assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes.");

for (size_t i = 0; i < size; i += 32) {
if (buf != NULL) {
_mm256_storeu_si256((__m256i *)&buf[i], o);
}

// I apply the counter to s1,
// since it is the one whose shift loses most entropy.
s1 = _mm256_add_epi64(s1, counter);
counter = _mm256_add_epi64(counter, increment);

// SIMD does not support rotations. Shift is the next best thing to entangle
// bits with other 64-bit positions. We must shift by an odd number so that
// each bit reaches all 64-bit positions, not just half. We must lose bits
// of information, so we minimize it: 1 and 3. We use different shift values
// to increase divergence between the two sides. We use rightward shift
// because the rightmost bits have the least diffusion in addition (the low
// bit is just a XOR of the low bits).
u0 = _mm256_srli_epi64(s0, 1);
u1 = _mm256_srli_epi64(s1, 3);
t0 = _mm256_permutevar8x32_epi32(s0, shu0);
t1 = _mm256_permutevar8x32_epi32(s1, shu1);
// Addition is the main source of diffusion.
// Storing the output in the state keeps that diffusion permanently.
s0 = _mm256_add_epi64(t0, u0);
s1 = _mm256_add_epi64(t1, u1);

// Two orthogonally grown pieces evolving independently, XORed.
o = _mm256_xor_si256(u0, t1);
}
s->state[0] = s0;
s->counter = counter;
s->state[1] = s1;
s->output = o;
}

// Nothing up my sleeve: those are the hex digits of Φ,
// the least approximable irrational number.
// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
static uint64_t phi[8] = {
0x9E3779B97F4A7C15,
0xF39CC0605CEDC834,
0x1082276BF3A27251,
0xF86C6A11D0C18E95,
0x2767F0B153D27B7F,
0x0347045B5BF1827F,
0x01886F0928403002,
0xC1D64BA40F335E36,
};

void prng_init(prng_state *s, uint64_t seed[4]) {
memset(s, 0, sizeof(prng_state));
#define STEPS 5
#define ROUNDS 4
uint8_t buf[32 * STEPS]; // 4 64-bit numbers per 256-bit SIMD.
// Diffuse first two seed elements in s0, then the last two. Same for s1.
// We must keep half of the state unchanged so users cannot set a bad state.
s->state[0] = _mm256_set_epi64x(phi[3], phi[2] ^ seed[1], phi[1], phi[0] ^ seed[0]);
s->state[1] = _mm256_set_epi64x(phi[7], phi[6] ^ seed[3], phi[5], phi[4] ^ seed[2]);
for (size_t i = 0; i < ROUNDS; i++) {
prng_gen(s, buf, 32 * STEPS);
s->state[0] = s->state[1];
s->state[1] = s->output;
}
#undef STEPS
#undef ROUNDS
}
#endif
Loading