From 593edc02d578a290a6a8ec85851a2083a930d5db Mon Sep 17 00:00:00 2001
From: BrBa2 <ddddddddrrdd3@gmail.com>
Date: Thu, 12 Jun 2025 12:52:03 +0800
Subject: [PATCH 01/11] helper function for random

---
 src/helper/shishua/rand.c              |  55 ++++++
 src/helper/shishua/rand.h              |  16 ++
 src/helper/shishua/shishua-avx2.h      | 111 ++++++++++++
 src/helper/shishua/shishua-half-avx2.h |  96 ++++++++++
 src/helper/shishua/shishua-half-neon.h | 155 ++++++++++++++++
 src/helper/shishua/shishua-half-sse2.h | 194 ++++++++++++++++++++
 src/helper/shishua/shishua-half.h      | 195 +++++++++++++++++++++
 src/helper/shishua/shishua-neon.h      | 180 +++++++++++++++++++
 src/helper/shishua/shishua-sse2.h      | 214 ++++++++++++++++++++++
 src/helper/shishua/shishua.h           | 234 +++++++++++++++++++++++++
 10 files changed, 1450 insertions(+)
 create mode 100644 src/helper/shishua/rand.c
 create mode 100644 src/helper/shishua/rand.h
 create mode 100644 src/helper/shishua/shishua-avx2.h
 create mode 100644 src/helper/shishua/shishua-half-avx2.h
 create mode 100644 src/helper/shishua/shishua-half-neon.h
 create mode 100644 src/helper/shishua/shishua-half-sse2.h
 create mode 100644 src/helper/shishua/shishua-half.h
 create mode 100644 src/helper/shishua/shishua-neon.h
 create mode 100644 src/helper/shishua/shishua-sse2.h
 create mode 100644 src/helper/shishua/shishua.h

diff --git a/src/helper/shishua/rand.c b/src/helper/shishua/rand.c
new file mode 100644
index 000000000..fe566e4dd
--- /dev/null
+++ b/src/helper/shishua/rand.c
@@ -0,0 +1,55 @@
+#include <assert.h>
+#include <string.h>
+
+#include "rand.h"
+
+uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max) {
+    assert(max > min);
+    assert(randc->count <= 127);
+    uint8_t randnum = ((uint16_t)randc->buf[randc->count] * (max - min)) >> 8;
+    randnum += min;
+    randc->count++;
+    return randnum;
+}
+
+uint32_t randnum_u32(struct rand_chunk *randc, uint32_t min, uint32_t max) {
+    assert(max > min);
+    assert(randc->count <= 124);
+    uint32_t value;
+    memcpy(&value, &randc->buf[randc->count], sizeof(value));
+    uint32_t randnum = ((uint64_t)value * (max - min)) >> 32;
+    randnum += min;
+    randc->count += sizeof(uint32_t);
+    return randnum;
+}
+
+float randf(struct rand_chunk *randc, float range, float offset) {
+    assert(range);
+    assert(randc->count <= 124);
+    uint32_t value;
+    memcpy(&value, &randc->buf[randc->count], sizeof(value));
+    // float randfloat = ((float)value / UINT32_MAX) * range + offset;
+    float randfloat = ((value >> 8) / 16777216.0f) * range + offset;
+    randc->count += sizeof(float);
+    return randfloat;
+}
+
+//NOTE: example
+
+// int main() {
+//     prng_state state;
+//     static uint64_t seed[4] = {0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95};
+//     struct rand_chunk randc = {0};
+//     prng_init(&state, seed);
+//     prng_gen(&state, randc.buf, 128);
+//
+//     for (int i = 0; i < 1000; i++) {
+// 	float randfloat = randf(&randc, 2, -1);
+// 	if(randc.count > 128 - sizeof(randfloat)) { //NOTE: always check randc.count b4 using function
+// 	    randc.count = 0;
+// 	    prng_gen(&state, randc.buf, 128);
+// 	}
+// 	printf("%f, ", randfloat);
+//     }
+//     return 0;
+// }
diff --git a/src/helper/shishua/rand.h b/src/helper/shishua/rand.h
new file mode 100644
index 000000000..bdfc71f84
--- /dev/null
+++ b/src/helper/shishua/rand.h
@@ -0,0 +1,16 @@
+#ifndef SHISHUA_RAND_H
+#define SHISHUA_RAND_H
+
+#include "shishua.h"
+#include <stdint.h>
+
+struct rand_chunk{
+    size_t count;
+    uint8_t buf[128]; //NOTE: must be multiple of 128
+};
+
+uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max);
+uint32_t randnum_u32(struct rand_chunk *randc, uint32_t min, uint32_t max);
+float randf(struct rand_chunk *randc, float range, float offset);
+
+#endif
diff --git a/src/helper/shishua/shishua-avx2.h b/src/helper/shishua/shishua-avx2.h
new file mode 100644
index 000000000..c2c865e80
--- /dev/null
+++ b/src/helper/shishua/shishua-avx2.h
@@ -0,0 +1,111 @@
+#ifndef SHISHUA_AVX2_H
+#define SHISHUA_AVX2_H
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+#include <immintrin.h>
+#include <assert.h>
+typedef struct prng_state {
+  __m256i state[4];
+  __m256i output[4];
+  __m256i counter;
+} prng_state;
+
+// buf's size must be a multiple of 128 bytes.
+static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) {
+  __m256i o0 = s->output[0], o1 = s->output[1], o2 = s->output[2], o3 = s->output[3],
+          s0 =  s->state[0], s1 =  s->state[1], s2 =  s->state[2], s3 =  s->state[3],
+          t0, t1, t2, t3, u0, u1, u2, u3, counter = s->counter;
+  // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+  // additions to strong positions for enrichment. The low 32-bit part of a
+  // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+  // They do not remain in the same chunk. Each part eventually reaches all
+  // positions ringwise: A to B, B to C, …, H to A.
+  // You may notice that they are simply 256-bit rotations (96 and 160).
+  __m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5),
+          shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3);
+  // The counter is not necessary to beat PractRand.
+  // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
+  // or about 7 millenia at 10 GiB/s.
+  // The increments are picked as odd numbers,
+  // since only coprimes of the base cover the full cycle,
+  // and all odd numbers are coprime of 2.
+  // I use different odd numbers for each 64-bit chunk
+  // for a tiny amount of variation stirring.
+  // I used the smallest odd numbers to avoid having a magic number.
+  __m256i increment = _mm256_set_epi64x(1, 3, 5, 7);
+
+  // TODO: consider adding proper uneven write handling
+  assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes.");
+
+  for (size_t i = 0; i < size; i += 128) {
+    if (buf != NULL) {
+      _mm256_storeu_si256((__m256i*)&buf[i +  0], o0);
+      _mm256_storeu_si256((__m256i*)&buf[i + 32], o1);
+      _mm256_storeu_si256((__m256i*)&buf[i + 64], o2);
+      _mm256_storeu_si256((__m256i*)&buf[i + 96], o3);
+    }
+
+    // I apply the counter to s1,
+    // since it is the one whose shift loses most entropy.
+    s1 = _mm256_add_epi64(s1, counter);
+    s3 = _mm256_add_epi64(s3, counter);
+    counter = _mm256_add_epi64(counter, increment);
+
+    // SIMD does not support rotations. Shift is the next best thing to entangle
+    // bits with other 64-bit positions. We must shift by an odd number so that
+    // each bit reaches all 64-bit positions, not just half. We must lose bits
+    // of information, so we minimize it: 1 and 3. We use different shift values
+    // to increase divergence between the two sides. We use rightward shift
+    // because the rightmost bits have the least diffusion in addition (the low
+    // bit is just a XOR of the low bits).
+    u0 = _mm256_srli_epi64(s0, 1);              u1 = _mm256_srli_epi64(s1, 3);
+    u2 = _mm256_srli_epi64(s2, 1);              u3 = _mm256_srli_epi64(s3, 3);
+    t0 = _mm256_permutevar8x32_epi32(s0, shu0); t1 = _mm256_permutevar8x32_epi32(s1, shu1);
+    t2 = _mm256_permutevar8x32_epi32(s2, shu0); t3 = _mm256_permutevar8x32_epi32(s3, shu1);
+    // Addition is the main source of diffusion.
+    // Storing the output in the state keeps that diffusion permanently.
+    s0 = _mm256_add_epi64(t0, u0);              s1 = _mm256_add_epi64(t1, u1);
+    s2 = _mm256_add_epi64(t2, u2);              s3 = _mm256_add_epi64(t3, u3);
+
+    // Two orthogonally grown pieces evolving independently, XORed.
+    o0 = _mm256_xor_si256(u0, t1);
+    o1 = _mm256_xor_si256(u2, t3);
+    o2 = _mm256_xor_si256(s0, s3);
+    o3 = _mm256_xor_si256(s2, s1);
+  }
+  s->output[0] = o0; s->output[1] = o1; s->output[2] = o2; s->output[3] = o3;
+  s->state [0] = s0; s->state [1] = s1; s->state [2] = s2; s->state [3] = s3;
+  s->counter = counter;
+}
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[16] = {
+  0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95,
+  0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36,
+  0xF06AD7AE9717877E, 0x85839D6EFFBD7DC6, 0x64D325D1C5371682, 0xCADD0CCCFDFFBBE1,
+  0x626E33B8D04B4331, 0xBBF73C790D94F79D, 0x471C4AB3ED3D82A5, 0xFEC507705E4AE6E5,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+  memset(s, 0, sizeof(prng_state));
+# define STEPS 1
+# define ROUNDS 13
+  uint8_t buf[128 * STEPS];
+  // Diffuse first two seed elements in s0, then the last two. Same for s1.
+  // We must keep half of the state unchanged so users cannot set a bad state.
+  s->state[0] = _mm256_set_epi64x(phi[ 3], phi[ 2] ^ seed[1], phi[ 1], phi[ 0] ^ seed[0]);
+  s->state[1] = _mm256_set_epi64x(phi[ 7], phi[ 6] ^ seed[3], phi[ 5], phi[ 4] ^ seed[2]);
+  s->state[2] = _mm256_set_epi64x(phi[11], phi[10] ^ seed[3], phi[ 9], phi[ 8] ^ seed[2]);
+  s->state[3] = _mm256_set_epi64x(phi[15], phi[14] ^ seed[1], phi[13], phi[12] ^ seed[0]);
+  for (size_t i = 0; i < ROUNDS; i++) {
+    prng_gen(s, buf, 128 * STEPS);
+    s->state[0] = s->output[3]; s->state[1] = s->output[2];
+    s->state[2] = s->output[1]; s->state[3] = s->output[0];
+  }
+# undef STEPS
+# undef ROUNDS
+}
+#endif
diff --git a/src/helper/shishua/shishua-half-avx2.h b/src/helper/shishua/shishua-half-avx2.h
new file mode 100644
index 000000000..b21f46174
--- /dev/null
+++ b/src/helper/shishua/shishua-half-avx2.h
@@ -0,0 +1,96 @@
+#ifndef SHISHUA_H
+#define SHISHUA_H
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+#include <immintrin.h>
+#include <assert.h>
+typedef struct prng_state {
+  __m256i state[2];
+  __m256i output;
+  __m256i counter;
+} prng_state;
+
+// buf's size must be a multiple of 32 bytes.
+static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) {
+  __m256i s0 = s->state[0], counter = s->counter,
+          s1 = s->state[1],       o = s->output,
+          t0, t1, t2, t3, u0, u1, u2, u3;
+  // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+  // additions to strong positions for enrichment. The low 32-bit part of a
+  // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+  // They do not remain in the same chunk. Each part eventually reaches all
+  // positions ringwise: A to B, B to C, …, H to A.
+  // You may notice that they are simply 256-bit rotations (96 and 160).
+  __m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5),
+          shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3);
+  // The counter is not necessary to beat PractRand.
+  // It sets a lower bound of 2^69 bytes = 512 EiB to the period,
+  // or about 1 millenia at 10 GiB/s.
+  // The increments are picked as odd numbers,
+  // since only coprimes of the base cover the full cycle,
+  // and all odd numbers are coprime of 2.
+  // I use different odd numbers for each 64-bit chunk
+  // for a tiny amount of variation stirring.
+  // I used the smallest odd numbers to avoid having a magic number.
+  __m256i increment = _mm256_set_epi64x(1, 3, 5, 7);
+
+  // TODO: consider adding proper uneven write handling
+  assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes.");
+
+  for (size_t i = 0; i < size; i += 32) {
+    if (buf != NULL) {
+      _mm256_storeu_si256((__m256i*)&buf[i], o);
+    }
+
+    // I apply the counter to s1,
+    // since it is the one whose shift loses most entropy.
+    s1 = _mm256_add_epi64(s1, counter);
+    counter = _mm256_add_epi64(counter, increment);
+
+    // SIMD does not support rotations. Shift is the next best thing to entangle
+    // bits with other 64-bit positions. We must shift by an odd number so that
+    // each bit reaches all 64-bit positions, not just half. We must lose bits
+    // of information, so we minimize it: 1 and 3. We use different shift values
+    // to increase divergence between the two sides. We use rightward shift
+    // because the rightmost bits have the least diffusion in addition (the low
+    // bit is just a XOR of the low bits).
+    u0 = _mm256_srli_epi64(s0, 1);              u1 = _mm256_srli_epi64(s1, 3);
+    t0 = _mm256_permutevar8x32_epi32(s0, shu0); t1 = _mm256_permutevar8x32_epi32(s1, shu1);
+    // Addition is the main source of diffusion.
+    // Storing the output in the state keeps that diffusion permanently.
+    s0 = _mm256_add_epi64(t0, u0);              s1 = _mm256_add_epi64(t1, u1);
+
+    // Two orthogonally grown pieces evolving independently, XORed.
+    o = _mm256_xor_si256(u0, t1);
+  }
+  s->state[0] = s0; s->counter = counter;
+  s->state[1] = s1; s->output  = o;
+}
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[8] = {
+  0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95,
+  0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+  memset(s, 0, sizeof(prng_state));
+# define STEPS 5
+# define ROUNDS 4
+  uint8_t buf[32 * STEPS];  // 4 64-bit numbers per 256-bit SIMD.
+  // Diffuse first two seed elements in s0, then the last two. Same for s1.
+  // We must keep half of the state unchanged so users cannot set a bad state.
+  s->state[0] = _mm256_set_epi64x(phi[3], phi[2] ^ seed[1], phi[1], phi[0] ^ seed[0]);
+  s->state[1] = _mm256_set_epi64x(phi[7], phi[6] ^ seed[3], phi[5], phi[4] ^ seed[2]);
+  for (size_t i = 0; i < ROUNDS; i++) {
+    prng_gen(s, buf, 32 * STEPS);
+    s->state[0] = s->state[1];
+    s->state[1] = s->output;
+  }
+# undef STEPS
+# undef ROUNDS
+}
+#endif
diff --git a/src/helper/shishua/shishua-half-neon.h b/src/helper/shishua/shishua-half-neon.h
new file mode 100644
index 000000000..f946a9aee
--- /dev/null
+++ b/src/helper/shishua/shishua-half-neon.h
@@ -0,0 +1,155 @@
+// An ARM NEON version of shishua.
+#ifndef SHISHUA_HALF_NEON_H
+#define SHISHUA_HALF_NEON_H
+#include <stdint.h>
+#include <stddef.h>
+#include <assert.h>
+#include <arm_neon.h>
+// A NEON version.
+typedef struct prng_state {
+  uint64x2_t state[4];
+  uint64x2_t output[2];
+  uint64x2_t counter[2];
+} prng_state;
+
+#define SHISHUA_VSETQ_N_U64(a, b) vcombine_u64(vdup_n_u64(a), vdup_n_u64(b))
+
+// To hide the vreinterpret ritual.
+#define SHISHUA_VEXTQ_U8(Rn, Rm, Imm) \
+  vreinterpretq_u64_u8( \
+    vextq_u8( \
+      vreinterpretq_u8_u64(Rn), \
+      vreinterpretq_u8_u64(Rm), \
+      (Imm) \
+    ) \
+  )
+
+// buf could technically alias with prng_state, according to the compiler.
+#if defined(__GNUC__) || defined(_MSC_VER)
+#  define SHISHUA_RESTRICT __restrict
+#else
+#  define SHISHUA_RESTRICT
+#endif
+
+// buf's size must be a multiple of 32 bytes.
+static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
+  uint64x2_t s0_lo = s->state[0], s0_hi = s->state[1],
+             s1_lo = s->state[2], s1_hi = s->state[3],
+             o_lo = s->output[0], o_hi = s->output[1],
+             t0_lo, t0_hi, t1_lo, t1_hi, u_lo, u_hi,
+             counter_lo = s->counter[0], counter_hi = s->counter[1];
+  // The counter is not necessary to beat PractRand.
+  // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
+  // or about 7 millenia at 10 GiB/s.
+  // The increments are picked as odd numbers,
+  // since only coprimes of the base cover the full cycle,
+  // and all odd numbers are coprime of 2.
+  // I use different odd numbers for each 64-bit chunk
+  // for a tiny amount of variation stirring.
+  // I used the smallest odd numbers to avoid having a magic number.
+  uint64x2_t increment_lo = SHISHUA_VSETQ_N_U64(7, 5);
+  uint64x2_t increment_hi = SHISHUA_VSETQ_N_U64(3, 1);
+  uint8_t *b = buf;
+  // TODO: consider adding proper uneven write handling
+  assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes.");
+
+  for (size_t i = 0; i < size/32; i++) {
+    if (buf != NULL) {
+      vst1q_u8(&b[0], vreinterpretq_u8_u64(o_lo)); b += 16;
+      vst1q_u8(&b[0], vreinterpretq_u8_u64(o_hi)); b += 16;
+    }
+
+    // I apply the counter to s1,
+    // since it is the one whose shift loses most entropy.
+    s1_lo = vaddq_u64(s1_lo, counter_lo);
+    s1_hi = vaddq_u64(s1_hi, counter_hi);
+
+    counter_lo = vaddq_u64(counter_lo, increment_lo);
+    counter_hi = vaddq_u64(counter_hi, increment_hi);
+
+    // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+    // additions to strong positions for enrichment. The low 32-bit part of a
+    // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+    // They do not remain in the same chunk. Each part eventually reaches all
+    // positions ringwise: A to B, B to C, …, H to A.
+    // You may notice that they are simply 256-bit rotations (96 and 160).
+    // Note: This:
+    //   x = (y << 96) | (y >> 160)
+    // can be rewritten as this
+    //   x_lo = (y_lo << 96) | (y_hi >> 32)
+    //   x_hi = (y_hi << 96) | (y_lo >> 32)
+    // which we can do with 2 vext.8 instructions.
+    t0_lo = SHISHUA_VEXTQ_U8(s0_hi, s0_lo,  4);
+    t0_hi = SHISHUA_VEXTQ_U8(s0_lo, s0_hi,  4);
+    t1_lo = SHISHUA_VEXTQ_U8(s1_lo, s1_hi, 12);
+    t1_hi = SHISHUA_VEXTQ_U8(s1_hi, s1_lo, 12);
+
+    // SIMD does not support rotations. Shift is the next best thing to entangle
+    // bits with other 64-bit positions. We must shift by an odd number so that
+    // each bit reaches all 64-bit positions, not just half. We must lose bits
+    // of information, so we minimize it: 1 and 3. We use different shift values
+    // to increase divergence between the two sides. We use rightward shift
+    // because the rightmost bits have the least diffusion in addition (the low
+    // bit is just a XOR of the low bits).
+    u_lo = vshrq_n_u64(s0_lo, 1);  u_hi = vshrq_n_u64(s0_hi, 1);
+#if defined(__clang__)
+    // UGLY HACK: Clang enjoys merging the above statements with the vadds below
+    // into vsras.
+    // This is dumb, as it still needs to do the original vshr for the xor mix,
+    // causing it to shift twice.
+    // This makes Clang assume that this line has side effects, preventing the
+    // combination and speeding things up significantly.
+    __asm__("" : "+w" (u_lo), "+w" (u_hi));
+#endif
+
+
+    // Two orthogonally grown pieces evolving independently, XORed.
+    // Note: We do this first because on the assembly side, vsra would overwrite
+    // t1_lo and t1_hi.
+    o_lo = veorq_u64(u_lo, t1_lo);
+    o_hi = veorq_u64(u_hi, t1_hi);
+    // Addition is the main source of diffusion.
+    // Storing the output in the state keeps that diffusion permanently.
+    s0_lo = vaddq_u64(t0_lo, u_lo);
+    s0_hi = vaddq_u64(t0_hi, u_hi);
+    // Use vsra here directly.
+    s1_lo = vsraq_n_u64(t1_lo, s1_lo, 3);
+    s1_hi = vsraq_n_u64(t1_hi, s1_hi, 3);
+  }
+  s->state[0] = s0_lo;  s->state[1] = s0_hi;
+  s->state[2] = s1_lo;  s->state[3] = s1_hi;
+  s->output[0] = o_lo; s->output[1] = o_hi;
+  s->counter[0] = counter_lo; s->counter[1] = counter_hi;
+}
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[8] = {
+  0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95,
+  0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+  s->counter[0] = vdupq_n_u64(0);
+  s->counter[1] = vdupq_n_u64(0);
+# define STEPS 5
+# define ROUNDS 4
+  // Diffuse first two seed elements in s0, then the last two. Same for s1.
+  // We must keep half of the state unchanged so users cannot set a bad state.
+  for (size_t i = 0; i < 4; i++) {
+     s->state[i] = veorq_u64(SHISHUA_VSETQ_N_U64(seed[i], 0), vld1q_u64(&phi[i * 2]));
+  }
+  for (size_t i = 0; i < ROUNDS; i++) {
+    prng_gen(s, NULL, 32 * STEPS);
+    s->state[0] = s->state[2];    s->state[1] = s->state[3];
+    s->state[2] = s->output[0];   s->state[3] = s->output[1];
+  }
+# undef STEPS
+# undef ROUNDS
+}
+#undef SHISHUA_VSETQ_N_U64
+#undef SHISHUA_VEXTQ_U8
+#undef SHISHUA_RESTRICT
+
+#endif
diff --git a/src/helper/shishua/shishua-half-sse2.h b/src/helper/shishua/shishua-half-sse2.h
new file mode 100644
index 000000000..d6f958930
--- /dev/null
+++ b/src/helper/shishua/shishua-half-sse2.h
@@ -0,0 +1,194 @@
+// An SSE2/SSSE3 version of shishua half. Slower than AVX2, but more compatible.
+// Also compatible with 32-bit x86.
+//
+// SSSE3 is recommended, as it has the useful _mm_alignr_epi8 intrinsic.
+// We can still emulate it on SSE2, but it is slower.
+#ifndef SHISHUA_HALF_SSE2_H
+#define SHISHUA_HALF_SSE2_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include <assert.h>
+// Note: cl.exe doesn't define __SSSE3__
+#if defined(__SSSE3__) || defined(__AVX__)
+#  include <tmmintrin.h> // SSSE3
+#  define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \
+   _mm_alignr_epi8(hi, lo, amt)
+#else
+#  include <emmintrin.h> // SSE2
+// Emulate _mm_alignr_epi8 for SSE2. It's a little slow.
+// The compiler may convert it to a sequence of shufps instructions, which is
+// perfectly fine.
+#  define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \
+   _mm_or_si128( \
+     _mm_slli_si128(hi, 16 - (amt)), \
+     _mm_srli_si128(lo, amt) \
+   )
+#endif
+
+typedef struct prng_state {
+  __m128i state[4];
+  __m128i output[2];
+  __m128i counter[2];
+} prng_state;
+
+// Wrappers for x86 targets which usually lack these intrinsics.
+// Don't call these with side effects.
+#if defined(__x86_64__) || defined(_M_X64)
+#   define SHISHUA_SET_EPI64X(b, a) _mm_set_epi64x(b, a)
+#   define SHISHUA_CVTSI64_SI128(x) _mm_cvtsi64_si128(x)
+#else
+#   define SHISHUA_SET_EPI64X(b, a) \
+      _mm_set_epi32( \
+        (int)(((uint64_t)(b)) >> 32), \
+        (int)(b), \
+        (int)(((uint64_t)(a)) >> 32), \
+        (int)(a) \
+      )
+#   define SHISHUA_CVTSI64_SI128(x) SHISHUA_SET_EPI64X(0, x)
+#endif
+
+// buf could technically alias with prng_state, according to the compiler.
+#if defined(__GNUC__) || defined(_MSC_VER)
+#  define SHISHUA_RESTRICT __restrict
+#else
+#  define SHISHUA_RESTRICT
+#endif
+
+// buf's size must be a multiple of 32 bytes.
+static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
+  // We need a lot of variables...
+  __m128i s0_lo = s->state[0],  s0_hi = s->state[1],
+          s1_lo = s->state[2],  s1_hi = s->state[3],
+          t0_lo, t0_hi, t1_lo, t1_hi,
+          u_lo, u_hi,
+          counter_lo = s->counter[0], counter_hi = s->counter[1];
+  // The counter is not necessary to beat PractRand.
+  // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
+  // or about 7 millenia at 10 GiB/s.
+  // The increments are picked as odd numbers,
+  // since only coprimes of the base cover the full cycle,
+  // and all odd numbers are coprime of 2.
+  // I use different odd numbers for each 64-bit chunk
+  // for a tiny amount of variation stirring.
+  // I used the smallest odd numbers to avoid having a magic number.
+  // increment = { 7, 5, 3, 1 };
+  const __m128i increment_lo = SHISHUA_SET_EPI64X(5, 7);
+  const __m128i increment_hi = SHISHUA_SET_EPI64X(1, 3);
+
+  // TODO: consider adding proper uneven write handling
+  assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes.");
+
+  for (size_t i = 0; i < size; i += 32) {
+    // Split to try to reduce register pressure
+    if (buf != NULL) {
+      _mm_storeu_si128((__m128i*)&buf[i+ 0], s->output[0]);
+      _mm_storeu_si128((__m128i*)&buf[i+16], s->output[1]);
+    }
+
+    // Lane 0
+
+
+    // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+    // additions to strong positions for enrichment. The low 32-bit part of a
+    // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+    // They do not remain in the same chunk. Each part eventually reaches all
+    // positions ringwise: A to B, B to C, …, H to A.
+    // You may notice that they are simply 256-bit rotations (96 and 160).
+    // Note: This:
+    //   x = (y << 96) | (y >> 160)
+    // can be rewritten as this
+    //   x_lo = (y_lo << 96) | (y_hi >> 32)
+    //   x_hi = (y_hi << 96) | (y_lo >> 32)
+    // which we can do with 2 _mm_alignr_epi8 instructions.
+    t0_lo = SHISHUA_ALIGNR_EPI8(s0_lo, s0_hi, 4);
+    t0_hi = SHISHUA_ALIGNR_EPI8(s0_hi, s0_lo, 4);
+
+    // SIMD does not support rotations. Shift is the next best thing to entangle
+    // bits with other 64-bit positions. We must shift by an odd number so that
+    // each bit reaches all 64-bit positions, not just half. We must lose bits
+    // of information, so we minimize it: 1 and 3. We use different shift values
+    // to increase divergence between the two sides. We use rightward shift
+    // because the rightmost bits have the least diffusion in addition (the low
+    // bit is just a XOR of the low bits).
+    u_lo = _mm_srli_epi64(s0_lo, 1);
+    u_hi = _mm_srli_epi64(s0_hi, 1);
+
+    // Addition is the main source of diffusion.
+    // Storing the output in the state keeps that diffusion permanently.
+    s0_lo = _mm_add_epi64(u_lo, t0_lo);
+    s0_hi = _mm_add_epi64(u_hi, t0_hi);
+
+    // Lane 1
+
+    // I apply the counter to s1,
+    // since it is the one whose shift loses most entropy.
+    s1_lo = _mm_add_epi64(s1_lo, counter_lo);
+    s1_hi = _mm_add_epi64(s1_hi, counter_hi);
+
+    // Increment the counter
+    counter_lo = _mm_add_epi64(counter_lo, increment_lo);
+    counter_hi = _mm_add_epi64(counter_hi, increment_hi);
+
+    // Same as above, but with different shift amounts.
+    t1_lo = SHISHUA_ALIGNR_EPI8(s1_hi, s1_lo, 12);
+    t1_hi = SHISHUA_ALIGNR_EPI8(s1_lo, s1_hi, 12);
+
+    s1_lo = _mm_srli_epi64(s1_lo, 3);
+    s1_hi = _mm_srli_epi64(s1_hi, 3);
+
+    s1_lo = _mm_add_epi64(s1_lo, t1_lo);
+    s1_hi = _mm_add_epi64(s1_hi, t1_hi);
+
+    // Two orthogonally grown pieces evolving independently, XORed.
+    s->output[0] = _mm_xor_si128(u_lo, t1_lo);
+    s->output[1] = _mm_xor_si128(u_hi, t1_hi);
+  }
+
+  s->state[0] = s0_lo;  s->state[1] = s0_hi;
+  s->state[2] = s1_lo;  s->state[3] = s1_hi;
+
+  s->counter[0] = counter_lo;
+  s->counter[1] = counter_hi;
+}
+
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[8] = {
+  0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95,
+  0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+  s->counter[0] = _mm_setzero_si128();
+  s->counter[1] = _mm_setzero_si128();
+# define STEPS 5
+# define ROUNDS 4
+  // Diffuse first two seed elements in s0, then the last two. Same for s1.
+  // We must keep half of the state unchanged so users cannot set a bad state.
+  // XXX: better way to do this?
+  __m128i seed_0 = SHISHUA_CVTSI64_SI128(seed[0]);
+  __m128i seed_1 = SHISHUA_CVTSI64_SI128(seed[1]);
+  __m128i seed_2 = SHISHUA_CVTSI64_SI128(seed[2]);
+  __m128i seed_3 = SHISHUA_CVTSI64_SI128(seed[3]);
+  s->state[0] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[0]));
+  s->state[1] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[2]));
+  s->state[2] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[4]));
+  s->state[3] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[6]));
+
+  for (int i = 0; i < ROUNDS; i++) {
+    prng_gen(s, NULL, 32 * STEPS);
+    s->state[0] = s->state[2];    s->state[1] = s->state[3];
+    s->state[2] = s->output[0];   s->state[3] = s->output[1];
+  }
+# undef STEPS
+# undef ROUNDS
+}
+#undef SHISHUA_ALIGNR_EPI8
+#undef SHISHUA_CVTSI64_SI128
+#undef SHISHUA_SET_EPI64X
+#undef SHISHUA_RESTRICT
+
+#endif
diff --git a/src/helper/shishua/shishua-half.h b/src/helper/shishua/shishua-half.h
new file mode 100644
index 000000000..a1ea715b1
--- /dev/null
+++ b/src/helper/shishua/shishua-half.h
@@ -0,0 +1,195 @@
+#ifndef SHISHUA_HALF_H
+#define SHISHUA_HALF_H
+
+#define SHISHUA_TARGET_SCALAR 0
+#define SHISHUA_TARGET_AVX2 1
+#define SHISHUA_TARGET_SSE2 2
+#define SHISHUA_TARGET_NEON 3
+
+#ifndef SHISHUA_TARGET
+#  if defined(__AVX2__) && (defined(__x86_64__) || defined(_M_X64))
+#    define SHISHUA_TARGET SHISHUA_TARGET_AVX2
+#  elif defined(__x86_64__) || defined(_M_X64) || defined(__SSE2__) \
+    || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#    define SHISHUA_TARGET SHISHUA_TARGET_SSE2
+// GCC's NEON codegen leaves much to be desired, at least as of 9.2.0. The
+// scalar path ends up being faster.
+// Device: Google Pixel 2 XL, 2.46GHz Qualcomm Snapdragon 835
+//                       |   GCC 9.2.0    |  Clang 9.0.1
+//   shishua neon        | 0.2845 ns/byte | 0.0966 ns/byte
+//   shishua scalar      | 0.2056 ns/byte | 0.2958 ns/byte
+//   shishua half neon   | 0.5169 ns/byte | 0.1929 ns/byte
+//   shishua half scalar | 0.2496 ns/byte | 0.2911 ns/byte
+// Therefore, we only autoselect the NEON path on Clang, at least until GCC's
+// NEON codegen improves.
+#  elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__clang__)
+#    define SHISHUA_TARGET SHISHUA_TARGET_NEON
+#  else
+#    define SHISHUA_TARGET SHISHUA_TARGET_SCALAR
+#  endif
+#endif
+
+// These are all optional: By defining SHISHUA_TARGET_SCALAR, you only
+// need this header.
+// Additionally, these headers can also be used on their own.
+#if SHISHUA_TARGET == SHISHUA_TARGET_AVX2
+#  include "shishua-half-avx2.h"
+#elif SHISHUA_TARGET == SHISHUA_TARGET_SSE2
+#  include "shishua-half-sse2.h"
+#elif SHISHUA_TARGET == SHISHUA_TARGET_NEON
+#  include "shishua-half-neon.h"
+#else // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR
+
+// Portable scalar implementation of shishua half.
+// Aims for a balance between code size and performance.
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+#include <assert.h>
+
+// Note: While it is an array, a "lane" refers to 4 consecutive uint64_t.
+typedef struct prng_state {
+  uint64_t state[8];   // 2 lanes
+  uint64_t output[4];  // 1 lane
+  uint64_t counter[4]; // 1 lane
+} prng_state;
+
+// buf could technically alias with prng_state, according to the compiler.
+#if defined(__GNUC__) || defined(_MSC_VER)
+#  define SHISHUA_RESTRICT __restrict
+#else
+#  define SHISHUA_RESTRICT
+#endif
+
+static inline void shishua_write_le64(void *dst, uint64_t val) {
+  // Define to write in native endianness with memcpy
+  // Also, use memcpy on known little endian setups.
+# if defined(SHISHUA_NATIVE_ENDIAN) \
+   || defined(_WIN32) \
+   || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
+   || defined(__LITTLE_ENDIAN__)
+  memcpy(dst, &val, sizeof(uint64_t));
+#else
+  // Byteshift write.
+  uint8_t *d = (uint8_t *)dst;
+  for (size_t i = 0; i < 8; i++) {
+    d[i] = (uint8_t)(val & 0xff);
+    val >>= 8;
+  }
+#endif
+}
+
+// buf's size must be a multiple of 32 bytes.
+static inline void prng_gen(prng_state *SHISHUA_RESTRICT state, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
+
+  uint8_t *b = buf;
+  // TODO: consider adding proper uneven write handling
+  assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes.");
+
+  for (size_t i = 0; i < size; i += 32) {
+    uint64_t t[8];
+    // Write to buf
+    if (buf != NULL) {
+      for (size_t j = 0; j < 4; j++) {
+        shishua_write_le64(b, state->output[j]);
+        b += 8;
+      }
+    }
+
+    for (size_t j = 0; j < 4; j++) {
+      // I apply the counter to s1,
+      // since it is the one whose shift loses most entropy.
+      state->state[j + 4] += state->counter[j];
+      // The counter is not necessary to beat PractRand.
+      // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
+      // or about 7 millenia at 10 GiB/s.
+      // The increments are picked as odd numbers,
+      // since only coprimes of the base cover the full cycle,
+      // and all odd numbers are coprime of 2.
+      // I use different odd numbers for each 64-bit chunk
+      // for a tiny amount of variation stirring.
+      // I used the smallest odd numbers to avoid having a magic number.
+      //
+      // For the scalar version, we calculate this dynamically, as it is
+      // simple enough.
+      state->counter[j] += 7 - (j * 2); // 7, 5, 3, 1
+    }
+
+    // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+    // additions to strong positions for enrichment. The low 32-bit part of a
+    // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+    // They do not remain in the same chunk. Each part eventually reaches all
+    // positions ringwise: A to B, B to C, …, H to A.
+    // You may notice that they are simply 256-bit rotations (96 and 160).
+    //
+    // It would be much easier and cleaner to just reinterpret as a uint32_t
+    // pointer or use memcpy, but that is unfortunately endian-dependent, and
+    // the former also breaks strict aliasing.
+    //
+    // The only known solution which doesn't rely on endianness is to
+    // read two 64-bit integers and do a funnel shift.
+    //
+    // See shishua.h for details.
+
+    // Lookup table for the _offsets_ in the shuffle. Even lanes rotate
+    // by 5, odd lanes rotate by 3.
+    // If it were by 32-bit lanes, it would be
+    // { 5,6,7,0,1,2,3,4, 11,12,13,14,15,8,9,10 }
+    const uint8_t shuf_offsets[16] = { 2,3,0,1, 5,6,7,4,   // left
+                                       3,0,1,2, 6,7,4,5 }; // right
+    for (size_t j = 0; j < 8; j++) {
+      t[j] = (state->state[shuf_offsets[j]] >> 32) | (state->state[shuf_offsets[j + 8]] << 32);
+    }
+    for (size_t j = 0; j < 4; j++) {
+      // SIMD does not support rotations. Shift is the next best thing to entangle
+      // bits with other 64-bit positions. We must shift by an odd number so that
+      // each bit reaches all 64-bit positions, not just half. We must lose bits
+      // of information, so we minimize it: 1 and 3. We use different shift values
+      // to increase divergence between the two sides. We use rightward shift
+      // because the rightmost bits have the least diffusion in addition (the low
+      // bit is just a XOR of the low bits).
+      uint64_t u_lo = state->state[j + 0] >> 1;
+      uint64_t u_hi = state->state[j + 4] >> 3;
+      // Addition is the main source of diffusion.
+      // Storing the output in the state keeps that diffusion permanently.
+      state->state[j + 0] = u_lo + t[j + 0];
+      state->state[j + 4] = u_hi + t[j + 4];
+
+      // Two orthogonally grown pieces evolving independently, XORed.
+      state->output[j] = u_lo ^ t[j + 4];
+    }
+  }
+}
+#undef SHISHUA_RESTRICT
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[8] = {
+  0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95,
+  0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+  memset(s, 0, sizeof(prng_state));
+
+# define STEPS 5
+# define ROUNDS 4
+  // Diffuse first two seed elements in s0, then the last two. Same for s1.
+  // We must keep half of the state unchanged so users cannot set a bad state.
+  memcpy(s->state, phi, sizeof(phi));
+  for (size_t i = 0; i < 4; i++) {
+    s->state[i * 2] ^= seed[i];
+  }
+  for (size_t i = 0; i < ROUNDS; i++) {
+    prng_gen(s, NULL, 32 * STEPS);
+    for (size_t j = 0; j < 4; j++) {
+       s->state[j + 0] = s->state[j + 4];
+       s->state[j + 4] = s->output[j];
+    }
+  }
+# undef STEPS
+# undef ROUNDS
+}
+#endif // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR
+#endif // SHISHUA_HALF_SCALAR_H
diff --git a/src/helper/shishua/shishua-neon.h b/src/helper/shishua/shishua-neon.h
new file mode 100644
index 000000000..b3ecbafdd
--- /dev/null
+++ b/src/helper/shishua/shishua-neon.h
@@ -0,0 +1,180 @@
+// An ARM NEON version of shishua.
+#ifndef SHISHUA_NEON_H
+#define SHISHUA_NEON_H
+#include <stdint.h>
+#include <stddef.h>
+#include <assert.h>
+#include <arm_neon.h>
+
+typedef struct prng_state {
+  uint64x2_t state[8];
+  uint64x2_t output[8];
+  uint64x2_t counter[2];
+} prng_state;
+
+#if defined(__GNUC__) && (defined(__BYTE_ORDER__) && __BYTE_ORDER__==__ORDER_LITTLE_ENDIAN__)
+#  define SHISHUA_VSETQ_N_U64(a, b) (__extension__(uint64x2_t) { a, b })
+#else
+#  define SHISHUA_VSETQ_N_U64(a, b) vcombine_u64(vdup_n_u64(a), vdup_n_u64(b))
+#endif
+// To hide the vreinterpret ritual.
+#define SHISHUA_VEXTQ_U8(Rn, Rm, Imm) \
+  vreinterpretq_u64_u8( \
+    vextq_u8( \
+      vreinterpretq_u8_u64(Rn), \
+      vreinterpretq_u8_u64(Rm), \
+      (Imm) \
+    ) \
+  )
+
+// buf could technically alias with prng_state, according to the compiler.
+#if defined(__GNUC__) || defined(_MSC_VER)
+#  define SHISHUA_RESTRICT __restrict
+#else
+#  define SHISHUA_RESTRICT
+#endif
+
+// buf's size must be a multiple of 128 bytes.
+static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
+  uint8_t *b = buf;
+  uint64x2_t counter_lo = s->counter[0], counter_hi = s->counter[1];
+  // The counter is not necessary to beat PractRand.
+  // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
+  // or about 7 millenia at 10 GiB/s.
+  // The increments are picked as odd numbers,
+  // since only coprimes of the base cover the full cycle,
+  // and all odd numbers are coprime of 2.
+  // I use different odd numbers for each 64-bit chunk
+  // for a tiny amount of variation stirring.
+  // I used the smallest odd numbers to avoid having a magic number.
+  uint64x2_t increment_lo = SHISHUA_VSETQ_N_U64(7, 5);
+  uint64x2_t increment_hi = SHISHUA_VSETQ_N_U64(3, 1);
+  // TODO: consider adding proper uneven write handling
+  assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes.");
+
+  for (size_t i = 0; i < size; i += 128) {
+    // Write the current output block to state if it is not NULL
+    if (buf != NULL) {
+      for (size_t j = 0; j < 8; j++) {
+        vst1q_u8(b, vreinterpretq_u8_u64(s->output[j]));
+        b += 16;
+      }
+    }
+    // NEON has less register pressure than SSE2, but we reroll it anyways for
+    // code size.
+    for (size_t j = 0; j < 2; j++) {
+      uint64x2_t s0_lo = s->state[j * 4 + 0],
+                 s0_hi = s->state[j * 4 + 1],
+                 s1_lo = s->state[j * 4 + 2],
+                 s1_hi = s->state[j * 4 + 3],
+                 t0_lo, t0_hi, t1_lo, t1_hi,
+                 u_lo, u_hi;
+
+      // I apply the counter to s1,
+      // since it is the one whose shift loses most entropy.
+      s1_lo = vaddq_u64(s1_lo, counter_lo); s1_hi = vaddq_u64(s1_hi, counter_hi);
+
+      // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+      // additions to strong positions for enrichment. The low 32-bit part of a
+      // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+      // They do not remain in the same chunk. Each part eventually reaches all
+      // positions ringwise: A to B, B to C, …, H to A.
+      // You may notice that they are simply 256-bit rotations (96 and 160).
+      // Note: This:
+      //   x = (y << 96) | (y >> 160)
+      // can be rewritten as this
+      //   x_lo = (y_lo << 96) | (y_hi >> 32)
+      //   x_hi = (y_hi << 96) | (y_lo >> 32)
+      // which we can do with 2 vext.8 instructions.
+      t0_lo = SHISHUA_VEXTQ_U8(s0_hi, s0_lo, 4);   t0_hi = SHISHUA_VEXTQ_U8(s0_lo, s0_hi,  4);
+      t1_lo = SHISHUA_VEXTQ_U8(s1_lo, s1_hi, 12);  t1_hi = SHISHUA_VEXTQ_U8(s1_hi, s1_lo, 12);
+
+      // SIMD does not support rotations. Shift is the next best thing to entangle
+      // bits with other 64-bit positions. We must shift by an odd number so that
+      // each bit reaches all 64-bit positions, not just half. We must lose bits
+      // of information, so we minimize it: 1 and 3. We use different shift values
+      // to increase divergence between the two sides. We use rightward shift
+      // because the rightmost bits have the least diffusion in addition (the low
+      // bit is just a XOR of the low bits).
+      u_lo = vshrq_n_u64(s0_lo, 1);   u_hi = vshrq_n_u64(s0_hi, 1);
+#if defined(__clang__)
+      // UGLY HACK: Clang enjoys merging the above statements with the vadds below
+      // into vsras.
+      // This is dumb, as it still needs to do the original vshr for the xor mix,
+      // causing it to shift twice.
+      // This makes Clang assume that this line has side effects, preventing the
+      // combination and speeding things up significantly.
+      __asm__("" : "+w" (u_lo), "+w" (u_hi));
+#endif
+
+      // Addition is the main source of diffusion.
+      // Storing the output in the state keeps that diffusion permanently.
+      s->state[4 * j + 0] = vaddq_u64(t0_lo, u_lo);
+      s->state[4 * j + 1] = vaddq_u64(t0_hi, u_hi);
+      // Use vsra here directly.
+      s->state[4 * j + 2] = vsraq_n_u64(t1_lo, s1_lo, 3);
+      s->state[4 * j + 3] = vsraq_n_u64(t1_hi, s1_hi, 3);
+
+      // The first orthogonally grown pieces evolving independently, XORed.
+      s->output[2 * j + 0] = veorq_u64(u_lo, t1_lo);
+      s->output[2 * j + 1] = veorq_u64(u_hi, t1_hi);
+
+    }
+    // The second orthogonally grown piece evolving independently, XORed.
+    s->output[4] = veorq_u64(s->state[0], s->state[6]);
+    s->output[5] = veorq_u64(s->state[1], s->state[7]);
+
+    s->output[6] = veorq_u64(s->state[2], s->state[4]);
+    s->output[7] = veorq_u64(s->state[3], s->state[5]);
+
+    counter_lo = vaddq_u64(counter_lo, increment_lo);
+    counter_hi = vaddq_u64(counter_hi, increment_hi);
+  }
+  s->counter[0] = counter_lo;
+  s->counter[1] = counter_hi;
+}
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[16] = {
+  0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95,
+  0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36,
+  0xF06AD7AE9717877E, 0x85839D6EFFBD7DC6, 0x64D325D1C5371682, 0xCADD0CCCFDFFBBE1,
+  0x626E33B8D04B4331, 0xBBF73C790D94F79D, 0x471C4AB3ED3D82A5, 0xFEC507705E4AE6E5,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+  s->counter[0] = vdupq_n_u64(0);
+  s->counter[1] = vdupq_n_u64(0);
+# define ROUNDS 13
+# define STEPS 1
+  // Diffuse first two seed elements in s0, then the last two. Same for s1.
+  // We must keep half of the state unchanged so users cannot set a bad state.
+  uint64x2_t seed_0 = SHISHUA_VSETQ_N_U64(seed[0], 0);
+  uint64x2_t seed_1 = SHISHUA_VSETQ_N_U64(seed[1], 0);
+  uint64x2_t seed_2 = SHISHUA_VSETQ_N_U64(seed[2], 0);
+  uint64x2_t seed_3 = SHISHUA_VSETQ_N_U64(seed[3], 0);
+  s->state[0] = veorq_u64(seed_0, vld1q_u64(&phi[ 0]));
+  s->state[1] = veorq_u64(seed_1, vld1q_u64(&phi[ 2]));
+  s->state[2] = veorq_u64(seed_2, vld1q_u64(&phi[ 4]));
+  s->state[3] = veorq_u64(seed_3, vld1q_u64(&phi[ 6]));
+  s->state[4] = veorq_u64(seed_2, vld1q_u64(&phi[ 8]));
+  s->state[5] = veorq_u64(seed_3, vld1q_u64(&phi[10]));
+  s->state[6] = veorq_u64(seed_0, vld1q_u64(&phi[12]));
+  s->state[7] = veorq_u64(seed_1, vld1q_u64(&phi[14]));
+
+  for (int i = 0; i < ROUNDS; i++) {
+    prng_gen(s, NULL, 128 * STEPS);
+    s->state[0] = s->output[6];   s->state[1] = s->output[7];
+    s->state[2] = s->output[4];   s->state[3] = s->output[5];
+    s->state[4] = s->output[2];   s->state[5] = s->output[3];
+    s->state[6] = s->output[0];   s->state[7] = s->output[1];
+  }
+# undef STEPS
+# undef ROUNDS
+}
+#undef SHISHUA_VSETQ_N_U64
+#undef SHISHUA_VEXTQ_U8
+#undef SHISHUA_RESTRICT
+#endif
diff --git a/src/helper/shishua/shishua-sse2.h b/src/helper/shishua/shishua-sse2.h
new file mode 100644
index 000000000..3a9807638
--- /dev/null
+++ b/src/helper/shishua/shishua-sse2.h
@@ -0,0 +1,214 @@
+// An SSE2/SSSE3 version of shishua. Slower than AVX2, but more compatible.
+// Also compatible with 32-bit x86.
+//
+// SSSE3 is recommended, as it has the useful _mm_alignr_epi8 intrinsic.
+// We can still emulate it on SSE2, but it is slower.
+// Consider the half version if AVX2 is not available.
+#ifndef SHISHUA_SSE2_H
+#define SHISHUA_SSE2_H
+#include <stdint.h>
+#include <stddef.h>
+#include <assert.h>
+// Note: cl.exe doesn't define __SSSE3__
+#if defined(__SSSE3__) || defined(__AVX__)
+#  include <tmmintrin.h> // SSSE3
+#  define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \
+   _mm_alignr_epi8(hi, lo, amt)
+#else
+#  include <emmintrin.h> // SSE2
+// Emulate _mm_alignr_epi8 for SSE2. It's a little slow.
+// The compiler may convert it to a sequence of shufps instructions, which is
+// perfectly fine.
+#  define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \
+   _mm_or_si128( \
+     _mm_slli_si128(hi, 16 - (amt)), \
+     _mm_srli_si128(lo, amt) \
+   )
+#endif
+
+typedef struct prng_state {
+  __m128i state[8];
+  __m128i output[8];
+  __m128i counter[2];
+} prng_state;
+
+// Wrappers for x86 targets which usually lack these intrinsics.
+// Don't call these with side effects.
+#if defined(__x86_64__) || defined(_M_X64)
+#   define SHISHUA_SET_EPI64X(b, a) _mm_set_epi64x(b, a)
+#   define SHISHUA_CVTSI64_SI128(x) _mm_cvtsi64_si128(x)
+#else
+#   define SHISHUA_SET_EPI64X(b, a) \
+      _mm_set_epi32( \
+        (int)(((uint64_t)(b)) >> 32), \
+        (int)(b), \
+        (int)(((uint64_t)(a)) >> 32), \
+        (int)(a) \
+      )
+#   define SHISHUA_CVTSI64_SI128(x) SHISHUA_SET_EPI64X(0, x)
+#endif
+
+// buf could technically alias with prng_state, according to the compiler.
+#if defined(__GNUC__) || defined(_MSC_VER)
+#  define SHISHUA_RESTRICT __restrict
+#else
+#  define SHISHUA_RESTRICT
+#endif
+
+// buf's size must be a multiple of 128 bytes.
+static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
+  __m128i counter_lo = s->counter[0], counter_hi = s->counter[1];
+  // The counter is not necessary to beat PractRand.
+  // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
+  // or about 7 millenia at 10 GiB/s.
+  // The increments are picked as odd numbers,
+  // since only coprimes of the base cover the full cycle,
+  // and all odd numbers are coprime of 2.
+  // I use different odd numbers for each 64-bit chunk
+  // for a tiny amount of variation stirring.
+  // I used the smallest odd numbers to avoid having a magic number.
+
+  // increment = { 7, 5, 3, 1 };
+  __m128i increment_lo = SHISHUA_SET_EPI64X(5, 7);
+  __m128i increment_hi = SHISHUA_SET_EPI64X(1, 3);
+
+  // TODO: consider adding proper uneven write handling
+  assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes.");
+
+  for (size_t i = 0; i < size; i += 128) {
+    // Write the current output block to state if it is not NULL
+    if (buf != NULL) {
+      for (size_t j = 0; j < 8; j++) {
+        _mm_storeu_si128((__m128i *)&buf[i + (16 * j)], s->output[j]);
+      }
+    }
+
+    // There are only 16 SSE registers (8 on i686), and we have to account for
+    // temporary copies due to being stuck with 2-operand instructions.
+    // Therefore, we use fixed iteration loops to reduce code complexity while
+    // still allowing the compiler to easily unroll the loop.
+    // We also try to keep variables active for as short as possible.
+    for (size_t j = 0; j < 2; j++) {
+      __m128i s_lo, s_hi, u0_lo, u0_hi, u1_lo, u1_hi, t_lo, t_hi;
+
+      // Lane 0
+      s_lo = s->state[4 * j + 0];
+      s_hi = s->state[4 * j + 1];
+
+      // SIMD does not support rotations. Shift is the next best thing to entangle
+      // bits with other 64-bit positions. We must shift by an odd number so that
+      // each bit reaches all 64-bit positions, not just half. We must lose bits
+      // of information, so we minimize it: 1 and 3. We use different shift values
+      // to increase divergence between the two sides. We use rightward shift
+      // because the rightmost bits have the least diffusion in addition (the low
+      // bit is just a XOR of the low bits).
+      u0_lo = _mm_srli_epi64(s_lo, 1);
+      u0_hi = _mm_srli_epi64(s_hi, 1);
+
+      // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+      // additions to strong positions for enrichment. The low 32-bit part of a
+      // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+      // They do not remain in the same chunk. Each part eventually reaches all
+      // positions ringwise: A to B, B to C, …, H to A.
+      // You may notice that they are simply 256-bit rotations (96 and 160).
+      // Note: This:
+      //   x = (y << 96) | (y >> 160)
+      // can be rewritten as this
+      //   x_lo = (y_lo << 96) | (y_hi >> 32)
+      //   x_hi = (y_hi << 96) | (y_lo >> 32)
+      // which we can do with 2 _mm_alignr_epi8 instructions.
+      t_lo = SHISHUA_ALIGNR_EPI8(s_lo, s_hi, 4);
+      t_hi = SHISHUA_ALIGNR_EPI8(s_hi, s_lo, 4);
+
+      // Addition is the main source of diffusion.
+      // Storing the output in the state keeps that diffusion permanently.
+      s->state[4 * j + 0] = _mm_add_epi64(t_lo, u0_lo);
+      s->state[4 * j + 1] = _mm_add_epi64(t_hi, u0_hi);
+
+      // Lane 1
+      s_lo = s->state[4 * j + 2];
+      s_hi = s->state[4 * j + 3];
+
+      // I apply the counter to s1,
+      // since it is the one whose shift loses most entropy.
+      s_lo = _mm_add_epi64(s_lo, counter_lo);
+      s_hi = _mm_add_epi64(s_hi, counter_hi);
+
+      // Same as above but with different shifts
+      u1_lo = _mm_srli_epi64(s_lo, 3);
+      u1_hi = _mm_srli_epi64(s_hi, 3);
+
+      t_lo = SHISHUA_ALIGNR_EPI8(s_hi, s_lo, 12);
+      t_hi = SHISHUA_ALIGNR_EPI8(s_lo, s_hi, 12);
+
+      s->state[4 * j + 2] = _mm_add_epi64(t_lo, u1_lo);
+      s->state[4 * j + 3] = _mm_add_epi64(t_hi, u1_hi);
+
+      // Merge lane 0 and lane 1
+      // The first orthogonally grown piece evolving independently, XORed.
+      s->output[2 * j + 0] = _mm_xor_si128(u0_lo, t_lo);
+      s->output[2 * j + 1] = _mm_xor_si128(u0_hi, t_hi);
+    }
+
+    // The second orthogonally grown piece evolving independently, XORed.
+    s->output[4] = _mm_xor_si128(s->state[0], s->state[6]);
+    s->output[5] = _mm_xor_si128(s->state[1], s->state[7]);
+    s->output[6] = _mm_xor_si128(s->state[4], s->state[2]);
+    s->output[7] = _mm_xor_si128(s->state[5], s->state[3]);
+
+    // Increment the counter
+    counter_lo = _mm_add_epi64(counter_lo, increment_lo);
+    counter_hi = _mm_add_epi64(counter_hi, increment_hi);
+  }
+
+  s->counter[0] = counter_lo;
+  s->counter[1] = counter_hi;
+}
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[16] = {
+  0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95,
+  0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36,
+  0xF06AD7AE9717877E, 0x85839D6EFFBD7DC6, 0x64D325D1C5371682, 0xCADD0CCCFDFFBBE1,
+  0x626E33B8D04B4331, 0xBBF73C790D94F79D, 0x471C4AB3ED3D82A5, 0xFEC507705E4AE6E5,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+  // Note: output is uninitialized at first, but since we pass NULL, its value
+  // is initially ignored.
+  s->counter[0] = _mm_setzero_si128();
+  s->counter[1] = _mm_setzero_si128();
+# define ROUNDS 13
+# define STEPS 1
+  // Diffuse first two seed elements in s0, then the last two. Same for s1.
+  // We must keep half of the state unchanged so users cannot set a bad state.
+  __m128i seed_0 = SHISHUA_CVTSI64_SI128(seed[0]);
+  __m128i seed_1 = SHISHUA_CVTSI64_SI128(seed[1]);
+  __m128i seed_2 = SHISHUA_CVTSI64_SI128(seed[2]);
+  __m128i seed_3 = SHISHUA_CVTSI64_SI128(seed[3]);
+  s->state[0] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[ 0]));
+  s->state[1] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[ 2]));
+  s->state[2] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[ 4]));
+  s->state[3] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[ 6]));
+  s->state[4] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[ 8]));
+  s->state[5] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[10]));
+  s->state[6] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[12]));
+  s->state[7] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[14]));
+
+  for (int i = 0; i < ROUNDS; i++) {
+    prng_gen(s, NULL, 128 * STEPS);
+    s->state[0] = s->output[6];  s->state[1] = s->output[7];
+    s->state[2] = s->output[4];  s->state[3] = s->output[5];
+    s->state[4] = s->output[2];  s->state[5] = s->output[3];
+    s->state[6] = s->output[0];  s->state[7] = s->output[1];
+  }
+# undef STEPS
+# undef ROUNDS
+}
+#undef SHISHUA_CVTSI64_SI128
+#undef SHISHUA_ALIGNR_EPI8
+#undef SHISHUA_SET_EPI64X
+#undef SHISHUA_RESTRICT
+#endif
diff --git a/src/helper/shishua/shishua.h b/src/helper/shishua/shishua.h
new file mode 100644
index 000000000..352a8aac2
--- /dev/null
+++ b/src/helper/shishua/shishua.h
@@ -0,0 +1,234 @@
+#ifndef SHISHUA_H
+#define SHISHUA_H
+
+#define SHISHUA_TARGET_SCALAR 0
+#define SHISHUA_TARGET_AVX2 1
+#define SHISHUA_TARGET_SSE2 2
+#define SHISHUA_TARGET_NEON 3
+
+#ifndef SHISHUA_TARGET
+#  if defined(__AVX2__) && (defined(__x86_64__) || defined(_M_X64))
+#    define SHISHUA_TARGET SHISHUA_TARGET_AVX2
+#  elif defined(__x86_64__) || defined(_M_X64) || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#    define SHISHUA_TARGET SHISHUA_TARGET_SSE2
+// GCC's NEON codegen leaves much to be desired, at least as of 9.2.0. The
+// scalar path ends up being faster.
+// Device: Google Pixel 2 XL, 2.46GHz Qualcomm Snapdragon 835
+//   algorithm           |   GCC 9.2.0    |  Clang 9.0.1
+//   shishua neon        | 0.2845 ns/byte | 0.0966 ns/byte
+//   shishua scalar      | 0.2056 ns/byte | 0.2958 ns/byte
+//   shishua half neon   | 0.5169 ns/byte | 0.1929 ns/byte
+//   shishua half scalar | 0.2496 ns/byte | 0.2911 ns/byte
+// Therefore, we only autoselect the NEON path on Clang, at least until GCC's
+// NEON codegen improves.
+#  elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__clang__)
+#    define SHISHUA_TARGET SHISHUA_TARGET_NEON
+#  else
+#    define SHISHUA_TARGET SHISHUA_TARGET_SCALAR
+#  endif
+#endif
+
+// These are all optional, with defining SHISHUA_TARGET_SCALAR, you only
+// need this header.
+#if SHISHUA_TARGET == SHISHUA_TARGET_AVX2
+#  include "shishua-avx2.h"
+#elif SHISHUA_TARGET == SHISHUA_TARGET_SSE2
+#  include "shishua-sse2.h"
+#elif SHISHUA_TARGET == SHISHUA_TARGET_NEON
+#  include "shishua-neon.h"
+#else // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR
+
+// Portable scalar implementation of shishua.
+// Designed to balance performance and code size.
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+#include <assert.h>
+
+// Note: While it is an array, a "lane" refers to 4 consecutive uint64_t.
+typedef struct prng_state {
+  uint64_t state[16];  // 4 lanes
+  uint64_t output[16]; // 4 lanes, 2 parts
+  uint64_t counter[4]; // 1 lane
+} prng_state;
+
+// buf could technically alias with prng_state, according to the compiler.
+#if defined(__GNUC__) || defined(_MSC_VER)
+#  define SHISHUA_RESTRICT __restrict
+#else
+#  define SHISHUA_RESTRICT
+#endif
+
+// Writes a 64-bit little endian integer to dst
+static inline void prng_write_le64(void *dst, uint64_t val) {
+  // Define to write in native endianness with memcpy
+  // Also, use memcpy on known little endian setups.
+#if defined(SHISHUA_NATIVE_ENDIAN) \
+   || defined(_WIN32) \
+   || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
+   || defined(__LITTLE_ENDIAN__)
+  memcpy(dst, &val, sizeof(uint64_t));
+#else
+  // Byteshift write.
+  uint8_t *d = (uint8_t *)dst;
+  for (size_t i = 0; i < 8; i++) {
+    d[i] = (uint8_t)(val & 0xff);
+    val >>= 8;
+  }
+#endif
+}
+
+// buf's size must be a multiple of 128 bytes.
+static inline void prng_gen(prng_state *SHISHUA_RESTRICT state, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
+  uint8_t *b = buf;
+  // TODO: consider adding proper uneven write handling
+  assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes.");
+
+  for (size_t i = 0; i < size; i += 128) {
+    // Write the current output block to state if it is not NULL
+    if (buf != NULL) {
+      for (size_t j = 0; j < 16; j++) {
+        prng_write_le64(b, state->output[j]); b += 8;
+      }
+    }
+    // Similar to SSE, use fixed iteration loops to reduce code complexity
+    // and allow the compiler more control over optimization.
+    for (size_t j = 0; j < 2; j++) {
+      // I don't want to type this 15 times.
+      uint64_t *s = &state->state[j * 8];  // 2 lanes
+      uint64_t *o = &state->output[j * 4]; // 1 lane
+      uint64_t t[8]; // temp buffer
+
+      // I apply the counter to s1,
+      // since it is the one whose shift loses most entropy.
+      for (size_t k = 0; k < 4; k++) {
+        s[k + 4] += state->counter[k];
+      }
+
+      // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+      // additions to strong positions for enrichment. The low 32-bit part of a
+      // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+      // They do not remain in the same chunk. Each part eventually reaches all
+      // positions ringwise: A to B, B to C, …, H to A.
+      //
+      // You may notice that they are simply 256-bit rotations (96 and 160):
+      //
+      //   t0 = (s0 <<  96) | (s0 >> (256 -  96));
+      //   t1 = (s1 << 160) | (s1 >> (256 - 160));
+      //
+      // The easiest way to do this would be to cast s and t to uint32_t *
+      // and operate on them that way.
+      //
+      //   uint32_t *t0_32 = (uint32_t *)t0, *t1_32 = (uint32_t *)t1;
+      //   uint32_t *s0_32 = (uint32_t *)s0, *s1_32 = (uint32_t *)s1;
+      //   for (size_t k = 0; k < 4; k++) {
+      //     t0_32[k] = s0_32[(k + 5) % 8];
+      //     t1_32[k] = s1_32[(k + 3) % 8];
+      //   }
+      //
+      // This is pretty, but it violates strict aliasing and relies on little
+      // endian data layout.
+      //
+      // A common workaround to strict aliasing would be to use memcpy:
+      //
+      //   // legal casts
+      //   unsigned char *t8 = (unsigned char *)t;
+      //   unsigned char *s8 = (unsigned char *)s;
+      //   memcpy(&t8[0], &s8[20], 32 - 20);
+      //   memcpy(&t8[32 - 20], &s8[0], 20);
+      //
+      // However, this still doesn't fix the endianness issue, and is very
+      // ugly.
+      //
+      // The only known solution which doesn't rely on endianness is to
+      // read two 64-bit integers and do a funnel shift.
+
+      // Lookup table for the _offsets_ in the shuffle. Even lanes rotate
+      // by 5, odd lanes rotate by 3.
+      // If it were by 32-bit lanes, it would be
+      // { 5,6,7,0,1,2,3,4, 11,12,13,14,15,8,9,10 }
+      const uint8_t shuf_offsets[16] = { 2,3,0,1, 5,6,7,4,   // left
+                                         3,0,1,2, 6,7,4,5 }; // right
+      for (size_t k = 0; k < 8; k++) {
+        t[k] = (s[shuf_offsets[k]] >> 32) | (s[shuf_offsets[k + 8]] << 32);
+      }
+
+      for (size_t k = 0; k < 4; k++) {
+        // SIMD does not support rotations. Shift is the next best thing to entangle
+        // bits with other 64-bit positions. We must shift by an odd number so that
+        // each bit reaches all 64-bit positions, not just half. We must lose bits
+        // of information, so we minimize it: 1 and 3. We use different shift values
+        // to increase divergence between the two sides. We use rightward shift
+        // because the rightmost bits have the least diffusion in addition (the low
+        // bit is just a XOR of the low bits).
+        uint64_t u_lo = s[k + 0] >> 1;
+        uint64_t u_hi = s[k + 4] >> 3;
+
+        // Addition is the main source of diffusion.
+        // Storing the output in the state keeps that diffusion permanently.
+        s[k + 0] = u_lo + t[k + 0];
+        s[k + 4] = u_hi + t[k + 4];
+
+        // The first orthogonally grown piece evolving independently, XORed.
+        o[k] = u_lo ^ t[k + 4];
+      }
+    }
+
+    // Merge together.
+    for (size_t j = 0; j < 4; j++) {
+      // The second orthogonally grown piece evolving independently, XORed.
+      state->output[j +  8] = state->state[j + 0] ^ state->state[j + 12];
+      state->output[j + 12] = state->state[j + 8] ^ state->state[j +  4];
+      // The counter is not necessary to beat PractRand.
+      // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
+      // or about 7 millenia at 10 GiB/s.
+      // The increments are picked as odd numbers,
+      // since only coprimes of the base cover the full cycle,
+      // and all odd numbers are coprime of 2.
+      // I use different odd numbers for each 64-bit chunk
+      // for a tiny amount of variation stirring.
+      // I used the smallest odd numbers to avoid having a magic number.
+      //
+      // For the scalar version, we calculate this dynamically, as it is
+      // simple enough.
+      state->counter[j] += 7 - (j * 2); // 7, 5, 3, 1
+    }
+  }
+}
+#undef SHISHUA_RESTRICT
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[16] = {
+  0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95,
+  0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36,
+  0xF06AD7AE9717877E, 0x85839D6EFFBD7DC6, 0x64D325D1C5371682, 0xCADD0CCCFDFFBBE1,
+  0x626E33B8D04B4331, 0xBBF73C790D94F79D, 0x471C4AB3ED3D82A5, 0xFEC507705E4AE6E5,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+  memset(s, 0, sizeof(prng_state));
+# define STEPS 1
+# define ROUNDS 13
+  // Diffuse first two seed elements in s0, then the last two. Same for s1.
+  // We must keep half of the state unchanged so users cannot set a bad state.
+  memcpy(s->state, phi, sizeof(phi));
+  for (size_t i = 0; i < 4; i++) {
+    s->state[i * 2 + 0] ^= seed[i];           // { s0,0,s1,0,s2,0,s3,0 }
+    s->state[i * 2 + 8] ^= seed[(i + 2) % 4]; // { s2,0,s3,0,s0,0,s1,0 }
+  }
+  for (size_t i = 0; i < ROUNDS; i++) {
+    prng_gen(s, NULL, 128 * STEPS);
+    for (size_t j = 0; j < 4; j++) {
+       s->state[j+ 0] = s->output[j+12];
+       s->state[j+ 4] = s->output[j+ 8];
+       s->state[j+ 8] = s->output[j+ 4];
+       s->state[j+12] = s->output[j+ 0];
+    }
+  }
+# undef STEPS
+# undef ROUNDS
+}
+#endif // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR
+#endif // SHISHUA_SCALAR_H

From 8753eee66349c2e518ef26682520f0e689e3a868 Mon Sep 17 00:00:00 2001
From: Lee Hoon Lim <hoonlim123@gmail.com>
Date: Thu, 12 Jun 2025 14:48:29 +0800
Subject: [PATCH 02/11] Cleanup

---
 CMakeLists.txt                         |   2 +
 external/shishua/shishua-avx2.h        | 137 ++++++++++++++
 external/shishua/shishua-half-avx2.h   | 107 +++++++++++
 external/shishua/shishua-half-neon.h   | 169 +++++++++++++++++
 external/shishua/shishua-half-sse2.h   | 200 ++++++++++++++++++++
 external/shishua/shishua-half.h        | 196 ++++++++++++++++++++
 external/shishua/shishua-neon.h        | 199 ++++++++++++++++++++
 external/shishua/shishua-sse2.h        | 228 +++++++++++++++++++++++
 external/shishua/shishua.h             | 244 +++++++++++++++++++++++++
 src/helper/shishua/shishua-avx2.h      | 111 -----------
 src/helper/shishua/shishua-half-avx2.h |  96 ----------
 src/helper/shishua/shishua-half-neon.h | 155 ----------------
 src/helper/shishua/shishua-half-sse2.h | 194 --------------------
 src/helper/shishua/shishua-half.h      | 195 --------------------
 src/helper/shishua/shishua-neon.h      | 180 ------------------
 src/helper/shishua/shishua-sse2.h      | 214 ----------------------
 src/helper/shishua/shishua.h           | 234 ------------------------
 src/{helper/shishua => }/rand.c        |   6 +-
 src/{helper/shishua => }/rand.h        |   6 +-
 19 files changed, 1488 insertions(+), 1385 deletions(-)
 create mode 100644 external/shishua/shishua-avx2.h
 create mode 100644 external/shishua/shishua-half-avx2.h
 create mode 100644 external/shishua/shishua-half-neon.h
 create mode 100644 external/shishua/shishua-half-sse2.h
 create mode 100644 external/shishua/shishua-half.h
 create mode 100644 external/shishua/shishua-neon.h
 create mode 100644 external/shishua/shishua-sse2.h
 create mode 100644 external/shishua/shishua.h
 delete mode 100644 src/helper/shishua/shishua-avx2.h
 delete mode 100644 src/helper/shishua/shishua-half-avx2.h
 delete mode 100644 src/helper/shishua/shishua-half-neon.h
 delete mode 100644 src/helper/shishua/shishua-half-sse2.h
 delete mode 100644 src/helper/shishua/shishua-half.h
 delete mode 100644 src/helper/shishua/shishua-neon.h
 delete mode 100644 src/helper/shishua/shishua-sse2.h
 delete mode 100644 src/helper/shishua/shishua.h
 rename src/{helper/shishua => }/rand.c (98%)
 rename src/{helper/shishua => }/rand.h (76%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4591d9ea1..95245ad12 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,8 @@ set(PROFILE_FLAG -O3 -p -g)
 
 function(apply_target target)
     target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/include)
+    target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/external)
+    target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/src)
     target_compile_options(${target} PRIVATE ${WARNING_FLAGS})
     target_link_libraries(libcneuron PRIVATE ${BLAS_LIBRARIES} m)
 
diff --git a/external/shishua/shishua-avx2.h b/external/shishua/shishua-avx2.h
new file mode 100644
index 000000000..e25dcf20f
--- /dev/null
+++ b/external/shishua/shishua-avx2.h
@@ -0,0 +1,137 @@
+#ifndef SHISHUA_AVX2_H
+#define SHISHUA_AVX2_H
+#include <assert.h>
+#include <immintrin.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+typedef struct prng_state {
+    __m256i state[4];
+    __m256i output[4];
+    __m256i counter;
+} prng_state;
+
+// buf's size must be a multiple of 128 bytes.
+static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) {
+    __m256i o0 = s->output[0], o1 = s->output[1], o2 = s->output[2], o3 = s->output[3],
+            s0 = s->state[0], s1 = s->state[1], s2 = s->state[2], s3 = s->state[3],
+            t0, t1, t2, t3, u0, u1, u2, u3, counter = s->counter;
+    // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+    // additions to strong positions for enrichment. The low 32-bit part of a
+    // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+    // They do not remain in the same chunk. Each part eventually reaches all
+    // positions ringwise: A to B, B to C, …, H to A.
+    // You may notice that they are simply 256-bit rotations (96 and 160).
+    __m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5),
+            shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3);
+    // The counter is not necessary to beat PractRand.
+    // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
+    // or about 7 millenia at 10 GiB/s.
+    // The increments are picked as odd numbers,
+    // since only coprimes of the base cover the full cycle,
+    // and all odd numbers are coprime of 2.
+    // I use different odd numbers for each 64-bit chunk
+    // for a tiny amount of variation stirring.
+    // I used the smallest odd numbers to avoid having a magic number.
+    __m256i increment = _mm256_set_epi64x(1, 3, 5, 7);
+
+    // TODO: consider adding proper uneven write handling
+    assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes.");
+
+    for (size_t i = 0; i < size; i += 128) {
+        if (buf != NULL) {
+            _mm256_storeu_si256((__m256i *)&buf[i + 0], o0);
+            _mm256_storeu_si256((__m256i *)&buf[i + 32], o1);
+            _mm256_storeu_si256((__m256i *)&buf[i + 64], o2);
+            _mm256_storeu_si256((__m256i *)&buf[i + 96], o3);
+        }
+
+        // I apply the counter to s1,
+        // since it is the one whose shift loses most entropy.
+        s1 = _mm256_add_epi64(s1, counter);
+        s3 = _mm256_add_epi64(s3, counter);
+        counter = _mm256_add_epi64(counter, increment);
+
+        // SIMD does not support rotations. Shift is the next best thing to entangle
+        // bits with other 64-bit positions. We must shift by an odd number so that
+        // each bit reaches all 64-bit positions, not just half. We must lose bits
+        // of information, so we minimize it: 1 and 3. We use different shift values
+        // to increase divergence between the two sides. We use rightward shift
+        // because the rightmost bits have the least diffusion in addition (the low
+        // bit is just a XOR of the low bits).
+        u0 = _mm256_srli_epi64(s0, 1);
+        u1 = _mm256_srli_epi64(s1, 3);
+        u2 = _mm256_srli_epi64(s2, 1);
+        u3 = _mm256_srli_epi64(s3, 3);
+        t0 = _mm256_permutevar8x32_epi32(s0, shu0);
+        t1 = _mm256_permutevar8x32_epi32(s1, shu1);
+        t2 = _mm256_permutevar8x32_epi32(s2, shu0);
+        t3 = _mm256_permutevar8x32_epi32(s3, shu1);
+        // Addition is the main source of diffusion.
+        // Storing the output in the state keeps that diffusion permanently.
+        s0 = _mm256_add_epi64(t0, u0);
+        s1 = _mm256_add_epi64(t1, u1);
+        s2 = _mm256_add_epi64(t2, u2);
+        s3 = _mm256_add_epi64(t3, u3);
+
+        // Two orthogonally grown pieces evolving independently, XORed.
+        o0 = _mm256_xor_si256(u0, t1);
+        o1 = _mm256_xor_si256(u2, t3);
+        o2 = _mm256_xor_si256(s0, s3);
+        o3 = _mm256_xor_si256(s2, s1);
+    }
+    s->output[0] = o0;
+    s->output[1] = o1;
+    s->output[2] = o2;
+    s->output[3] = o3;
+    s->state[0] = s0;
+    s->state[1] = s1;
+    s->state[2] = s2;
+    s->state[3] = s3;
+    s->counter = counter;
+}
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[16] = {
+    0x9E3779B97F4A7C15,
+    0xF39CC0605CEDC834,
+    0x1082276BF3A27251,
+    0xF86C6A11D0C18E95,
+    0x2767F0B153D27B7F,
+    0x0347045B5BF1827F,
+    0x01886F0928403002,
+    0xC1D64BA40F335E36,
+    0xF06AD7AE9717877E,
+    0x85839D6EFFBD7DC6,
+    0x64D325D1C5371682,
+    0xCADD0CCCFDFFBBE1,
+    0x626E33B8D04B4331,
+    0xBBF73C790D94F79D,
+    0x471C4AB3ED3D82A5,
+    0xFEC507705E4AE6E5,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+    memset(s, 0, sizeof(prng_state));
+#define STEPS 1
+#define ROUNDS 13
+    uint8_t buf[128 * STEPS];
+    // Diffuse first two seed elements in s0, then the last two. Same for s1.
+    // We must keep half of the state unchanged so users cannot set a bad state.
+    s->state[0] = _mm256_set_epi64x(phi[3], phi[2] ^ seed[1], phi[1], phi[0] ^ seed[0]);
+    s->state[1] = _mm256_set_epi64x(phi[7], phi[6] ^ seed[3], phi[5], phi[4] ^ seed[2]);
+    s->state[2] = _mm256_set_epi64x(phi[11], phi[10] ^ seed[3], phi[9], phi[8] ^ seed[2]);
+    s->state[3] = _mm256_set_epi64x(phi[15], phi[14] ^ seed[1], phi[13], phi[12] ^ seed[0]);
+    for (size_t i = 0; i < ROUNDS; i++) {
+        prng_gen(s, buf, 128 * STEPS);
+        s->state[0] = s->output[3];
+        s->state[1] = s->output[2];
+        s->state[2] = s->output[1];
+        s->state[3] = s->output[0];
+    }
+#undef STEPS
+#undef ROUNDS
+}
+#endif
diff --git a/external/shishua/shishua-half-avx2.h b/external/shishua/shishua-half-avx2.h
new file mode 100644
index 000000000..d1b3df42a
--- /dev/null
+++ b/external/shishua/shishua-half-avx2.h
@@ -0,0 +1,107 @@
+#ifndef SHISHUA_H
+#define SHISHUA_H
+#include <assert.h>
+#include <immintrin.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+typedef struct prng_state {
+    __m256i state[2];
+    __m256i output;
+    __m256i counter;
+} prng_state;
+
+// buf's size must be a multiple of 32 bytes.
+static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) {
+    __m256i s0 = s->state[0], counter = s->counter,
+            s1 = s->state[1], o = s->output,
+            t0, t1, t2, t3, u0, u1, u2, u3;
+    // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+    // additions to strong positions for enrichment. The low 32-bit part of a
+    // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+    // They do not remain in the same chunk. Each part eventually reaches all
+    // positions ringwise: A to B, B to C, …, H to A.
+    // You may notice that they are simply 256-bit rotations (96 and 160).
+    __m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5),
+            shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3);
+    // The counter is not necessary to beat PractRand.
+    // It sets a lower bound of 2^69 bytes = 512 EiB to the period,
+    // or about 1 millenia at 10 GiB/s.
+    // The increments are picked as odd numbers,
+    // since only coprimes of the base cover the full cycle,
+    // and all odd numbers are coprime of 2.
+    // I use different odd numbers for each 64-bit chunk
+    // for a tiny amount of variation stirring.
+    // I used the smallest odd numbers to avoid having a magic number.
+    __m256i increment = _mm256_set_epi64x(1, 3, 5, 7);
+
+    // TODO: consider adding proper uneven write handling
+    assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes.");
+
+    for (size_t i = 0; i < size; i += 32) {
+        if (buf != NULL) {
+            _mm256_storeu_si256((__m256i *)&buf[i], o);
+        }
+
+        // I apply the counter to s1,
+        // since it is the one whose shift loses most entropy.
+        s1 = _mm256_add_epi64(s1, counter);
+        counter = _mm256_add_epi64(counter, increment);
+
+        // SIMD does not support rotations. Shift is the next best thing to entangle
+        // bits with other 64-bit positions. We must shift by an odd number so that
+        // each bit reaches all 64-bit positions, not just half. We must lose bits
+        // of information, so we minimize it: 1 and 3. We use different shift values
+        // to increase divergence between the two sides. We use rightward shift
+        // because the rightmost bits have the least diffusion in addition (the low
+        // bit is just a XOR of the low bits).
+        u0 = _mm256_srli_epi64(s0, 1);
+        u1 = _mm256_srli_epi64(s1, 3);
+        t0 = _mm256_permutevar8x32_epi32(s0, shu0);
+        t1 = _mm256_permutevar8x32_epi32(s1, shu1);
+        // Addition is the main source of diffusion.
+        // Storing the output in the state keeps that diffusion permanently.
+        s0 = _mm256_add_epi64(t0, u0);
+        s1 = _mm256_add_epi64(t1, u1);
+
+        // Two orthogonally grown pieces evolving independently, XORed.
+        o = _mm256_xor_si256(u0, t1);
+    }
+    s->state[0] = s0;
+    s->counter = counter;
+    s->state[1] = s1;
+    s->output = o;
+}
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[8] = {
+    0x9E3779B97F4A7C15,
+    0xF39CC0605CEDC834,
+    0x1082276BF3A27251,
+    0xF86C6A11D0C18E95,
+    0x2767F0B153D27B7F,
+    0x0347045B5BF1827F,
+    0x01886F0928403002,
+    0xC1D64BA40F335E36,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+    memset(s, 0, sizeof(prng_state));
+#define STEPS 5
+#define ROUNDS 4
+    uint8_t buf[32 * STEPS];  // 4 64-bit numbers per 256-bit SIMD.
+    // Diffuse first two seed elements in s0, then the last two. Same for s1.
+    // We must keep half of the state unchanged so users cannot set a bad state.
+    s->state[0] = _mm256_set_epi64x(phi[3], phi[2] ^ seed[1], phi[1], phi[0] ^ seed[0]);
+    s->state[1] = _mm256_set_epi64x(phi[7], phi[6] ^ seed[3], phi[5], phi[4] ^ seed[2]);
+    for (size_t i = 0; i < ROUNDS; i++) {
+        prng_gen(s, buf, 32 * STEPS);
+        s->state[0] = s->state[1];
+        s->state[1] = s->output;
+    }
+#undef STEPS
+#undef ROUNDS
+}
+#endif
diff --git a/external/shishua/shishua-half-neon.h b/external/shishua/shishua-half-neon.h
new file mode 100644
index 000000000..ba5655b14
--- /dev/null
+++ b/external/shishua/shishua-half-neon.h
@@ -0,0 +1,169 @@
+// An ARM NEON version of shishua.
+#ifndef SHISHUA_HALF_NEON_H
+#define SHISHUA_HALF_NEON_H
+#include <arm_neon.h>
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+// A NEON version.
+typedef struct prng_state {
+    uint64x2_t state[4];
+    uint64x2_t output[2];
+    uint64x2_t counter[2];
+} prng_state;
+
+#define SHISHUA_VSETQ_N_U64(a, b) vcombine_u64(vdup_n_u64(a), vdup_n_u64(b))
+
+// To hide the vreinterpret ritual.
+#define SHISHUA_VEXTQ_U8(Rn, Rm, Imm) \
+    vreinterpretq_u64_u8(             \
+        vextq_u8(                     \
+            vreinterpretq_u8_u64(Rn), \
+            vreinterpretq_u8_u64(Rm), \
+            (Imm)))
+
+// buf could technically alias with prng_state, according to the compiler.
+#if defined(__GNUC__) || defined(_MSC_VER)
+#define SHISHUA_RESTRICT __restrict
+#else
+#define SHISHUA_RESTRICT
+#endif
+
+// buf's size must be a multiple of 32 bytes.
+static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
+    uint64x2_t s0_lo = s->state[0], s0_hi = s->state[1],
+               s1_lo = s->state[2], s1_hi = s->state[3],
+               o_lo = s->output[0], o_hi = s->output[1],
+               t0_lo, t0_hi, t1_lo, t1_hi, u_lo, u_hi,
+               counter_lo = s->counter[0], counter_hi = s->counter[1];
+    // The counter is not necessary to beat PractRand.
+    // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
+    // or about 7 millenia at 10 GiB/s.
+    // The increments are picked as odd numbers,
+    // since only coprimes of the base cover the full cycle,
+    // and all odd numbers are coprime of 2.
+    // I use different odd numbers for each 64-bit chunk
+    // for a tiny amount of variation stirring.
+    // I used the smallest odd numbers to avoid having a magic number.
+    uint64x2_t increment_lo = SHISHUA_VSETQ_N_U64(7, 5);
+    uint64x2_t increment_hi = SHISHUA_VSETQ_N_U64(3, 1);
+    uint8_t *b = buf;
+    // TODO: consider adding proper uneven write handling
+    assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes.");
+
+    for (size_t i = 0; i < size / 32; i++) {
+        if (buf != NULL) {
+            vst1q_u8(&b[0], vreinterpretq_u8_u64(o_lo));
+            b += 16;
+            vst1q_u8(&b[0], vreinterpretq_u8_u64(o_hi));
+            b += 16;
+        }
+
+        // I apply the counter to s1,
+        // since it is the one whose shift loses most entropy.
+        s1_lo = vaddq_u64(s1_lo, counter_lo);
+        s1_hi = vaddq_u64(s1_hi, counter_hi);
+
+        counter_lo = vaddq_u64(counter_lo, increment_lo);
+        counter_hi = vaddq_u64(counter_hi, increment_hi);
+
+        // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+        // additions to strong positions for enrichment. The low 32-bit part of a
+        // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+        // They do not remain in the same chunk. Each part eventually reaches all
+        // positions ringwise: A to B, B to C, …, H to A.
+        // You may notice that they are simply 256-bit rotations (96 and 160).
+        // Note: This:
+        //   x = (y << 96) | (y >> 160)
+        // can be rewritten as this
+        //   x_lo = (y_lo << 96) | (y_hi >> 32)
+        //   x_hi = (y_hi << 96) | (y_lo >> 32)
+        // which we can do with 2 vext.8 instructions.
+        t0_lo = SHISHUA_VEXTQ_U8(s0_hi, s0_lo, 4);
+        t0_hi = SHISHUA_VEXTQ_U8(s0_lo, s0_hi, 4);
+        t1_lo = SHISHUA_VEXTQ_U8(s1_lo, s1_hi, 12);
+        t1_hi = SHISHUA_VEXTQ_U8(s1_hi, s1_lo, 12);
+
+        // SIMD does not support rotations. Shift is the next best thing to entangle
+        // bits with other 64-bit positions. We must shift by an odd number so that
+        // each bit reaches all 64-bit positions, not just half. We must lose bits
+        // of information, so we minimize it: 1 and 3. We use different shift values
+        // to increase divergence between the two sides. We use rightward shift
+        // because the rightmost bits have the least diffusion in addition (the low
+        // bit is just a XOR of the low bits).
+        u_lo = vshrq_n_u64(s0_lo, 1);
+        u_hi = vshrq_n_u64(s0_hi, 1);
+#if defined(__clang__)
+        // UGLY HACK: Clang enjoys merging the above statements with the vadds below
+        // into vsras.
+        // This is dumb, as it still needs to do the original vshr for the xor mix,
+        // causing it to shift twice.
+        // This makes Clang assume that this line has side effects, preventing the
+        // combination and speeding things up significantly.
+        // clang-format off
+        __asm__("" : "+w"(u_lo), "+w"(u_hi));
+        // clang-format on
+#endif
+
+        // Two orthogonally grown pieces evolving independently, XORed.
+        // Note: We do this first because on the assembly side, vsra would overwrite
+        // t1_lo and t1_hi.
+        o_lo = veorq_u64(u_lo, t1_lo);
+        o_hi = veorq_u64(u_hi, t1_hi);
+        // Addition is the main source of diffusion.
+        // Storing the output in the state keeps that diffusion permanently.
+        s0_lo = vaddq_u64(t0_lo, u_lo);
+        s0_hi = vaddq_u64(t0_hi, u_hi);
+        // Use vsra here directly.
+        s1_lo = vsraq_n_u64(t1_lo, s1_lo, 3);
+        s1_hi = vsraq_n_u64(t1_hi, s1_hi, 3);
+    }
+    s->state[0] = s0_lo;
+    s->state[1] = s0_hi;
+    s->state[2] = s1_lo;
+    s->state[3] = s1_hi;
+    s->output[0] = o_lo;
+    s->output[1] = o_hi;
+    s->counter[0] = counter_lo;
+    s->counter[1] = counter_hi;
+}
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[8] = {
+    0x9E3779B97F4A7C15,
+    0xF39CC0605CEDC834,
+    0x1082276BF3A27251,
+    0xF86C6A11D0C18E95,
+    0x2767F0B153D27B7F,
+    0x0347045B5BF1827F,
+    0x01886F0928403002,
+    0xC1D64BA40F335E36,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+    s->counter[0] = vdupq_n_u64(0);
+    s->counter[1] = vdupq_n_u64(0);
+#define STEPS 5
+#define ROUNDS 4
+    // Diffuse first two seed elements in s0, then the last two. Same for s1.
+    // We must keep half of the state unchanged so users cannot set a bad state.
+    for (size_t i = 0; i < 4; i++) {
+        s->state[i] = veorq_u64(SHISHUA_VSETQ_N_U64(seed[i], 0), vld1q_u64(&phi[i * 2]));
+    }
+    for (size_t i = 0; i < ROUNDS; i++) {
+        prng_gen(s, NULL, 32 * STEPS);
+        s->state[0] = s->state[2];
+        s->state[1] = s->state[3];
+        s->state[2] = s->output[0];
+        s->state[3] = s->output[1];
+    }
+#undef STEPS
+#undef ROUNDS
+}
+#undef SHISHUA_VSETQ_N_U64
+#undef SHISHUA_VEXTQ_U8
+#undef SHISHUA_RESTRICT
+
+#endif
diff --git a/external/shishua/shishua-half-sse2.h b/external/shishua/shishua-half-sse2.h
new file mode 100644
index 000000000..91e1194fa
--- /dev/null
+++ b/external/shishua/shishua-half-sse2.h
@@ -0,0 +1,200 @@
+// An SSE2/SSSE3 version of shishua half. Slower than AVX2, but more compatible.
+// Also compatible with 32-bit x86.
+//
+// SSSE3 is recommended, as it has the useful _mm_alignr_epi8 intrinsic.
+// We can still emulate it on SSE2, but it is slower.
+#ifndef SHISHUA_HALF_SSE2_H
+#define SHISHUA_HALF_SSE2_H
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+// Note: cl.exe doesn't define __SSSE3__
+#if defined(__SSSE3__) || defined(__AVX__)
+#include <tmmintrin.h>  // SSSE3
+#define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \
+    _mm_alignr_epi8(hi, lo, amt)
+#else
+#include <emmintrin.h>  // SSE2
+// Emulate _mm_alignr_epi8 for SSE2. It's a little slow.
+// The compiler may convert it to a sequence of shufps instructions, which is
+// perfectly fine.
+#define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \
+    _mm_or_si128(                        \
+        _mm_slli_si128(hi, 16 - (amt)),  \
+        _mm_srli_si128(lo, amt))
+#endif
+
+typedef struct prng_state {
+    __m128i state[4];
+    __m128i output[2];
+    __m128i counter[2];
+} prng_state;
+
+// Wrappers for x86 targets which usually lack these intrinsics.
+// Don't call these with side effects.
+#if defined(__x86_64__) || defined(_M_X64)
+#define SHISHUA_SET_EPI64X(b, a) _mm_set_epi64x(b, a)
+#define SHISHUA_CVTSI64_SI128(x) _mm_cvtsi64_si128(x)
+#else
+#define SHISHUA_SET_EPI64X(b, a)      \
+    _mm_set_epi32(                    \
+        (int)(((uint64_t)(b)) >> 32), \
+        (int)(b),                     \
+        (int)(((uint64_t)(a)) >> 32), \
+        (int)(a))
+#define SHISHUA_CVTSI64_SI128(x) SHISHUA_SET_EPI64X(0, x)
+#endif
+
+// buf could technically alias with prng_state, according to the compiler.
+#if defined(__GNUC__) || defined(_MSC_VER)
+#define SHISHUA_RESTRICT __restrict
+#else
+#define SHISHUA_RESTRICT
+#endif
+
+// buf's size must be a multiple of 32 bytes.
+static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
+    // We need a lot of variables...
+    __m128i s0_lo = s->state[0], s0_hi = s->state[1],
+            s1_lo = s->state[2], s1_hi = s->state[3],
+            t0_lo, t0_hi, t1_lo, t1_hi,
+            u_lo, u_hi,
+            counter_lo = s->counter[0], counter_hi = s->counter[1];
+    // The counter is not necessary to beat PractRand.
+    // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
+    // or about 7 millenia at 10 GiB/s.
+    // The increments are picked as odd numbers,
+    // since only coprimes of the base cover the full cycle,
+    // and all odd numbers are coprime of 2.
+    // I use different odd numbers for each 64-bit chunk
+    // for a tiny amount of variation stirring.
+    // I used the smallest odd numbers to avoid having a magic number.
+    // increment = { 7, 5, 3, 1 };
+    const __m128i increment_lo = SHISHUA_SET_EPI64X(5, 7);
+    const __m128i increment_hi = SHISHUA_SET_EPI64X(1, 3);
+
+    // TODO: consider adding proper uneven write handling
+    assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes.");
+
+    for (size_t i = 0; i < size; i += 32) {
+        // Split to try to reduce register pressure
+        if (buf != NULL) {
+            _mm_storeu_si128((__m128i *)&buf[i + 0], s->output[0]);
+            _mm_storeu_si128((__m128i *)&buf[i + 16], s->output[1]);
+        }
+
+        // Lane 0
+
+        // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+        // additions to strong positions for enrichment. The low 32-bit part of a
+        // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+        // They do not remain in the same chunk. Each part eventually reaches all
+        // positions ringwise: A to B, B to C, …, H to A.
+        // You may notice that they are simply 256-bit rotations (96 and 160).
+        // Note: This:
+        //   x = (y << 96) | (y >> 160)
+        // can be rewritten as this
+        //   x_lo = (y_lo << 96) | (y_hi >> 32)
+        //   x_hi = (y_hi << 96) | (y_lo >> 32)
+        // which we can do with 2 _mm_alignr_epi8 instructions.
+        t0_lo = SHISHUA_ALIGNR_EPI8(s0_lo, s0_hi, 4);
+        t0_hi = SHISHUA_ALIGNR_EPI8(s0_hi, s0_lo, 4);
+
+        // SIMD does not support rotations. Shift is the next best thing to entangle
+        // bits with other 64-bit positions. We must shift by an odd number so that
+        // each bit reaches all 64-bit positions, not just half. We must lose bits
+        // of information, so we minimize it: 1 and 3. We use different shift values
+        // to increase divergence between the two sides. We use rightward shift
+        // because the rightmost bits have the least diffusion in addition (the low
+        // bit is just a XOR of the low bits).
+        u_lo = _mm_srli_epi64(s0_lo, 1);
+        u_hi = _mm_srli_epi64(s0_hi, 1);
+
+        // Addition is the main source of diffusion.
+        // Storing the output in the state keeps that diffusion permanently.
+        s0_lo = _mm_add_epi64(u_lo, t0_lo);
+        s0_hi = _mm_add_epi64(u_hi, t0_hi);
+
+        // Lane 1
+
+        // I apply the counter to s1,
+        // since it is the one whose shift loses most entropy.
+        s1_lo = _mm_add_epi64(s1_lo, counter_lo);
+        s1_hi = _mm_add_epi64(s1_hi, counter_hi);
+
+        // Increment the counter
+        counter_lo = _mm_add_epi64(counter_lo, increment_lo);
+        counter_hi = _mm_add_epi64(counter_hi, increment_hi);
+
+        // Same as above, but with different shift amounts.
+        t1_lo = SHISHUA_ALIGNR_EPI8(s1_hi, s1_lo, 12);
+        t1_hi = SHISHUA_ALIGNR_EPI8(s1_lo, s1_hi, 12);
+
+        s1_lo = _mm_srli_epi64(s1_lo, 3);
+        s1_hi = _mm_srli_epi64(s1_hi, 3);
+
+        s1_lo = _mm_add_epi64(s1_lo, t1_lo);
+        s1_hi = _mm_add_epi64(s1_hi, t1_hi);
+
+        // Two orthogonally grown pieces evolving independently, XORed.
+        s->output[0] = _mm_xor_si128(u_lo, t1_lo);
+        s->output[1] = _mm_xor_si128(u_hi, t1_hi);
+    }
+
+    s->state[0] = s0_lo;
+    s->state[1] = s0_hi;
+    s->state[2] = s1_lo;
+    s->state[3] = s1_hi;
+
+    s->counter[0] = counter_lo;
+    s->counter[1] = counter_hi;
+}
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[8] = {
+    0x9E3779B97F4A7C15,
+    0xF39CC0605CEDC834,
+    0x1082276BF3A27251,
+    0xF86C6A11D0C18E95,
+    0x2767F0B153D27B7F,
+    0x0347045B5BF1827F,
+    0x01886F0928403002,
+    0xC1D64BA40F335E36,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+    s->counter[0] = _mm_setzero_si128();
+    s->counter[1] = _mm_setzero_si128();
+#define STEPS 5
+#define ROUNDS 4
+    // Diffuse first two seed elements in s0, then the last two. Same for s1.
+    // We must keep half of the state unchanged so users cannot set a bad state.
+    // XXX: better way to do this?
+    __m128i seed_0 = SHISHUA_CVTSI64_SI128(seed[0]);
+    __m128i seed_1 = SHISHUA_CVTSI64_SI128(seed[1]);
+    __m128i seed_2 = SHISHUA_CVTSI64_SI128(seed[2]);
+    __m128i seed_3 = SHISHUA_CVTSI64_SI128(seed[3]);
+    s->state[0] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[0]));
+    s->state[1] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[2]));
+    s->state[2] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[4]));
+    s->state[3] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[6]));
+
+    for (int i = 0; i < ROUNDS; i++) {
+        prng_gen(s, NULL, 32 * STEPS);
+        s->state[0] = s->state[2];
+        s->state[1] = s->state[3];
+        s->state[2] = s->output[0];
+        s->state[3] = s->output[1];
+    }
+#undef STEPS
+#undef ROUNDS
+}
+#undef SHISHUA_ALIGNR_EPI8
+#undef SHISHUA_CVTSI64_SI128
+#undef SHISHUA_SET_EPI64X
+#undef SHISHUA_RESTRICT
+
+#endif
diff --git a/external/shishua/shishua-half.h b/external/shishua/shishua-half.h
new file mode 100644
index 000000000..70a9f4be4
--- /dev/null
+++ b/external/shishua/shishua-half.h
@@ -0,0 +1,196 @@
+#ifndef SHISHUA_HALF_H
+#define SHISHUA_HALF_H
+
+#define SHISHUA_TARGET_SCALAR 0
+#define SHISHUA_TARGET_AVX2 1
+#define SHISHUA_TARGET_SSE2 2
+#define SHISHUA_TARGET_NEON 3
+
+#ifndef SHISHUA_TARGET
+#if defined(__AVX2__) && (defined(__x86_64__) || defined(_M_X64))
+#define SHISHUA_TARGET SHISHUA_TARGET_AVX2
+#elif defined(__x86_64__) || defined(_M_X64) || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#define SHISHUA_TARGET SHISHUA_TARGET_SSE2
+// GCC's NEON codegen leaves much to be desired, at least as of 9.2.0. The
+// scalar path ends up being faster.
+// Device: Google Pixel 2 XL, 2.46GHz Qualcomm Snapdragon 835
+//                       |   GCC 9.2.0    |  Clang 9.0.1
+//   shishua neon        | 0.2845 ns/byte | 0.0966 ns/byte
+//   shishua scalar      | 0.2056 ns/byte | 0.2958 ns/byte
+//   shishua half neon   | 0.5169 ns/byte | 0.1929 ns/byte
+//   shishua half scalar | 0.2496 ns/byte | 0.2911 ns/byte
+// Therefore, we only autoselect the NEON path on Clang, at least until GCC's
+// NEON codegen improves.
+#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__clang__)
+#define SHISHUA_TARGET SHISHUA_TARGET_NEON
+#else
+#define SHISHUA_TARGET SHISHUA_TARGET_SCALAR
+#endif
+#endif
+
+// These are all optional: By defining SHISHUA_TARGET_SCALAR, you only
+// need this header.
+// Additionally, these headers can also be used on their own.
+#if SHISHUA_TARGET == SHISHUA_TARGET_AVX2
+#include "shishua-half-avx2.h"
+#elif SHISHUA_TARGET == SHISHUA_TARGET_SSE2
+#include "shishua-half-sse2.h"
+#elif SHISHUA_TARGET == SHISHUA_TARGET_NEON
+#include "shishua-half-neon.h"
+#else  // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR
+
+// Portable scalar implementation of shishua half.
+// Aims for a balance between code size and performance.
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+// Note: While it is an array, a "lane" refers to 4 consecutive uint64_t.
+typedef struct prng_state {
+    uint64_t state[8];    // 2 lanes
+    uint64_t output[4];   // 1 lane
+    uint64_t counter[4];  // 1 lane
+} prng_state;
+
+// buf could technically alias with prng_state, according to the compiler.
+#if defined(__GNUC__) || defined(_MSC_VER)
+#define SHISHUA_RESTRICT __restrict
+#else
+#define SHISHUA_RESTRICT
+#endif
+
+static inline void shishua_write_le64(void *dst, uint64_t val) {
+    // Define to write in native endianness with memcpy
+    // Also, use memcpy on known little endian setups.
+#if defined(SHISHUA_NATIVE_ENDIAN) || defined(_WIN32) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN__)
+    memcpy(dst, &val, sizeof(uint64_t));
+#else
+    // Byteshift write.
+    uint8_t *d = (uint8_t *)dst;
+    for (size_t i = 0; i < 8; i++) {
+        d[i] = (uint8_t)(val & 0xff);
+        val >>= 8;
+    }
+#endif
+}
+
+// buf's size must be a multiple of 32 bytes.
+static inline void prng_gen(prng_state *SHISHUA_RESTRICT state, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
+    uint8_t *b = buf;
+    // TODO: consider adding proper uneven write handling
+    assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes.");
+
+    for (size_t i = 0; i < size; i += 32) {
+        uint64_t t[8];
+        // Write to buf
+        if (buf != NULL) {
+            for (size_t j = 0; j < 4; j++) {
+                shishua_write_le64(b, state->output[j]);
+                b += 8;
+            }
+        }
+
+        for (size_t j = 0; j < 4; j++) {
+            // I apply the counter to s1,
+            // since it is the one whose shift loses most entropy.
+            state->state[j + 4] += state->counter[j];
+            // The counter is not necessary to beat PractRand.
+            // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
+            // or about 7 millenia at 10 GiB/s.
+            // The increments are picked as odd numbers,
+            // since only coprimes of the base cover the full cycle,
+            // and all odd numbers are coprime of 2.
+            // I use different odd numbers for each 64-bit chunk
+            // for a tiny amount of variation stirring.
+            // I used the smallest odd numbers to avoid having a magic number.
+            //
+            // For the scalar version, we calculate this dynamically, as it is
+            // simple enough.
+            state->counter[j] += 7 - (j * 2);  // 7, 5, 3, 1
+        }
+
+        // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+        // additions to strong positions for enrichment. The low 32-bit part of a
+        // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+        // They do not remain in the same chunk. Each part eventually reaches all
+        // positions ringwise: A to B, B to C, …, H to A.
+        // You may notice that they are simply 256-bit rotations (96 and 160).
+        //
+        // It would be much easier and cleaner to just reinterpret as a uint32_t
+        // pointer or use memcpy, but that is unfortunately endian-dependent, and
+        // the former also breaks strict aliasing.
+        //
+        // The only known solution which doesn't rely on endianness is to
+        // read two 64-bit integers and do a funnel shift.
+        //
+        // See shishua.h for details.
+
+        // Lookup table for the _offsets_ in the shuffle. Even lanes rotate
+        // by 5, odd lanes rotate by 3.
+        // If it were by 32-bit lanes, it would be
+        // { 5,6,7,0,1,2,3,4, 11,12,13,14,15,8,9,10 }
+        const uint8_t shuf_offsets[16] = {2, 3, 0, 1, 5, 6, 7, 4,   // left
+                                          3, 0, 1, 2, 6, 7, 4, 5};  // right
+        for (size_t j = 0; j < 8; j++) {
+            t[j] = (state->state[shuf_offsets[j]] >> 32) | (state->state[shuf_offsets[j + 8]] << 32);
+        }
+        for (size_t j = 0; j < 4; j++) {
+            // SIMD does not support rotations. Shift is the next best thing to entangle
+            // bits with other 64-bit positions. We must shift by an odd number so that
+            // each bit reaches all 64-bit positions, not just half. We must lose bits
+            // of information, so we minimize it: 1 and 3. We use different shift values
+            // to increase divergence between the two sides. We use rightward shift
+            // because the rightmost bits have the least diffusion in addition (the low
+            // bit is just a XOR of the low bits).
+            uint64_t u_lo = state->state[j + 0] >> 1;
+            uint64_t u_hi = state->state[j + 4] >> 3;
+            // Addition is the main source of diffusion.
+            // Storing the output in the state keeps that diffusion permanently.
+            state->state[j + 0] = u_lo + t[j + 0];
+            state->state[j + 4] = u_hi + t[j + 4];
+
+            // Two orthogonally grown pieces evolving independently, XORed.
+            state->output[j] = u_lo ^ t[j + 4];
+        }
+    }
+}
+#undef SHISHUA_RESTRICT
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[8] = {
+    0x9E3779B97F4A7C15,
+    0xF39CC0605CEDC834,
+    0x1082276BF3A27251,
+    0xF86C6A11D0C18E95,
+    0x2767F0B153D27B7F,
+    0x0347045B5BF1827F,
+    0x01886F0928403002,
+    0xC1D64BA40F335E36,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+    memset(s, 0, sizeof(prng_state));
+
+#define STEPS 5
+#define ROUNDS 4
+    // Diffuse first two seed elements in s0, then the last two. Same for s1.
+    // We must keep half of the state unchanged so users cannot set a bad state.
+    memcpy(s->state, phi, sizeof(phi));
+    for (size_t i = 0; i < 4; i++) {
+        s->state[i * 2] ^= seed[i];
+    }
+    for (size_t i = 0; i < ROUNDS; i++) {
+        prng_gen(s, NULL, 32 * STEPS);
+        for (size_t j = 0; j < 4; j++) {
+            s->state[j + 0] = s->state[j + 4];
+            s->state[j + 4] = s->output[j];
+        }
+    }
+#undef STEPS
+#undef ROUNDS
+}
+#endif  // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR
+#endif  // SHISHUA_HALF_SCALAR_H
diff --git a/external/shishua/shishua-neon.h b/external/shishua/shishua-neon.h
new file mode 100644
index 000000000..6cd5124bd
--- /dev/null
+++ b/external/shishua/shishua-neon.h
@@ -0,0 +1,199 @@
+// An ARM NEON version of shishua.
+#ifndef SHISHUA_NEON_H
+#define SHISHUA_NEON_H
+#include <arm_neon.h>
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct prng_state {
+    uint64x2_t state[8];
+    uint64x2_t output[8];
+    uint64x2_t counter[2];
+} prng_state;
+
+#if defined(__GNUC__) && (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define SHISHUA_VSETQ_N_U64(a, b) (__extension__(uint64x2_t){a, b})
+#else
+#define SHISHUA_VSETQ_N_U64(a, b) vcombine_u64(vdup_n_u64(a), vdup_n_u64(b))
+#endif
+// To hide the vreinterpret ritual.
+#define SHISHUA_VEXTQ_U8(Rn, Rm, Imm) \
+    vreinterpretq_u64_u8(             \
+        vextq_u8(                     \
+            vreinterpretq_u8_u64(Rn), \
+            vreinterpretq_u8_u64(Rm), \
+            (Imm)))
+
+// buf could technically alias with prng_state, according to the compiler.
+#if defined(__GNUC__) || defined(_MSC_VER)
+#define SHISHUA_RESTRICT __restrict
+#else
+#define SHISHUA_RESTRICT
+#endif
+
+// buf's size must be a multiple of 128 bytes.
+static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
+    uint8_t *b = buf;
+    uint64x2_t counter_lo = s->counter[0], counter_hi = s->counter[1];
+    // The counter is not necessary to beat PractRand.
+    // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
+    // or about 7 millenia at 10 GiB/s.
+    // The increments are picked as odd numbers,
+    // since only coprimes of the base cover the full cycle,
+    // and all odd numbers are coprime of 2.
+    // I use different odd numbers for each 64-bit chunk
+    // for a tiny amount of variation stirring.
+    // I used the smallest odd numbers to avoid having a magic number.
+    uint64x2_t increment_lo = SHISHUA_VSETQ_N_U64(7, 5);
+    uint64x2_t increment_hi = SHISHUA_VSETQ_N_U64(3, 1);
+    // TODO: consider adding proper uneven write handling
+    assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes.");
+
+    for (size_t i = 0; i < size; i += 128) {
+        // Write the current output block to state if it is not NULL
+        if (buf != NULL) {
+            for (size_t j = 0; j < 8; j++) {
+                vst1q_u8(b, vreinterpretq_u8_u64(s->output[j]));
+                b += 16;
+            }
+        }
+        // NEON has less register pressure than SSE2, but we reroll it anyways for
+        // code size.
+        for (size_t j = 0; j < 2; j++) {
+            uint64x2_t s0_lo = s->state[j * 4 + 0],
+                       s0_hi = s->state[j * 4 + 1],
+                       s1_lo = s->state[j * 4 + 2],
+                       s1_hi = s->state[j * 4 + 3],
+                       t0_lo, t0_hi, t1_lo, t1_hi,
+                       u_lo, u_hi;
+
+            // I apply the counter to s1,
+            // since it is the one whose shift loses most entropy.
+            s1_lo = vaddq_u64(s1_lo, counter_lo);
+            s1_hi = vaddq_u64(s1_hi, counter_hi);
+
+            // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+            // additions to strong positions for enrichment. The low 32-bit part of a
+            // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+            // They do not remain in the same chunk. Each part eventually reaches all
+            // positions ringwise: A to B, B to C, …, H to A.
+            // You may notice that they are simply 256-bit rotations (96 and 160).
+            // Note: This:
+            //   x = (y << 96) | (y >> 160)
+            // can be rewritten as this
+            //   x_lo = (y_lo << 96) | (y_hi >> 32)
+            //   x_hi = (y_hi << 96) | (y_lo >> 32)
+            // which we can do with 2 vext.8 instructions.
+            t0_lo = SHISHUA_VEXTQ_U8(s0_hi, s0_lo, 4);
+            t0_hi = SHISHUA_VEXTQ_U8(s0_lo, s0_hi, 4);
+            t1_lo = SHISHUA_VEXTQ_U8(s1_lo, s1_hi, 12);
+            t1_hi = SHISHUA_VEXTQ_U8(s1_hi, s1_lo, 12);
+
+            // SIMD does not support rotations. Shift is the next best thing to entangle
+            // bits with other 64-bit positions. We must shift by an odd number so that
+            // each bit reaches all 64-bit positions, not just half. We must lose bits
+            // of information, so we minimize it: 1 and 3. We use different shift values
+            // to increase divergence between the two sides. We use rightward shift
+            // because the rightmost bits have the least diffusion in addition (the low
+            // bit is just a XOR of the low bits).
+            u_lo = vshrq_n_u64(s0_lo, 1);
+            u_hi = vshrq_n_u64(s0_hi, 1);
+#if defined(__clang__)
+            // UGLY HACK: Clang enjoys merging the above statements with the vadds below
+            // into vsras.
+            // This is dumb, as it still needs to do the original vshr for the xor mix,
+            // causing it to shift twice.
+            // This makes Clang assume that this line has side effects, preventing the
+            // combination and speeding things up significantly.
+            // clang-format off
+            __asm__("" : "+w"(u_lo), "+w"(u_hi));
+            // clang-format on
+#endif
+
+            // Addition is the main source of diffusion.
+            // Storing the output in the state keeps that diffusion permanently.
+            s->state[4 * j + 0] = vaddq_u64(t0_lo, u_lo);
+            s->state[4 * j + 1] = vaddq_u64(t0_hi, u_hi);
+            // Use vsra here directly.
+            s->state[4 * j + 2] = vsraq_n_u64(t1_lo, s1_lo, 3);
+            s->state[4 * j + 3] = vsraq_n_u64(t1_hi, s1_hi, 3);
+
+            // The first orthogonally grown pieces evolving independently, XORed.
+            s->output[2 * j + 0] = veorq_u64(u_lo, t1_lo);
+            s->output[2 * j + 1] = veorq_u64(u_hi, t1_hi);
+        }
+        // The second orthogonally grown piece evolving independently, XORed.
+        s->output[4] = veorq_u64(s->state[0], s->state[6]);
+        s->output[5] = veorq_u64(s->state[1], s->state[7]);
+
+        s->output[6] = veorq_u64(s->state[2], s->state[4]);
+        s->output[7] = veorq_u64(s->state[3], s->state[5]);
+
+        counter_lo = vaddq_u64(counter_lo, increment_lo);
+        counter_hi = vaddq_u64(counter_hi, increment_hi);
+    }
+    s->counter[0] = counter_lo;
+    s->counter[1] = counter_hi;
+}
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[16] = {
+    0x9E3779B97F4A7C15,
+    0xF39CC0605CEDC834,
+    0x1082276BF3A27251,
+    0xF86C6A11D0C18E95,
+    0x2767F0B153D27B7F,
+    0x0347045B5BF1827F,
+    0x01886F0928403002,
+    0xC1D64BA40F335E36,
+    0xF06AD7AE9717877E,
+    0x85839D6EFFBD7DC6,
+    0x64D325D1C5371682,
+    0xCADD0CCCFDFFBBE1,
+    0x626E33B8D04B4331,
+    0xBBF73C790D94F79D,
+    0x471C4AB3ED3D82A5,
+    0xFEC507705E4AE6E5,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+    s->counter[0] = vdupq_n_u64(0);
+    s->counter[1] = vdupq_n_u64(0);
+#define ROUNDS 13
+#define STEPS 1
+    // Diffuse first two seed elements in s0, then the last two. Same for s1.
+    // We must keep half of the state unchanged so users cannot set a bad state.
+    uint64x2_t seed_0 = SHISHUA_VSETQ_N_U64(seed[0], 0);
+    uint64x2_t seed_1 = SHISHUA_VSETQ_N_U64(seed[1], 0);
+    uint64x2_t seed_2 = SHISHUA_VSETQ_N_U64(seed[2], 0);
+    uint64x2_t seed_3 = SHISHUA_VSETQ_N_U64(seed[3], 0);
+    s->state[0] = veorq_u64(seed_0, vld1q_u64(&phi[0]));
+    s->state[1] = veorq_u64(seed_1, vld1q_u64(&phi[2]));
+    s->state[2] = veorq_u64(seed_2, vld1q_u64(&phi[4]));
+    s->state[3] = veorq_u64(seed_3, vld1q_u64(&phi[6]));
+    s->state[4] = veorq_u64(seed_2, vld1q_u64(&phi[8]));
+    s->state[5] = veorq_u64(seed_3, vld1q_u64(&phi[10]));
+    s->state[6] = veorq_u64(seed_0, vld1q_u64(&phi[12]));
+    s->state[7] = veorq_u64(seed_1, vld1q_u64(&phi[14]));
+
+    for (int i = 0; i < ROUNDS; i++) {
+        prng_gen(s, NULL, 128 * STEPS);
+        s->state[0] = s->output[6];
+        s->state[1] = s->output[7];
+        s->state[2] = s->output[4];
+        s->state[3] = s->output[5];
+        s->state[4] = s->output[2];
+        s->state[5] = s->output[3];
+        s->state[6] = s->output[0];
+        s->state[7] = s->output[1];
+    }
+#undef STEPS
+#undef ROUNDS
+}
+#undef SHISHUA_VSETQ_N_U64
+#undef SHISHUA_VEXTQ_U8
+#undef SHISHUA_RESTRICT
+#endif
diff --git a/external/shishua/shishua-sse2.h b/external/shishua/shishua-sse2.h
new file mode 100644
index 000000000..af28300ad
--- /dev/null
+++ b/external/shishua/shishua-sse2.h
@@ -0,0 +1,228 @@
+// An SSE2/SSSE3 version of shishua. Slower than AVX2, but more compatible.
+// Also compatible with 32-bit x86.
+//
+// SSSE3 is recommended, as it has the useful _mm_alignr_epi8 intrinsic.
+// We can still emulate it on SSE2, but it is slower.
+// Consider the half version if AVX2 is not available.
+#ifndef SHISHUA_SSE2_H
+#define SHISHUA_SSE2_H
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+// Note: cl.exe doesn't define __SSSE3__
+#if defined(__SSSE3__) || defined(__AVX__)
+#include <tmmintrin.h>  // SSSE3
+#define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \
+    _mm_alignr_epi8(hi, lo, amt)
+#else
+#include <emmintrin.h>  // SSE2
+// Emulate _mm_alignr_epi8 for SSE2. It's a little slow.
+// The compiler may convert it to a sequence of shufps instructions, which is
+// perfectly fine.
+#define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \
+    _mm_or_si128(                        \
+        _mm_slli_si128(hi, 16 - (amt)),  \
+        _mm_srli_si128(lo, amt))
+#endif
+
+typedef struct prng_state {
+    __m128i state[8];
+    __m128i output[8];
+    __m128i counter[2];
+} prng_state;
+
+// Wrappers for x86 targets which usually lack these intrinsics.
+// Don't call these with side effects.
+#if defined(__x86_64__) || defined(_M_X64)
+#define SHISHUA_SET_EPI64X(b, a) _mm_set_epi64x(b, a)
+#define SHISHUA_CVTSI64_SI128(x) _mm_cvtsi64_si128(x)
+#else
+#define SHISHUA_SET_EPI64X(b, a)      \
+    _mm_set_epi32(                    \
+        (int)(((uint64_t)(b)) >> 32), \
+        (int)(b),                     \
+        (int)(((uint64_t)(a)) >> 32), \
+        (int)(a))
+#define SHISHUA_CVTSI64_SI128(x) SHISHUA_SET_EPI64X(0, x)
+#endif
+
+// buf could technically alias with prng_state, according to the compiler.
+#if defined(__GNUC__) || defined(_MSC_VER)
+#define SHISHUA_RESTRICT __restrict
+#else
+#define SHISHUA_RESTRICT
+#endif
+
+// buf's size must be a multiple of 128 bytes.
+static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
+    __m128i counter_lo = s->counter[0], counter_hi = s->counter[1];
+    // The counter is not necessary to beat PractRand.
+    // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
+    // or about 7 millenia at 10 GiB/s.
+    // The increments are picked as odd numbers,
+    // since only coprimes of the base cover the full cycle,
+    // and all odd numbers are coprime of 2.
+    // I use different odd numbers for each 64-bit chunk
+    // for a tiny amount of variation stirring.
+    // I used the smallest odd numbers to avoid having a magic number.
+
+    // increment = { 7, 5, 3, 1 };
+    __m128i increment_lo = SHISHUA_SET_EPI64X(5, 7);
+    __m128i increment_hi = SHISHUA_SET_EPI64X(1, 3);
+
+    // TODO: consider adding proper uneven write handling
+    assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes.");
+
+    for (size_t i = 0; i < size; i += 128) {
+        // Write the current output block to state if it is not NULL
+        if (buf != NULL) {
+            for (size_t j = 0; j < 8; j++) {
+                _mm_storeu_si128((__m128i *)&buf[i + (16 * j)], s->output[j]);
+            }
+        }
+
+        // There are only 16 SSE registers (8 on i686), and we have to account for
+        // temporary copies due to being stuck with 2-operand instructions.
+        // Therefore, we use fixed iteration loops to reduce code complexity while
+        // still allowing the compiler to easily unroll the loop.
+        // We also try to keep variables active for as short as possible.
+        for (size_t j = 0; j < 2; j++) {
+            __m128i s_lo, s_hi, u0_lo, u0_hi, u1_lo, u1_hi, t_lo, t_hi;
+
+            // Lane 0
+            s_lo = s->state[4 * j + 0];
+            s_hi = s->state[4 * j + 1];
+
+            // SIMD does not support rotations. Shift is the next best thing to entangle
+            // bits with other 64-bit positions. We must shift by an odd number so that
+            // each bit reaches all 64-bit positions, not just half. We must lose bits
+            // of information, so we minimize it: 1 and 3. We use different shift values
+            // to increase divergence between the two sides. We use rightward shift
+            // because the rightmost bits have the least diffusion in addition (the low
+            // bit is just a XOR of the low bits).
+            u0_lo = _mm_srli_epi64(s_lo, 1);
+            u0_hi = _mm_srli_epi64(s_hi, 1);
+
+            // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+            // additions to strong positions for enrichment. The low 32-bit part of a
+            // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+            // They do not remain in the same chunk. Each part eventually reaches all
+            // positions ringwise: A to B, B to C, …, H to A.
+            // You may notice that they are simply 256-bit rotations (96 and 160).
+            // Note: This:
+            //   x = (y << 96) | (y >> 160)
+            // can be rewritten as this
+            //   x_lo = (y_lo << 96) | (y_hi >> 32)
+            //   x_hi = (y_hi << 96) | (y_lo >> 32)
+            // which we can do with 2 _mm_alignr_epi8 instructions.
+            t_lo = SHISHUA_ALIGNR_EPI8(s_lo, s_hi, 4);
+            t_hi = SHISHUA_ALIGNR_EPI8(s_hi, s_lo, 4);
+
+            // Addition is the main source of diffusion.
+            // Storing the output in the state keeps that diffusion permanently.
+            s->state[4 * j + 0] = _mm_add_epi64(t_lo, u0_lo);
+            s->state[4 * j + 1] = _mm_add_epi64(t_hi, u0_hi);
+
+            // Lane 1
+            s_lo = s->state[4 * j + 2];
+            s_hi = s->state[4 * j + 3];
+
+            // I apply the counter to s1,
+            // since it is the one whose shift loses most entropy.
+            s_lo = _mm_add_epi64(s_lo, counter_lo);
+            s_hi = _mm_add_epi64(s_hi, counter_hi);
+
+            // Same as above but with different shifts
+            u1_lo = _mm_srli_epi64(s_lo, 3);
+            u1_hi = _mm_srli_epi64(s_hi, 3);
+
+            t_lo = SHISHUA_ALIGNR_EPI8(s_hi, s_lo, 12);
+            t_hi = SHISHUA_ALIGNR_EPI8(s_lo, s_hi, 12);
+
+            s->state[4 * j + 2] = _mm_add_epi64(t_lo, u1_lo);
+            s->state[4 * j + 3] = _mm_add_epi64(t_hi, u1_hi);
+
+            // Merge lane 0 and lane 1
+            // The first orthogonally grown piece evolving independently, XORed.
+            s->output[2 * j + 0] = _mm_xor_si128(u0_lo, t_lo);
+            s->output[2 * j + 1] = _mm_xor_si128(u0_hi, t_hi);
+        }
+
+        // The second orthogonally grown piece evolving independently, XORed.
+        s->output[4] = _mm_xor_si128(s->state[0], s->state[6]);
+        s->output[5] = _mm_xor_si128(s->state[1], s->state[7]);
+        s->output[6] = _mm_xor_si128(s->state[4], s->state[2]);
+        s->output[7] = _mm_xor_si128(s->state[5], s->state[3]);
+
+        // Increment the counter
+        counter_lo = _mm_add_epi64(counter_lo, increment_lo);
+        counter_hi = _mm_add_epi64(counter_hi, increment_hi);
+    }
+
+    s->counter[0] = counter_lo;
+    s->counter[1] = counter_hi;
+}
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[16] = {
+    0x9E3779B97F4A7C15,
+    0xF39CC0605CEDC834,
+    0x1082276BF3A27251,
+    0xF86C6A11D0C18E95,
+    0x2767F0B153D27B7F,
+    0x0347045B5BF1827F,
+    0x01886F0928403002,
+    0xC1D64BA40F335E36,
+    0xF06AD7AE9717877E,
+    0x85839D6EFFBD7DC6,
+    0x64D325D1C5371682,
+    0xCADD0CCCFDFFBBE1,
+    0x626E33B8D04B4331,
+    0xBBF73C790D94F79D,
+    0x471C4AB3ED3D82A5,
+    0xFEC507705E4AE6E5,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+    // Note: output is uninitialized at first, but since we pass NULL, its value
+    // is initially ignored.
+    s->counter[0] = _mm_setzero_si128();
+    s->counter[1] = _mm_setzero_si128();
+#define ROUNDS 13
+#define STEPS 1
+    // Diffuse first two seed elements in s0, then the last two. Same for s1.
+    // We must keep half of the state unchanged so users cannot set a bad state.
+    __m128i seed_0 = SHISHUA_CVTSI64_SI128(seed[0]);
+    __m128i seed_1 = SHISHUA_CVTSI64_SI128(seed[1]);
+    __m128i seed_2 = SHISHUA_CVTSI64_SI128(seed[2]);
+    __m128i seed_3 = SHISHUA_CVTSI64_SI128(seed[3]);
+    s->state[0] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[0]));
+    s->state[1] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[2]));
+    s->state[2] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[4]));
+    s->state[3] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[6]));
+    s->state[4] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[8]));
+    s->state[5] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[10]));
+    s->state[6] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[12]));
+    s->state[7] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[14]));
+
+    for (int i = 0; i < ROUNDS; i++) {
+        prng_gen(s, NULL, 128 * STEPS);
+        s->state[0] = s->output[6];
+        s->state[1] = s->output[7];
+        s->state[2] = s->output[4];
+        s->state[3] = s->output[5];
+        s->state[4] = s->output[2];
+        s->state[5] = s->output[3];
+        s->state[6] = s->output[0];
+        s->state[7] = s->output[1];
+    }
+#undef STEPS
+#undef ROUNDS
+}
+#undef SHISHUA_CVTSI64_SI128
+#undef SHISHUA_ALIGNR_EPI8
+#undef SHISHUA_SET_EPI64X
+#undef SHISHUA_RESTRICT
+#endif
diff --git a/external/shishua/shishua.h b/external/shishua/shishua.h
new file mode 100644
index 000000000..a0bb9248d
--- /dev/null
+++ b/external/shishua/shishua.h
@@ -0,0 +1,244 @@
+#ifndef SHISHUA_H
+#define SHISHUA_H
+
+#define SHISHUA_TARGET_SCALAR 0
+#define SHISHUA_TARGET_AVX2 1
+#define SHISHUA_TARGET_SSE2 2
+#define SHISHUA_TARGET_NEON 3
+
+#ifndef SHISHUA_TARGET
+#if defined(__AVX2__) && (defined(__x86_64__) || defined(_M_X64))
+#define SHISHUA_TARGET SHISHUA_TARGET_AVX2
+#elif defined(__x86_64__) || defined(_M_X64) || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#define SHISHUA_TARGET SHISHUA_TARGET_SSE2
+// GCC's NEON codegen leaves much to be desired, at least as of 9.2.0. The
+// scalar path ends up being faster.
+// Device: Google Pixel 2 XL, 2.46GHz Qualcomm Snapdragon 835
+//   algorithm           |   GCC 9.2.0    |  Clang 9.0.1
+//   shishua neon        | 0.2845 ns/byte | 0.0966 ns/byte
+//   shishua scalar      | 0.2056 ns/byte | 0.2958 ns/byte
+//   shishua half neon   | 0.5169 ns/byte | 0.1929 ns/byte
+//   shishua half scalar | 0.2496 ns/byte | 0.2911 ns/byte
+// Therefore, we only autoselect the NEON path on Clang, at least until GCC's
+// NEON codegen improves.
+#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__clang__)
+#define SHISHUA_TARGET SHISHUA_TARGET_NEON
+#else
+#define SHISHUA_TARGET SHISHUA_TARGET_SCALAR
+#endif
+#endif
+
+// These are all optional, with defining SHISHUA_TARGET_SCALAR, you only
+// need this header.
+#if SHISHUA_TARGET == SHISHUA_TARGET_AVX2
+#include "shishua-avx2.h"
+#elif SHISHUA_TARGET == SHISHUA_TARGET_SSE2
+#include "shishua-sse2.h"
+#elif SHISHUA_TARGET == SHISHUA_TARGET_NEON
+#include "shishua-neon.h"
+#else  // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR
+
+// Portable scalar implementation of shishua.
+// Designed to balance performance and code size.
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+// Note: While it is an array, a "lane" refers to 4 consecutive uint64_t.
+typedef struct prng_state {
+    uint64_t state[16];   // 4 lanes
+    uint64_t output[16];  // 4 lanes, 2 parts
+    uint64_t counter[4];  // 1 lane
+} prng_state;
+
+// buf could technically alias with prng_state, according to the compiler.
+#if defined(__GNUC__) || defined(_MSC_VER)
+#define SHISHUA_RESTRICT __restrict
+#else
+#define SHISHUA_RESTRICT
+#endif
+
+// Writes a 64-bit little endian integer to dst
+static inline void prng_write_le64(void *dst, uint64_t val) {
+    // Define to write in native endianness with memcpy
+    // Also, use memcpy on known little endian setups.
+#if defined(SHISHUA_NATIVE_ENDIAN) || defined(_WIN32) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN__)
+    memcpy(dst, &val, sizeof(uint64_t));
+#else
+    // Byteshift write.
+    uint8_t *d = (uint8_t *)dst;
+    for (size_t i = 0; i < 8; i++) {
+        d[i] = (uint8_t)(val & 0xff);
+        val >>= 8;
+    }
+#endif
+}
+
+// buf's size must be a multiple of 128 bytes.
+static inline void prng_gen(prng_state *SHISHUA_RESTRICT state, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
+    uint8_t *b = buf;
+    // TODO: consider adding proper uneven write handling
+    assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes.");
+
+    for (size_t i = 0; i < size; i += 128) {
+        // Write the current output block to state if it is not NULL
+        if (buf != NULL) {
+            for (size_t j = 0; j < 16; j++) {
+                prng_write_le64(b, state->output[j]);
+                b += 8;
+            }
+        }
+        // Similar to SSE, use fixed iteration loops to reduce code complexity
+        // and allow the compiler more control over optimization.
+        for (size_t j = 0; j < 2; j++) {
+            // I don't want to type this 15 times.
+            uint64_t *s = &state->state[j * 8];   // 2 lanes
+            uint64_t *o = &state->output[j * 4];  // 1 lane
+            uint64_t t[8];                        // temp buffer
+
+            // I apply the counter to s1,
+            // since it is the one whose shift loses most entropy.
+            for (size_t k = 0; k < 4; k++) {
+                s[k + 4] += state->counter[k];
+            }
+
+            // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+            // additions to strong positions for enrichment. The low 32-bit part of a
+            // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+            // They do not remain in the same chunk. Each part eventually reaches all
+            // positions ringwise: A to B, B to C, …, H to A.
+            //
+            // You may notice that they are simply 256-bit rotations (96 and 160):
+            //
+            //   t0 = (s0 <<  96) | (s0 >> (256 -  96));
+            //   t1 = (s1 << 160) | (s1 >> (256 - 160));
+            //
+            // The easiest way to do this would be to cast s and t to uint32_t *
+            // and operate on them that way.
+            //
+            //   uint32_t *t0_32 = (uint32_t *)t0, *t1_32 = (uint32_t *)t1;
+            //   uint32_t *s0_32 = (uint32_t *)s0, *s1_32 = (uint32_t *)s1;
+            //   for (size_t k = 0; k < 4; k++) {
+            //     t0_32[k] = s0_32[(k + 5) % 8];
+            //     t1_32[k] = s1_32[(k + 3) % 8];
+            //   }
+            //
+            // This is pretty, but it violates strict aliasing and relies on little
+            // endian data layout.
+            //
+            // A common workaround to strict aliasing would be to use memcpy:
+            //
+            //   // legal casts
+            //   unsigned char *t8 = (unsigned char *)t;
+            //   unsigned char *s8 = (unsigned char *)s;
+            //   memcpy(&t8[0], &s8[20], 32 - 20);
+            //   memcpy(&t8[32 - 20], &s8[0], 20);
+            //
+            // However, this still doesn't fix the endianness issue, and is very
+            // ugly.
+            //
+            // The only known solution which doesn't rely on endianness is to
+            // read two 64-bit integers and do a funnel shift.
+
+            // Lookup table for the _offsets_ in the shuffle. Even lanes rotate
+            // by 5, odd lanes rotate by 3.
+            // If it were by 32-bit lanes, it would be
+            // { 5,6,7,0,1,2,3,4, 11,12,13,14,15,8,9,10 }
+            const uint8_t shuf_offsets[16] = {2, 3, 0, 1, 5, 6, 7, 4,   // left
+                                              3, 0, 1, 2, 6, 7, 4, 5};  // right
+            for (size_t k = 0; k < 8; k++) {
+                t[k] = (s[shuf_offsets[k]] >> 32) | (s[shuf_offsets[k + 8]] << 32);
+            }
+
+            for (size_t k = 0; k < 4; k++) {
+                // SIMD does not support rotations. Shift is the next best thing to entangle
+                // bits with other 64-bit positions. We must shift by an odd number so that
+                // each bit reaches all 64-bit positions, not just half. We must lose bits
+                // of information, so we minimize it: 1 and 3. We use different shift values
+                // to increase divergence between the two sides. We use rightward shift
+                // because the rightmost bits have the least diffusion in addition (the low
+                // bit is just a XOR of the low bits).
+                uint64_t u_lo = s[k + 0] >> 1;
+                uint64_t u_hi = s[k + 4] >> 3;
+
+                // Addition is the main source of diffusion.
+                // Storing the output in the state keeps that diffusion permanently.
+                s[k + 0] = u_lo + t[k + 0];
+                s[k + 4] = u_hi + t[k + 4];
+
+                // The first orthogonally grown piece evolving independently, XORed.
+                o[k] = u_lo ^ t[k + 4];
+            }
+        }
+
+        // Merge together.
+        for (size_t j = 0; j < 4; j++) {
+            // The second orthogonally grown piece evolving independently, XORed.
+            state->output[j + 8] = state->state[j + 0] ^ state->state[j + 12];
+            state->output[j + 12] = state->state[j + 8] ^ state->state[j + 4];
+            // The counter is not necessary to beat PractRand.
+            // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
+            // or about 7 millenia at 10 GiB/s.
+            // The increments are picked as odd numbers,
+            // since only coprimes of the base cover the full cycle,
+            // and all odd numbers are coprime of 2.
+            // I use different odd numbers for each 64-bit chunk
+            // for a tiny amount of variation stirring.
+            // I used the smallest odd numbers to avoid having a magic number.
+            //
+            // For the scalar version, we calculate this dynamically, as it is
+            // simple enough.
+            state->counter[j] += 7 - (j * 2);  // 7, 5, 3, 1
+        }
+    }
+}
+#undef SHISHUA_RESTRICT
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[16] = {
+    0x9E3779B97F4A7C15,
+    0xF39CC0605CEDC834,
+    0x1082276BF3A27251,
+    0xF86C6A11D0C18E95,
+    0x2767F0B153D27B7F,
+    0x0347045B5BF1827F,
+    0x01886F0928403002,
+    0xC1D64BA40F335E36,
+    0xF06AD7AE9717877E,
+    0x85839D6EFFBD7DC6,
+    0x64D325D1C5371682,
+    0xCADD0CCCFDFFBBE1,
+    0x626E33B8D04B4331,
+    0xBBF73C790D94F79D,
+    0x471C4AB3ED3D82A5,
+    0xFEC507705E4AE6E5,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+    memset(s, 0, sizeof(prng_state));
+#define STEPS 1
+#define ROUNDS 13
+    // Diffuse first two seed elements in s0, then the last two. Same for s1.
+    // We must keep half of the state unchanged so users cannot set a bad state.
+    memcpy(s->state, phi, sizeof(phi));
+    for (size_t i = 0; i < 4; i++) {
+        s->state[i * 2 + 0] ^= seed[i];            // { s0,0,s1,0,s2,0,s3,0 }
+        s->state[i * 2 + 8] ^= seed[(i + 2) % 4];  // { s2,0,s3,0,s0,0,s1,0 }
+    }
+    for (size_t i = 0; i < ROUNDS; i++) {
+        prng_gen(s, NULL, 128 * STEPS);
+        for (size_t j = 0; j < 4; j++) {
+            s->state[j + 0] = s->output[j + 12];
+            s->state[j + 4] = s->output[j + 8];
+            s->state[j + 8] = s->output[j + 4];
+            s->state[j + 12] = s->output[j + 0];
+        }
+    }
+#undef STEPS
+#undef ROUNDS
+}
+#endif  // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR
+#endif  // SHISHUA_SCALAR_H
diff --git a/src/helper/shishua/shishua-avx2.h b/src/helper/shishua/shishua-avx2.h
deleted file mode 100644
index c2c865e80..000000000
--- a/src/helper/shishua/shishua-avx2.h
+++ /dev/null
@@ -1,111 +0,0 @@
-#ifndef SHISHUA_AVX2_H
-#define SHISHUA_AVX2_H
-#include <stdint.h>
-#include <stddef.h>
-#include <string.h>
-#include <immintrin.h>
-#include <assert.h>
-typedef struct prng_state {
-  __m256i state[4];
-  __m256i output[4];
-  __m256i counter;
-} prng_state;
-
-// buf's size must be a multiple of 128 bytes.
-static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) {
-  __m256i o0 = s->output[0], o1 = s->output[1], o2 = s->output[2], o3 = s->output[3],
-          s0 =  s->state[0], s1 =  s->state[1], s2 =  s->state[2], s3 =  s->state[3],
-          t0, t1, t2, t3, u0, u1, u2, u3, counter = s->counter;
-  // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
-  // additions to strong positions for enrichment. The low 32-bit part of a
-  // 64-bit chunk never moves to the same 64-bit chunk as its high part.
-  // They do not remain in the same chunk. Each part eventually reaches all
-  // positions ringwise: A to B, B to C, …, H to A.
-  // You may notice that they are simply 256-bit rotations (96 and 160).
-  __m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5),
-          shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3);
-  // The counter is not necessary to beat PractRand.
-  // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
-  // or about 7 millenia at 10 GiB/s.
-  // The increments are picked as odd numbers,
-  // since only coprimes of the base cover the full cycle,
-  // and all odd numbers are coprime of 2.
-  // I use different odd numbers for each 64-bit chunk
-  // for a tiny amount of variation stirring.
-  // I used the smallest odd numbers to avoid having a magic number.
-  __m256i increment = _mm256_set_epi64x(1, 3, 5, 7);
-
-  // TODO: consider adding proper uneven write handling
-  assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes.");
-
-  for (size_t i = 0; i < size; i += 128) {
-    if (buf != NULL) {
-      _mm256_storeu_si256((__m256i*)&buf[i +  0], o0);
-      _mm256_storeu_si256((__m256i*)&buf[i + 32], o1);
-      _mm256_storeu_si256((__m256i*)&buf[i + 64], o2);
-      _mm256_storeu_si256((__m256i*)&buf[i + 96], o3);
-    }
-
-    // I apply the counter to s1,
-    // since it is the one whose shift loses most entropy.
-    s1 = _mm256_add_epi64(s1, counter);
-    s3 = _mm256_add_epi64(s3, counter);
-    counter = _mm256_add_epi64(counter, increment);
-
-    // SIMD does not support rotations. Shift is the next best thing to entangle
-    // bits with other 64-bit positions. We must shift by an odd number so that
-    // each bit reaches all 64-bit positions, not just half. We must lose bits
-    // of information, so we minimize it: 1 and 3. We use different shift values
-    // to increase divergence between the two sides. We use rightward shift
-    // because the rightmost bits have the least diffusion in addition (the low
-    // bit is just a XOR of the low bits).
-    u0 = _mm256_srli_epi64(s0, 1);              u1 = _mm256_srli_epi64(s1, 3);
-    u2 = _mm256_srli_epi64(s2, 1);              u3 = _mm256_srli_epi64(s3, 3);
-    t0 = _mm256_permutevar8x32_epi32(s0, shu0); t1 = _mm256_permutevar8x32_epi32(s1, shu1);
-    t2 = _mm256_permutevar8x32_epi32(s2, shu0); t3 = _mm256_permutevar8x32_epi32(s3, shu1);
-    // Addition is the main source of diffusion.
-    // Storing the output in the state keeps that diffusion permanently.
-    s0 = _mm256_add_epi64(t0, u0);              s1 = _mm256_add_epi64(t1, u1);
-    s2 = _mm256_add_epi64(t2, u2);              s3 = _mm256_add_epi64(t3, u3);
-
-    // Two orthogonally grown pieces evolving independently, XORed.
-    o0 = _mm256_xor_si256(u0, t1);
-    o1 = _mm256_xor_si256(u2, t3);
-    o2 = _mm256_xor_si256(s0, s3);
-    o3 = _mm256_xor_si256(s2, s1);
-  }
-  s->output[0] = o0; s->output[1] = o1; s->output[2] = o2; s->output[3] = o3;
-  s->state [0] = s0; s->state [1] = s1; s->state [2] = s2; s->state [3] = s3;
-  s->counter = counter;
-}
-
-// Nothing up my sleeve: those are the hex digits of Φ,
-// the least approximable irrational number.
-// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
-static uint64_t phi[16] = {
-  0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95,
-  0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36,
-  0xF06AD7AE9717877E, 0x85839D6EFFBD7DC6, 0x64D325D1C5371682, 0xCADD0CCCFDFFBBE1,
-  0x626E33B8D04B4331, 0xBBF73C790D94F79D, 0x471C4AB3ED3D82A5, 0xFEC507705E4AE6E5,
-};
-
-void prng_init(prng_state *s, uint64_t seed[4]) {
-  memset(s, 0, sizeof(prng_state));
-# define STEPS 1
-# define ROUNDS 13
-  uint8_t buf[128 * STEPS];
-  // Diffuse first two seed elements in s0, then the last two. Same for s1.
-  // We must keep half of the state unchanged so users cannot set a bad state.
-  s->state[0] = _mm256_set_epi64x(phi[ 3], phi[ 2] ^ seed[1], phi[ 1], phi[ 0] ^ seed[0]);
-  s->state[1] = _mm256_set_epi64x(phi[ 7], phi[ 6] ^ seed[3], phi[ 5], phi[ 4] ^ seed[2]);
-  s->state[2] = _mm256_set_epi64x(phi[11], phi[10] ^ seed[3], phi[ 9], phi[ 8] ^ seed[2]);
-  s->state[3] = _mm256_set_epi64x(phi[15], phi[14] ^ seed[1], phi[13], phi[12] ^ seed[0]);
-  for (size_t i = 0; i < ROUNDS; i++) {
-    prng_gen(s, buf, 128 * STEPS);
-    s->state[0] = s->output[3]; s->state[1] = s->output[2];
-    s->state[2] = s->output[1]; s->state[3] = s->output[0];
-  }
-# undef STEPS
-# undef ROUNDS
-}
-#endif
diff --git a/src/helper/shishua/shishua-half-avx2.h b/src/helper/shishua/shishua-half-avx2.h
deleted file mode 100644
index b21f46174..000000000
--- a/src/helper/shishua/shishua-half-avx2.h
+++ /dev/null
@@ -1,96 +0,0 @@
-#ifndef SHISHUA_H
-#define SHISHUA_H
-#include <stdint.h>
-#include <stddef.h>
-#include <string.h>
-#include <immintrin.h>
-#include <assert.h>
-typedef struct prng_state {
-  __m256i state[2];
-  __m256i output;
-  __m256i counter;
-} prng_state;
-
-// buf's size must be a multiple of 32 bytes.
-static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) {
-  __m256i s0 = s->state[0], counter = s->counter,
-          s1 = s->state[1],       o = s->output,
-          t0, t1, t2, t3, u0, u1, u2, u3;
-  // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
-  // additions to strong positions for enrichment. The low 32-bit part of a
-  // 64-bit chunk never moves to the same 64-bit chunk as its high part.
-  // They do not remain in the same chunk. Each part eventually reaches all
-  // positions ringwise: A to B, B to C, …, H to A.
-  // You may notice that they are simply 256-bit rotations (96 and 160).
-  __m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5),
-          shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3);
-  // The counter is not necessary to beat PractRand.
-  // It sets a lower bound of 2^69 bytes = 512 EiB to the period,
-  // or about 1 millenia at 10 GiB/s.
-  // The increments are picked as odd numbers,
-  // since only coprimes of the base cover the full cycle,
-  // and all odd numbers are coprime of 2.
-  // I use different odd numbers for each 64-bit chunk
-  // for a tiny amount of variation stirring.
-  // I used the smallest odd numbers to avoid having a magic number.
-  __m256i increment = _mm256_set_epi64x(1, 3, 5, 7);
-
-  // TODO: consider adding proper uneven write handling
-  assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes.");
-
-  for (size_t i = 0; i < size; i += 32) {
-    if (buf != NULL) {
-      _mm256_storeu_si256((__m256i*)&buf[i], o);
-    }
-
-    // I apply the counter to s1,
-    // since it is the one whose shift loses most entropy.
-    s1 = _mm256_add_epi64(s1, counter);
-    counter = _mm256_add_epi64(counter, increment);
-
-    // SIMD does not support rotations. Shift is the next best thing to entangle
-    // bits with other 64-bit positions. We must shift by an odd number so that
-    // each bit reaches all 64-bit positions, not just half. We must lose bits
-    // of information, so we minimize it: 1 and 3. We use different shift values
-    // to increase divergence between the two sides. We use rightward shift
-    // because the rightmost bits have the least diffusion in addition (the low
-    // bit is just a XOR of the low bits).
-    u0 = _mm256_srli_epi64(s0, 1);              u1 = _mm256_srli_epi64(s1, 3);
-    t0 = _mm256_permutevar8x32_epi32(s0, shu0); t1 = _mm256_permutevar8x32_epi32(s1, shu1);
-    // Addition is the main source of diffusion.
-    // Storing the output in the state keeps that diffusion permanently.
-    s0 = _mm256_add_epi64(t0, u0);              s1 = _mm256_add_epi64(t1, u1);
-
-    // Two orthogonally grown pieces evolving independently, XORed.
-    o = _mm256_xor_si256(u0, t1);
-  }
-  s->state[0] = s0; s->counter = counter;
-  s->state[1] = s1; s->output  = o;
-}
-
-// Nothing up my sleeve: those are the hex digits of Φ,
-// the least approximable irrational number.
-// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
-static uint64_t phi[8] = {
-  0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95,
-  0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36,
-};
-
-void prng_init(prng_state *s, uint64_t seed[4]) {
-  memset(s, 0, sizeof(prng_state));
-# define STEPS 5
-# define ROUNDS 4
-  uint8_t buf[32 * STEPS];  // 4 64-bit numbers per 256-bit SIMD.
-  // Diffuse first two seed elements in s0, then the last two. Same for s1.
-  // We must keep half of the state unchanged so users cannot set a bad state.
-  s->state[0] = _mm256_set_epi64x(phi[3], phi[2] ^ seed[1], phi[1], phi[0] ^ seed[0]);
-  s->state[1] = _mm256_set_epi64x(phi[7], phi[6] ^ seed[3], phi[5], phi[4] ^ seed[2]);
-  for (size_t i = 0; i < ROUNDS; i++) {
-    prng_gen(s, buf, 32 * STEPS);
-    s->state[0] = s->state[1];
-    s->state[1] = s->output;
-  }
-# undef STEPS
-# undef ROUNDS
-}
-#endif
diff --git a/src/helper/shishua/shishua-half-neon.h b/src/helper/shishua/shishua-half-neon.h
deleted file mode 100644
index f946a9aee..000000000
--- a/src/helper/shishua/shishua-half-neon.h
+++ /dev/null
@@ -1,155 +0,0 @@
-// An ARM NEON version of shishua.
-#ifndef SHISHUA_HALF_NEON_H
-#define SHISHUA_HALF_NEON_H
-#include <stdint.h>
-#include <stddef.h>
-#include <assert.h>
-#include <arm_neon.h>
-// A NEON version.
-typedef struct prng_state {
-  uint64x2_t state[4];
-  uint64x2_t output[2];
-  uint64x2_t counter[2];
-} prng_state;
-
-#define SHISHUA_VSETQ_N_U64(a, b) vcombine_u64(vdup_n_u64(a), vdup_n_u64(b))
-
-// To hide the vreinterpret ritual.
-#define SHISHUA_VEXTQ_U8(Rn, Rm, Imm) \
-  vreinterpretq_u64_u8( \
-    vextq_u8( \
-      vreinterpretq_u8_u64(Rn), \
-      vreinterpretq_u8_u64(Rm), \
-      (Imm) \
-    ) \
-  )
-
-// buf could technically alias with prng_state, according to the compiler.
-#if defined(__GNUC__) || defined(_MSC_VER)
-#  define SHISHUA_RESTRICT __restrict
-#else
-#  define SHISHUA_RESTRICT
-#endif
-
-// buf's size must be a multiple of 32 bytes.
-static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
-  uint64x2_t s0_lo = s->state[0], s0_hi = s->state[1],
-             s1_lo = s->state[2], s1_hi = s->state[3],
-             o_lo = s->output[0], o_hi = s->output[1],
-             t0_lo, t0_hi, t1_lo, t1_hi, u_lo, u_hi,
-             counter_lo = s->counter[0], counter_hi = s->counter[1];
-  // The counter is not necessary to beat PractRand.
-  // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
-  // or about 7 millenia at 10 GiB/s.
-  // The increments are picked as odd numbers,
-  // since only coprimes of the base cover the full cycle,
-  // and all odd numbers are coprime of 2.
-  // I use different odd numbers for each 64-bit chunk
-  // for a tiny amount of variation stirring.
-  // I used the smallest odd numbers to avoid having a magic number.
-  uint64x2_t increment_lo = SHISHUA_VSETQ_N_U64(7, 5);
-  uint64x2_t increment_hi = SHISHUA_VSETQ_N_U64(3, 1);
-  uint8_t *b = buf;
-  // TODO: consider adding proper uneven write handling
-  assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes.");
-
-  for (size_t i = 0; i < size/32; i++) {
-    if (buf != NULL) {
-      vst1q_u8(&b[0], vreinterpretq_u8_u64(o_lo)); b += 16;
-      vst1q_u8(&b[0], vreinterpretq_u8_u64(o_hi)); b += 16;
-    }
-
-    // I apply the counter to s1,
-    // since it is the one whose shift loses most entropy.
-    s1_lo = vaddq_u64(s1_lo, counter_lo);
-    s1_hi = vaddq_u64(s1_hi, counter_hi);
-
-    counter_lo = vaddq_u64(counter_lo, increment_lo);
-    counter_hi = vaddq_u64(counter_hi, increment_hi);
-
-    // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
-    // additions to strong positions for enrichment. The low 32-bit part of a
-    // 64-bit chunk never moves to the same 64-bit chunk as its high part.
-    // They do not remain in the same chunk. Each part eventually reaches all
-    // positions ringwise: A to B, B to C, …, H to A.
-    // You may notice that they are simply 256-bit rotations (96 and 160).
-    // Note: This:
-    //   x = (y << 96) | (y >> 160)
-    // can be rewritten as this
-    //   x_lo = (y_lo << 96) | (y_hi >> 32)
-    //   x_hi = (y_hi << 96) | (y_lo >> 32)
-    // which we can do with 2 vext.8 instructions.
-    t0_lo = SHISHUA_VEXTQ_U8(s0_hi, s0_lo,  4);
-    t0_hi = SHISHUA_VEXTQ_U8(s0_lo, s0_hi,  4);
-    t1_lo = SHISHUA_VEXTQ_U8(s1_lo, s1_hi, 12);
-    t1_hi = SHISHUA_VEXTQ_U8(s1_hi, s1_lo, 12);
-
-    // SIMD does not support rotations. Shift is the next best thing to entangle
-    // bits with other 64-bit positions. We must shift by an odd number so that
-    // each bit reaches all 64-bit positions, not just half. We must lose bits
-    // of information, so we minimize it: 1 and 3. We use different shift values
-    // to increase divergence between the two sides. We use rightward shift
-    // because the rightmost bits have the least diffusion in addition (the low
-    // bit is just a XOR of the low bits).
-    u_lo = vshrq_n_u64(s0_lo, 1);  u_hi = vshrq_n_u64(s0_hi, 1);
-#if defined(__clang__)
-    // UGLY HACK: Clang enjoys merging the above statements with the vadds below
-    // into vsras.
-    // This is dumb, as it still needs to do the original vshr for the xor mix,
-    // causing it to shift twice.
-    // This makes Clang assume that this line has side effects, preventing the
-    // combination and speeding things up significantly.
-    __asm__("" : "+w" (u_lo), "+w" (u_hi));
-#endif
-
-
-    // Two orthogonally grown pieces evolving independently, XORed.
-    // Note: We do this first because on the assembly side, vsra would overwrite
-    // t1_lo and t1_hi.
-    o_lo = veorq_u64(u_lo, t1_lo);
-    o_hi = veorq_u64(u_hi, t1_hi);
-    // Addition is the main source of diffusion.
-    // Storing the output in the state keeps that diffusion permanently.
-    s0_lo = vaddq_u64(t0_lo, u_lo);
-    s0_hi = vaddq_u64(t0_hi, u_hi);
-    // Use vsra here directly.
-    s1_lo = vsraq_n_u64(t1_lo, s1_lo, 3);
-    s1_hi = vsraq_n_u64(t1_hi, s1_hi, 3);
-  }
-  s->state[0] = s0_lo;  s->state[1] = s0_hi;
-  s->state[2] = s1_lo;  s->state[3] = s1_hi;
-  s->output[0] = o_lo; s->output[1] = o_hi;
-  s->counter[0] = counter_lo; s->counter[1] = counter_hi;
-}
-
-// Nothing up my sleeve: those are the hex digits of Φ,
-// the least approximable irrational number.
-// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
-static uint64_t phi[8] = {
-  0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95,
-  0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36,
-};
-
-void prng_init(prng_state *s, uint64_t seed[4]) {
-  s->counter[0] = vdupq_n_u64(0);
-  s->counter[1] = vdupq_n_u64(0);
-# define STEPS 5
-# define ROUNDS 4
-  // Diffuse first two seed elements in s0, then the last two. Same for s1.
-  // We must keep half of the state unchanged so users cannot set a bad state.
-  for (size_t i = 0; i < 4; i++) {
-     s->state[i] = veorq_u64(SHISHUA_VSETQ_N_U64(seed[i], 0), vld1q_u64(&phi[i * 2]));
-  }
-  for (size_t i = 0; i < ROUNDS; i++) {
-    prng_gen(s, NULL, 32 * STEPS);
-    s->state[0] = s->state[2];    s->state[1] = s->state[3];
-    s->state[2] = s->output[0];   s->state[3] = s->output[1];
-  }
-# undef STEPS
-# undef ROUNDS
-}
-#undef SHISHUA_VSETQ_N_U64
-#undef SHISHUA_VEXTQ_U8
-#undef SHISHUA_RESTRICT
-
-#endif
diff --git a/src/helper/shishua/shishua-half-sse2.h b/src/helper/shishua/shishua-half-sse2.h
deleted file mode 100644
index d6f958930..000000000
--- a/src/helper/shishua/shishua-half-sse2.h
+++ /dev/null
@@ -1,194 +0,0 @@
-// An SSE2/SSSE3 version of shishua half. Slower than AVX2, but more compatible.
-// Also compatible with 32-bit x86.
-//
-// SSSE3 is recommended, as it has the useful _mm_alignr_epi8 intrinsic.
-// We can still emulate it on SSE2, but it is slower.
-#ifndef SHISHUA_HALF_SSE2_H
-#define SHISHUA_HALF_SSE2_H
-
-#include <stdint.h>
-#include <stddef.h>
-#include <assert.h>
-// Note: cl.exe doesn't define __SSSE3__
-#if defined(__SSSE3__) || defined(__AVX__)
-#  include <tmmintrin.h> // SSSE3
-#  define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \
-   _mm_alignr_epi8(hi, lo, amt)
-#else
-#  include <emmintrin.h> // SSE2
-// Emulate _mm_alignr_epi8 for SSE2. It's a little slow.
-// The compiler may convert it to a sequence of shufps instructions, which is
-// perfectly fine.
-#  define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \
-   _mm_or_si128( \
-     _mm_slli_si128(hi, 16 - (amt)), \
-     _mm_srli_si128(lo, amt) \
-   )
-#endif
-
-typedef struct prng_state {
-  __m128i state[4];
-  __m128i output[2];
-  __m128i counter[2];
-} prng_state;
-
-// Wrappers for x86 targets which usually lack these intrinsics.
-// Don't call these with side effects.
-#if defined(__x86_64__) || defined(_M_X64)
-#   define SHISHUA_SET_EPI64X(b, a) _mm_set_epi64x(b, a)
-#   define SHISHUA_CVTSI64_SI128(x) _mm_cvtsi64_si128(x)
-#else
-#   define SHISHUA_SET_EPI64X(b, a) \
-      _mm_set_epi32( \
-        (int)(((uint64_t)(b)) >> 32), \
-        (int)(b), \
-        (int)(((uint64_t)(a)) >> 32), \
-        (int)(a) \
-      )
-#   define SHISHUA_CVTSI64_SI128(x) SHISHUA_SET_EPI64X(0, x)
-#endif
-
-// buf could technically alias with prng_state, according to the compiler.
-#if defined(__GNUC__) || defined(_MSC_VER)
-#  define SHISHUA_RESTRICT __restrict
-#else
-#  define SHISHUA_RESTRICT
-#endif
-
-// buf's size must be a multiple of 32 bytes.
-static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
-  // We need a lot of variables...
-  __m128i s0_lo = s->state[0],  s0_hi = s->state[1],
-          s1_lo = s->state[2],  s1_hi = s->state[3],
-          t0_lo, t0_hi, t1_lo, t1_hi,
-          u_lo, u_hi,
-          counter_lo = s->counter[0], counter_hi = s->counter[1];
-  // The counter is not necessary to beat PractRand.
-  // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
-  // or about 7 millenia at 10 GiB/s.
-  // The increments are picked as odd numbers,
-  // since only coprimes of the base cover the full cycle,
-  // and all odd numbers are coprime of 2.
-  // I use different odd numbers for each 64-bit chunk
-  // for a tiny amount of variation stirring.
-  // I used the smallest odd numbers to avoid having a magic number.
-  // increment = { 7, 5, 3, 1 };
-  const __m128i increment_lo = SHISHUA_SET_EPI64X(5, 7);
-  const __m128i increment_hi = SHISHUA_SET_EPI64X(1, 3);
-
-  // TODO: consider adding proper uneven write handling
-  assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes.");
-
-  for (size_t i = 0; i < size; i += 32) {
-    // Split to try to reduce register pressure
-    if (buf != NULL) {
-      _mm_storeu_si128((__m128i*)&buf[i+ 0], s->output[0]);
-      _mm_storeu_si128((__m128i*)&buf[i+16], s->output[1]);
-    }
-
-    // Lane 0
-
-
-    // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
-    // additions to strong positions for enrichment. The low 32-bit part of a
-    // 64-bit chunk never moves to the same 64-bit chunk as its high part.
-    // They do not remain in the same chunk. Each part eventually reaches all
-    // positions ringwise: A to B, B to C, …, H to A.
-    // You may notice that they are simply 256-bit rotations (96 and 160).
-    // Note: This:
-    //   x = (y << 96) | (y >> 160)
-    // can be rewritten as this
-    //   x_lo = (y_lo << 96) | (y_hi >> 32)
-    //   x_hi = (y_hi << 96) | (y_lo >> 32)
-    // which we can do with 2 _mm_alignr_epi8 instructions.
-    t0_lo = SHISHUA_ALIGNR_EPI8(s0_lo, s0_hi, 4);
-    t0_hi = SHISHUA_ALIGNR_EPI8(s0_hi, s0_lo, 4);
-
-    // SIMD does not support rotations. Shift is the next best thing to entangle
-    // bits with other 64-bit positions. We must shift by an odd number so that
-    // each bit reaches all 64-bit positions, not just half. We must lose bits
-    // of information, so we minimize it: 1 and 3. We use different shift values
-    // to increase divergence between the two sides. We use rightward shift
-    // because the rightmost bits have the least diffusion in addition (the low
-    // bit is just a XOR of the low bits).
-    u_lo = _mm_srli_epi64(s0_lo, 1);
-    u_hi = _mm_srli_epi64(s0_hi, 1);
-
-    // Addition is the main source of diffusion.
-    // Storing the output in the state keeps that diffusion permanently.
-    s0_lo = _mm_add_epi64(u_lo, t0_lo);
-    s0_hi = _mm_add_epi64(u_hi, t0_hi);
-
-    // Lane 1
-
-    // I apply the counter to s1,
-    // since it is the one whose shift loses most entropy.
-    s1_lo = _mm_add_epi64(s1_lo, counter_lo);
-    s1_hi = _mm_add_epi64(s1_hi, counter_hi);
-
-    // Increment the counter
-    counter_lo = _mm_add_epi64(counter_lo, increment_lo);
-    counter_hi = _mm_add_epi64(counter_hi, increment_hi);
-
-    // Same as above, but with different shift amounts.
-    t1_lo = SHISHUA_ALIGNR_EPI8(s1_hi, s1_lo, 12);
-    t1_hi = SHISHUA_ALIGNR_EPI8(s1_lo, s1_hi, 12);
-
-    s1_lo = _mm_srli_epi64(s1_lo, 3);
-    s1_hi = _mm_srli_epi64(s1_hi, 3);
-
-    s1_lo = _mm_add_epi64(s1_lo, t1_lo);
-    s1_hi = _mm_add_epi64(s1_hi, t1_hi);
-
-    // Two orthogonally grown pieces evolving independently, XORed.
-    s->output[0] = _mm_xor_si128(u_lo, t1_lo);
-    s->output[1] = _mm_xor_si128(u_hi, t1_hi);
-  }
-
-  s->state[0] = s0_lo;  s->state[1] = s0_hi;
-  s->state[2] = s1_lo;  s->state[3] = s1_hi;
-
-  s->counter[0] = counter_lo;
-  s->counter[1] = counter_hi;
-}
-
-
-// Nothing up my sleeve: those are the hex digits of Φ,
-// the least approximable irrational number.
-// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
-static uint64_t phi[8] = {
-  0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95,
-  0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36,
-};
-
-void prng_init(prng_state *s, uint64_t seed[4]) {
-  s->counter[0] = _mm_setzero_si128();
-  s->counter[1] = _mm_setzero_si128();
-# define STEPS 5
-# define ROUNDS 4
-  // Diffuse first two seed elements in s0, then the last two. Same for s1.
-  // We must keep half of the state unchanged so users cannot set a bad state.
-  // XXX: better way to do this?
-  __m128i seed_0 = SHISHUA_CVTSI64_SI128(seed[0]);
-  __m128i seed_1 = SHISHUA_CVTSI64_SI128(seed[1]);
-  __m128i seed_2 = SHISHUA_CVTSI64_SI128(seed[2]);
-  __m128i seed_3 = SHISHUA_CVTSI64_SI128(seed[3]);
-  s->state[0] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[0]));
-  s->state[1] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[2]));
-  s->state[2] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[4]));
-  s->state[3] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[6]));
-
-  for (int i = 0; i < ROUNDS; i++) {
-    prng_gen(s, NULL, 32 * STEPS);
-    s->state[0] = s->state[2];    s->state[1] = s->state[3];
-    s->state[2] = s->output[0];   s->state[3] = s->output[1];
-  }
-# undef STEPS
-# undef ROUNDS
-}
-#undef SHISHUA_ALIGNR_EPI8
-#undef SHISHUA_CVTSI64_SI128
-#undef SHISHUA_SET_EPI64X
-#undef SHISHUA_RESTRICT
-
-#endif
diff --git a/src/helper/shishua/shishua-half.h b/src/helper/shishua/shishua-half.h
deleted file mode 100644
index a1ea715b1..000000000
--- a/src/helper/shishua/shishua-half.h
+++ /dev/null
@@ -1,195 +0,0 @@
-#ifndef SHISHUA_HALF_H
-#define SHISHUA_HALF_H
-
-#define SHISHUA_TARGET_SCALAR 0
-#define SHISHUA_TARGET_AVX2 1
-#define SHISHUA_TARGET_SSE2 2
-#define SHISHUA_TARGET_NEON 3
-
-#ifndef SHISHUA_TARGET
-#  if defined(__AVX2__) && (defined(__x86_64__) || defined(_M_X64))
-#    define SHISHUA_TARGET SHISHUA_TARGET_AVX2
-#  elif defined(__x86_64__) || defined(_M_X64) || defined(__SSE2__) \
-    || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
-#    define SHISHUA_TARGET SHISHUA_TARGET_SSE2
-// GCC's NEON codegen leaves much to be desired, at least as of 9.2.0. The
-// scalar path ends up being faster.
-// Device: Google Pixel 2 XL, 2.46GHz Qualcomm Snapdragon 835
-//                       |   GCC 9.2.0    |  Clang 9.0.1
-//   shishua neon        | 0.2845 ns/byte | 0.0966 ns/byte
-//   shishua scalar      | 0.2056 ns/byte | 0.2958 ns/byte
-//   shishua half neon   | 0.5169 ns/byte | 0.1929 ns/byte
-//   shishua half scalar | 0.2496 ns/byte | 0.2911 ns/byte
-// Therefore, we only autoselect the NEON path on Clang, at least until GCC's
-// NEON codegen improves.
-#  elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__clang__)
-#    define SHISHUA_TARGET SHISHUA_TARGET_NEON
-#  else
-#    define SHISHUA_TARGET SHISHUA_TARGET_SCALAR
-#  endif
-#endif
-
-// These are all optional: By defining SHISHUA_TARGET_SCALAR, you only
-// need this header.
-// Additionally, these headers can also be used on their own.
-#if SHISHUA_TARGET == SHISHUA_TARGET_AVX2
-#  include "shishua-half-avx2.h"
-#elif SHISHUA_TARGET == SHISHUA_TARGET_SSE2
-#  include "shishua-half-sse2.h"
-#elif SHISHUA_TARGET == SHISHUA_TARGET_NEON
-#  include "shishua-half-neon.h"
-#else // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR
-
-// Portable scalar implementation of shishua half.
-// Aims for a balance between code size and performance.
-#include <stdint.h>
-#include <stddef.h>
-#include <string.h>
-#include <assert.h>
-
-// Note: While it is an array, a "lane" refers to 4 consecutive uint64_t.
-typedef struct prng_state {
-  uint64_t state[8];   // 2 lanes
-  uint64_t output[4];  // 1 lane
-  uint64_t counter[4]; // 1 lane
-} prng_state;
-
-// buf could technically alias with prng_state, according to the compiler.
-#if defined(__GNUC__) || defined(_MSC_VER)
-#  define SHISHUA_RESTRICT __restrict
-#else
-#  define SHISHUA_RESTRICT
-#endif
-
-static inline void shishua_write_le64(void *dst, uint64_t val) {
-  // Define to write in native endianness with memcpy
-  // Also, use memcpy on known little endian setups.
-# if defined(SHISHUA_NATIVE_ENDIAN) \
-   || defined(_WIN32) \
-   || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
-   || defined(__LITTLE_ENDIAN__)
-  memcpy(dst, &val, sizeof(uint64_t));
-#else
-  // Byteshift write.
-  uint8_t *d = (uint8_t *)dst;
-  for (size_t i = 0; i < 8; i++) {
-    d[i] = (uint8_t)(val & 0xff);
-    val >>= 8;
-  }
-#endif
-}
-
-// buf's size must be a multiple of 32 bytes.
-static inline void prng_gen(prng_state *SHISHUA_RESTRICT state, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
-
-  uint8_t *b = buf;
-  // TODO: consider adding proper uneven write handling
-  assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes.");
-
-  for (size_t i = 0; i < size; i += 32) {
-    uint64_t t[8];
-    // Write to buf
-    if (buf != NULL) {
-      for (size_t j = 0; j < 4; j++) {
-        shishua_write_le64(b, state->output[j]);
-        b += 8;
-      }
-    }
-
-    for (size_t j = 0; j < 4; j++) {
-      // I apply the counter to s1,
-      // since it is the one whose shift loses most entropy.
-      state->state[j + 4] += state->counter[j];
-      // The counter is not necessary to beat PractRand.
-      // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
-      // or about 7 millenia at 10 GiB/s.
-      // The increments are picked as odd numbers,
-      // since only coprimes of the base cover the full cycle,
-      // and all odd numbers are coprime of 2.
-      // I use different odd numbers for each 64-bit chunk
-      // for a tiny amount of variation stirring.
-      // I used the smallest odd numbers to avoid having a magic number.
-      //
-      // For the scalar version, we calculate this dynamically, as it is
-      // simple enough.
-      state->counter[j] += 7 - (j * 2); // 7, 5, 3, 1
-    }
-
-    // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
-    // additions to strong positions for enrichment. The low 32-bit part of a
-    // 64-bit chunk never moves to the same 64-bit chunk as its high part.
-    // They do not remain in the same chunk. Each part eventually reaches all
-    // positions ringwise: A to B, B to C, …, H to A.
-    // You may notice that they are simply 256-bit rotations (96 and 160).
-    //
-    // It would be much easier and cleaner to just reinterpret as a uint32_t
-    // pointer or use memcpy, but that is unfortunately endian-dependent, and
-    // the former also breaks strict aliasing.
-    //
-    // The only known solution which doesn't rely on endianness is to
-    // read two 64-bit integers and do a funnel shift.
-    //
-    // See shishua.h for details.
-
-    // Lookup table for the _offsets_ in the shuffle. Even lanes rotate
-    // by 5, odd lanes rotate by 3.
-    // If it were by 32-bit lanes, it would be
-    // { 5,6,7,0,1,2,3,4, 11,12,13,14,15,8,9,10 }
-    const uint8_t shuf_offsets[16] = { 2,3,0,1, 5,6,7,4,   // left
-                                       3,0,1,2, 6,7,4,5 }; // right
-    for (size_t j = 0; j < 8; j++) {
-      t[j] = (state->state[shuf_offsets[j]] >> 32) | (state->state[shuf_offsets[j + 8]] << 32);
-    }
-    for (size_t j = 0; j < 4; j++) {
-      // SIMD does not support rotations. Shift is the next best thing to entangle
-      // bits with other 64-bit positions. We must shift by an odd number so that
-      // each bit reaches all 64-bit positions, not just half. We must lose bits
-      // of information, so we minimize it: 1 and 3. We use different shift values
-      // to increase divergence between the two sides. We use rightward shift
-      // because the rightmost bits have the least diffusion in addition (the low
-      // bit is just a XOR of the low bits).
-      uint64_t u_lo = state->state[j + 0] >> 1;
-      uint64_t u_hi = state->state[j + 4] >> 3;
-      // Addition is the main source of diffusion.
-      // Storing the output in the state keeps that diffusion permanently.
-      state->state[j + 0] = u_lo + t[j + 0];
-      state->state[j + 4] = u_hi + t[j + 4];
-
-      // Two orthogonally grown pieces evolving independently, XORed.
-      state->output[j] = u_lo ^ t[j + 4];
-    }
-  }
-}
-#undef SHISHUA_RESTRICT
-
-// Nothing up my sleeve: those are the hex digits of Φ,
-// the least approximable irrational number.
-// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
-static uint64_t phi[8] = {
-  0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95,
-  0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36,
-};
-
-void prng_init(prng_state *s, uint64_t seed[4]) {
-  memset(s, 0, sizeof(prng_state));
-
-# define STEPS 5
-# define ROUNDS 4
-  // Diffuse first two seed elements in s0, then the last two. Same for s1.
-  // We must keep half of the state unchanged so users cannot set a bad state.
-  memcpy(s->state, phi, sizeof(phi));
-  for (size_t i = 0; i < 4; i++) {
-    s->state[i * 2] ^= seed[i];
-  }
-  for (size_t i = 0; i < ROUNDS; i++) {
-    prng_gen(s, NULL, 32 * STEPS);
-    for (size_t j = 0; j < 4; j++) {
-       s->state[j + 0] = s->state[j + 4];
-       s->state[j + 4] = s->output[j];
-    }
-  }
-# undef STEPS
-# undef ROUNDS
-}
-#endif // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR
-#endif // SHISHUA_HALF_SCALAR_H
diff --git a/src/helper/shishua/shishua-neon.h b/src/helper/shishua/shishua-neon.h
deleted file mode 100644
index b3ecbafdd..000000000
--- a/src/helper/shishua/shishua-neon.h
+++ /dev/null
@@ -1,180 +0,0 @@
-// An ARM NEON version of shishua.
-#ifndef SHISHUA_NEON_H
-#define SHISHUA_NEON_H
-#include <stdint.h>
-#include <stddef.h>
-#include <assert.h>
-#include <arm_neon.h>
-
-typedef struct prng_state {
-  uint64x2_t state[8];
-  uint64x2_t output[8];
-  uint64x2_t counter[2];
-} prng_state;
-
-#if defined(__GNUC__) && (defined(__BYTE_ORDER__) && __BYTE_ORDER__==__ORDER_LITTLE_ENDIAN__)
-#  define SHISHUA_VSETQ_N_U64(a, b) (__extension__(uint64x2_t) { a, b })
-#else
-#  define SHISHUA_VSETQ_N_U64(a, b) vcombine_u64(vdup_n_u64(a), vdup_n_u64(b))
-#endif
-// To hide the vreinterpret ritual.
-#define SHISHUA_VEXTQ_U8(Rn, Rm, Imm) \
-  vreinterpretq_u64_u8( \
-    vextq_u8( \
-      vreinterpretq_u8_u64(Rn), \
-      vreinterpretq_u8_u64(Rm), \
-      (Imm) \
-    ) \
-  )
-
-// buf could technically alias with prng_state, according to the compiler.
-#if defined(__GNUC__) || defined(_MSC_VER)
-#  define SHISHUA_RESTRICT __restrict
-#else
-#  define SHISHUA_RESTRICT
-#endif
-
-// buf's size must be a multiple of 128 bytes.
-static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
-  uint8_t *b = buf;
-  uint64x2_t counter_lo = s->counter[0], counter_hi = s->counter[1];
-  // The counter is not necessary to beat PractRand.
-  // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
-  // or about 7 millenia at 10 GiB/s.
-  // The increments are picked as odd numbers,
-  // since only coprimes of the base cover the full cycle,
-  // and all odd numbers are coprime of 2.
-  // I use different odd numbers for each 64-bit chunk
-  // for a tiny amount of variation stirring.
-  // I used the smallest odd numbers to avoid having a magic number.
-  uint64x2_t increment_lo = SHISHUA_VSETQ_N_U64(7, 5);
-  uint64x2_t increment_hi = SHISHUA_VSETQ_N_U64(3, 1);
-  // TODO: consider adding proper uneven write handling
-  assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes.");
-
-  for (size_t i = 0; i < size; i += 128) {
-    // Write the current output block to state if it is not NULL
-    if (buf != NULL) {
-      for (size_t j = 0; j < 8; j++) {
-        vst1q_u8(b, vreinterpretq_u8_u64(s->output[j]));
-        b += 16;
-      }
-    }
-    // NEON has less register pressure than SSE2, but we reroll it anyways for
-    // code size.
-    for (size_t j = 0; j < 2; j++) {
-      uint64x2_t s0_lo = s->state[j * 4 + 0],
-                 s0_hi = s->state[j * 4 + 1],
-                 s1_lo = s->state[j * 4 + 2],
-                 s1_hi = s->state[j * 4 + 3],
-                 t0_lo, t0_hi, t1_lo, t1_hi,
-                 u_lo, u_hi;
-
-      // I apply the counter to s1,
-      // since it is the one whose shift loses most entropy.
-      s1_lo = vaddq_u64(s1_lo, counter_lo); s1_hi = vaddq_u64(s1_hi, counter_hi);
-
-      // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
-      // additions to strong positions for enrichment. The low 32-bit part of a
-      // 64-bit chunk never moves to the same 64-bit chunk as its high part.
-      // They do not remain in the same chunk. Each part eventually reaches all
-      // positions ringwise: A to B, B to C, …, H to A.
-      // You may notice that they are simply 256-bit rotations (96 and 160).
-      // Note: This:
-      //   x = (y << 96) | (y >> 160)
-      // can be rewritten as this
-      //   x_lo = (y_lo << 96) | (y_hi >> 32)
-      //   x_hi = (y_hi << 96) | (y_lo >> 32)
-      // which we can do with 2 vext.8 instructions.
-      t0_lo = SHISHUA_VEXTQ_U8(s0_hi, s0_lo, 4);   t0_hi = SHISHUA_VEXTQ_U8(s0_lo, s0_hi,  4);
-      t1_lo = SHISHUA_VEXTQ_U8(s1_lo, s1_hi, 12);  t1_hi = SHISHUA_VEXTQ_U8(s1_hi, s1_lo, 12);
-
-      // SIMD does not support rotations. Shift is the next best thing to entangle
-      // bits with other 64-bit positions. We must shift by an odd number so that
-      // each bit reaches all 64-bit positions, not just half. We must lose bits
-      // of information, so we minimize it: 1 and 3. We use different shift values
-      // to increase divergence between the two sides. We use rightward shift
-      // because the rightmost bits have the least diffusion in addition (the low
-      // bit is just a XOR of the low bits).
-      u_lo = vshrq_n_u64(s0_lo, 1);   u_hi = vshrq_n_u64(s0_hi, 1);
-#if defined(__clang__)
-      // UGLY HACK: Clang enjoys merging the above statements with the vadds below
-      // into vsras.
-      // This is dumb, as it still needs to do the original vshr for the xor mix,
-      // causing it to shift twice.
-      // This makes Clang assume that this line has side effects, preventing the
-      // combination and speeding things up significantly.
-      __asm__("" : "+w" (u_lo), "+w" (u_hi));
-#endif
-
-      // Addition is the main source of diffusion.
-      // Storing the output in the state keeps that diffusion permanently.
-      s->state[4 * j + 0] = vaddq_u64(t0_lo, u_lo);
-      s->state[4 * j + 1] = vaddq_u64(t0_hi, u_hi);
-      // Use vsra here directly.
-      s->state[4 * j + 2] = vsraq_n_u64(t1_lo, s1_lo, 3);
-      s->state[4 * j + 3] = vsraq_n_u64(t1_hi, s1_hi, 3);
-
-      // The first orthogonally grown pieces evolving independently, XORed.
-      s->output[2 * j + 0] = veorq_u64(u_lo, t1_lo);
-      s->output[2 * j + 1] = veorq_u64(u_hi, t1_hi);
-
-    }
-    // The second orthogonally grown piece evolving independently, XORed.
-    s->output[4] = veorq_u64(s->state[0], s->state[6]);
-    s->output[5] = veorq_u64(s->state[1], s->state[7]);
-
-    s->output[6] = veorq_u64(s->state[2], s->state[4]);
-    s->output[7] = veorq_u64(s->state[3], s->state[5]);
-
-    counter_lo = vaddq_u64(counter_lo, increment_lo);
-    counter_hi = vaddq_u64(counter_hi, increment_hi);
-  }
-  s->counter[0] = counter_lo;
-  s->counter[1] = counter_hi;
-}
-
-// Nothing up my sleeve: those are the hex digits of Φ,
-// the least approximable irrational number.
-// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
-static uint64_t phi[16] = {
-  0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95,
-  0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36,
-  0xF06AD7AE9717877E, 0x85839D6EFFBD7DC6, 0x64D325D1C5371682, 0xCADD0CCCFDFFBBE1,
-  0x626E33B8D04B4331, 0xBBF73C790D94F79D, 0x471C4AB3ED3D82A5, 0xFEC507705E4AE6E5,
-};
-
-void prng_init(prng_state *s, uint64_t seed[4]) {
-  s->counter[0] = vdupq_n_u64(0);
-  s->counter[1] = vdupq_n_u64(0);
-# define ROUNDS 13
-# define STEPS 1
-  // Diffuse first two seed elements in s0, then the last two. Same for s1.
-  // We must keep half of the state unchanged so users cannot set a bad state.
-  uint64x2_t seed_0 = SHISHUA_VSETQ_N_U64(seed[0], 0);
-  uint64x2_t seed_1 = SHISHUA_VSETQ_N_U64(seed[1], 0);
-  uint64x2_t seed_2 = SHISHUA_VSETQ_N_U64(seed[2], 0);
-  uint64x2_t seed_3 = SHISHUA_VSETQ_N_U64(seed[3], 0);
-  s->state[0] = veorq_u64(seed_0, vld1q_u64(&phi[ 0]));
-  s->state[1] = veorq_u64(seed_1, vld1q_u64(&phi[ 2]));
-  s->state[2] = veorq_u64(seed_2, vld1q_u64(&phi[ 4]));
-  s->state[3] = veorq_u64(seed_3, vld1q_u64(&phi[ 6]));
-  s->state[4] = veorq_u64(seed_2, vld1q_u64(&phi[ 8]));
-  s->state[5] = veorq_u64(seed_3, vld1q_u64(&phi[10]));
-  s->state[6] = veorq_u64(seed_0, vld1q_u64(&phi[12]));
-  s->state[7] = veorq_u64(seed_1, vld1q_u64(&phi[14]));
-
-  for (int i = 0; i < ROUNDS; i++) {
-    prng_gen(s, NULL, 128 * STEPS);
-    s->state[0] = s->output[6];   s->state[1] = s->output[7];
-    s->state[2] = s->output[4];   s->state[3] = s->output[5];
-    s->state[4] = s->output[2];   s->state[5] = s->output[3];
-    s->state[6] = s->output[0];   s->state[7] = s->output[1];
-  }
-# undef STEPS
-# undef ROUNDS
-}
-#undef SHISHUA_VSETQ_N_U64
-#undef SHISHUA_VEXTQ_U8
-#undef SHISHUA_RESTRICT
-#endif
diff --git a/src/helper/shishua/shishua-sse2.h b/src/helper/shishua/shishua-sse2.h
deleted file mode 100644
index 3a9807638..000000000
--- a/src/helper/shishua/shishua-sse2.h
+++ /dev/null
@@ -1,214 +0,0 @@
-// An SSE2/SSSE3 version of shishua. Slower than AVX2, but more compatible.
-// Also compatible with 32-bit x86.
-//
-// SSSE3 is recommended, as it has the useful _mm_alignr_epi8 intrinsic.
-// We can still emulate it on SSE2, but it is slower.
-// Consider the half version if AVX2 is not available.
-#ifndef SHISHUA_SSE2_H
-#define SHISHUA_SSE2_H
-#include <stdint.h>
-#include <stddef.h>
-#include <assert.h>
-// Note: cl.exe doesn't define __SSSE3__
-#if defined(__SSSE3__) || defined(__AVX__)
-#  include <tmmintrin.h> // SSSE3
-#  define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \
-   _mm_alignr_epi8(hi, lo, amt)
-#else
-#  include <emmintrin.h> // SSE2
-// Emulate _mm_alignr_epi8 for SSE2. It's a little slow.
-// The compiler may convert it to a sequence of shufps instructions, which is
-// perfectly fine.
-#  define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \
-   _mm_or_si128( \
-     _mm_slli_si128(hi, 16 - (amt)), \
-     _mm_srli_si128(lo, amt) \
-   )
-#endif
-
-typedef struct prng_state {
-  __m128i state[8];
-  __m128i output[8];
-  __m128i counter[2];
-} prng_state;
-
-// Wrappers for x86 targets which usually lack these intrinsics.
-// Don't call these with side effects.
-#if defined(__x86_64__) || defined(_M_X64)
-#   define SHISHUA_SET_EPI64X(b, a) _mm_set_epi64x(b, a)
-#   define SHISHUA_CVTSI64_SI128(x) _mm_cvtsi64_si128(x)
-#else
-#   define SHISHUA_SET_EPI64X(b, a) \
-      _mm_set_epi32( \
-        (int)(((uint64_t)(b)) >> 32), \
-        (int)(b), \
-        (int)(((uint64_t)(a)) >> 32), \
-        (int)(a) \
-      )
-#   define SHISHUA_CVTSI64_SI128(x) SHISHUA_SET_EPI64X(0, x)
-#endif
-
-// buf could technically alias with prng_state, according to the compiler.
-#if defined(__GNUC__) || defined(_MSC_VER)
-#  define SHISHUA_RESTRICT __restrict
-#else
-#  define SHISHUA_RESTRICT
-#endif
-
-// buf's size must be a multiple of 128 bytes.
-static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
-  __m128i counter_lo = s->counter[0], counter_hi = s->counter[1];
-  // The counter is not necessary to beat PractRand.
-  // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
-  // or about 7 millenia at 10 GiB/s.
-  // The increments are picked as odd numbers,
-  // since only coprimes of the base cover the full cycle,
-  // and all odd numbers are coprime of 2.
-  // I use different odd numbers for each 64-bit chunk
-  // for a tiny amount of variation stirring.
-  // I used the smallest odd numbers to avoid having a magic number.
-
-  // increment = { 7, 5, 3, 1 };
-  __m128i increment_lo = SHISHUA_SET_EPI64X(5, 7);
-  __m128i increment_hi = SHISHUA_SET_EPI64X(1, 3);
-
-  // TODO: consider adding proper uneven write handling
-  assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes.");
-
-  for (size_t i = 0; i < size; i += 128) {
-    // Write the current output block to state if it is not NULL
-    if (buf != NULL) {
-      for (size_t j = 0; j < 8; j++) {
-        _mm_storeu_si128((__m128i *)&buf[i + (16 * j)], s->output[j]);
-      }
-    }
-
-    // There are only 16 SSE registers (8 on i686), and we have to account for
-    // temporary copies due to being stuck with 2-operand instructions.
-    // Therefore, we use fixed iteration loops to reduce code complexity while
-    // still allowing the compiler to easily unroll the loop.
-    // We also try to keep variables active for as short as possible.
-    for (size_t j = 0; j < 2; j++) {
-      __m128i s_lo, s_hi, u0_lo, u0_hi, u1_lo, u1_hi, t_lo, t_hi;
-
-      // Lane 0
-      s_lo = s->state[4 * j + 0];
-      s_hi = s->state[4 * j + 1];
-
-      // SIMD does not support rotations. Shift is the next best thing to entangle
-      // bits with other 64-bit positions. We must shift by an odd number so that
-      // each bit reaches all 64-bit positions, not just half. We must lose bits
-      // of information, so we minimize it: 1 and 3. We use different shift values
-      // to increase divergence between the two sides. We use rightward shift
-      // because the rightmost bits have the least diffusion in addition (the low
-      // bit is just a XOR of the low bits).
-      u0_lo = _mm_srli_epi64(s_lo, 1);
-      u0_hi = _mm_srli_epi64(s_hi, 1);
-
-      // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
-      // additions to strong positions for enrichment. The low 32-bit part of a
-      // 64-bit chunk never moves to the same 64-bit chunk as its high part.
-      // They do not remain in the same chunk. Each part eventually reaches all
-      // positions ringwise: A to B, B to C, …, H to A.
-      // You may notice that they are simply 256-bit rotations (96 and 160).
-      // Note: This:
-      //   x = (y << 96) | (y >> 160)
-      // can be rewritten as this
-      //   x_lo = (y_lo << 96) | (y_hi >> 32)
-      //   x_hi = (y_hi << 96) | (y_lo >> 32)
-      // which we can do with 2 _mm_alignr_epi8 instructions.
-      t_lo = SHISHUA_ALIGNR_EPI8(s_lo, s_hi, 4);
-      t_hi = SHISHUA_ALIGNR_EPI8(s_hi, s_lo, 4);
-
-      // Addition is the main source of diffusion.
-      // Storing the output in the state keeps that diffusion permanently.
-      s->state[4 * j + 0] = _mm_add_epi64(t_lo, u0_lo);
-      s->state[4 * j + 1] = _mm_add_epi64(t_hi, u0_hi);
-
-      // Lane 1
-      s_lo = s->state[4 * j + 2];
-      s_hi = s->state[4 * j + 3];
-
-      // I apply the counter to s1,
-      // since it is the one whose shift loses most entropy.
-      s_lo = _mm_add_epi64(s_lo, counter_lo);
-      s_hi = _mm_add_epi64(s_hi, counter_hi);
-
-      // Same as above but with different shifts
-      u1_lo = _mm_srli_epi64(s_lo, 3);
-      u1_hi = _mm_srli_epi64(s_hi, 3);
-
-      t_lo = SHISHUA_ALIGNR_EPI8(s_hi, s_lo, 12);
-      t_hi = SHISHUA_ALIGNR_EPI8(s_lo, s_hi, 12);
-
-      s->state[4 * j + 2] = _mm_add_epi64(t_lo, u1_lo);
-      s->state[4 * j + 3] = _mm_add_epi64(t_hi, u1_hi);
-
-      // Merge lane 0 and lane 1
-      // The first orthogonally grown piece evolving independently, XORed.
-      s->output[2 * j + 0] = _mm_xor_si128(u0_lo, t_lo);
-      s->output[2 * j + 1] = _mm_xor_si128(u0_hi, t_hi);
-    }
-
-    // The second orthogonally grown piece evolving independently, XORed.
-    s->output[4] = _mm_xor_si128(s->state[0], s->state[6]);
-    s->output[5] = _mm_xor_si128(s->state[1], s->state[7]);
-    s->output[6] = _mm_xor_si128(s->state[4], s->state[2]);
-    s->output[7] = _mm_xor_si128(s->state[5], s->state[3]);
-
-    // Increment the counter
-    counter_lo = _mm_add_epi64(counter_lo, increment_lo);
-    counter_hi = _mm_add_epi64(counter_hi, increment_hi);
-  }
-
-  s->counter[0] = counter_lo;
-  s->counter[1] = counter_hi;
-}
-
-// Nothing up my sleeve: those are the hex digits of Φ,
-// the least approximable irrational number.
-// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
-static uint64_t phi[16] = {
-  0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95,
-  0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36,
-  0xF06AD7AE9717877E, 0x85839D6EFFBD7DC6, 0x64D325D1C5371682, 0xCADD0CCCFDFFBBE1,
-  0x626E33B8D04B4331, 0xBBF73C790D94F79D, 0x471C4AB3ED3D82A5, 0xFEC507705E4AE6E5,
-};
-
-void prng_init(prng_state *s, uint64_t seed[4]) {
-  // Note: output is uninitialized at first, but since we pass NULL, its value
-  // is initially ignored.
-  s->counter[0] = _mm_setzero_si128();
-  s->counter[1] = _mm_setzero_si128();
-# define ROUNDS 13
-# define STEPS 1
-  // Diffuse first two seed elements in s0, then the last two. Same for s1.
-  // We must keep half of the state unchanged so users cannot set a bad state.
-  __m128i seed_0 = SHISHUA_CVTSI64_SI128(seed[0]);
-  __m128i seed_1 = SHISHUA_CVTSI64_SI128(seed[1]);
-  __m128i seed_2 = SHISHUA_CVTSI64_SI128(seed[2]);
-  __m128i seed_3 = SHISHUA_CVTSI64_SI128(seed[3]);
-  s->state[0] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[ 0]));
-  s->state[1] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[ 2]));
-  s->state[2] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[ 4]));
-  s->state[3] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[ 6]));
-  s->state[4] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[ 8]));
-  s->state[5] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[10]));
-  s->state[6] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[12]));
-  s->state[7] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[14]));
-
-  for (int i = 0; i < ROUNDS; i++) {
-    prng_gen(s, NULL, 128 * STEPS);
-    s->state[0] = s->output[6];  s->state[1] = s->output[7];
-    s->state[2] = s->output[4];  s->state[3] = s->output[5];
-    s->state[4] = s->output[2];  s->state[5] = s->output[3];
-    s->state[6] = s->output[0];  s->state[7] = s->output[1];
-  }
-# undef STEPS
-# undef ROUNDS
-}
-#undef SHISHUA_CVTSI64_SI128
-#undef SHISHUA_ALIGNR_EPI8
-#undef SHISHUA_SET_EPI64X
-#undef SHISHUA_RESTRICT
-#endif
diff --git a/src/helper/shishua/shishua.h b/src/helper/shishua/shishua.h
deleted file mode 100644
index 352a8aac2..000000000
--- a/src/helper/shishua/shishua.h
+++ /dev/null
@@ -1,234 +0,0 @@
-#ifndef SHISHUA_H
-#define SHISHUA_H
-
-#define SHISHUA_TARGET_SCALAR 0
-#define SHISHUA_TARGET_AVX2 1
-#define SHISHUA_TARGET_SSE2 2
-#define SHISHUA_TARGET_NEON 3
-
-#ifndef SHISHUA_TARGET
-#  if defined(__AVX2__) && (defined(__x86_64__) || defined(_M_X64))
-#    define SHISHUA_TARGET SHISHUA_TARGET_AVX2
-#  elif defined(__x86_64__) || defined(_M_X64) || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
-#    define SHISHUA_TARGET SHISHUA_TARGET_SSE2
-// GCC's NEON codegen leaves much to be desired, at least as of 9.2.0. The
-// scalar path ends up being faster.
-// Device: Google Pixel 2 XL, 2.46GHz Qualcomm Snapdragon 835
-//   algorithm           |   GCC 9.2.0    |  Clang 9.0.1
-//   shishua neon        | 0.2845 ns/byte | 0.0966 ns/byte
-//   shishua scalar      | 0.2056 ns/byte | 0.2958 ns/byte
-//   shishua half neon   | 0.5169 ns/byte | 0.1929 ns/byte
-//   shishua half scalar | 0.2496 ns/byte | 0.2911 ns/byte
-// Therefore, we only autoselect the NEON path on Clang, at least until GCC's
-// NEON codegen improves.
-#  elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__clang__)
-#    define SHISHUA_TARGET SHISHUA_TARGET_NEON
-#  else
-#    define SHISHUA_TARGET SHISHUA_TARGET_SCALAR
-#  endif
-#endif
-
-// These are all optional, with defining SHISHUA_TARGET_SCALAR, you only
-// need this header.
-#if SHISHUA_TARGET == SHISHUA_TARGET_AVX2
-#  include "shishua-avx2.h"
-#elif SHISHUA_TARGET == SHISHUA_TARGET_SSE2
-#  include "shishua-sse2.h"
-#elif SHISHUA_TARGET == SHISHUA_TARGET_NEON
-#  include "shishua-neon.h"
-#else // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR
-
-// Portable scalar implementation of shishua.
-// Designed to balance performance and code size.
-#include <stdint.h>
-#include <stddef.h>
-#include <string.h>
-#include <assert.h>
-
-// Note: While it is an array, a "lane" refers to 4 consecutive uint64_t.
-typedef struct prng_state {
-  uint64_t state[16];  // 4 lanes
-  uint64_t output[16]; // 4 lanes, 2 parts
-  uint64_t counter[4]; // 1 lane
-} prng_state;
-
-// buf could technically alias with prng_state, according to the compiler.
-#if defined(__GNUC__) || defined(_MSC_VER)
-#  define SHISHUA_RESTRICT __restrict
-#else
-#  define SHISHUA_RESTRICT
-#endif
-
-// Writes a 64-bit little endian integer to dst
-static inline void prng_write_le64(void *dst, uint64_t val) {
-  // Define to write in native endianness with memcpy
-  // Also, use memcpy on known little endian setups.
-#if defined(SHISHUA_NATIVE_ENDIAN) \
-   || defined(_WIN32) \
-   || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
-   || defined(__LITTLE_ENDIAN__)
-  memcpy(dst, &val, sizeof(uint64_t));
-#else
-  // Byteshift write.
-  uint8_t *d = (uint8_t *)dst;
-  for (size_t i = 0; i < 8; i++) {
-    d[i] = (uint8_t)(val & 0xff);
-    val >>= 8;
-  }
-#endif
-}
-
-// buf's size must be a multiple of 128 bytes.
-static inline void prng_gen(prng_state *SHISHUA_RESTRICT state, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
-  uint8_t *b = buf;
-  // TODO: consider adding proper uneven write handling
-  assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes.");
-
-  for (size_t i = 0; i < size; i += 128) {
-    // Write the current output block to state if it is not NULL
-    if (buf != NULL) {
-      for (size_t j = 0; j < 16; j++) {
-        prng_write_le64(b, state->output[j]); b += 8;
-      }
-    }
-    // Similar to SSE, use fixed iteration loops to reduce code complexity
-    // and allow the compiler more control over optimization.
-    for (size_t j = 0; j < 2; j++) {
-      // I don't want to type this 15 times.
-      uint64_t *s = &state->state[j * 8];  // 2 lanes
-      uint64_t *o = &state->output[j * 4]; // 1 lane
-      uint64_t t[8]; // temp buffer
-
-      // I apply the counter to s1,
-      // since it is the one whose shift loses most entropy.
-      for (size_t k = 0; k < 4; k++) {
-        s[k + 4] += state->counter[k];
-      }
-
-      // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
-      // additions to strong positions for enrichment. The low 32-bit part of a
-      // 64-bit chunk never moves to the same 64-bit chunk as its high part.
-      // They do not remain in the same chunk. Each part eventually reaches all
-      // positions ringwise: A to B, B to C, …, H to A.
-      //
-      // You may notice that they are simply 256-bit rotations (96 and 160):
-      //
-      //   t0 = (s0 <<  96) | (s0 >> (256 -  96));
-      //   t1 = (s1 << 160) | (s1 >> (256 - 160));
-      //
-      // The easiest way to do this would be to cast s and t to uint32_t *
-      // and operate on them that way.
-      //
-      //   uint32_t *t0_32 = (uint32_t *)t0, *t1_32 = (uint32_t *)t1;
-      //   uint32_t *s0_32 = (uint32_t *)s0, *s1_32 = (uint32_t *)s1;
-      //   for (size_t k = 0; k < 4; k++) {
-      //     t0_32[k] = s0_32[(k + 5) % 8];
-      //     t1_32[k] = s1_32[(k + 3) % 8];
-      //   }
-      //
-      // This is pretty, but it violates strict aliasing and relies on little
-      // endian data layout.
-      //
-      // A common workaround to strict aliasing would be to use memcpy:
-      //
-      //   // legal casts
-      //   unsigned char *t8 = (unsigned char *)t;
-      //   unsigned char *s8 = (unsigned char *)s;
-      //   memcpy(&t8[0], &s8[20], 32 - 20);
-      //   memcpy(&t8[32 - 20], &s8[0], 20);
-      //
-      // However, this still doesn't fix the endianness issue, and is very
-      // ugly.
-      //
-      // The only known solution which doesn't rely on endianness is to
-      // read two 64-bit integers and do a funnel shift.
-
-      // Lookup table for the _offsets_ in the shuffle. Even lanes rotate
-      // by 5, odd lanes rotate by 3.
-      // If it were by 32-bit lanes, it would be
-      // { 5,6,7,0,1,2,3,4, 11,12,13,14,15,8,9,10 }
-      const uint8_t shuf_offsets[16] = { 2,3,0,1, 5,6,7,4,   // left
-                                         3,0,1,2, 6,7,4,5 }; // right
-      for (size_t k = 0; k < 8; k++) {
-        t[k] = (s[shuf_offsets[k]] >> 32) | (s[shuf_offsets[k + 8]] << 32);
-      }
-
-      for (size_t k = 0; k < 4; k++) {
-        // SIMD does not support rotations. Shift is the next best thing to entangle
-        // bits with other 64-bit positions. We must shift by an odd number so that
-        // each bit reaches all 64-bit positions, not just half. We must lose bits
-        // of information, so we minimize it: 1 and 3. We use different shift values
-        // to increase divergence between the two sides. We use rightward shift
-        // because the rightmost bits have the least diffusion in addition (the low
-        // bit is just a XOR of the low bits).
-        uint64_t u_lo = s[k + 0] >> 1;
-        uint64_t u_hi = s[k + 4] >> 3;
-
-        // Addition is the main source of diffusion.
-        // Storing the output in the state keeps that diffusion permanently.
-        s[k + 0] = u_lo + t[k + 0];
-        s[k + 4] = u_hi + t[k + 4];
-
-        // The first orthogonally grown piece evolving independently, XORed.
-        o[k] = u_lo ^ t[k + 4];
-      }
-    }
-
-    // Merge together.
-    for (size_t j = 0; j < 4; j++) {
-      // The second orthogonally grown piece evolving independently, XORed.
-      state->output[j +  8] = state->state[j + 0] ^ state->state[j + 12];
-      state->output[j + 12] = state->state[j + 8] ^ state->state[j +  4];
-      // The counter is not necessary to beat PractRand.
-      // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
-      // or about 7 millenia at 10 GiB/s.
-      // The increments are picked as odd numbers,
-      // since only coprimes of the base cover the full cycle,
-      // and all odd numbers are coprime of 2.
-      // I use different odd numbers for each 64-bit chunk
-      // for a tiny amount of variation stirring.
-      // I used the smallest odd numbers to avoid having a magic number.
-      //
-      // For the scalar version, we calculate this dynamically, as it is
-      // simple enough.
-      state->counter[j] += 7 - (j * 2); // 7, 5, 3, 1
-    }
-  }
-}
-#undef SHISHUA_RESTRICT
-
-// Nothing up my sleeve: those are the hex digits of Φ,
-// the least approximable irrational number.
-// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
-static uint64_t phi[16] = {
-  0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95,
-  0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36,
-  0xF06AD7AE9717877E, 0x85839D6EFFBD7DC6, 0x64D325D1C5371682, 0xCADD0CCCFDFFBBE1,
-  0x626E33B8D04B4331, 0xBBF73C790D94F79D, 0x471C4AB3ED3D82A5, 0xFEC507705E4AE6E5,
-};
-
-void prng_init(prng_state *s, uint64_t seed[4]) {
-  memset(s, 0, sizeof(prng_state));
-# define STEPS 1
-# define ROUNDS 13
-  // Diffuse first two seed elements in s0, then the last two. Same for s1.
-  // We must keep half of the state unchanged so users cannot set a bad state.
-  memcpy(s->state, phi, sizeof(phi));
-  for (size_t i = 0; i < 4; i++) {
-    s->state[i * 2 + 0] ^= seed[i];           // { s0,0,s1,0,s2,0,s3,0 }
-    s->state[i * 2 + 8] ^= seed[(i + 2) % 4]; // { s2,0,s3,0,s0,0,s1,0 }
-  }
-  for (size_t i = 0; i < ROUNDS; i++) {
-    prng_gen(s, NULL, 128 * STEPS);
-    for (size_t j = 0; j < 4; j++) {
-       s->state[j+ 0] = s->output[j+12];
-       s->state[j+ 4] = s->output[j+ 8];
-       s->state[j+ 8] = s->output[j+ 4];
-       s->state[j+12] = s->output[j+ 0];
-    }
-  }
-# undef STEPS
-# undef ROUNDS
-}
-#endif // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR
-#endif // SHISHUA_SCALAR_H
diff --git a/src/helper/shishua/rand.c b/src/rand.c
similarity index 98%
rename from src/helper/shishua/rand.c
rename to src/rand.c
index fe566e4dd..7d31a04cc 100644
--- a/src/helper/shishua/rand.c
+++ b/src/rand.c
@@ -1,8 +1,8 @@
+#include "rand.h"
+
 #include <assert.h>
 #include <string.h>
 
-#include "rand.h"
-
 uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max) {
     assert(max > min);
     assert(randc->count <= 127);
@@ -34,7 +34,7 @@ float randf(struct rand_chunk *randc, float range, float offset) {
     return randfloat;
 }
 
-//NOTE: example
+// NOTE: example
 
 // int main() {
 //     prng_state state;
diff --git a/src/helper/shishua/rand.h b/src/rand.h
similarity index 76%
rename from src/helper/shishua/rand.h
rename to src/rand.h
index bdfc71f84..40a6d4d2e 100644
--- a/src/helper/shishua/rand.h
+++ b/src/rand.h
@@ -1,12 +1,12 @@
 #ifndef SHISHUA_RAND_H
 #define SHISHUA_RAND_H
 
-#include "shishua.h"
+#include <stddef.h>
 #include <stdint.h>
 
-struct rand_chunk{
+struct rand_chunk {
     size_t count;
-    uint8_t buf[128]; //NOTE: must be multiple of 128
+    uint8_t buf[128];  // NOTE: must be multiple of 128
 };
 
 uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max);

From 4e1fc685a5d02cefdd89e5759bf9d46bc672785c Mon Sep 17 00:00:00 2001
From: BrBa2 <108572893+barium-bromide@users.noreply.github.com>
Date: Thu, 12 Jun 2025 14:49:08 +0800
Subject: [PATCH 03/11] helper function for random (#13)

* helper function for random

* Cleanup

---------

Co-authored-by: Lee Hoon Lim <hoonlim123@gmail.com>
---
 CMakeLists.txt                       |   2 +
 external/shishua/shishua-avx2.h      | 137 +++++++++++++++
 external/shishua/shishua-half-avx2.h | 107 ++++++++++++
 external/shishua/shishua-half-neon.h | 169 +++++++++++++++++++
 external/shishua/shishua-half-sse2.h | 200 ++++++++++++++++++++++
 external/shishua/shishua-half.h      | 196 +++++++++++++++++++++
 external/shishua/shishua-neon.h      | 199 ++++++++++++++++++++++
 external/shishua/shishua-sse2.h      | 228 +++++++++++++++++++++++++
 external/shishua/shishua.h           | 244 +++++++++++++++++++++++++++
 src/rand.c                           |  55 ++++++
 src/rand.h                           |  16 ++
 11 files changed, 1553 insertions(+)
 create mode 100644 external/shishua/shishua-avx2.h
 create mode 100644 external/shishua/shishua-half-avx2.h
 create mode 100644 external/shishua/shishua-half-neon.h
 create mode 100644 external/shishua/shishua-half-sse2.h
 create mode 100644 external/shishua/shishua-half.h
 create mode 100644 external/shishua/shishua-neon.h
 create mode 100644 external/shishua/shishua-sse2.h
 create mode 100644 external/shishua/shishua.h
 create mode 100644 src/rand.c
 create mode 100644 src/rand.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4591d9ea1..95245ad12 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,8 @@ set(PROFILE_FLAG -O3 -p -g)
 
 function(apply_target target)
     target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/include)
+    target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/external)
+    target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/src)
     target_compile_options(${target} PRIVATE ${WARNING_FLAGS})
     target_link_libraries(libcneuron PRIVATE ${BLAS_LIBRARIES} m)
 
diff --git a/external/shishua/shishua-avx2.h b/external/shishua/shishua-avx2.h
new file mode 100644
index 000000000..e25dcf20f
--- /dev/null
+++ b/external/shishua/shishua-avx2.h
@@ -0,0 +1,137 @@
+#ifndef SHISHUA_AVX2_H
+#define SHISHUA_AVX2_H
+#include <assert.h>
+#include <immintrin.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+typedef struct prng_state {
+    __m256i state[4];
+    __m256i output[4];
+    __m256i counter;
+} prng_state;
+
+// buf's size must be a multiple of 128 bytes.
+static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) {
+    __m256i o0 = s->output[0], o1 = s->output[1], o2 = s->output[2], o3 = s->output[3],
+            s0 = s->state[0], s1 = s->state[1], s2 = s->state[2], s3 = s->state[3],
+            t0, t1, t2, t3, u0, u1, u2, u3, counter = s->counter;
+    // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+    // additions to strong positions for enrichment. The low 32-bit part of a
+    // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+    // They do not remain in the same chunk. Each part eventually reaches all
+    // positions ringwise: A to B, B to C, …, H to A.
+    // You may notice that they are simply 256-bit rotations (96 and 160).
+    __m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5),
+            shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3);
+    // The counter is not necessary to beat PractRand.
+    // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
+    // or about 7 millenia at 10 GiB/s.
+    // The increments are picked as odd numbers,
+    // since only coprimes of the base cover the full cycle,
+    // and all odd numbers are coprime of 2.
+    // I use different odd numbers for each 64-bit chunk
+    // for a tiny amount of variation stirring.
+    // I used the smallest odd numbers to avoid having a magic number.
+    __m256i increment = _mm256_set_epi64x(1, 3, 5, 7);
+
+    // TODO: consider adding proper uneven write handling
+    assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes.");
+
+    for (size_t i = 0; i < size; i += 128) {
+        if (buf != NULL) {
+            _mm256_storeu_si256((__m256i *)&buf[i + 0], o0);
+            _mm256_storeu_si256((__m256i *)&buf[i + 32], o1);
+            _mm256_storeu_si256((__m256i *)&buf[i + 64], o2);
+            _mm256_storeu_si256((__m256i *)&buf[i + 96], o3);
+        }
+
+        // I apply the counter to s1,
+        // since it is the one whose shift loses most entropy.
+        s1 = _mm256_add_epi64(s1, counter);
+        s3 = _mm256_add_epi64(s3, counter);
+        counter = _mm256_add_epi64(counter, increment);
+
+        // SIMD does not support rotations. Shift is the next best thing to entangle
+        // bits with other 64-bit positions. We must shift by an odd number so that
+        // each bit reaches all 64-bit positions, not just half. We must lose bits
+        // of information, so we minimize it: 1 and 3. We use different shift values
+        // to increase divergence between the two sides. We use rightward shift
+        // because the rightmost bits have the least diffusion in addition (the low
+        // bit is just a XOR of the low bits).
+        u0 = _mm256_srli_epi64(s0, 1);
+        u1 = _mm256_srli_epi64(s1, 3);
+        u2 = _mm256_srli_epi64(s2, 1);
+        u3 = _mm256_srli_epi64(s3, 3);
+        t0 = _mm256_permutevar8x32_epi32(s0, shu0);
+        t1 = _mm256_permutevar8x32_epi32(s1, shu1);
+        t2 = _mm256_permutevar8x32_epi32(s2, shu0);
+        t3 = _mm256_permutevar8x32_epi32(s3, shu1);
+        // Addition is the main source of diffusion.
+        // Storing the output in the state keeps that diffusion permanently.
+        s0 = _mm256_add_epi64(t0, u0);
+        s1 = _mm256_add_epi64(t1, u1);
+        s2 = _mm256_add_epi64(t2, u2);
+        s3 = _mm256_add_epi64(t3, u3);
+
+        // Two orthogonally grown pieces evolving independently, XORed.
+        o0 = _mm256_xor_si256(u0, t1);
+        o1 = _mm256_xor_si256(u2, t3);
+        o2 = _mm256_xor_si256(s0, s3);
+        o3 = _mm256_xor_si256(s2, s1);
+    }
+    s->output[0] = o0;
+    s->output[1] = o1;
+    s->output[2] = o2;
+    s->output[3] = o3;
+    s->state[0] = s0;
+    s->state[1] = s1;
+    s->state[2] = s2;
+    s->state[3] = s3;
+    s->counter = counter;
+}
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[16] = {
+    0x9E3779B97F4A7C15,
+    0xF39CC0605CEDC834,
+    0x1082276BF3A27251,
+    0xF86C6A11D0C18E95,
+    0x2767F0B153D27B7F,
+    0x0347045B5BF1827F,
+    0x01886F0928403002,
+    0xC1D64BA40F335E36,
+    0xF06AD7AE9717877E,
+    0x85839D6EFFBD7DC6,
+    0x64D325D1C5371682,
+    0xCADD0CCCFDFFBBE1,
+    0x626E33B8D04B4331,
+    0xBBF73C790D94F79D,
+    0x471C4AB3ED3D82A5,
+    0xFEC507705E4AE6E5,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+    memset(s, 0, sizeof(prng_state));
+#define STEPS 1
+#define ROUNDS 13
+    uint8_t buf[128 * STEPS];
+    // Diffuse first two seed elements in s0, then the last two. Same for s1.
+    // We must keep half of the state unchanged so users cannot set a bad state.
+    s->state[0] = _mm256_set_epi64x(phi[3], phi[2] ^ seed[1], phi[1], phi[0] ^ seed[0]);
+    s->state[1] = _mm256_set_epi64x(phi[7], phi[6] ^ seed[3], phi[5], phi[4] ^ seed[2]);
+    s->state[2] = _mm256_set_epi64x(phi[11], phi[10] ^ seed[3], phi[9], phi[8] ^ seed[2]);
+    s->state[3] = _mm256_set_epi64x(phi[15], phi[14] ^ seed[1], phi[13], phi[12] ^ seed[0]);
+    for (size_t i = 0; i < ROUNDS; i++) {
+        prng_gen(s, buf, 128 * STEPS);
+        s->state[0] = s->output[3];
+        s->state[1] = s->output[2];
+        s->state[2] = s->output[1];
+        s->state[3] = s->output[0];
+    }
+#undef STEPS
+#undef ROUNDS
+}
+#endif
diff --git a/external/shishua/shishua-half-avx2.h b/external/shishua/shishua-half-avx2.h
new file mode 100644
index 000000000..d1b3df42a
--- /dev/null
+++ b/external/shishua/shishua-half-avx2.h
@@ -0,0 +1,107 @@
+#ifndef SHISHUA_H
+#define SHISHUA_H
+#include <assert.h>
+#include <immintrin.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+typedef struct prng_state {
+    __m256i state[2];
+    __m256i output;
+    __m256i counter;
+} prng_state;
+
+// buf's size must be a multiple of 32 bytes.
+static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) {
+    __m256i s0 = s->state[0], counter = s->counter,
+            s1 = s->state[1], o = s->output,
+            t0, t1, t2, t3, u0, u1, u2, u3;
+    // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+    // additions to strong positions for enrichment. The low 32-bit part of a
+    // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+    // They do not remain in the same chunk. Each part eventually reaches all
+    // positions ringwise: A to B, B to C, …, H to A.
+    // You may notice that they are simply 256-bit rotations (96 and 160).
+    __m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5),
+            shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3);
+    // The counter is not necessary to beat PractRand.
+    // It sets a lower bound of 2^69 bytes = 512 EiB to the period,
+    // or about 1 millenia at 10 GiB/s.
+    // The increments are picked as odd numbers,
+    // since only coprimes of the base cover the full cycle,
+    // and all odd numbers are coprime of 2.
+    // I use different odd numbers for each 64-bit chunk
+    // for a tiny amount of variation stirring.
+    // I used the smallest odd numbers to avoid having a magic number.
+    __m256i increment = _mm256_set_epi64x(1, 3, 5, 7);
+
+    // TODO: consider adding proper uneven write handling
+    assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes.");
+
+    for (size_t i = 0; i < size; i += 32) {
+        if (buf != NULL) {
+            _mm256_storeu_si256((__m256i *)&buf[i], o);
+        }
+
+        // I apply the counter to s1,
+        // since it is the one whose shift loses most entropy.
+        s1 = _mm256_add_epi64(s1, counter);
+        counter = _mm256_add_epi64(counter, increment);
+
+        // SIMD does not support rotations. Shift is the next best thing to entangle
+        // bits with other 64-bit positions. We must shift by an odd number so that
+        // each bit reaches all 64-bit positions, not just half. We must lose bits
+        // of information, so we minimize it: 1 and 3. We use different shift values
+        // to increase divergence between the two sides. We use rightward shift
+        // because the rightmost bits have the least diffusion in addition (the low
+        // bit is just a XOR of the low bits).
+        u0 = _mm256_srli_epi64(s0, 1);
+        u1 = _mm256_srli_epi64(s1, 3);
+        t0 = _mm256_permutevar8x32_epi32(s0, shu0);
+        t1 = _mm256_permutevar8x32_epi32(s1, shu1);
+        // Addition is the main source of diffusion.
+        // Storing the output in the state keeps that diffusion permanently.
+        s0 = _mm256_add_epi64(t0, u0);
+        s1 = _mm256_add_epi64(t1, u1);
+
+        // Two orthogonally grown pieces evolving independently, XORed.
+        o = _mm256_xor_si256(u0, t1);
+    }
+    s->state[0] = s0;
+    s->counter = counter;
+    s->state[1] = s1;
+    s->output = o;
+}
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[8] = {
+    0x9E3779B97F4A7C15,
+    0xF39CC0605CEDC834,
+    0x1082276BF3A27251,
+    0xF86C6A11D0C18E95,
+    0x2767F0B153D27B7F,
+    0x0347045B5BF1827F,
+    0x01886F0928403002,
+    0xC1D64BA40F335E36,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+    memset(s, 0, sizeof(prng_state));
+#define STEPS 5
+#define ROUNDS 4
+    uint8_t buf[32 * STEPS];  // 4 64-bit numbers per 256-bit SIMD.
+    // Diffuse first two seed elements in s0, then the last two. Same for s1.
+    // We must keep half of the state unchanged so users cannot set a bad state.
+    s->state[0] = _mm256_set_epi64x(phi[3], phi[2] ^ seed[1], phi[1], phi[0] ^ seed[0]);
+    s->state[1] = _mm256_set_epi64x(phi[7], phi[6] ^ seed[3], phi[5], phi[4] ^ seed[2]);
+    for (size_t i = 0; i < ROUNDS; i++) {
+        prng_gen(s, buf, 32 * STEPS);
+        s->state[0] = s->state[1];
+        s->state[1] = s->output;
+    }
+#undef STEPS
+#undef ROUNDS
+}
+#endif
diff --git a/external/shishua/shishua-half-neon.h b/external/shishua/shishua-half-neon.h
new file mode 100644
index 000000000..ba5655b14
--- /dev/null
+++ b/external/shishua/shishua-half-neon.h
@@ -0,0 +1,169 @@
+// An ARM NEON version of shishua.
+#ifndef SHISHUA_HALF_NEON_H
+#define SHISHUA_HALF_NEON_H
+#include <arm_neon.h>
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+// A NEON version.
+typedef struct prng_state {
+    uint64x2_t state[4];
+    uint64x2_t output[2];
+    uint64x2_t counter[2];
+} prng_state;
+
+#define SHISHUA_VSETQ_N_U64(a, b) vcombine_u64(vdup_n_u64(a), vdup_n_u64(b))
+
+// To hide the vreinterpret ritual.
+#define SHISHUA_VEXTQ_U8(Rn, Rm, Imm) \
+    vreinterpretq_u64_u8(             \
+        vextq_u8(                     \
+            vreinterpretq_u8_u64(Rn), \
+            vreinterpretq_u8_u64(Rm), \
+            (Imm)))
+
+// buf could technically alias with prng_state, according to the compiler.
+#if defined(__GNUC__) || defined(_MSC_VER)
+#define SHISHUA_RESTRICT __restrict
+#else
+#define SHISHUA_RESTRICT
+#endif
+
+// buf's size must be a multiple of 32 bytes.
+static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
+    uint64x2_t s0_lo = s->state[0], s0_hi = s->state[1],
+               s1_lo = s->state[2], s1_hi = s->state[3],
+               o_lo = s->output[0], o_hi = s->output[1],
+               t0_lo, t0_hi, t1_lo, t1_hi, u_lo, u_hi,
+               counter_lo = s->counter[0], counter_hi = s->counter[1];
+    // The counter is not necessary to beat PractRand.
+    // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
+    // or about 7 millenia at 10 GiB/s.
+    // The increments are picked as odd numbers,
+    // since only coprimes of the base cover the full cycle,
+    // and all odd numbers are coprime of 2.
+    // I use different odd numbers for each 64-bit chunk
+    // for a tiny amount of variation stirring.
+    // I used the smallest odd numbers to avoid having a magic number.
+    uint64x2_t increment_lo = SHISHUA_VSETQ_N_U64(7, 5);
+    uint64x2_t increment_hi = SHISHUA_VSETQ_N_U64(3, 1);
+    uint8_t *b = buf;
+    // TODO: consider adding proper uneven write handling
+    assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes.");
+
+    for (size_t i = 0; i < size / 32; i++) {
+        if (buf != NULL) {
+            vst1q_u8(&b[0], vreinterpretq_u8_u64(o_lo));
+            b += 16;
+            vst1q_u8(&b[0], vreinterpretq_u8_u64(o_hi));
+            b += 16;
+        }
+
+        // I apply the counter to s1,
+        // since it is the one whose shift loses most entropy.
+        s1_lo = vaddq_u64(s1_lo, counter_lo);
+        s1_hi = vaddq_u64(s1_hi, counter_hi);
+
+        counter_lo = vaddq_u64(counter_lo, increment_lo);
+        counter_hi = vaddq_u64(counter_hi, increment_hi);
+
+        // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+        // additions to strong positions for enrichment. The low 32-bit part of a
+        // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+        // They do not remain in the same chunk. Each part eventually reaches all
+        // positions ringwise: A to B, B to C, …, H to A.
+        // You may notice that they are simply 256-bit rotations (96 and 160).
+        // Note: This:
+        //   x = (y << 96) | (y >> 160)
+        // can be rewritten as this
+        //   x_lo = (y_lo << 96) | (y_hi >> 32)
+        //   x_hi = (y_hi << 96) | (y_lo >> 32)
+        // which we can do with 2 vext.8 instructions.
+        t0_lo = SHISHUA_VEXTQ_U8(s0_hi, s0_lo, 4);
+        t0_hi = SHISHUA_VEXTQ_U8(s0_lo, s0_hi, 4);
+        t1_lo = SHISHUA_VEXTQ_U8(s1_lo, s1_hi, 12);
+        t1_hi = SHISHUA_VEXTQ_U8(s1_hi, s1_lo, 12);
+
+        // SIMD does not support rotations. Shift is the next best thing to entangle
+        // bits with other 64-bit positions. We must shift by an odd number so that
+        // each bit reaches all 64-bit positions, not just half. We must lose bits
+        // of information, so we minimize it: 1 and 3. We use different shift values
+        // to increase divergence between the two sides. We use rightward shift
+        // because the rightmost bits have the least diffusion in addition (the low
+        // bit is just a XOR of the low bits).
+        u_lo = vshrq_n_u64(s0_lo, 1);
+        u_hi = vshrq_n_u64(s0_hi, 1);
+#if defined(__clang__)
+        // UGLY HACK: Clang enjoys merging the above statements with the vadds below
+        // into vsras.
+        // This is dumb, as it still needs to do the original vshr for the xor mix,
+        // causing it to shift twice.
+        // This makes Clang assume that this line has side effects, preventing the
+        // combination and speeding things up significantly.
+        // clang-format off
+        __asm__("" : "+w"(u_lo), "+w"(u_hi));
+        // clang-format on
+#endif
+
+        // Two orthogonally grown pieces evolving independently, XORed.
+        // Note: We do this first because on the assembly side, vsra would overwrite
+        // t1_lo and t1_hi.
+        o_lo = veorq_u64(u_lo, t1_lo);
+        o_hi = veorq_u64(u_hi, t1_hi);
+        // Addition is the main source of diffusion.
+        // Storing the output in the state keeps that diffusion permanently.
+        s0_lo = vaddq_u64(t0_lo, u_lo);
+        s0_hi = vaddq_u64(t0_hi, u_hi);
+        // Use vsra here directly.
+        s1_lo = vsraq_n_u64(t1_lo, s1_lo, 3);
+        s1_hi = vsraq_n_u64(t1_hi, s1_hi, 3);
+    }
+    s->state[0] = s0_lo;
+    s->state[1] = s0_hi;
+    s->state[2] = s1_lo;
+    s->state[3] = s1_hi;
+    s->output[0] = o_lo;
+    s->output[1] = o_hi;
+    s->counter[0] = counter_lo;
+    s->counter[1] = counter_hi;
+}
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[8] = {
+    0x9E3779B97F4A7C15,
+    0xF39CC0605CEDC834,
+    0x1082276BF3A27251,
+    0xF86C6A11D0C18E95,
+    0x2767F0B153D27B7F,
+    0x0347045B5BF1827F,
+    0x01886F0928403002,
+    0xC1D64BA40F335E36,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+    s->counter[0] = vdupq_n_u64(0);
+    s->counter[1] = vdupq_n_u64(0);
+#define STEPS 5
+#define ROUNDS 4
+    // Diffuse first two seed elements in s0, then the last two. Same for s1.
+    // We must keep half of the state unchanged so users cannot set a bad state.
+    for (size_t i = 0; i < 4; i++) {
+        s->state[i] = veorq_u64(SHISHUA_VSETQ_N_U64(seed[i], 0), vld1q_u64(&phi[i * 2]));
+    }
+    for (size_t i = 0; i < ROUNDS; i++) {
+        prng_gen(s, NULL, 32 * STEPS);
+        s->state[0] = s->state[2];
+        s->state[1] = s->state[3];
+        s->state[2] = s->output[0];
+        s->state[3] = s->output[1];
+    }
+#undef STEPS
+#undef ROUNDS
+}
+#undef SHISHUA_VSETQ_N_U64
+#undef SHISHUA_VEXTQ_U8
+#undef SHISHUA_RESTRICT
+
+#endif
diff --git a/external/shishua/shishua-half-sse2.h b/external/shishua/shishua-half-sse2.h
new file mode 100644
index 000000000..91e1194fa
--- /dev/null
+++ b/external/shishua/shishua-half-sse2.h
@@ -0,0 +1,200 @@
+// An SSE2/SSSE3 version of shishua half. Slower than AVX2, but more compatible.
+// Also compatible with 32-bit x86.
+//
+// SSSE3 is recommended, as it has the useful _mm_alignr_epi8 intrinsic.
+// We can still emulate it on SSE2, but it is slower.
+#ifndef SHISHUA_HALF_SSE2_H
+#define SHISHUA_HALF_SSE2_H
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+// Note: cl.exe doesn't define __SSSE3__
+#if defined(__SSSE3__) || defined(__AVX__)
+#include <tmmintrin.h>  // SSSE3
+#define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \
+    _mm_alignr_epi8(hi, lo, amt)
+#else
+#include <emmintrin.h>  // SSE2
+// Emulate _mm_alignr_epi8 for SSE2. It's a little slow.
+// The compiler may convert it to a sequence of shufps instructions, which is
+// perfectly fine.
+#define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \
+    _mm_or_si128(                        \
+        _mm_slli_si128(hi, 16 - (amt)),  \
+        _mm_srli_si128(lo, amt))
+#endif
+
+typedef struct prng_state {
+    __m128i state[4];
+    __m128i output[2];
+    __m128i counter[2];
+} prng_state;
+
+// Wrappers for x86 targets which usually lack these intrinsics.
+// Don't call these with side effects.
+#if defined(__x86_64__) || defined(_M_X64)
+#define SHISHUA_SET_EPI64X(b, a) _mm_set_epi64x(b, a)
+#define SHISHUA_CVTSI64_SI128(x) _mm_cvtsi64_si128(x)
+#else
+#define SHISHUA_SET_EPI64X(b, a)      \
+    _mm_set_epi32(                    \
+        (int)(((uint64_t)(b)) >> 32), \
+        (int)(b),                     \
+        (int)(((uint64_t)(a)) >> 32), \
+        (int)(a))
+#define SHISHUA_CVTSI64_SI128(x) SHISHUA_SET_EPI64X(0, x)
+#endif
+
+// buf could technically alias with prng_state, according to the compiler.
+#if defined(__GNUC__) || defined(_MSC_VER)
+#define SHISHUA_RESTRICT __restrict
+#else
+#define SHISHUA_RESTRICT
+#endif
+
+// buf's size must be a multiple of 32 bytes.
+static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
+    // We need a lot of variables...
+    __m128i s0_lo = s->state[0], s0_hi = s->state[1],
+            s1_lo = s->state[2], s1_hi = s->state[3],
+            t0_lo, t0_hi, t1_lo, t1_hi,
+            u_lo, u_hi,
+            counter_lo = s->counter[0], counter_hi = s->counter[1];
+    // The counter is not necessary to beat PractRand.
+    // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
+    // or about 7 millenia at 10 GiB/s.
+    // The increments are picked as odd numbers,
+    // since only coprimes of the base cover the full cycle,
+    // and all odd numbers are coprime of 2.
+    // I use different odd numbers for each 64-bit chunk
+    // for a tiny amount of variation stirring.
+    // I used the smallest odd numbers to avoid having a magic number.
+    // increment = { 7, 5, 3, 1 };
+    const __m128i increment_lo = SHISHUA_SET_EPI64X(5, 7);
+    const __m128i increment_hi = SHISHUA_SET_EPI64X(1, 3);
+
+    // TODO: consider adding proper uneven write handling
+    assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes.");
+
+    for (size_t i = 0; i < size; i += 32) {
+        // Split to try to reduce register pressure
+        if (buf != NULL) {
+            _mm_storeu_si128((__m128i *)&buf[i + 0], s->output[0]);
+            _mm_storeu_si128((__m128i *)&buf[i + 16], s->output[1]);
+        }
+
+        // Lane 0
+
+        // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+        // additions to strong positions for enrichment. The low 32-bit part of a
+        // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+        // They do not remain in the same chunk. Each part eventually reaches all
+        // positions ringwise: A to B, B to C, …, H to A.
+        // You may notice that they are simply 256-bit rotations (96 and 160).
+        // Note: This:
+        //   x = (y << 96) | (y >> 160)
+        // can be rewritten as this
+        //   x_lo = (y_lo << 96) | (y_hi >> 32)
+        //   x_hi = (y_hi << 96) | (y_lo >> 32)
+        // which we can do with 2 _mm_alignr_epi8 instructions.
+        t0_lo = SHISHUA_ALIGNR_EPI8(s0_lo, s0_hi, 4);
+        t0_hi = SHISHUA_ALIGNR_EPI8(s0_hi, s0_lo, 4);
+
+        // SIMD does not support rotations. Shift is the next best thing to entangle
+        // bits with other 64-bit positions. We must shift by an odd number so that
+        // each bit reaches all 64-bit positions, not just half. We must lose bits
+        // of information, so we minimize it: 1 and 3. We use different shift values
+        // to increase divergence between the two sides. We use rightward shift
+        // because the rightmost bits have the least diffusion in addition (the low
+        // bit is just a XOR of the low bits).
+        u_lo = _mm_srli_epi64(s0_lo, 1);
+        u_hi = _mm_srli_epi64(s0_hi, 1);
+
+        // Addition is the main source of diffusion.
+        // Storing the output in the state keeps that diffusion permanently.
+        s0_lo = _mm_add_epi64(u_lo, t0_lo);
+        s0_hi = _mm_add_epi64(u_hi, t0_hi);
+
+        // Lane 1
+
+        // I apply the counter to s1,
+        // since it is the one whose shift loses most entropy.
+        s1_lo = _mm_add_epi64(s1_lo, counter_lo);
+        s1_hi = _mm_add_epi64(s1_hi, counter_hi);
+
+        // Increment the counter
+        counter_lo = _mm_add_epi64(counter_lo, increment_lo);
+        counter_hi = _mm_add_epi64(counter_hi, increment_hi);
+
+        // Same as above, but with different shift amounts.
+        t1_lo = SHISHUA_ALIGNR_EPI8(s1_hi, s1_lo, 12);
+        t1_hi = SHISHUA_ALIGNR_EPI8(s1_lo, s1_hi, 12);
+
+        s1_lo = _mm_srli_epi64(s1_lo, 3);
+        s1_hi = _mm_srli_epi64(s1_hi, 3);
+
+        s1_lo = _mm_add_epi64(s1_lo, t1_lo);
+        s1_hi = _mm_add_epi64(s1_hi, t1_hi);
+
+        // Two orthogonally grown pieces evolving independently, XORed.
+        s->output[0] = _mm_xor_si128(u_lo, t1_lo);
+        s->output[1] = _mm_xor_si128(u_hi, t1_hi);
+    }
+
+    s->state[0] = s0_lo;
+    s->state[1] = s0_hi;
+    s->state[2] = s1_lo;
+    s->state[3] = s1_hi;
+
+    s->counter[0] = counter_lo;
+    s->counter[1] = counter_hi;
+}
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[8] = {
+    0x9E3779B97F4A7C15,
+    0xF39CC0605CEDC834,
+    0x1082276BF3A27251,
+    0xF86C6A11D0C18E95,
+    0x2767F0B153D27B7F,
+    0x0347045B5BF1827F,
+    0x01886F0928403002,
+    0xC1D64BA40F335E36,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+    s->counter[0] = _mm_setzero_si128();
+    s->counter[1] = _mm_setzero_si128();
+#define STEPS 5
+#define ROUNDS 4
+    // Diffuse first two seed elements in s0, then the last two. Same for s1.
+    // We must keep half of the state unchanged so users cannot set a bad state.
+    // XXX: better way to do this?
+    __m128i seed_0 = SHISHUA_CVTSI64_SI128(seed[0]);
+    __m128i seed_1 = SHISHUA_CVTSI64_SI128(seed[1]);
+    __m128i seed_2 = SHISHUA_CVTSI64_SI128(seed[2]);
+    __m128i seed_3 = SHISHUA_CVTSI64_SI128(seed[3]);
+    s->state[0] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[0]));
+    s->state[1] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[2]));
+    s->state[2] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[4]));
+    s->state[3] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[6]));
+
+    for (int i = 0; i < ROUNDS; i++) {
+        prng_gen(s, NULL, 32 * STEPS);
+        s->state[0] = s->state[2];
+        s->state[1] = s->state[3];
+        s->state[2] = s->output[0];
+        s->state[3] = s->output[1];
+    }
+#undef STEPS
+#undef ROUNDS
+}
+#undef SHISHUA_ALIGNR_EPI8
+#undef SHISHUA_CVTSI64_SI128
+#undef SHISHUA_SET_EPI64X
+#undef SHISHUA_RESTRICT
+
+#endif
diff --git a/external/shishua/shishua-half.h b/external/shishua/shishua-half.h
new file mode 100644
index 000000000..70a9f4be4
--- /dev/null
+++ b/external/shishua/shishua-half.h
@@ -0,0 +1,196 @@
+#ifndef SHISHUA_HALF_H
+#define SHISHUA_HALF_H
+
+#define SHISHUA_TARGET_SCALAR 0
+#define SHISHUA_TARGET_AVX2 1
+#define SHISHUA_TARGET_SSE2 2
+#define SHISHUA_TARGET_NEON 3
+
+#ifndef SHISHUA_TARGET
+#if defined(__AVX2__) && (defined(__x86_64__) || defined(_M_X64))
+#define SHISHUA_TARGET SHISHUA_TARGET_AVX2
+#elif defined(__x86_64__) || defined(_M_X64) || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#define SHISHUA_TARGET SHISHUA_TARGET_SSE2
+// GCC's NEON codegen leaves much to be desired, at least as of 9.2.0. The
+// scalar path ends up being faster.
+// Device: Google Pixel 2 XL, 2.46GHz Qualcomm Snapdragon 835
+//                       |   GCC 9.2.0    |  Clang 9.0.1
+//   shishua neon        | 0.2845 ns/byte | 0.0966 ns/byte
+//   shishua scalar      | 0.2056 ns/byte | 0.2958 ns/byte
+//   shishua half neon   | 0.5169 ns/byte | 0.1929 ns/byte
+//   shishua half scalar | 0.2496 ns/byte | 0.2911 ns/byte
+// Therefore, we only autoselect the NEON path on Clang, at least until GCC's
+// NEON codegen improves.
+#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__clang__)
+#define SHISHUA_TARGET SHISHUA_TARGET_NEON
+#else
+#define SHISHUA_TARGET SHISHUA_TARGET_SCALAR
+#endif
+#endif
+
+// These are all optional: By defining SHISHUA_TARGET_SCALAR, you only
+// need this header.
+// Additionally, these headers can also be used on their own.
+#if SHISHUA_TARGET == SHISHUA_TARGET_AVX2
+#include "shishua-half-avx2.h"
+#elif SHISHUA_TARGET == SHISHUA_TARGET_SSE2
+#include "shishua-half-sse2.h"
+#elif SHISHUA_TARGET == SHISHUA_TARGET_NEON
+#include "shishua-half-neon.h"
+#else  // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR
+
+// Portable scalar implementation of shishua half.
+// Aims for a balance between code size and performance.
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+// Note: While it is an array, a "lane" refers to 4 consecutive uint64_t.
+typedef struct prng_state {
+    uint64_t state[8];    // 2 lanes
+    uint64_t output[4];   // 1 lane
+    uint64_t counter[4];  // 1 lane
+} prng_state;
+
+// buf could technically alias with prng_state, according to the compiler.
+#if defined(__GNUC__) || defined(_MSC_VER)
+#define SHISHUA_RESTRICT __restrict
+#else
+#define SHISHUA_RESTRICT
+#endif
+
+static inline void shishua_write_le64(void *dst, uint64_t val) {
+    // Define to write in native endianness with memcpy
+    // Also, use memcpy on known little endian setups.
+#if defined(SHISHUA_NATIVE_ENDIAN) || defined(_WIN32) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN__)
+    memcpy(dst, &val, sizeof(uint64_t));
+#else
+    // Byteshift write.
+    uint8_t *d = (uint8_t *)dst;
+    for (size_t i = 0; i < 8; i++) {
+        d[i] = (uint8_t)(val & 0xff);
+        val >>= 8;
+    }
+#endif
+}
+
+// buf's size must be a multiple of 32 bytes.
+static inline void prng_gen(prng_state *SHISHUA_RESTRICT state, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
+    uint8_t *b = buf;
+    // TODO: consider adding proper uneven write handling
+    assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes.");
+
+    for (size_t i = 0; i < size; i += 32) {
+        uint64_t t[8];
+        // Write to buf
+        if (buf != NULL) {
+            for (size_t j = 0; j < 4; j++) {
+                shishua_write_le64(b, state->output[j]);
+                b += 8;
+            }
+        }
+
+        for (size_t j = 0; j < 4; j++) {
+            // I apply the counter to s1,
+            // since it is the one whose shift loses most entropy.
+            state->state[j + 4] += state->counter[j];
+            // The counter is not necessary to beat PractRand.
+            // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
+            // or about 7 millenia at 10 GiB/s.
+            // The increments are picked as odd numbers,
+            // since only coprimes of the base cover the full cycle,
+            // and all odd numbers are coprime of 2.
+            // I use different odd numbers for each 64-bit chunk
+            // for a tiny amount of variation stirring.
+            // I used the smallest odd numbers to avoid having a magic number.
+            //
+            // For the scalar version, we calculate this dynamically, as it is
+            // simple enough.
+            state->counter[j] += 7 - (j * 2);  // 7, 5, 3, 1
+        }
+
+        // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+        // additions to strong positions for enrichment. The low 32-bit part of a
+        // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+        // They do not remain in the same chunk. Each part eventually reaches all
+        // positions ringwise: A to B, B to C, …, H to A.
+        // You may notice that they are simply 256-bit rotations (96 and 160).
+        //
+        // It would be much easier and cleaner to just reinterpret as a uint32_t
+        // pointer or use memcpy, but that is unfortunately endian-dependent, and
+        // the former also breaks strict aliasing.
+        //
+        // The only known solution which doesn't rely on endianness is to
+        // read two 64-bit integers and do a funnel shift.
+        //
+        // See shishua.h for details.
+
+        // Lookup table for the _offsets_ in the shuffle. Even lanes rotate
+        // by 5, odd lanes rotate by 3.
+        // If it were by 32-bit lanes, it would be
+        // { 5,6,7,0,1,2,3,4, 11,12,13,14,15,8,9,10 }
+        const uint8_t shuf_offsets[16] = {2, 3, 0, 1, 5, 6, 7, 4,   // left
+                                          3, 0, 1, 2, 6, 7, 4, 5};  // right
+        for (size_t j = 0; j < 8; j++) {
+            t[j] = (state->state[shuf_offsets[j]] >> 32) | (state->state[shuf_offsets[j + 8]] << 32);
+        }
+        for (size_t j = 0; j < 4; j++) {
+            // SIMD does not support rotations. Shift is the next best thing to entangle
+            // bits with other 64-bit positions. We must shift by an odd number so that
+            // each bit reaches all 64-bit positions, not just half. We must lose bits
+            // of information, so we minimize it: 1 and 3. We use different shift values
+            // to increase divergence between the two sides. We use rightward shift
+            // because the rightmost bits have the least diffusion in addition (the low
+            // bit is just a XOR of the low bits).
+            uint64_t u_lo = state->state[j + 0] >> 1;
+            uint64_t u_hi = state->state[j + 4] >> 3;
+            // Addition is the main source of diffusion.
+            // Storing the output in the state keeps that diffusion permanently.
+            state->state[j + 0] = u_lo + t[j + 0];
+            state->state[j + 4] = u_hi + t[j + 4];
+
+            // Two orthogonally grown pieces evolving independently, XORed.
+            state->output[j] = u_lo ^ t[j + 4];
+        }
+    }
+}
+#undef SHISHUA_RESTRICT
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[8] = {
+    0x9E3779B97F4A7C15,
+    0xF39CC0605CEDC834,
+    0x1082276BF3A27251,
+    0xF86C6A11D0C18E95,
+    0x2767F0B153D27B7F,
+    0x0347045B5BF1827F,
+    0x01886F0928403002,
+    0xC1D64BA40F335E36,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+    memset(s, 0, sizeof(prng_state));
+
+#define STEPS 5
+#define ROUNDS 4
+    // Diffuse first two seed elements in s0, then the last two. Same for s1.
+    // We must keep half of the state unchanged so users cannot set a bad state.
+    memcpy(s->state, phi, sizeof(phi));
+    for (size_t i = 0; i < 4; i++) {
+        s->state[i * 2] ^= seed[i];
+    }
+    for (size_t i = 0; i < ROUNDS; i++) {
+        prng_gen(s, NULL, 32 * STEPS);
+        for (size_t j = 0; j < 4; j++) {
+            s->state[j + 0] = s->state[j + 4];
+            s->state[j + 4] = s->output[j];
+        }
+    }
+#undef STEPS
+#undef ROUNDS
+}
+#endif  // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR
+#endif  // SHISHUA_HALF_SCALAR_H
diff --git a/external/shishua/shishua-neon.h b/external/shishua/shishua-neon.h
new file mode 100644
index 000000000..6cd5124bd
--- /dev/null
+++ b/external/shishua/shishua-neon.h
@@ -0,0 +1,199 @@
+// An ARM NEON version of shishua.
+#ifndef SHISHUA_NEON_H
+#define SHISHUA_NEON_H
+#include <arm_neon.h>
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct prng_state {
+    uint64x2_t state[8];
+    uint64x2_t output[8];
+    uint64x2_t counter[2];
+} prng_state;
+
+#if defined(__GNUC__) && (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define SHISHUA_VSETQ_N_U64(a, b) (__extension__(uint64x2_t){a, b})
+#else
+#define SHISHUA_VSETQ_N_U64(a, b) vcombine_u64(vdup_n_u64(a), vdup_n_u64(b))
+#endif
+// To hide the vreinterpret ritual.
+#define SHISHUA_VEXTQ_U8(Rn, Rm, Imm) \
+    vreinterpretq_u64_u8(             \
+        vextq_u8(                     \
+            vreinterpretq_u8_u64(Rn), \
+            vreinterpretq_u8_u64(Rm), \
+            (Imm)))
+
+// buf could technically alias with prng_state, according to the compiler.
+#if defined(__GNUC__) || defined(_MSC_VER)
+#define SHISHUA_RESTRICT __restrict
+#else
+#define SHISHUA_RESTRICT
+#endif
+
+// buf's size must be a multiple of 128 bytes.
+static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
+    uint8_t *b = buf;
+    uint64x2_t counter_lo = s->counter[0], counter_hi = s->counter[1];
+    // The counter is not necessary to beat PractRand.
+    // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
+    // or about 7 millenia at 10 GiB/s.
+    // The increments are picked as odd numbers,
+    // since only coprimes of the base cover the full cycle,
+    // and all odd numbers are coprime of 2.
+    // I use different odd numbers for each 64-bit chunk
+    // for a tiny amount of variation stirring.
+    // I used the smallest odd numbers to avoid having a magic number.
+    uint64x2_t increment_lo = SHISHUA_VSETQ_N_U64(7, 5);
+    uint64x2_t increment_hi = SHISHUA_VSETQ_N_U64(3, 1);
+    // TODO: consider adding proper uneven write handling
+    assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes.");
+
+    for (size_t i = 0; i < size; i += 128) {
+        // Write the current output block to state if it is not NULL
+        if (buf != NULL) {
+            for (size_t j = 0; j < 8; j++) {
+                vst1q_u8(b, vreinterpretq_u8_u64(s->output[j]));
+                b += 16;
+            }
+        }
+        // NEON has less register pressure than SSE2, but we reroll it anyways for
+        // code size.
+        for (size_t j = 0; j < 2; j++) {
+            uint64x2_t s0_lo = s->state[j * 4 + 0],
+                       s0_hi = s->state[j * 4 + 1],
+                       s1_lo = s->state[j * 4 + 2],
+                       s1_hi = s->state[j * 4 + 3],
+                       t0_lo, t0_hi, t1_lo, t1_hi,
+                       u_lo, u_hi;
+
+            // I apply the counter to s1,
+            // since it is the one whose shift loses most entropy.
+            s1_lo = vaddq_u64(s1_lo, counter_lo);
+            s1_hi = vaddq_u64(s1_hi, counter_hi);
+
+            // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+            // additions to strong positions for enrichment. The low 32-bit part of a
+            // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+            // They do not remain in the same chunk. Each part eventually reaches all
+            // positions ringwise: A to B, B to C, …, H to A.
+            // You may notice that they are simply 256-bit rotations (96 and 160).
+            // Note: This:
+            //   x = (y << 96) | (y >> 160)
+            // can be rewritten as this
+            //   x_lo = (y_lo << 96) | (y_hi >> 32)
+            //   x_hi = (y_hi << 96) | (y_lo >> 32)
+            // which we can do with 2 vext.8 instructions.
+            t0_lo = SHISHUA_VEXTQ_U8(s0_hi, s0_lo, 4);
+            t0_hi = SHISHUA_VEXTQ_U8(s0_lo, s0_hi, 4);
+            t1_lo = SHISHUA_VEXTQ_U8(s1_lo, s1_hi, 12);
+            t1_hi = SHISHUA_VEXTQ_U8(s1_hi, s1_lo, 12);
+
+            // SIMD does not support rotations. Shift is the next best thing to entangle
+            // bits with other 64-bit positions. We must shift by an odd number so that
+            // each bit reaches all 64-bit positions, not just half. We must lose bits
+            // of information, so we minimize it: 1 and 3. We use different shift values
+            // to increase divergence between the two sides. We use rightward shift
+            // because the rightmost bits have the least diffusion in addition (the low
+            // bit is just a XOR of the low bits).
+            u_lo = vshrq_n_u64(s0_lo, 1);
+            u_hi = vshrq_n_u64(s0_hi, 1);
+#if defined(__clang__)
+            // UGLY HACK: Clang enjoys merging the above statements with the vadds below
+            // into vsras.
+            // This is dumb, as it still needs to do the original vshr for the xor mix,
+            // causing it to shift twice.
+            // This makes Clang assume that this line has side effects, preventing the
+            // combination and speeding things up significantly.
+            // clang-format off
+            __asm__("" : "+w"(u_lo), "+w"(u_hi));
+            // clang-format on
+#endif
+
+            // Addition is the main source of diffusion.
+            // Storing the output in the state keeps that diffusion permanently.
+            s->state[4 * j + 0] = vaddq_u64(t0_lo, u_lo);
+            s->state[4 * j + 1] = vaddq_u64(t0_hi, u_hi);
+            // Use vsra here directly.
+            s->state[4 * j + 2] = vsraq_n_u64(t1_lo, s1_lo, 3);
+            s->state[4 * j + 3] = vsraq_n_u64(t1_hi, s1_hi, 3);
+
+            // The first orthogonally grown pieces evolving independently, XORed.
+            s->output[2 * j + 0] = veorq_u64(u_lo, t1_lo);
+            s->output[2 * j + 1] = veorq_u64(u_hi, t1_hi);
+        }
+        // The second orthogonally grown piece evolving independently, XORed.
+        s->output[4] = veorq_u64(s->state[0], s->state[6]);
+        s->output[5] = veorq_u64(s->state[1], s->state[7]);
+
+        s->output[6] = veorq_u64(s->state[2], s->state[4]);
+        s->output[7] = veorq_u64(s->state[3], s->state[5]);
+
+        counter_lo = vaddq_u64(counter_lo, increment_lo);
+        counter_hi = vaddq_u64(counter_hi, increment_hi);
+    }
+    s->counter[0] = counter_lo;
+    s->counter[1] = counter_hi;
+}
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[16] = {
+    0x9E3779B97F4A7C15,
+    0xF39CC0605CEDC834,
+    0x1082276BF3A27251,
+    0xF86C6A11D0C18E95,
+    0x2767F0B153D27B7F,
+    0x0347045B5BF1827F,
+    0x01886F0928403002,
+    0xC1D64BA40F335E36,
+    0xF06AD7AE9717877E,
+    0x85839D6EFFBD7DC6,
+    0x64D325D1C5371682,
+    0xCADD0CCCFDFFBBE1,
+    0x626E33B8D04B4331,
+    0xBBF73C790D94F79D,
+    0x471C4AB3ED3D82A5,
+    0xFEC507705E4AE6E5,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+    s->counter[0] = vdupq_n_u64(0);
+    s->counter[1] = vdupq_n_u64(0);
+#define ROUNDS 13
+#define STEPS 1
+    // Diffuse first two seed elements in s0, then the last two. Same for s1.
+    // We must keep half of the state unchanged so users cannot set a bad state.
+    uint64x2_t seed_0 = SHISHUA_VSETQ_N_U64(seed[0], 0);
+    uint64x2_t seed_1 = SHISHUA_VSETQ_N_U64(seed[1], 0);
+    uint64x2_t seed_2 = SHISHUA_VSETQ_N_U64(seed[2], 0);
+    uint64x2_t seed_3 = SHISHUA_VSETQ_N_U64(seed[3], 0);
+    s->state[0] = veorq_u64(seed_0, vld1q_u64(&phi[0]));
+    s->state[1] = veorq_u64(seed_1, vld1q_u64(&phi[2]));
+    s->state[2] = veorq_u64(seed_2, vld1q_u64(&phi[4]));
+    s->state[3] = veorq_u64(seed_3, vld1q_u64(&phi[6]));
+    s->state[4] = veorq_u64(seed_2, vld1q_u64(&phi[8]));
+    s->state[5] = veorq_u64(seed_3, vld1q_u64(&phi[10]));
+    s->state[6] = veorq_u64(seed_0, vld1q_u64(&phi[12]));
+    s->state[7] = veorq_u64(seed_1, vld1q_u64(&phi[14]));
+
+    for (int i = 0; i < ROUNDS; i++) {
+        prng_gen(s, NULL, 128 * STEPS);
+        s->state[0] = s->output[6];
+        s->state[1] = s->output[7];
+        s->state[2] = s->output[4];
+        s->state[3] = s->output[5];
+        s->state[4] = s->output[2];
+        s->state[5] = s->output[3];
+        s->state[6] = s->output[0];
+        s->state[7] = s->output[1];
+    }
+#undef STEPS
+#undef ROUNDS
+}
+#undef SHISHUA_VSETQ_N_U64
+#undef SHISHUA_VEXTQ_U8
+#undef SHISHUA_RESTRICT
+#endif
diff --git a/external/shishua/shishua-sse2.h b/external/shishua/shishua-sse2.h
new file mode 100644
index 000000000..af28300ad
--- /dev/null
+++ b/external/shishua/shishua-sse2.h
@@ -0,0 +1,228 @@
+// An SSE2/SSSE3 version of shishua. Slower than AVX2, but more compatible.
+// Also compatible with 32-bit x86.
+//
+// SSSE3 is recommended, as it has the useful _mm_alignr_epi8 intrinsic.
+// We can still emulate it on SSE2, but it is slower.
+// Consider the half version if AVX2 is not available.
+#ifndef SHISHUA_SSE2_H
+#define SHISHUA_SSE2_H
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+// Note: cl.exe doesn't define __SSSE3__
+#if defined(__SSSE3__) || defined(__AVX__)
+#include <tmmintrin.h>  // SSSE3
+#define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \
+    _mm_alignr_epi8(hi, lo, amt)
+#else
+#include <emmintrin.h>  // SSE2
+// Emulate _mm_alignr_epi8 for SSE2. It's a little slow.
+// The compiler may convert it to a sequence of shufps instructions, which is
+// perfectly fine.
+#define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \
+    _mm_or_si128(                        \
+        _mm_slli_si128(hi, 16 - (amt)),  \
+        _mm_srli_si128(lo, amt))
+#endif
+
+typedef struct prng_state {
+    __m128i state[8];
+    __m128i output[8];
+    __m128i counter[2];
+} prng_state;
+
+// Wrappers for x86 targets which usually lack these intrinsics.
+// Don't call these with side effects.
+#if defined(__x86_64__) || defined(_M_X64)
+#define SHISHUA_SET_EPI64X(b, a) _mm_set_epi64x(b, a)
+#define SHISHUA_CVTSI64_SI128(x) _mm_cvtsi64_si128(x)
+#else
+#define SHISHUA_SET_EPI64X(b, a)      \
+    _mm_set_epi32(                    \
+        (int)(((uint64_t)(b)) >> 32), \
+        (int)(b),                     \
+        (int)(((uint64_t)(a)) >> 32), \
+        (int)(a))
+#define SHISHUA_CVTSI64_SI128(x) SHISHUA_SET_EPI64X(0, x)
+#endif
+
+// buf could technically alias with prng_state, according to the compiler.
+#if defined(__GNUC__) || defined(_MSC_VER)
+#define SHISHUA_RESTRICT __restrict
+#else
+#define SHISHUA_RESTRICT
+#endif
+
+// buf's size must be a multiple of 128 bytes.
+static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
+    __m128i counter_lo = s->counter[0], counter_hi = s->counter[1];
+    // The counter is not necessary to beat PractRand.
+    // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
+    // or about 7 millenia at 10 GiB/s.
+    // The increments are picked as odd numbers,
+    // since only coprimes of the base cover the full cycle,
+    // and all odd numbers are coprime of 2.
+    // I use different odd numbers for each 64-bit chunk
+    // for a tiny amount of variation stirring.
+    // I used the smallest odd numbers to avoid having a magic number.
+
+    // increment = { 7, 5, 3, 1 };
+    __m128i increment_lo = SHISHUA_SET_EPI64X(5, 7);
+    __m128i increment_hi = SHISHUA_SET_EPI64X(1, 3);
+
+    // TODO: consider adding proper uneven write handling
+    assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes.");
+
+    for (size_t i = 0; i < size; i += 128) {
+        // Write the current output block to state if it is not NULL
+        if (buf != NULL) {
+            for (size_t j = 0; j < 8; j++) {
+                _mm_storeu_si128((__m128i *)&buf[i + (16 * j)], s->output[j]);
+            }
+        }
+
+        // There are only 16 SSE registers (8 on i686), and we have to account for
+        // temporary copies due to being stuck with 2-operand instructions.
+        // Therefore, we use fixed iteration loops to reduce code complexity while
+        // still allowing the compiler to easily unroll the loop.
+        // We also try to keep variables active for as short as possible.
+        for (size_t j = 0; j < 2; j++) {
+            __m128i s_lo, s_hi, u0_lo, u0_hi, u1_lo, u1_hi, t_lo, t_hi;
+
+            // Lane 0
+            s_lo = s->state[4 * j + 0];
+            s_hi = s->state[4 * j + 1];
+
+            // SIMD does not support rotations. Shift is the next best thing to entangle
+            // bits with other 64-bit positions. We must shift by an odd number so that
+            // each bit reaches all 64-bit positions, not just half. We must lose bits
+            // of information, so we minimize it: 1 and 3. We use different shift values
+            // to increase divergence between the two sides. We use rightward shift
+            // because the rightmost bits have the least diffusion in addition (the low
+            // bit is just a XOR of the low bits).
+            u0_lo = _mm_srli_epi64(s_lo, 1);
+            u0_hi = _mm_srli_epi64(s_hi, 1);
+
+            // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+            // additions to strong positions for enrichment. The low 32-bit part of a
+            // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+            // They do not remain in the same chunk. Each part eventually reaches all
+            // positions ringwise: A to B, B to C, …, H to A.
+            // You may notice that they are simply 256-bit rotations (96 and 160).
+            // Note: This:
+            //   x = (y << 96) | (y >> 160)
+            // can be rewritten as this
+            //   x_lo = (y_lo << 96) | (y_hi >> 32)
+            //   x_hi = (y_hi << 96) | (y_lo >> 32)
+            // which we can do with 2 _mm_alignr_epi8 instructions.
+            t_lo = SHISHUA_ALIGNR_EPI8(s_lo, s_hi, 4);
+            t_hi = SHISHUA_ALIGNR_EPI8(s_hi, s_lo, 4);
+
+            // Addition is the main source of diffusion.
+            // Storing the output in the state keeps that diffusion permanently.
+            s->state[4 * j + 0] = _mm_add_epi64(t_lo, u0_lo);
+            s->state[4 * j + 1] = _mm_add_epi64(t_hi, u0_hi);
+
+            // Lane 1
+            s_lo = s->state[4 * j + 2];
+            s_hi = s->state[4 * j + 3];
+
+            // I apply the counter to s1,
+            // since it is the one whose shift loses most entropy.
+            s_lo = _mm_add_epi64(s_lo, counter_lo);
+            s_hi = _mm_add_epi64(s_hi, counter_hi);
+
+            // Same as above but with different shifts
+            u1_lo = _mm_srli_epi64(s_lo, 3);
+            u1_hi = _mm_srli_epi64(s_hi, 3);
+
+            t_lo = SHISHUA_ALIGNR_EPI8(s_hi, s_lo, 12);
+            t_hi = SHISHUA_ALIGNR_EPI8(s_lo, s_hi, 12);
+
+            s->state[4 * j + 2] = _mm_add_epi64(t_lo, u1_lo);
+            s->state[4 * j + 3] = _mm_add_epi64(t_hi, u1_hi);
+
+            // Merge lane 0 and lane 1
+            // The first orthogonally grown piece evolving independently, XORed.
+            s->output[2 * j + 0] = _mm_xor_si128(u0_lo, t_lo);
+            s->output[2 * j + 1] = _mm_xor_si128(u0_hi, t_hi);
+        }
+
+        // The second orthogonally grown piece evolving independently, XORed.
+        s->output[4] = _mm_xor_si128(s->state[0], s->state[6]);
+        s->output[5] = _mm_xor_si128(s->state[1], s->state[7]);
+        s->output[6] = _mm_xor_si128(s->state[4], s->state[2]);
+        s->output[7] = _mm_xor_si128(s->state[5], s->state[3]);
+
+        // Increment the counter
+        counter_lo = _mm_add_epi64(counter_lo, increment_lo);
+        counter_hi = _mm_add_epi64(counter_hi, increment_hi);
+    }
+
+    s->counter[0] = counter_lo;
+    s->counter[1] = counter_hi;
+}
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[16] = {
+    0x9E3779B97F4A7C15,
+    0xF39CC0605CEDC834,
+    0x1082276BF3A27251,
+    0xF86C6A11D0C18E95,
+    0x2767F0B153D27B7F,
+    0x0347045B5BF1827F,
+    0x01886F0928403002,
+    0xC1D64BA40F335E36,
+    0xF06AD7AE9717877E,
+    0x85839D6EFFBD7DC6,
+    0x64D325D1C5371682,
+    0xCADD0CCCFDFFBBE1,
+    0x626E33B8D04B4331,
+    0xBBF73C790D94F79D,
+    0x471C4AB3ED3D82A5,
+    0xFEC507705E4AE6E5,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+    // Note: output is uninitialized at first, but since we pass NULL, its value
+    // is initially ignored.
+    s->counter[0] = _mm_setzero_si128();
+    s->counter[1] = _mm_setzero_si128();
+#define ROUNDS 13
+#define STEPS 1
+    // Diffuse first two seed elements in s0, then the last two. Same for s1.
+    // We must keep half of the state unchanged so users cannot set a bad state.
+    __m128i seed_0 = SHISHUA_CVTSI64_SI128(seed[0]);
+    __m128i seed_1 = SHISHUA_CVTSI64_SI128(seed[1]);
+    __m128i seed_2 = SHISHUA_CVTSI64_SI128(seed[2]);
+    __m128i seed_3 = SHISHUA_CVTSI64_SI128(seed[3]);
+    s->state[0] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[0]));
+    s->state[1] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[2]));
+    s->state[2] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[4]));
+    s->state[3] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[6]));
+    s->state[4] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[8]));
+    s->state[5] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[10]));
+    s->state[6] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[12]));
+    s->state[7] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[14]));
+
+    for (int i = 0; i < ROUNDS; i++) {
+        prng_gen(s, NULL, 128 * STEPS);
+        s->state[0] = s->output[6];
+        s->state[1] = s->output[7];
+        s->state[2] = s->output[4];
+        s->state[3] = s->output[5];
+        s->state[4] = s->output[2];
+        s->state[5] = s->output[3];
+        s->state[6] = s->output[0];
+        s->state[7] = s->output[1];
+    }
+#undef STEPS
+#undef ROUNDS
+}
+#undef SHISHUA_CVTSI64_SI128
+#undef SHISHUA_ALIGNR_EPI8
+#undef SHISHUA_SET_EPI64X
+#undef SHISHUA_RESTRICT
+#endif
diff --git a/external/shishua/shishua.h b/external/shishua/shishua.h
new file mode 100644
index 000000000..a0bb9248d
--- /dev/null
+++ b/external/shishua/shishua.h
@@ -0,0 +1,244 @@
+#ifndef SHISHUA_H
+#define SHISHUA_H
+
+#define SHISHUA_TARGET_SCALAR 0
+#define SHISHUA_TARGET_AVX2 1
+#define SHISHUA_TARGET_SSE2 2
+#define SHISHUA_TARGET_NEON 3
+
+#ifndef SHISHUA_TARGET
+#if defined(__AVX2__) && (defined(__x86_64__) || defined(_M_X64))
+#define SHISHUA_TARGET SHISHUA_TARGET_AVX2
+#elif defined(__x86_64__) || defined(_M_X64) || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#define SHISHUA_TARGET SHISHUA_TARGET_SSE2
+// GCC's NEON codegen leaves much to be desired, at least as of 9.2.0. The
+// scalar path ends up being faster.
+// Device: Google Pixel 2 XL, 2.46GHz Qualcomm Snapdragon 835
+//   algorithm           |   GCC 9.2.0    |  Clang 9.0.1
+//   shishua neon        | 0.2845 ns/byte | 0.0966 ns/byte
+//   shishua scalar      | 0.2056 ns/byte | 0.2958 ns/byte
+//   shishua half neon   | 0.5169 ns/byte | 0.1929 ns/byte
+//   shishua half scalar | 0.2496 ns/byte | 0.2911 ns/byte
+// Therefore, we only autoselect the NEON path on Clang, at least until GCC's
+// NEON codegen improves.
+#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__clang__)
+#define SHISHUA_TARGET SHISHUA_TARGET_NEON
+#else
+#define SHISHUA_TARGET SHISHUA_TARGET_SCALAR
+#endif
+#endif
+
+// These are all optional, with defining SHISHUA_TARGET_SCALAR, you only
+// need this header.
+#if SHISHUA_TARGET == SHISHUA_TARGET_AVX2
+#include "shishua-avx2.h"
+#elif SHISHUA_TARGET == SHISHUA_TARGET_SSE2
+#include "shishua-sse2.h"
+#elif SHISHUA_TARGET == SHISHUA_TARGET_NEON
+#include "shishua-neon.h"
+#else  // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR
+
+// Portable scalar implementation of shishua.
+// Designed to balance performance and code size.
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+// Note: While it is an array, a "lane" refers to 4 consecutive uint64_t.
+typedef struct prng_state {
+    uint64_t state[16];   // 4 lanes
+    uint64_t output[16];  // 4 lanes, 2 parts
+    uint64_t counter[4];  // 1 lane
+} prng_state;
+
+// buf could technically alias with prng_state, according to the compiler.
+#if defined(__GNUC__) || defined(_MSC_VER)
+#define SHISHUA_RESTRICT __restrict
+#else
+#define SHISHUA_RESTRICT
+#endif
+
+// Writes a 64-bit little endian integer to dst
+static inline void prng_write_le64(void *dst, uint64_t val) {
+    // Define to write in native endianness with memcpy
+    // Also, use memcpy on known little endian setups.
+#if defined(SHISHUA_NATIVE_ENDIAN) || defined(_WIN32) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN__)
+    memcpy(dst, &val, sizeof(uint64_t));
+#else
+    // Byteshift write.
+    uint8_t *d = (uint8_t *)dst;
+    for (size_t i = 0; i < 8; i++) {
+        d[i] = (uint8_t)(val & 0xff);
+        val >>= 8;
+    }
+#endif
+}
+
+// buf's size must be a multiple of 128 bytes.
+static inline void prng_gen(prng_state *SHISHUA_RESTRICT state, uint8_t *SHISHUA_RESTRICT buf, size_t size) {
+    uint8_t *b = buf;
+    // TODO: consider adding proper uneven write handling
+    assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes.");
+
+    for (size_t i = 0; i < size; i += 128) {
+        // Write the current output block to state if it is not NULL
+        if (buf != NULL) {
+            for (size_t j = 0; j < 16; j++) {
+                prng_write_le64(b, state->output[j]);
+                b += 8;
+            }
+        }
+        // Similar to SSE, use fixed iteration loops to reduce code complexity
+        // and allow the compiler more control over optimization.
+        for (size_t j = 0; j < 2; j++) {
+            // I don't want to type this 15 times.
+            uint64_t *s = &state->state[j * 8];   // 2 lanes
+            uint64_t *o = &state->output[j * 4];  // 1 lane
+            uint64_t t[8];                        // temp buffer
+
+            // I apply the counter to s1,
+            // since it is the one whose shift loses most entropy.
+            for (size_t k = 0; k < 4; k++) {
+                s[k + 4] += state->counter[k];
+            }
+
+            // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
+            // additions to strong positions for enrichment. The low 32-bit part of a
+            // 64-bit chunk never moves to the same 64-bit chunk as its high part.
+            // They do not remain in the same chunk. Each part eventually reaches all
+            // positions ringwise: A to B, B to C, …, H to A.
+            //
+            // You may notice that they are simply 256-bit rotations (96 and 160):
+            //
+            //   t0 = (s0 <<  96) | (s0 >> (256 -  96));
+            //   t1 = (s1 << 160) | (s1 >> (256 - 160));
+            //
+            // The easiest way to do this would be to cast s and t to uint32_t *
+            // and operate on them that way.
+            //
+            //   uint32_t *t0_32 = (uint32_t *)t0, *t1_32 = (uint32_t *)t1;
+            //   uint32_t *s0_32 = (uint32_t *)s0, *s1_32 = (uint32_t *)s1;
+            //   for (size_t k = 0; k < 4; k++) {
+            //     t0_32[k] = s0_32[(k + 5) % 8];
+            //     t1_32[k] = s1_32[(k + 3) % 8];
+            //   }
+            //
+            // This is pretty, but it violates strict aliasing and relies on little
+            // endian data layout.
+            //
+            // A common workaround to strict aliasing would be to use memcpy:
+            //
+            //   // legal casts
+            //   unsigned char *t8 = (unsigned char *)t;
+            //   unsigned char *s8 = (unsigned char *)s;
+            //   memcpy(&t8[0], &s8[20], 32 - 20);
+            //   memcpy(&t8[32 - 20], &s8[0], 20);
+            //
+            // However, this still doesn't fix the endianness issue, and is very
+            // ugly.
+            //
+            // The only known solution which doesn't rely on endianness is to
+            // read two 64-bit integers and do a funnel shift.
+
+            // Lookup table for the _offsets_ in the shuffle. Even lanes rotate
+            // by 5, odd lanes rotate by 3.
+            // If it were by 32-bit lanes, it would be
+            // { 5,6,7,0,1,2,3,4, 11,12,13,14,15,8,9,10 }
+            const uint8_t shuf_offsets[16] = {2, 3, 0, 1, 5, 6, 7, 4,   // left
+                                              3, 0, 1, 2, 6, 7, 4, 5};  // right
+            for (size_t k = 0; k < 8; k++) {
+                t[k] = (s[shuf_offsets[k]] >> 32) | (s[shuf_offsets[k + 8]] << 32);
+            }
+
+            for (size_t k = 0; k < 4; k++) {
+                // SIMD does not support rotations. Shift is the next best thing to entangle
+                // bits with other 64-bit positions. We must shift by an odd number so that
+                // each bit reaches all 64-bit positions, not just half. We must lose bits
+                // of information, so we minimize it: 1 and 3. We use different shift values
+                // to increase divergence between the two sides. We use rightward shift
+                // because the rightmost bits have the least diffusion in addition (the low
+                // bit is just a XOR of the low bits).
+                uint64_t u_lo = s[k + 0] >> 1;
+                uint64_t u_hi = s[k + 4] >> 3;
+
+                // Addition is the main source of diffusion.
+                // Storing the output in the state keeps that diffusion permanently.
+                s[k + 0] = u_lo + t[k + 0];
+                s[k + 4] = u_hi + t[k + 4];
+
+                // The first orthogonally grown piece evolving independently, XORed.
+                o[k] = u_lo ^ t[k + 4];
+            }
+        }
+
+        // Merge together.
+        for (size_t j = 0; j < 4; j++) {
+            // The second orthogonally grown piece evolving independently, XORed.
+            state->output[j + 8] = state->state[j + 0] ^ state->state[j + 12];
+            state->output[j + 12] = state->state[j + 8] ^ state->state[j + 4];
+            // The counter is not necessary to beat PractRand.
+            // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
+            // or about 7 millenia at 10 GiB/s.
+            // The increments are picked as odd numbers,
+            // since only coprimes of the base cover the full cycle,
+            // and all odd numbers are coprime of 2.
+            // I use different odd numbers for each 64-bit chunk
+            // for a tiny amount of variation stirring.
+            // I used the smallest odd numbers to avoid having a magic number.
+            //
+            // For the scalar version, we calculate this dynamically, as it is
+            // simple enough.
+            state->counter[j] += 7 - (j * 2);  // 7, 5, 3, 1
+        }
+    }
+}
+#undef SHISHUA_RESTRICT
+
+// Nothing up my sleeve: those are the hex digits of Φ,
+// the least approximable irrational number.
+// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
+static uint64_t phi[16] = {
+    0x9E3779B97F4A7C15,
+    0xF39CC0605CEDC834,
+    0x1082276BF3A27251,
+    0xF86C6A11D0C18E95,
+    0x2767F0B153D27B7F,
+    0x0347045B5BF1827F,
+    0x01886F0928403002,
+    0xC1D64BA40F335E36,
+    0xF06AD7AE9717877E,
+    0x85839D6EFFBD7DC6,
+    0x64D325D1C5371682,
+    0xCADD0CCCFDFFBBE1,
+    0x626E33B8D04B4331,
+    0xBBF73C790D94F79D,
+    0x471C4AB3ED3D82A5,
+    0xFEC507705E4AE6E5,
+};
+
+void prng_init(prng_state *s, uint64_t seed[4]) {
+    memset(s, 0, sizeof(prng_state));
+#define STEPS 1
+#define ROUNDS 13
+    // Diffuse first two seed elements in s0, then the last two. Same for s1.
+    // We must keep half of the state unchanged so users cannot set a bad state.
+    memcpy(s->state, phi, sizeof(phi));
+    for (size_t i = 0; i < 4; i++) {
+        s->state[i * 2 + 0] ^= seed[i];            // { s0,0,s1,0,s2,0,s3,0 }
+        s->state[i * 2 + 8] ^= seed[(i + 2) % 4];  // { s2,0,s3,0,s0,0,s1,0 }
+    }
+    for (size_t i = 0; i < ROUNDS; i++) {
+        prng_gen(s, NULL, 128 * STEPS);
+        for (size_t j = 0; j < 4; j++) {
+            s->state[j + 0] = s->output[j + 12];
+            s->state[j + 4] = s->output[j + 8];
+            s->state[j + 8] = s->output[j + 4];
+            s->state[j + 12] = s->output[j + 0];
+        }
+    }
+#undef STEPS
+#undef ROUNDS
+}
+#endif  // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR
+#endif  // SHISHUA_SCALAR_H
diff --git a/src/rand.c b/src/rand.c
new file mode 100644
index 000000000..7d31a04cc
--- /dev/null
+++ b/src/rand.c
@@ -0,0 +1,55 @@
+#include "rand.h"
+
+#include <assert.h>
+#include <string.h>
+
+uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max) {
+    assert(max > min);
+    assert(randc->count <= 127);
+    uint8_t randnum = ((uint16_t)randc->buf[randc->count] * (max - min)) >> 8;
+    randnum += min;
+    randc->count++;
+    return randnum;
+}
+
+uint32_t randnum_u32(struct rand_chunk *randc, uint32_t min, uint32_t max) {
+    assert(max > min);
+    assert(randc->count <= 124);
+    uint32_t value;
+    memcpy(&value, &randc->buf[randc->count], sizeof(value));
+    uint32_t randnum = ((uint64_t)value * (max - min)) >> 32;
+    randnum += min;
+    randc->count += sizeof(uint32_t);
+    return randnum;
+}
+
+float randf(struct rand_chunk *randc, float range, float offset) {
+    assert(range);
+    assert(randc->count <= 124);
+    uint32_t value;
+    memcpy(&value, &randc->buf[randc->count], sizeof(value));
+    // float randfloat = ((float)value / UINT32_MAX) * range + offset;
+    float randfloat = ((value >> 8) / 16777216.0f) * range + offset;
+    randc->count += sizeof(float);
+    return randfloat;
+}
+
+// NOTE: example
+
+// int main() {
+//     prng_state state;
+//     static uint64_t seed[4] = {0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95};
+//     struct rand_chunk randc = {0};
+//     prng_init(&state, seed);
+//     prng_gen(&state, randc.buf, 128);
+//
+//     for (int i = 0; i < 1000; i++) {
+// 	float randfloat = randf(&randc, 2, -1);
+// 	if(randc.count > 128 - sizeof(randfloat)) { //NOTE: always check randc.count b4 using function
+// 	    randc.count = 0;
+// 	    prng_gen(&state, randc.buf, 128);
+// 	}
+// 	printf("%f, ", randfloat);
+//     }
+//     return 0;
+// }
diff --git a/src/rand.h b/src/rand.h
new file mode 100644
index 000000000..40a6d4d2e
--- /dev/null
+++ b/src/rand.h
@@ -0,0 +1,16 @@
+#ifndef SHISHUA_RAND_H
+#define SHISHUA_RAND_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+struct rand_chunk {
+    size_t count;
+    uint8_t buf[128];  // NOTE: must be multiple of 128
+};
+
+uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max);
+uint32_t randnum_u32(struct rand_chunk *randc, uint32_t min, uint32_t max);
+float randf(struct rand_chunk *randc, float range, float offset);
+
+#endif

From a62e6fd6faa0978ced1ba04688b537793e09b7a2 Mon Sep 17 00:00:00 2001
From: BrBa2 <ddddddddrrdd3@gmail.com>
Date: Thu, 12 Jun 2025 16:45:29 +0800
Subject: [PATCH 04/11] fuck this shit multiple definition go fuck yousrlf

---
 include/cneuron/cneuron.h |   2 +-
 src/data.c                |   6 +-
 src/main.c                |   9 +-
 src/network.c             |  10 +-
 src/rand.c                |  32 ++-
 src/rand.h                |   5 +-
 test/data.cpp             | 334 +++++++++++++-------------
 test/linear_algebra.cpp   | 100 ++++----
 test/main.cpp             |  14 +-
 test/network.cpp          | 491 +++++++++++++++++++-------------------
 test/test_utils.cpp       | 154 ++++++------
 test/test_utils.h         |  24 +-
 12 files changed, 597 insertions(+), 584 deletions(-)

diff --git a/include/cneuron/cneuron.h b/include/cneuron/cneuron.h
index c2c5c2d7e..4a68fae06 100644
--- a/include/cneuron/cneuron.h
+++ b/include/cneuron/cneuron.h
@@ -3,6 +3,7 @@
 
 #include <stdbool.h>
 #include <stddef.h>
+#include "rand.h"
 
 /**
  * @brief Represents a single data element with its inputs and expected output index.
@@ -164,7 +165,6 @@ typedef struct {
  * @param max Maximum value for the random number.
  * @return A random float between min and max.
  */
-float random_float(float min, float max);
 
 /**
  * @brief Allocates and initializes a new layer.
diff --git a/src/data.c b/src/data.c
index 51e319b43..e85c6ff9c 100644
--- a/src/data.c
+++ b/src/data.c
@@ -136,7 +136,7 @@ dataset *get_random_dataset_sample(const dataset *source_dataset, size_t amount)
     new_dataset->datas = malloc(sizeof(data) * amount);
 
     for (size_t i = 0; i < amount; i++) {
-        new_dataset->datas[i] = get_data_copy(source_dataset->datas[rand() % source_dataset->length], source_dataset->inputs_length);
+        new_dataset->datas[i] = get_data_copy(source_dataset->datas[randnum_u32(&randc, 0, source_dataset->length)], source_dataset->inputs_length);
     }
 
     return new_dataset;
@@ -242,9 +242,9 @@ void noise_data(data *data, size_t inputs_length, float noise_factor, float prob
     assert(inputs_length > 0);
 
     for (size_t i = 0; i < inputs_length; i++) {
-        float random_value = rand() / (float)RAND_MAX;
+        float random_value = randf(&randc, 1, 0);
         if (random_value <= probability) {
-            float noise = (rand() / (float)RAND_MAX * noise_factor);
+            float noise = randf(&randc, noise_factor, 0);
             float new_value = data->inputs[i] + noise;
 
             data->inputs[i] = fmin(new_value, 1.0f);
diff --git a/src/main.c b/src/main.c
index 7f43f0789..cc9fafb73 100644
--- a/src/main.c
+++ b/src/main.c
@@ -39,9 +39,9 @@ void train(neural_network *nn, dataset *train_dataset, dataset *test_dataset, fl
         dataset *batch_dataset = get_random_dataset_sample(train_dataset, batch_size);
         for (size_t i = 0; i < batch_dataset->length; i++) {
             data *data = batch_dataset->datas[i];
-            rotate_data(data, IMAGE_SIZE, IMAGE_SIZE, random_float(-5.0f, 5.0f));
-            scale_data(data, IMAGE_SIZE, IMAGE_SIZE, random_float(0.9f, 1.1f));
-            offset_data(data, IMAGE_SIZE, IMAGE_SIZE, random_float(-3.0f, 3.0f), random_float(-3.0f, 3.0f));
+            rotate_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, -5.0f, 5.0f));
+            scale_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, 0.9f, 1.1f));
+            offset_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, -3.0f, 3.0f), randf(&randc, -3.0f, 3.0f));
             noise_data(data, IMAGE_SIZE * IMAGE_SIZE, 0.3f, 0.08f);
         }
         mini_batch_gd(nn, learn_rate, batch_dataset);
@@ -97,7 +97,8 @@ dataset *get_mnist(bool is_test) {
 }
 
 int main(int argc, char **argv) {
-    srand(time(NULL));
+    static uint64_t seed[4] = {0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95};
+    prng_init(&__randstate, seed);
     dataset *train_dataset = get_mnist(false);
     dataset *test_dataset = get_mnist(true);
     size_t network_length = 3;
diff --git a/src/network.c b/src/network.c
index 7352d4901..3d06103f7 100644
--- a/src/network.c
+++ b/src/network.c
@@ -10,12 +10,6 @@
 
 #include "cneuron/cneuron.h"
 
-float random_float(float min, float max) {
-    assert(min < max);
-
-    return (float)rand() / (float)RAND_MAX * (max - min) + min;
-}
-
 layer *get_layer(size_t length, size_t prev_length) {
     layer *new_layer = calloc(1, sizeof(layer));
     if (!new_layer)
@@ -30,7 +24,7 @@ layer *get_layer(size_t length, size_t prev_length) {
     }
 
     for (size_t i = 0; i < length * prev_length; i++)
-        new_layer->weights[i] = ((float)rand() / (float)RAND_MAX * 2.0f - 1.0f);
+        new_layer->weights[i] = randf(&randc, 2.0f, -1.0f);
 
     new_layer->delta = calloc(length, sizeof(float));
     if (!new_layer->delta) {
@@ -206,7 +200,7 @@ float cost(neural_network *nn, const dataset *test_dataset, size_t num_test) {
 
     layer *output_layer = nn->layers[nn->length - 1];
     for (size_t i = 0; i < num_test; i++) {
-        data *test_data = test_dataset->datas[rand() % test_dataset->length];
+        data *test_data = test_dataset->datas[randnum_u32(&randc, 0, test_dataset->length)];
         compute_network(nn, test_data->inputs);
         for (size_t j = 0; j < output_layer->length; j++) {
             float output = output_layer->output[j];
diff --git a/src/rand.c b/src/rand.c
index 7d31a04cc..c01a6adbb 100644
--- a/src/rand.c
+++ b/src/rand.c
@@ -5,35 +5,49 @@
 
 uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max) {
     assert(max > min);
-    assert(randc->count <= 127);
+    assert(randc->count <= 255);
     uint8_t randnum = ((uint16_t)randc->buf[randc->count] * (max - min)) >> 8;
     randnum += min;
     randc->count++;
+    if (randc->count > 255) {
+	randc->count = 0;
+	prng_gen(&__randstate, randc->buf, 256);
+    }
     return randnum;
 }
 
 uint32_t randnum_u32(struct rand_chunk *randc, uint32_t min, uint32_t max) {
     assert(max > min);
-    assert(randc->count <= 124);
+    assert(randc->count <= 252);
     uint32_t value;
     memcpy(&value, &randc->buf[randc->count], sizeof(value));
     uint32_t randnum = ((uint64_t)value * (max - min)) >> 32;
     randnum += min;
     randc->count += sizeof(uint32_t);
+    if (randc->count > 252) {
+	randc->count = 0;
+	prng_gen(&__randstate, randc->buf, 256);
+    }
     return randnum;
 }
 
 float randf(struct rand_chunk *randc, float range, float offset) {
     assert(range);
-    assert(randc->count <= 124);
+    assert(randc->count <= 252);
     uint32_t value;
-    memcpy(&value, &randc->buf[randc->count], sizeof(value));
+    memcpy(&value, &randc->buf[randc->count], sizeof(value) - 1);
     // float randfloat = ((float)value / UINT32_MAX) * range + offset;
-    float randfloat = ((value >> 8) / 16777216.0f) * range + offset;
-    randc->count += sizeof(float);
+    float randfloat = (value / 16777216.0f) * range + offset;
+    randc->count += sizeof(value) - 1;
+    if (randc->count > 253) {
+	randc->count = 0;
+	prng_gen(&__randstate, randc->buf, 256);
+    }
     return randfloat;
 }
 
+
+
 // NOTE: example
 
 // int main() {
@@ -41,13 +55,13 @@ float randf(struct rand_chunk *randc, float range, float offset) {
 //     static uint64_t seed[4] = {0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95};
 //     struct rand_chunk randc = {0};
 //     prng_init(&state, seed);
-//     prng_gen(&state, randc.buf, 128);
+//     prng_gen(&state, randc.buf, 256);
 //
 //     for (int i = 0; i < 1000; i++) {
 // 	float randfloat = randf(&randc, 2, -1);
-// 	if(randc.count > 128 - sizeof(randfloat)) { //NOTE: always check randc.count b4 using function
+// 	if(randc.count > 256 - sizeof(randfloat)) { //NOTE: always check randc.count b4 using function
 // 	    randc.count = 0;
-// 	    prng_gen(&state, randc.buf, 128);
+// 	    prng_gen(&state, randc.buf, 256);
 // 	}
 // 	printf("%f, ", randfloat);
 //     }
diff --git a/src/rand.h b/src/rand.h
index 40a6d4d2e..0d06992cb 100644
--- a/src/rand.h
+++ b/src/rand.h
@@ -3,14 +3,17 @@
 
 #include <stddef.h>
 #include <stdint.h>
+#include "../external/shishua/shishua.h"
 
 struct rand_chunk {
     size_t count;
-    uint8_t buf[128];  // NOTE: must be multiple of 128
+    uint8_t buf[256];  // NOTE: must be multiple of 256
 };
 
 uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max);
 uint32_t randnum_u32(struct rand_chunk *randc, uint32_t min, uint32_t max);
 float randf(struct rand_chunk *randc, float range, float offset);
 
+prng_state __randstate;
+struct rand_chunk randc = {0};
 #endif
diff --git a/test/data.cpp b/test/data.cpp
index abe560d7f..0eb88c354 100644
--- a/test/data.cpp
+++ b/test/data.cpp
@@ -1,167 +1,167 @@
-#include <gtest/gtest.h>
-
-extern "C" {
-#include "cneuron/cneuron.h"
-}
-
-TEST(DataTest, CreateData) {
-    data *test_data = (data *)malloc(sizeof(data));
-
-    size_t inputs_length = 10;
-    test_data->inputs = (float *)malloc(sizeof(float) * inputs_length);
-    EXPECT_NE(test_data, nullptr);
-    EXPECT_NE(test_data->inputs, nullptr);
-    for (size_t i = 0; i < inputs_length; i++) {
-        test_data->inputs[i] = static_cast<float>(i);
-    }
-    free_data(test_data);
-    // No crash
-}
-
-TEST(DataTest, GetDatasetFileNotFound) {
-    dataset *test_dataset = get_dataset("non_existent_file.dat");
-    ASSERT_EQ(test_dataset, nullptr);
-}
-
-TEST(DataTest, GetDatasetValidFile) {
-    dataset *test_dataset = get_dataset("data/mnist/test/0.dat");
-    ASSERT_NE(test_dataset, nullptr);
-    ASSERT_GT(test_dataset->length, 0);
-    ASSERT_GT(test_dataset->inputs_length, 0);
-
-    ASSERT_NE(test_dataset, nullptr);
-    ASSERT_NE(test_dataset->datas[0], nullptr);
-    ASSERT_NE(test_dataset->datas[0]->inputs, nullptr);
-
-    free_dataset(test_dataset);
-}
-
-TEST(DataTest, FreeDataset) {
-    dataset *test_dataset = get_dataset("data/mnist/test/0.dat");
-
-    free_dataset(test_dataset);
-    // No crash
-}
-
-TEST(DataTest, FreeData) {
-    data *test_data = (data *)malloc(sizeof(data));
-    test_data->inputs = (float *)malloc(sizeof(float) * 10);
-
-    free_data(test_data);
-    // No crash
-}
-
-TEST(DataTest, CopyData) {
-    dataset *test_dataset = get_dataset("data/mnist/test/0.dat");
-
-    data *data_copy = get_data_copy(test_dataset->datas[0], test_dataset->inputs_length);
-    ASSERT_NE(data_copy, nullptr);
-    ASSERT_NE(data_copy->inputs, nullptr);
-
-    for (size_t i = 0; i < test_dataset->inputs_length; i++) {
-        ASSERT_FLOAT_EQ(data_copy->inputs[i], test_dataset->datas[0]->inputs[i]);
-    }
-
-    ASSERT_FLOAT_EQ(data_copy->expected_index, test_dataset->datas[0]->expected_index);
-
-    free_data(data_copy);
-    free_dataset(test_dataset);
-}
-
-TEST(DataTest, RandomSampleDataset) {
-    dataset *test_dataset = get_dataset("data/mnist/test/0.dat");
-
-    dataset *dataset_sample = get_random_dataset_sample(test_dataset, test_dataset->length - 1);
-    ASSERT_NE(dataset_sample, nullptr);
-    ASSERT_NE(dataset_sample->datas, nullptr);
-    ASSERT_NE(dataset_sample->datas[0]->inputs, nullptr);
-
-    free_dataset(dataset_sample);
-    free_dataset(test_dataset);
-}
-
-TEST(DataTest, RotateData) {
-    data *test_data = (data *)malloc(sizeof(data));
-
-    size_t inputs_length = 9;
-    test_data->inputs = (float *)malloc(sizeof(float) * inputs_length);
-    for (size_t i = 0; i < inputs_length; i++) {
-        test_data->inputs[i] = static_cast<float>(i) + 1.0f;
-    }
-
-    rotate_data(test_data, 3, 3, 90);
-    ASSERT_FLOAT_EQ(test_data->inputs[0], 7.0f);
-    ASSERT_FLOAT_EQ(test_data->inputs[2], 1.0f);
-    ASSERT_FLOAT_EQ(test_data->inputs[4], 5.0f);
-
-    free_data(test_data);
-}
-
-TEST(DataTest, ScaleData) {
-    data *test_data = (data *)malloc(sizeof(data));
-
-    size_t inputs_length = 9;
-    test_data->inputs = (float *)malloc(sizeof(float) * inputs_length);
-    for (size_t i = 0; i < inputs_length; i++) {
-        test_data->inputs[i] = i + 1.0f;
-    }
-
-    scale_data(test_data, 3, 3, 2.0f);
-    ASSERT_FLOAT_EQ(test_data->inputs[0], 5.0f);
-    ASSERT_FLOAT_EQ(test_data->inputs[2], 6.0f);
-    ASSERT_FLOAT_EQ(test_data->inputs[8], 9.0f);
-
-    free_data(test_data);
-}
-
-TEST(DataTest, OffsetData) {
-    data *test_data = (data *)malloc(sizeof(data));
-
-    size_t inputs_length = 9;
-    test_data->inputs = (float *)malloc(sizeof(float) * inputs_length);
-    for (size_t i = 0; i < inputs_length; i++) {
-        test_data->inputs[i] = i + 1.0f;
-    }
-
-    offset_data(test_data, 3, 3, 1.0f, 1.0f);
-    ASSERT_FLOAT_EQ(test_data->inputs[0], 0.0f);
-    ASSERT_FLOAT_EQ(test_data->inputs[5], 2.0f);
-    ASSERT_FLOAT_EQ(test_data->inputs[6], 0.0f);
-    ASSERT_FLOAT_EQ(test_data->inputs[8], 5.0f);
-
-    free_data(test_data);
-}
-
-TEST(DataTest, NoiseData) {
-    data *test_data = (data *)malloc(sizeof(data));
-
-    size_t inputs_length = 9;
-    test_data->inputs = (float *)malloc(sizeof(float) * inputs_length);
-    for (size_t i = 0; i < inputs_length; i++) {
-        test_data->inputs[i] = i + 1.0f;
-    }
-    data *data_copy = get_data_copy(test_data, inputs_length);
-
-    bool same = true;
-    noise_data(test_data, inputs_length, 1.0f, 1.0f);
-    for (size_t i = 0; i < inputs_length; i++) {
-        if (data_copy->inputs[i] != test_data->inputs[i]) {
-            same = false;
-            break;
-        }
-    }
-    ASSERT_FALSE(same);
-
-    free_data(test_data);
-    free_data(data_copy);
-}
-
-TEST(DataTest, OutputExpected) {
-    data *test_data = (data *)malloc(sizeof(data));
-    test_data->expected_index = 1;
-
-    ASSERT_FLOAT_EQ(output_expected(0, test_data), 0.0f);
-    ASSERT_FLOAT_EQ(output_expected(1, test_data), 1.0f);
-
-    free(test_data);
-}
+// #include <gtest/gtest.h>
+//
+// extern "C" {
+// #include "cneuron/cneuron.h"
+// }
+//
+// TEST(DataTest, CreateData) {
+//     data *test_data = (data *)malloc(sizeof(data));
+//
+//     size_t inputs_length = 10;
+//     test_data->inputs = (float *)malloc(sizeof(float) * inputs_length);
+//     EXPECT_NE(test_data, nullptr);
+//     EXPECT_NE(test_data->inputs, nullptr);
+//     for (size_t i = 0; i < inputs_length; i++) {
+//         test_data->inputs[i] = static_cast<float>(i);
+//     }
+//     free_data(test_data);
+//     // No crash
+// }
+//
+// TEST(DataTest, GetDatasetFileNotFound) {
+//     dataset *test_dataset = get_dataset("non_existent_file.dat");
+//     ASSERT_EQ(test_dataset, nullptr);
+// }
+//
+// TEST(DataTest, GetDatasetValidFile) {
+//     dataset *test_dataset = get_dataset("data/mnist/test/0.dat");
+//     ASSERT_NE(test_dataset, nullptr);
+//     ASSERT_GT(test_dataset->length, 0);
+//     ASSERT_GT(test_dataset->inputs_length, 0);
+//
+//     ASSERT_NE(test_dataset, nullptr);
+//     ASSERT_NE(test_dataset->datas[0], nullptr);
+//     ASSERT_NE(test_dataset->datas[0]->inputs, nullptr);
+//
+//     free_dataset(test_dataset);
+// }
+//
+// TEST(DataTest, FreeDataset) {
+//     dataset *test_dataset = get_dataset("data/mnist/test/0.dat");
+//
+//     free_dataset(test_dataset);
+//     // No crash
+// }
+//
+// TEST(DataTest, FreeData) {
+//     data *test_data = (data *)malloc(sizeof(data));
+//     test_data->inputs = (float *)malloc(sizeof(float) * 10);
+//
+//     free_data(test_data);
+//     // No crash
+// }
+//
+// TEST(DataTest, CopyData) {
+//     dataset *test_dataset = get_dataset("data/mnist/test/0.dat");
+//
+//     data *data_copy = get_data_copy(test_dataset->datas[0], test_dataset->inputs_length);
+//     ASSERT_NE(data_copy, nullptr);
+//     ASSERT_NE(data_copy->inputs, nullptr);
+//
+//     for (size_t i = 0; i < test_dataset->inputs_length; i++) {
+//         ASSERT_FLOAT_EQ(data_copy->inputs[i], test_dataset->datas[0]->inputs[i]);
+//     }
+//
+//     ASSERT_FLOAT_EQ(data_copy->expected_index, test_dataset->datas[0]->expected_index);
+//
+//     free_data(data_copy);
+//     free_dataset(test_dataset);
+// }
+//
+// TEST(DataTest, RandomSampleDataset) {
+//     dataset *test_dataset = get_dataset("data/mnist/test/0.dat");
+//
+//     dataset *dataset_sample = get_random_dataset_sample(test_dataset, test_dataset->length - 1);
+//     ASSERT_NE(dataset_sample, nullptr);
+//     ASSERT_NE(dataset_sample->datas, nullptr);
+//     ASSERT_NE(dataset_sample->datas[0]->inputs, nullptr);
+//
+//     free_dataset(dataset_sample);
+//     free_dataset(test_dataset);
+// }
+//
+// TEST(DataTest, RotateData) {
+//     data *test_data = (data *)malloc(sizeof(data));
+//
+//     size_t inputs_length = 9;
+//     test_data->inputs = (float *)malloc(sizeof(float) * inputs_length);
+//     for (size_t i = 0; i < inputs_length; i++) {
+//         test_data->inputs[i] = static_cast<float>(i) + 1.0f;
+//     }
+//
+//     rotate_data(test_data, 3, 3, 90);
+//     ASSERT_FLOAT_EQ(test_data->inputs[0], 7.0f);
+//     ASSERT_FLOAT_EQ(test_data->inputs[2], 1.0f);
+//     ASSERT_FLOAT_EQ(test_data->inputs[4], 5.0f);
+//
+//     free_data(test_data);
+// }
+//
+// TEST(DataTest, ScaleData) {
+//     data *test_data = (data *)malloc(sizeof(data));
+//
+//     size_t inputs_length = 9;
+//     test_data->inputs = (float *)malloc(sizeof(float) * inputs_length);
+//     for (size_t i = 0; i < inputs_length; i++) {
+//         test_data->inputs[i] = i + 1.0f;
+//     }
+//
+//     scale_data(test_data, 3, 3, 2.0f);
+//     ASSERT_FLOAT_EQ(test_data->inputs[0], 5.0f);
+//     ASSERT_FLOAT_EQ(test_data->inputs[2], 6.0f);
+//     ASSERT_FLOAT_EQ(test_data->inputs[8], 9.0f);
+//
+//     free_data(test_data);
+// }
+//
+// TEST(DataTest, OffsetData) {
+//     data *test_data = (data *)malloc(sizeof(data));
+//
+//     size_t inputs_length = 9;
+//     test_data->inputs = (float *)malloc(sizeof(float) * inputs_length);
+//     for (size_t i = 0; i < inputs_length; i++) {
+//         test_data->inputs[i] = i + 1.0f;
+//     }
+//
+//     offset_data(test_data, 3, 3, 1.0f, 1.0f);
+//     ASSERT_FLOAT_EQ(test_data->inputs[0], 0.0f);
+//     ASSERT_FLOAT_EQ(test_data->inputs[5], 2.0f);
+//     ASSERT_FLOAT_EQ(test_data->inputs[6], 0.0f);
+//     ASSERT_FLOAT_EQ(test_data->inputs[8], 5.0f);
+//
+//     free_data(test_data);
+// }
+//
+// TEST(DataTest, NoiseData) {
+//     data *test_data = (data *)malloc(sizeof(data));
+//
+//     size_t inputs_length = 9;
+//     test_data->inputs = (float *)malloc(sizeof(float) * inputs_length);
+//     for (size_t i = 0; i < inputs_length; i++) {
+//         test_data->inputs[i] = i + 1.0f;
+//     }
+//     data *data_copy = get_data_copy(test_data, inputs_length);
+//
+//     bool same = true;
+//     noise_data(test_data, inputs_length, 1.0f, 1.0f);
+//     for (size_t i = 0; i < inputs_length; i++) {
+//         if (data_copy->inputs[i] != test_data->inputs[i]) {
+//             same = false;
+//             break;
+//         }
+//     }
+//     ASSERT_FALSE(same);
+//
+//     free_data(test_data);
+//     free_data(data_copy);
+// }
+//
+// TEST(DataTest, OutputExpected) {
+//     data *test_data = (data *)malloc(sizeof(data));
+//     test_data->expected_index = 1;
+//
+//     ASSERT_FLOAT_EQ(output_expected(0, test_data), 0.0f);
+//     ASSERT_FLOAT_EQ(output_expected(1, test_data), 1.0f);
+//
+//     free(test_data);
+// }
diff --git a/test/linear_algebra.cpp b/test/linear_algebra.cpp
index fee0c10d7..31b5edc9b 100644
--- a/test/linear_algebra.cpp
+++ b/test/linear_algebra.cpp
@@ -1,50 +1,50 @@
-#include <gtest/gtest.h>
-#include <math.h>
-
-extern "C" {
-#include "cneuron/cneuron.h"
-}
-
-#include "test_utils.h"
-
-TEST(LinearAlgebraTest, VectorApplyActivation) {
-    size_t length = 3;
-    float *a = (float *)malloc(sizeof(float) * length);
-    float *b = (float *)malloc(sizeof(float) * length);
-
-    a[0] = 1.0f;
-    a[1] = -1.2f;
-    a[2] = -0.2f;
-
-    vector_apply_activation(a, b, length, sigmoid, false);
-
-    for (size_t i = 0; i < length; i++) {
-        ASSERT_FLOAT_EQ(b[i], sigmoid(a[i], false));
-    }
-
-    free(a);
-    free(b);
-}
-
-TEST(LinearAlgebraTest, HadamardProduct) {
-    size_t length = 3;
-    float *a = (float *)malloc(sizeof(float) * length);
-    float *b = (float *)malloc(sizeof(float) * length);
-
-    a[0] = 1.0f;
-    a[1] = -1.2f;
-    a[2] = -0.2f;
-
-    b[0] = 1.2f;
-    b[1] = -2.2f;
-    b[2] = -0.4f;
-
-    hadamard_product(a, b, b, length);
-
-    ASSERT_FLOAT_EQ(b[0], 1.2f);
-    ASSERT_FLOAT_EQ(b[1], 2.64f);
-    ASSERT_FLOAT_EQ(b[2], 0.08f);
-
-    free(a);
-    free(b);
-}
+// #include <gtest/gtest.h>
+// #include <math.h>
+//
+// extern "C" {
+// #include "cneuron/cneuron.h"
+// }
+//
+// #include "test_utils.h"
+//
+// TEST(LinearAlgebraTest, VectorApplyActivation) {
+//     size_t length = 3;
+//     float *a = (float *)malloc(sizeof(float) * length);
+//     float *b = (float *)malloc(sizeof(float) * length);
+//
+//     a[0] = 1.0f;
+//     a[1] = -1.2f;
+//     a[2] = -0.2f;
+//
+//     vector_apply_activation(a, b, length, sigmoid, false);
+//
+//     for (size_t i = 0; i < length; i++) {
+//         ASSERT_FLOAT_EQ(b[i], sigmoid(a[i], false));
+//     }
+//
+//     free(a);
+//     free(b);
+// }
+//
+// TEST(LinearAlgebraTest, HadamardProduct) {
+//     size_t length = 3;
+//     float *a = (float *)malloc(sizeof(float) * length);
+//     float *b = (float *)malloc(sizeof(float) * length);
+//
+//     a[0] = 1.0f;
+//     a[1] = -1.2f;
+//     a[2] = -0.2f;
+//
+//     b[0] = 1.2f;
+//     b[1] = -2.2f;
+//     b[2] = -0.4f;
+//
+//     hadamard_product(a, b, b, length);
+//
+//     ASSERT_FLOAT_EQ(b[0], 1.2f);
+//     ASSERT_FLOAT_EQ(b[1], 2.64f);
+//     ASSERT_FLOAT_EQ(b[2], 0.08f);
+//
+//     free(a);
+//     free(b);
+// }
diff --git a/test/main.cpp b/test/main.cpp
index 0a4376670..5bebbc677 100644
--- a/test/main.cpp
+++ b/test/main.cpp
@@ -1,7 +1,7 @@
-#include <gtest/gtest.h>
-
-int main(int argc, char **argv) {
-    srand(time(NULL));
-    ::testing::InitGoogleTest(&argc, argv);
-    return RUN_ALL_TESTS();
-}
+// #include <gtest/gtest.h>
+//
+// int main(int argc, char **argv) {
+//     srand(time(NULL));
+//     ::testing::InitGoogleTest(&argc, argv);
+//     return RUN_ALL_TESTS();
+// }
diff --git a/test/network.cpp b/test/network.cpp
index 4082a2052..6e56ca377 100644
--- a/test/network.cpp
+++ b/test/network.cpp
@@ -1,245 +1,246 @@
-#include <gtest/gtest.h>
-
-extern "C" {
-#include "cneuron/cneuron.h"
-}
-
-#include <math.h>
-
-#include "test_utils.h"
-
-TEST(NetworkTest, RandomFloat) {
-    float test = random_float(0.0f, 1.0f);
-    bool same = true;
-    for (int i = 0; i < 10; i++) {
-        if (test != random_float(0.0f, 1.0f)) {
-            same = false;
-            break;
-        }
-    }
-    ASSERT_FALSE(same);
-}
-
-TEST(NetworkTest, GetLayer) {
-    size_t layer_length = 3;
-    layer *test_layer = get_layer(layer_length, 5);
-
-    ASSERT_NE(test_layer, nullptr);
-    ASSERT_NE(test_layer->delta, nullptr);
-    ASSERT_NE(test_layer->weighted_input, nullptr);
-    ASSERT_NE(test_layer->weights, nullptr);
-    ASSERT_NE(test_layer->bias, nullptr);
-    ASSERT_NE(test_layer->output, nullptr);
-
-    ASSERT_EQ(test_layer->length, layer_length);
-
-    free_layer(test_layer);
-}
-
-TEST(NetworkTest, GetNeuralNetwork) {
-    size_t layer_length = 3;
-    size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
-    layer_lengths[0] = 2;
-    layer_lengths[1] = 3;
-    layer_lengths[2] = 4;
-    size_t inputs_length = 2;
-    neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid);
-
-    ASSERT_NE(nn, nullptr);
-    ASSERT_EQ(nn->length, layer_length);
-    ASSERT_EQ(nn->inputs_length, inputs_length);
-    ASSERT_EQ(nn->activation_function, &sigmoid);
-    ASSERT_NE(nn->layers, nullptr);
-    for (size_t i = 0; i < layer_length; i++) {
-        ASSERT_NE(nn->layers[i], nullptr);
-        ASSERT_EQ(nn->layers[i]->length, layer_lengths[i]);
-        ASSERT_EQ(nn->layers[i]->prev_layer, (i == 0) ? nullptr : nn->layers[i - 1]);
-        ASSERT_EQ(nn->layers[i]->next_layer, (i == layer_length - 1) ? nullptr : nn->layers[i + 1]);
-    }
-
-    free_neural_network(nn);
-    free(layer_lengths);
-}
-
-TEST(NetworkTest, FreeDataset) {
-    size_t layer_length = 1;
-    size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
-    layer_lengths[0] = 2;
-    neural_network *nn = get_neural_network(layer_length, layer_lengths, 2, nullptr);
-
-    free_neural_network(nn);
-    // No crash
-    free(layer_lengths);
-}
-
-TEST(NetworkTest, FreeLayer) {
-    size_t layer_length = 2;
-    layer *test_layer = get_layer(layer_length, 3);
-
-    free_layer(test_layer);
-    // No crash
-}
-
-TEST(NetworkTest, ComputeNetwork) {
-    size_t layer_length = 1;
-    size_t inputs_length = 1;
-    size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
-    layer_lengths[0] = 1;
-    neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid);
-
-    float *inputs = (float *)malloc(sizeof(float) * inputs_length);
-    inputs[0] = 0.2f;
-
-    nn->layers[0]->weights[0] = 0.5f;
-    nn->layers[0]->bias[0] = 0.3f;
-
-    compute_network(nn, inputs);
-
-    ASSERT_FLOAT_EQ(nn->layers[0]->output[0], 0.59868766f);
-
-    free(inputs);
-    free_neural_network(nn);
-    free(layer_lengths);
-}
-
-TEST(NetworkTest, Softmax) {
-    size_t layer_length = 1;
-    size_t inputs_length = 1;
-    size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
-    layer_lengths[0] = 3;
-    neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid);
-
-    nn->layers[0]->output[0] = 0.2f;
-    nn->layers[0]->output[1] = 0.3f;
-    nn->layers[0]->output[2] = 0.5f;
-
-    ASSERT_FLOAT_EQ(softmax(nn, 0), 0.28943311f);
-    ASSERT_FLOAT_EQ(softmax(nn, 1), 0.31987305f);
-    ASSERT_FLOAT_EQ(softmax(nn, 2), 0.39069383f);
-
-    free_neural_network(nn);
-    free(layer_lengths);
-}
-
-// Output layer only
-TEST(NetworkTest, StochasticGDSingleLayer) {
-    dataset *test_dataset = get_or();
-
-    // Create network
-    size_t layer_length = 1;
-    size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
-    layer_lengths[0] = 2;
-    neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid);
-
-    for (size_t i = 0; i < 50000; i++) {
-        for (size_t j = 0; j < test_dataset->length; j++) {
-            stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]);
-        }
-        if (i % 10000 == 0) {
-            printf("Single layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
-        }
-    }
-
-    ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f);
-    ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f);
-
-    free_dataset(test_dataset);
-    free_neural_network(nn);
-    free(layer_lengths);
-}
-
-TEST(NetworkTest, StochasticGDTests) {
-    dataset *test_dataset = get_xor();
-
-    // Multi layer test
-    size_t layer_length = 2;
-    size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
-    layer_lengths[0] = 4;
-    layer_lengths[1] = 2;
-    neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid);
-
-    for (size_t i = 0; i < 500000; i++) {
-        for (size_t j = 0; j < test_dataset->length; j++) {
-            stochastic_gd(nn, 0.001f, test_dataset->datas[rand() % test_dataset->length]);
-        }
-        if (i % 100000 == 0) {
-            printf("Stochastic Multi layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
-        }
-    }
-
-    ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f);
-    ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f);
-
-    free_neural_network(nn);
-    free(layer_lengths);
-
-    // Non-linearly separable test
-    layer_length = 1;
-    layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
-    layer_lengths[0] = 2;
-    nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid);
-
-    for (size_t i = 0; i < 50000; i++) {
-        for (size_t j = 0; j < test_dataset->length; j++) {
-            stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]);
-        }
-        if (i % 10000 == 0) {
-            printf("Stochastic Non-linearly separable learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
-        }
-    }
-
-    ASSERT_GE(cost(nn, test_dataset, test_dataset->length), 0.2f);
-    ASSERT_LE(test_network_percent(nn, test_dataset), 90.0f);
-
-    free_neural_network(nn);
-    free(layer_lengths);
-    free_dataset(test_dataset);
-}
-
-TEST(NetworkTest, MiniBatchGDTests) {
-    dataset *test_dataset = get_xor();
-
-    // Multi layer test
-    size_t layer_length = 2;
-    size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
-    layer_lengths[0] = 4;
-    layer_lengths[1] = 2;
-    neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid);
-
-    for (size_t i = 0; i < 1500000; i++) {
-        dataset *batch_dataset = get_random_dataset_sample(test_dataset, test_dataset->length);
-        mini_batch_gd(nn, 0.001f, batch_dataset);
-        free_dataset(batch_dataset);
-        if (i % 200000 == 0) {
-            printf("Mini Batch Multi layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
-        }
-    }
-
-    ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f);
-    ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f);
-
-    free_neural_network(nn);
-    free(layer_lengths);
-
-    // Non-linearly separable test
-    layer_length = 1;
-    layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
-    layer_lengths[0] = 2;
-    nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid);
-
-    for (size_t i = 0; i < 100000; i++) {
-        dataset *batch_dataset = get_random_dataset_sample(test_dataset, test_dataset->length);
-        mini_batch_gd(nn, 0.001f, batch_dataset);
-        free_dataset(batch_dataset);
-        if (i % 20000 == 0) {
-            printf("Mini Batch Non-linearly separable learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
-        }
-    }
-
-    ASSERT_GE(cost(nn, test_dataset, test_dataset->length), 0.2f);
-    ASSERT_LE(test_network_percent(nn, test_dataset), 90.0f);
-
-    free_neural_network(nn);
-    free(layer_lengths);
-    free_dataset(test_dataset);
-}
+// #include <gtest/gtest.h>
+//
+// extern "C" {
+// #include "cneuron/cneuron.h"
+// #include "../src/rand.h"
+// }
+//
+// #include <math.h>
+//
+// #include "test_utils.h"
+//
+// TEST(NetworkTest, RandomFloat) {
+//     float test = randf(&randc, 0.0f, 1.0f);
+//     bool same = true;
+//     for (int i = 0; i < 10; i++) {
+//         if (test != randf(&randc, 0.0f, 1.0f)) {
+//             same = false;
+//             break;
+//         }
+//     }
+//     ASSERT_FALSE(same);
+// }
+//
+// TEST(NetworkTest, GetLayer) {
+//     size_t layer_length = 3;
+//     layer *test_layer = get_layer(layer_length, 5);
+//
+//     ASSERT_NE(test_layer, nullptr);
+//     ASSERT_NE(test_layer->delta, nullptr);
+//     ASSERT_NE(test_layer->weighted_input, nullptr);
+//     ASSERT_NE(test_layer->weights, nullptr);
+//     ASSERT_NE(test_layer->bias, nullptr);
+//     ASSERT_NE(test_layer->output, nullptr);
+//
+//     ASSERT_EQ(test_layer->length, layer_length);
+//
+//     free_layer(test_layer);
+// }
+//
+// TEST(NetworkTest, GetNeuralNetwork) {
+//     size_t layer_length = 3;
+//     size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
+//     layer_lengths[0] = 2;
+//     layer_lengths[1] = 3;
+//     layer_lengths[2] = 4;
+//     size_t inputs_length = 2;
+//     neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid);
+//
+//     ASSERT_NE(nn, nullptr);
+//     ASSERT_EQ(nn->length, layer_length);
+//     ASSERT_EQ(nn->inputs_length, inputs_length);
+//     ASSERT_EQ(nn->activation_function, &sigmoid);
+//     ASSERT_NE(nn->layers, nullptr);
+//     for (size_t i = 0; i < layer_length; i++) {
+//         ASSERT_NE(nn->layers[i], nullptr);
+//         ASSERT_EQ(nn->layers[i]->length, layer_lengths[i]);
+//         ASSERT_EQ(nn->layers[i]->prev_layer, (i == 0) ? nullptr : nn->layers[i - 1]);
+//         ASSERT_EQ(nn->layers[i]->next_layer, (i == layer_length - 1) ? nullptr : nn->layers[i + 1]);
+//     }
+//
+//     free_neural_network(nn);
+//     free(layer_lengths);
+// }
+//
+// TEST(NetworkTest, FreeDataset) {
+//     size_t layer_length = 1;
+//     size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
+//     layer_lengths[0] = 2;
+//     neural_network *nn = get_neural_network(layer_length, layer_lengths, 2, nullptr);
+//
+//     free_neural_network(nn);
+//     // No crash
+//     free(layer_lengths);
+// }
+//
+// TEST(NetworkTest, FreeLayer) {
+//     size_t layer_length = 2;
+//     layer *test_layer = get_layer(layer_length, 3);
+//
+//     free_layer(test_layer);
+//     // No crash
+// }
+//
+// TEST(NetworkTest, ComputeNetwork) {
+//     size_t layer_length = 1;
+//     size_t inputs_length = 1;
+//     size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
+//     layer_lengths[0] = 1;
+//     neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid);
+//
+//     float *inputs = (float *)malloc(sizeof(float) * inputs_length);
+//     inputs[0] = 0.2f;
+//
+//     nn->layers[0]->weights[0] = 0.5f;
+//     nn->layers[0]->bias[0] = 0.3f;
+//
+//     compute_network(nn, inputs);
+//
+//     ASSERT_FLOAT_EQ(nn->layers[0]->output[0], 0.59868766f);
+//
+//     free(inputs);
+//     free_neural_network(nn);
+//     free(layer_lengths);
+// }
+//
+// TEST(NetworkTest, Softmax) {
+//     size_t layer_length = 1;
+//     size_t inputs_length = 1;
+//     size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
+//     layer_lengths[0] = 3;
+//     neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid);
+//
+//     nn->layers[0]->output[0] = 0.2f;
+//     nn->layers[0]->output[1] = 0.3f;
+//     nn->layers[0]->output[2] = 0.5f;
+//
+//     ASSERT_FLOAT_EQ(softmax(nn, 0), 0.28943311f);
+//     ASSERT_FLOAT_EQ(softmax(nn, 1), 0.31987305f);
+//     ASSERT_FLOAT_EQ(softmax(nn, 2), 0.39069383f);
+//
+//     free_neural_network(nn);
+//     free(layer_lengths);
+// }
+//
+// // Output layer only
+// TEST(NetworkTest, StochasticGDSingleLayer) {
+//     dataset *test_dataset = get_or();
+//
+//     // Create network
+//     size_t layer_length = 1;
+//     size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
+//     layer_lengths[0] = 2;
+//     neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid);
+//
+//     for (size_t i = 0; i < 50000; i++) {
+//         for (size_t j = 0; j < test_dataset->length; j++) {
+//             stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]);
+//         }
+//         if (i % 10000 == 0) {
+//             printf("Single layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
+//         }
+//     }
+//
+//     ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f);
+//     ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f);
+//
+//     free_dataset(test_dataset);
+//     free_neural_network(nn);
+//     free(layer_lengths);
+// }
+//
+// TEST(NetworkTest, StochasticGDTests) {
+//     dataset *test_dataset = get_xor();
+//
+//     // Multi layer test
+//     size_t layer_length = 2;
+//     size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
+//     layer_lengths[0] = 4;
+//     layer_lengths[1] = 2;
+//     neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid);
+//
+//     for (size_t i = 0; i < 500000; i++) {
+//         for (size_t j = 0; j < test_dataset->length; j++) {
+//             stochastic_gd(nn, 0.001f, test_dataset->datas[rand() % test_dataset->length]);
+//         }
+//         if (i % 100000 == 0) {
+//             printf("Stochastic Multi layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
+//         }
+//     }
+//
+//     ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f);
+//     ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f);
+//
+//     free_neural_network(nn);
+//     free(layer_lengths);
+//
+//     // Non-linearly separable test
+//     layer_length = 1;
+//     layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
+//     layer_lengths[0] = 2;
+//     nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid);
+//
+//     for (size_t i = 0; i < 50000; i++) {
+//         for (size_t j = 0; j < test_dataset->length; j++) {
+//             stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]);
+//         }
+//         if (i % 10000 == 0) {
+//             printf("Stochastic Non-linearly separable learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
+//         }
+//     }
+//
+//     ASSERT_GE(cost(nn, test_dataset, test_dataset->length), 0.2f);
+//     ASSERT_LE(test_network_percent(nn, test_dataset), 90.0f);
+//
+//     free_neural_network(nn);
+//     free(layer_lengths);
+//     free_dataset(test_dataset);
+// }
+//
+// TEST(NetworkTest, MiniBatchGDTests) {
+//     dataset *test_dataset = get_xor();
+//
+//     // Multi layer test
+//     size_t layer_length = 2;
+//     size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
+//     layer_lengths[0] = 4;
+//     layer_lengths[1] = 2;
+//     neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid);
+//
+//     for (size_t i = 0; i < 1500000; i++) {
+//         dataset *batch_dataset = get_random_dataset_sample(test_dataset, test_dataset->length);
+//         mini_batch_gd(nn, 0.001f, batch_dataset);
+//         free_dataset(batch_dataset);
+//         if (i % 200000 == 0) {
+//             printf("Mini Batch Multi layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
+//         }
+//     }
+//
+//     ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f);
+//     ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f);
+//
+//     free_neural_network(nn);
+//     free(layer_lengths);
+//
+//     // Non-linearly separable test
+//     layer_length = 1;
+//     layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
+//     layer_lengths[0] = 2;
+//     nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid);
+//
+//     for (size_t i = 0; i < 100000; i++) {
+//         dataset *batch_dataset = get_random_dataset_sample(test_dataset, test_dataset->length);
+//         mini_batch_gd(nn, 0.001f, batch_dataset);
+//         free_dataset(batch_dataset);
+//         if (i % 20000 == 0) {
+//             printf("Mini Batch Non-linearly separable learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
+//         }
+//     }
+//
+//     ASSERT_GE(cost(nn, test_dataset, test_dataset->length), 0.2f);
+//     ASSERT_LE(test_network_percent(nn, test_dataset), 90.0f);
+//
+//     free_neural_network(nn);
+//     free(layer_lengths);
+//     free_dataset(test_dataset);
+// }
diff --git a/test/test_utils.cpp b/test/test_utils.cpp
index 5513f015d..def5f48ca 100644
--- a/test/test_utils.cpp
+++ b/test/test_utils.cpp
@@ -1,77 +1,77 @@
-#include "test_utils.h"
-
-#include <math.h>
-
-float sigmoid(float val, bool is_deravative) {
-    float result = 1.0f / (1.0f + expf(-val));
-    if (is_deravative == 1) {
-        return result * (1.0f - result);
-    }
-    return result;
-}
-
-dataset *get_xor() {
-    // Create data
-    dataset *test_dataset = (dataset *)malloc(sizeof(dataset));
-    test_dataset->length = 4;
-    data **datas = (data **)malloc(sizeof(data *) * test_dataset->length);
-    test_dataset->datas = datas;
-    test_dataset->inputs_length = 2;
-
-    for (size_t i = 0; i < test_dataset->length; i++) {
-        test_dataset->datas[i] = (data *)malloc(sizeof(data));
-        test_dataset->datas[i]->inputs = (float *)malloc(sizeof(float) * test_dataset->inputs_length);
-    }
-
-    // XOR gate
-    test_dataset->datas[0]->inputs[0] = 1.0f;
-    test_dataset->datas[0]->inputs[1] = 1.0f;
-    test_dataset->datas[0]->expected_index = 0;
-
-    test_dataset->datas[1]->inputs[0] = 0.0f;
-    test_dataset->datas[1]->inputs[1] = 0.0f;
-    test_dataset->datas[1]->expected_index = 0;
-
-    test_dataset->datas[2]->inputs[0] = 0.0f;
-    test_dataset->datas[2]->inputs[1] = 1.0f;
-    test_dataset->datas[2]->expected_index = 1;
-
-    test_dataset->datas[3]->inputs[0] = 1.0f;
-    test_dataset->datas[3]->inputs[1] = 0.0f;
-    test_dataset->datas[3]->expected_index = 1;
-
-    return test_dataset;
-}
-
-dataset *get_or() {
-    // Create data
-    dataset *test_dataset = (dataset *)malloc(sizeof(dataset));
-    test_dataset->length = 4;
-    data **datas = (data **)malloc(sizeof(data *) * test_dataset->length);
-    test_dataset->datas = datas;
-    test_dataset->inputs_length = 2;
-
-    for (size_t i = 0; i < test_dataset->length; i++) {
-        test_dataset->datas[i] = (data *)malloc(sizeof(data));
-        test_dataset->datas[i]->inputs = (float *)malloc(sizeof(float) * test_dataset->inputs_length);
-    }
-
-    // OR gate
-    test_dataset->datas[1]->inputs[0] = 0.0f;
-    test_dataset->datas[1]->inputs[1] = 0.0f;
-    test_dataset->datas[1]->expected_index = 0;
-
-    test_dataset->datas[0]->inputs[0] = 1.0f;
-    test_dataset->datas[0]->inputs[1] = 1.0f;
-    test_dataset->datas[0]->expected_index = 1;
-
-    test_dataset->datas[2]->inputs[0] = 0.0f;
-    test_dataset->datas[2]->inputs[1] = 1.0f;
-    test_dataset->datas[2]->expected_index = 1;
-
-    test_dataset->datas[3]->inputs[0] = 1.0f;
-    test_dataset->datas[3]->inputs[1] = 0.0f;
-    test_dataset->datas[3]->expected_index = 1;
-
-    return test_dataset;
-}
+// #include "test_utils.h"
+//
+// #include <math.h>
+//
+// float sigmoid(float val, bool is_deravative) {
+//     float result = 1.0f / (1.0f + expf(-val));
+//     if (is_deravative == 1) {
+//         return result * (1.0f - result);
+//     }
+//     return result;
+// }
+//
+// dataset *get_xor() {
+//     // Create data
+//     dataset *test_dataset = (dataset *)malloc(sizeof(dataset));
+//     test_dataset->length = 4;
+//     data **datas = (data **)malloc(sizeof(data *) * test_dataset->length);
+//     test_dataset->datas = datas;
+//     test_dataset->inputs_length = 2;
+//
+//     for (size_t i = 0; i < test_dataset->length; i++) {
+//         test_dataset->datas[i] = (data *)malloc(sizeof(data));
+//         test_dataset->datas[i]->inputs = (float *)malloc(sizeof(float) * test_dataset->inputs_length);
+//     }
+//
+//     // XOR gate
+//     test_dataset->datas[0]->inputs[0] = 1.0f;
+//     test_dataset->datas[0]->inputs[1] = 1.0f;
+//     test_dataset->datas[0]->expected_index = 0;
+//
+//     test_dataset->datas[1]->inputs[0] = 0.0f;
+//     test_dataset->datas[1]->inputs[1] = 0.0f;
+//     test_dataset->datas[1]->expected_index = 0;
+//
+//     test_dataset->datas[2]->inputs[0] = 0.0f;
+//     test_dataset->datas[2]->inputs[1] = 1.0f;
+//     test_dataset->datas[2]->expected_index = 1;
+//
+//     test_dataset->datas[3]->inputs[0] = 1.0f;
+//     test_dataset->datas[3]->inputs[1] = 0.0f;
+//     test_dataset->datas[3]->expected_index = 1;
+//
+//     return test_dataset;
+// }
+//
+// dataset *get_or() {
+//     // Create data
+//     dataset *test_dataset = (dataset *)malloc(sizeof(dataset));
+//     test_dataset->length = 4;
+//     data **datas = (data **)malloc(sizeof(data *) * test_dataset->length);
+//     test_dataset->datas = datas;
+//     test_dataset->inputs_length = 2;
+//
+//     for (size_t i = 0; i < test_dataset->length; i++) {
+//         test_dataset->datas[i] = (data *)malloc(sizeof(data));
+//         test_dataset->datas[i]->inputs = (float *)malloc(sizeof(float) * test_dataset->inputs_length);
+//     }
+//
+//     // OR gate
+//     test_dataset->datas[1]->inputs[0] = 0.0f;
+//     test_dataset->datas[1]->inputs[1] = 0.0f;
+//     test_dataset->datas[1]->expected_index = 0;
+//
+//     test_dataset->datas[0]->inputs[0] = 1.0f;
+//     test_dataset->datas[0]->inputs[1] = 1.0f;
+//     test_dataset->datas[0]->expected_index = 1;
+//
+//     test_dataset->datas[2]->inputs[0] = 0.0f;
+//     test_dataset->datas[2]->inputs[1] = 1.0f;
+//     test_dataset->datas[2]->expected_index = 1;
+//
+//     test_dataset->datas[3]->inputs[0] = 1.0f;
+//     test_dataset->datas[3]->inputs[1] = 0.0f;
+//     test_dataset->datas[3]->expected_index = 1;
+//
+//     return test_dataset;
+// }
diff --git a/test/test_utils.h b/test/test_utils.h
index 3c310aef2..02dbbe9ce 100644
--- a/test/test_utils.h
+++ b/test/test_utils.h
@@ -1,12 +1,12 @@
-#ifndef TEST_UTILS_H
-#define TEST_UTILS_H
-
-#include "cneuron/cneuron.h"
-
-float sigmoid(float val, bool is_deravative);
-
-dataset *get_xor();
-
-dataset *get_or();
-
-#endif
+// #ifndef TEST_UTILS_H
+// #define TEST_UTILS_H
+//
+// #include "cneuron/cneuron.h"
+//
+// float sigmoid(float val, bool is_deravative);
+//
+// dataset *get_xor();
+//
+// dataset *get_or();
+//
+// #endif

From 209307cce56fffeafe4fe774fb31394600ce8d03 Mon Sep 17 00:00:00 2001
From: BrBa2 <ddddddddrrdd3@gmail.com>
Date: Thu, 12 Jun 2025 17:06:22 +0800
Subject: [PATCH 05/11] fix: not working rand

---
 external/shishua/shishua-sse2.h |   2 +-
 include/cneuron/cneuron.h       |   1 -
 src/data.c                      |   1 +
 src/main.c                      |   1 +
 src/network.c                   |   1 +
 src/rand.h                      |   4 +-
 test/data.cpp                   | 334 +++++++++++-----------
 test/linear_algebra.cpp         | 100 +++----
 test/main.cpp                   |  14 +-
 test/network.cpp                | 492 ++++++++++++++++----------------
 test/test_utils.cpp             | 154 +++++-----
 test/test_utils.h               |  24 +-
 12 files changed, 565 insertions(+), 563 deletions(-)

diff --git a/external/shishua/shishua-sse2.h b/external/shishua/shishua-sse2.h
index af28300ad..a9648fb0b 100644
--- a/external/shishua/shishua-sse2.h
+++ b/external/shishua/shishua-sse2.h
@@ -185,7 +185,7 @@ static uint64_t phi[16] = {
     0xFEC507705E4AE6E5,
 };
 
-void prng_init(prng_state *s, uint64_t seed[4]) {
+static void prng_init(prng_state *s, uint64_t seed[4]) {
     // Note: output is uninitialized at first, but since we pass NULL, its value
     // is initially ignored.
     s->counter[0] = _mm_setzero_si128();
diff --git a/include/cneuron/cneuron.h b/include/cneuron/cneuron.h
index 4a68fae06..1d583f2e7 100644
--- a/include/cneuron/cneuron.h
+++ b/include/cneuron/cneuron.h
@@ -3,7 +3,6 @@
 
 #include <stdbool.h>
 #include <stddef.h>
-#include "rand.h"
 
 /**
  * @brief Represents a single data element with its inputs and expected output index.
diff --git a/src/data.c b/src/data.c
index e85c6ff9c..f3bae3913 100644
--- a/src/data.c
+++ b/src/data.c
@@ -7,6 +7,7 @@
 #include <string.h>
 
 #include "cneuron/cneuron.h"
+#include "rand.h"
 
 #define BACKGROUND_VALUE 0.0f
 
diff --git a/src/main.c b/src/main.c
index cc9fafb73..e1b6f4e06 100644
--- a/src/main.c
+++ b/src/main.c
@@ -7,6 +7,7 @@
 #include <time.h>
 
 #include "cneuron/cneuron.h"
+#include "rand.h"
 
 const size_t IMAGE_SIZE = 28;
 
diff --git a/src/network.c b/src/network.c
index 3d06103f7..7e5e2db6b 100644
--- a/src/network.c
+++ b/src/network.c
@@ -9,6 +9,7 @@
 #include <string.h>
 
 #include "cneuron/cneuron.h"
+#include "rand.h"
 
 layer *get_layer(size_t length, size_t prev_length) {
     layer *new_layer = calloc(1, sizeof(layer));
diff --git a/src/rand.h b/src/rand.h
index 0d06992cb..ba6f5159d 100644
--- a/src/rand.h
+++ b/src/rand.h
@@ -14,6 +14,6 @@ uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max);
 uint32_t randnum_u32(struct rand_chunk *randc, uint32_t min, uint32_t max);
 float randf(struct rand_chunk *randc, float range, float offset);
 
-prng_state __randstate;
-struct rand_chunk randc = {0};
+static prng_state __randstate;
+static struct rand_chunk randc = {0};
 #endif
diff --git a/test/data.cpp b/test/data.cpp
index 0eb88c354..abe560d7f 100644
--- a/test/data.cpp
+++ b/test/data.cpp
@@ -1,167 +1,167 @@
-// #include <gtest/gtest.h>
-//
-// extern "C" {
-// #include "cneuron/cneuron.h"
-// }
-//
-// TEST(DataTest, CreateData) {
-//     data *test_data = (data *)malloc(sizeof(data));
-//
-//     size_t inputs_length = 10;
-//     test_data->inputs = (float *)malloc(sizeof(float) * inputs_length);
-//     EXPECT_NE(test_data, nullptr);
-//     EXPECT_NE(test_data->inputs, nullptr);
-//     for (size_t i = 0; i < inputs_length; i++) {
-//         test_data->inputs[i] = static_cast<float>(i);
-//     }
-//     free_data(test_data);
-//     // No crash
-// }
-//
-// TEST(DataTest, GetDatasetFileNotFound) {
-//     dataset *test_dataset = get_dataset("non_existent_file.dat");
-//     ASSERT_EQ(test_dataset, nullptr);
-// }
-//
-// TEST(DataTest, GetDatasetValidFile) {
-//     dataset *test_dataset = get_dataset("data/mnist/test/0.dat");
-//     ASSERT_NE(test_dataset, nullptr);
-//     ASSERT_GT(test_dataset->length, 0);
-//     ASSERT_GT(test_dataset->inputs_length, 0);
-//
-//     ASSERT_NE(test_dataset, nullptr);
-//     ASSERT_NE(test_dataset->datas[0], nullptr);
-//     ASSERT_NE(test_dataset->datas[0]->inputs, nullptr);
-//
-//     free_dataset(test_dataset);
-// }
-//
-// TEST(DataTest, FreeDataset) {
-//     dataset *test_dataset = get_dataset("data/mnist/test/0.dat");
-//
-//     free_dataset(test_dataset);
-//     // No crash
-// }
-//
-// TEST(DataTest, FreeData) {
-//     data *test_data = (data *)malloc(sizeof(data));
-//     test_data->inputs = (float *)malloc(sizeof(float) * 10);
-//
-//     free_data(test_data);
-//     // No crash
-// }
-//
-// TEST(DataTest, CopyData) {
-//     dataset *test_dataset = get_dataset("data/mnist/test/0.dat");
-//
-//     data *data_copy = get_data_copy(test_dataset->datas[0], test_dataset->inputs_length);
-//     ASSERT_NE(data_copy, nullptr);
-//     ASSERT_NE(data_copy->inputs, nullptr);
-//
-//     for (size_t i = 0; i < test_dataset->inputs_length; i++) {
-//         ASSERT_FLOAT_EQ(data_copy->inputs[i], test_dataset->datas[0]->inputs[i]);
-//     }
-//
-//     ASSERT_FLOAT_EQ(data_copy->expected_index, test_dataset->datas[0]->expected_index);
-//
-//     free_data(data_copy);
-//     free_dataset(test_dataset);
-// }
-//
-// TEST(DataTest, RandomSampleDataset) {
-//     dataset *test_dataset = get_dataset("data/mnist/test/0.dat");
-//
-//     dataset *dataset_sample = get_random_dataset_sample(test_dataset, test_dataset->length - 1);
-//     ASSERT_NE(dataset_sample, nullptr);
-//     ASSERT_NE(dataset_sample->datas, nullptr);
-//     ASSERT_NE(dataset_sample->datas[0]->inputs, nullptr);
-//
-//     free_dataset(dataset_sample);
-//     free_dataset(test_dataset);
-// }
-//
-// TEST(DataTest, RotateData) {
-//     data *test_data = (data *)malloc(sizeof(data));
-//
-//     size_t inputs_length = 9;
-//     test_data->inputs = (float *)malloc(sizeof(float) * inputs_length);
-//     for (size_t i = 0; i < inputs_length; i++) {
-//         test_data->inputs[i] = static_cast<float>(i) + 1.0f;
-//     }
-//
-//     rotate_data(test_data, 3, 3, 90);
-//     ASSERT_FLOAT_EQ(test_data->inputs[0], 7.0f);
-//     ASSERT_FLOAT_EQ(test_data->inputs[2], 1.0f);
-//     ASSERT_FLOAT_EQ(test_data->inputs[4], 5.0f);
-//
-//     free_data(test_data);
-// }
-//
-// TEST(DataTest, ScaleData) {
-//     data *test_data = (data *)malloc(sizeof(data));
-//
-//     size_t inputs_length = 9;
-//     test_data->inputs = (float *)malloc(sizeof(float) * inputs_length);
-//     for (size_t i = 0; i < inputs_length; i++) {
-//         test_data->inputs[i] = i + 1.0f;
-//     }
-//
-//     scale_data(test_data, 3, 3, 2.0f);
-//     ASSERT_FLOAT_EQ(test_data->inputs[0], 5.0f);
-//     ASSERT_FLOAT_EQ(test_data->inputs[2], 6.0f);
-//     ASSERT_FLOAT_EQ(test_data->inputs[8], 9.0f);
-//
-//     free_data(test_data);
-// }
-//
-// TEST(DataTest, OffsetData) {
-//     data *test_data = (data *)malloc(sizeof(data));
-//
-//     size_t inputs_length = 9;
-//     test_data->inputs = (float *)malloc(sizeof(float) * inputs_length);
-//     for (size_t i = 0; i < inputs_length; i++) {
-//         test_data->inputs[i] = i + 1.0f;
-//     }
-//
-//     offset_data(test_data, 3, 3, 1.0f, 1.0f);
-//     ASSERT_FLOAT_EQ(test_data->inputs[0], 0.0f);
-//     ASSERT_FLOAT_EQ(test_data->inputs[5], 2.0f);
-//     ASSERT_FLOAT_EQ(test_data->inputs[6], 0.0f);
-//     ASSERT_FLOAT_EQ(test_data->inputs[8], 5.0f);
-//
-//     free_data(test_data);
-// }
-//
-// TEST(DataTest, NoiseData) {
-//     data *test_data = (data *)malloc(sizeof(data));
-//
-//     size_t inputs_length = 9;
-//     test_data->inputs = (float *)malloc(sizeof(float) * inputs_length);
-//     for (size_t i = 0; i < inputs_length; i++) {
-//         test_data->inputs[i] = i + 1.0f;
-//     }
-//     data *data_copy = get_data_copy(test_data, inputs_length);
-//
-//     bool same = true;
-//     noise_data(test_data, inputs_length, 1.0f, 1.0f);
-//     for (size_t i = 0; i < inputs_length; i++) {
-//         if (data_copy->inputs[i] != test_data->inputs[i]) {
-//             same = false;
-//             break;
-//         }
-//     }
-//     ASSERT_FALSE(same);
-//
-//     free_data(test_data);
-//     free_data(data_copy);
-// }
-//
-// TEST(DataTest, OutputExpected) {
-//     data *test_data = (data *)malloc(sizeof(data));
-//     test_data->expected_index = 1;
-//
-//     ASSERT_FLOAT_EQ(output_expected(0, test_data), 0.0f);
-//     ASSERT_FLOAT_EQ(output_expected(1, test_data), 1.0f);
-//
-//     free(test_data);
-// }
+#include <gtest/gtest.h>
+
+extern "C" {
+#include "cneuron/cneuron.h"
+}
+
+TEST(DataTest, CreateData) {
+    data *test_data = (data *)malloc(sizeof(data));
+
+    size_t inputs_length = 10;
+    test_data->inputs = (float *)malloc(sizeof(float) * inputs_length);
+    EXPECT_NE(test_data, nullptr);
+    EXPECT_NE(test_data->inputs, nullptr);
+    for (size_t i = 0; i < inputs_length; i++) {
+        test_data->inputs[i] = static_cast<float>(i);
+    }
+    free_data(test_data);
+    // No crash
+}
+
+TEST(DataTest, GetDatasetFileNotFound) {
+    dataset *test_dataset = get_dataset("non_existent_file.dat");
+    ASSERT_EQ(test_dataset, nullptr);
+}
+
+TEST(DataTest, GetDatasetValidFile) {
+    dataset *test_dataset = get_dataset("data/mnist/test/0.dat");
+    ASSERT_NE(test_dataset, nullptr);
+    ASSERT_GT(test_dataset->length, 0);
+    ASSERT_GT(test_dataset->inputs_length, 0);
+
+    ASSERT_NE(test_dataset, nullptr);
+    ASSERT_NE(test_dataset->datas[0], nullptr);
+    ASSERT_NE(test_dataset->datas[0]->inputs, nullptr);
+
+    free_dataset(test_dataset);
+}
+
+TEST(DataTest, FreeDataset) {
+    dataset *test_dataset = get_dataset("data/mnist/test/0.dat");
+
+    free_dataset(test_dataset);
+    // No crash
+}
+
+TEST(DataTest, FreeData) {
+    data *test_data = (data *)malloc(sizeof(data));
+    test_data->inputs = (float *)malloc(sizeof(float) * 10);
+
+    free_data(test_data);
+    // No crash
+}
+
+TEST(DataTest, CopyData) {
+    dataset *test_dataset = get_dataset("data/mnist/test/0.dat");
+
+    data *data_copy = get_data_copy(test_dataset->datas[0], test_dataset->inputs_length);
+    ASSERT_NE(data_copy, nullptr);
+    ASSERT_NE(data_copy->inputs, nullptr);
+
+    for (size_t i = 0; i < test_dataset->inputs_length; i++) {
+        ASSERT_FLOAT_EQ(data_copy->inputs[i], test_dataset->datas[0]->inputs[i]);
+    }
+
+    ASSERT_FLOAT_EQ(data_copy->expected_index, test_dataset->datas[0]->expected_index);
+
+    free_data(data_copy);
+    free_dataset(test_dataset);
+}
+
+TEST(DataTest, RandomSampleDataset) {
+    dataset *test_dataset = get_dataset("data/mnist/test/0.dat");
+
+    dataset *dataset_sample = get_random_dataset_sample(test_dataset, test_dataset->length - 1);
+    ASSERT_NE(dataset_sample, nullptr);
+    ASSERT_NE(dataset_sample->datas, nullptr);
+    ASSERT_NE(dataset_sample->datas[0]->inputs, nullptr);
+
+    free_dataset(dataset_sample);
+    free_dataset(test_dataset);
+}
+
+TEST(DataTest, RotateData) {
+    data *test_data = (data *)malloc(sizeof(data));
+
+    size_t inputs_length = 9;
+    test_data->inputs = (float *)malloc(sizeof(float) * inputs_length);
+    for (size_t i = 0; i < inputs_length; i++) {
+        test_data->inputs[i] = static_cast<float>(i) + 1.0f;
+    }
+
+    rotate_data(test_data, 3, 3, 90);
+    ASSERT_FLOAT_EQ(test_data->inputs[0], 7.0f);
+    ASSERT_FLOAT_EQ(test_data->inputs[2], 1.0f);
+    ASSERT_FLOAT_EQ(test_data->inputs[4], 5.0f);
+
+    free_data(test_data);
+}
+
+TEST(DataTest, ScaleData) {
+    data *test_data = (data *)malloc(sizeof(data));
+
+    size_t inputs_length = 9;
+    test_data->inputs = (float *)malloc(sizeof(float) * inputs_length);
+    for (size_t i = 0; i < inputs_length; i++) {
+        test_data->inputs[i] = i + 1.0f;
+    }
+
+    scale_data(test_data, 3, 3, 2.0f);
+    ASSERT_FLOAT_EQ(test_data->inputs[0], 5.0f);
+    ASSERT_FLOAT_EQ(test_data->inputs[2], 6.0f);
+    ASSERT_FLOAT_EQ(test_data->inputs[8], 9.0f);
+
+    free_data(test_data);
+}
+
+TEST(DataTest, OffsetData) {
+    data *test_data = (data *)malloc(sizeof(data));
+
+    size_t inputs_length = 9;
+    test_data->inputs = (float *)malloc(sizeof(float) * inputs_length);
+    for (size_t i = 0; i < inputs_length; i++) {
+        test_data->inputs[i] = i + 1.0f;
+    }
+
+    offset_data(test_data, 3, 3, 1.0f, 1.0f);
+    ASSERT_FLOAT_EQ(test_data->inputs[0], 0.0f);
+    ASSERT_FLOAT_EQ(test_data->inputs[5], 2.0f);
+    ASSERT_FLOAT_EQ(test_data->inputs[6], 0.0f);
+    ASSERT_FLOAT_EQ(test_data->inputs[8], 5.0f);
+
+    free_data(test_data);
+}
+
+TEST(DataTest, NoiseData) {
+    data *test_data = (data *)malloc(sizeof(data));
+
+    size_t inputs_length = 9;
+    test_data->inputs = (float *)malloc(sizeof(float) * inputs_length);
+    for (size_t i = 0; i < inputs_length; i++) {
+        test_data->inputs[i] = i + 1.0f;
+    }
+    data *data_copy = get_data_copy(test_data, inputs_length);
+
+    bool same = true;
+    noise_data(test_data, inputs_length, 1.0f, 1.0f);
+    for (size_t i = 0; i < inputs_length; i++) {
+        if (data_copy->inputs[i] != test_data->inputs[i]) {
+            same = false;
+            break;
+        }
+    }
+    ASSERT_FALSE(same);
+
+    free_data(test_data);
+    free_data(data_copy);
+}
+
+TEST(DataTest, OutputExpected) {
+    data *test_data = (data *)malloc(sizeof(data));
+    test_data->expected_index = 1;
+
+    ASSERT_FLOAT_EQ(output_expected(0, test_data), 0.0f);
+    ASSERT_FLOAT_EQ(output_expected(1, test_data), 1.0f);
+
+    free(test_data);
+}
diff --git a/test/linear_algebra.cpp b/test/linear_algebra.cpp
index 31b5edc9b..fee0c10d7 100644
--- a/test/linear_algebra.cpp
+++ b/test/linear_algebra.cpp
@@ -1,50 +1,50 @@
-// #include <gtest/gtest.h>
-// #include <math.h>
-//
-// extern "C" {
-// #include "cneuron/cneuron.h"
-// }
-//
-// #include "test_utils.h"
-//
-// TEST(LinearAlgebraTest, VectorApplyActivation) {
-//     size_t length = 3;
-//     float *a = (float *)malloc(sizeof(float) * length);
-//     float *b = (float *)malloc(sizeof(float) * length);
-//
-//     a[0] = 1.0f;
-//     a[1] = -1.2f;
-//     a[2] = -0.2f;
-//
-//     vector_apply_activation(a, b, length, sigmoid, false);
-//
-//     for (size_t i = 0; i < length; i++) {
-//         ASSERT_FLOAT_EQ(b[i], sigmoid(a[i], false));
-//     }
-//
-//     free(a);
-//     free(b);
-// }
-//
-// TEST(LinearAlgebraTest, HadamardProduct) {
-//     size_t length = 3;
-//     float *a = (float *)malloc(sizeof(float) * length);
-//     float *b = (float *)malloc(sizeof(float) * length);
-//
-//     a[0] = 1.0f;
-//     a[1] = -1.2f;
-//     a[2] = -0.2f;
-//
-//     b[0] = 1.2f;
-//     b[1] = -2.2f;
-//     b[2] = -0.4f;
-//
-//     hadamard_product(a, b, b, length);
-//
-//     ASSERT_FLOAT_EQ(b[0], 1.2f);
-//     ASSERT_FLOAT_EQ(b[1], 2.64f);
-//     ASSERT_FLOAT_EQ(b[2], 0.08f);
-//
-//     free(a);
-//     free(b);
-// }
+#include <gtest/gtest.h>
+#include <math.h>
+
+extern "C" {
+#include "cneuron/cneuron.h"
+}
+
+#include "test_utils.h"
+
+TEST(LinearAlgebraTest, VectorApplyActivation) {
+    size_t length = 3;
+    float *a = (float *)malloc(sizeof(float) * length);
+    float *b = (float *)malloc(sizeof(float) * length);
+
+    a[0] = 1.0f;
+    a[1] = -1.2f;
+    a[2] = -0.2f;
+
+    vector_apply_activation(a, b, length, sigmoid, false);
+
+    for (size_t i = 0; i < length; i++) {
+        ASSERT_FLOAT_EQ(b[i], sigmoid(a[i], false));
+    }
+
+    free(a);
+    free(b);
+}
+
+TEST(LinearAlgebraTest, HadamardProduct) {
+    size_t length = 3;
+    float *a = (float *)malloc(sizeof(float) * length);
+    float *b = (float *)malloc(sizeof(float) * length);
+
+    a[0] = 1.0f;
+    a[1] = -1.2f;
+    a[2] = -0.2f;
+
+    b[0] = 1.2f;
+    b[1] = -2.2f;
+    b[2] = -0.4f;
+
+    hadamard_product(a, b, b, length);
+
+    ASSERT_FLOAT_EQ(b[0], 1.2f);
+    ASSERT_FLOAT_EQ(b[1], 2.64f);
+    ASSERT_FLOAT_EQ(b[2], 0.08f);
+
+    free(a);
+    free(b);
+}
diff --git a/test/main.cpp b/test/main.cpp
index 5bebbc677..0a4376670 100644
--- a/test/main.cpp
+++ b/test/main.cpp
@@ -1,7 +1,7 @@
-// #include <gtest/gtest.h>
-//
-// int main(int argc, char **argv) {
-//     srand(time(NULL));
-//     ::testing::InitGoogleTest(&argc, argv);
-//     return RUN_ALL_TESTS();
-// }
+#include <gtest/gtest.h>
+
+int main(int argc, char **argv) {
+    srand(time(NULL));
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/test/network.cpp b/test/network.cpp
index 6e56ca377..36010ab0b 100644
--- a/test/network.cpp
+++ b/test/network.cpp
@@ -1,246 +1,246 @@
-// #include <gtest/gtest.h>
-//
-// extern "C" {
-// #include "cneuron/cneuron.h"
-// #include "../src/rand.h"
-// }
-//
-// #include <math.h>
-//
-// #include "test_utils.h"
-//
-// TEST(NetworkTest, RandomFloat) {
-//     float test = randf(&randc, 0.0f, 1.0f);
-//     bool same = true;
-//     for (int i = 0; i < 10; i++) {
-//         if (test != randf(&randc, 0.0f, 1.0f)) {
-//             same = false;
-//             break;
-//         }
-//     }
-//     ASSERT_FALSE(same);
-// }
-//
-// TEST(NetworkTest, GetLayer) {
-//     size_t layer_length = 3;
-//     layer *test_layer = get_layer(layer_length, 5);
-//
-//     ASSERT_NE(test_layer, nullptr);
-//     ASSERT_NE(test_layer->delta, nullptr);
-//     ASSERT_NE(test_layer->weighted_input, nullptr);
-//     ASSERT_NE(test_layer->weights, nullptr);
-//     ASSERT_NE(test_layer->bias, nullptr);
-//     ASSERT_NE(test_layer->output, nullptr);
-//
-//     ASSERT_EQ(test_layer->length, layer_length);
-//
-//     free_layer(test_layer);
-// }
-//
-// TEST(NetworkTest, GetNeuralNetwork) {
-//     size_t layer_length = 3;
-//     size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
-//     layer_lengths[0] = 2;
-//     layer_lengths[1] = 3;
-//     layer_lengths[2] = 4;
-//     size_t inputs_length = 2;
-//     neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid);
-//
-//     ASSERT_NE(nn, nullptr);
-//     ASSERT_EQ(nn->length, layer_length);
-//     ASSERT_EQ(nn->inputs_length, inputs_length);
-//     ASSERT_EQ(nn->activation_function, &sigmoid);
-//     ASSERT_NE(nn->layers, nullptr);
-//     for (size_t i = 0; i < layer_length; i++) {
-//         ASSERT_NE(nn->layers[i], nullptr);
-//         ASSERT_EQ(nn->layers[i]->length, layer_lengths[i]);
-//         ASSERT_EQ(nn->layers[i]->prev_layer, (i == 0) ? nullptr : nn->layers[i - 1]);
-//         ASSERT_EQ(nn->layers[i]->next_layer, (i == layer_length - 1) ? nullptr : nn->layers[i + 1]);
-//     }
-//
-//     free_neural_network(nn);
-//     free(layer_lengths);
-// }
-//
-// TEST(NetworkTest, FreeDataset) {
-//     size_t layer_length = 1;
-//     size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
-//     layer_lengths[0] = 2;
-//     neural_network *nn = get_neural_network(layer_length, layer_lengths, 2, nullptr);
-//
-//     free_neural_network(nn);
-//     // No crash
-//     free(layer_lengths);
-// }
-//
-// TEST(NetworkTest, FreeLayer) {
-//     size_t layer_length = 2;
-//     layer *test_layer = get_layer(layer_length, 3);
-//
-//     free_layer(test_layer);
-//     // No crash
-// }
-//
-// TEST(NetworkTest, ComputeNetwork) {
-//     size_t layer_length = 1;
-//     size_t inputs_length = 1;
-//     size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
-//     layer_lengths[0] = 1;
-//     neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid);
-//
-//     float *inputs = (float *)malloc(sizeof(float) * inputs_length);
-//     inputs[0] = 0.2f;
-//
-//     nn->layers[0]->weights[0] = 0.5f;
-//     nn->layers[0]->bias[0] = 0.3f;
-//
-//     compute_network(nn, inputs);
-//
-//     ASSERT_FLOAT_EQ(nn->layers[0]->output[0], 0.59868766f);
-//
-//     free(inputs);
-//     free_neural_network(nn);
-//     free(layer_lengths);
-// }
-//
-// TEST(NetworkTest, Softmax) {
-//     size_t layer_length = 1;
-//     size_t inputs_length = 1;
-//     size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
-//     layer_lengths[0] = 3;
-//     neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid);
-//
-//     nn->layers[0]->output[0] = 0.2f;
-//     nn->layers[0]->output[1] = 0.3f;
-//     nn->layers[0]->output[2] = 0.5f;
-//
-//     ASSERT_FLOAT_EQ(softmax(nn, 0), 0.28943311f);
-//     ASSERT_FLOAT_EQ(softmax(nn, 1), 0.31987305f);
-//     ASSERT_FLOAT_EQ(softmax(nn, 2), 0.39069383f);
-//
-//     free_neural_network(nn);
-//     free(layer_lengths);
-// }
-//
-// // Output layer only
-// TEST(NetworkTest, StochasticGDSingleLayer) {
-//     dataset *test_dataset = get_or();
-//
-//     // Create network
-//     size_t layer_length = 1;
-//     size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
-//     layer_lengths[0] = 2;
-//     neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid);
-//
-//     for (size_t i = 0; i < 50000; i++) {
-//         for (size_t j = 0; j < test_dataset->length; j++) {
-//             stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]);
-//         }
-//         if (i % 10000 == 0) {
-//             printf("Single layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
-//         }
-//     }
-//
-//     ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f);
-//     ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f);
-//
-//     free_dataset(test_dataset);
-//     free_neural_network(nn);
-//     free(layer_lengths);
-// }
-//
-// TEST(NetworkTest, StochasticGDTests) {
-//     dataset *test_dataset = get_xor();
-//
-//     // Multi layer test
-//     size_t layer_length = 2;
-//     size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
-//     layer_lengths[0] = 4;
-//     layer_lengths[1] = 2;
-//     neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid);
-//
-//     for (size_t i = 0; i < 500000; i++) {
-//         for (size_t j = 0; j < test_dataset->length; j++) {
-//             stochastic_gd(nn, 0.001f, test_dataset->datas[rand() % test_dataset->length]);
-//         }
-//         if (i % 100000 == 0) {
-//             printf("Stochastic Multi layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
-//         }
-//     }
-//
-//     ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f);
-//     ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f);
-//
-//     free_neural_network(nn);
-//     free(layer_lengths);
-//
-//     // Non-linearly separable test
-//     layer_length = 1;
-//     layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
-//     layer_lengths[0] = 2;
-//     nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid);
-//
-//     for (size_t i = 0; i < 50000; i++) {
-//         for (size_t j = 0; j < test_dataset->length; j++) {
-//             stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]);
-//         }
-//         if (i % 10000 == 0) {
-//             printf("Stochastic Non-linearly separable learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
-//         }
-//     }
-//
-//     ASSERT_GE(cost(nn, test_dataset, test_dataset->length), 0.2f);
-//     ASSERT_LE(test_network_percent(nn, test_dataset), 90.0f);
-//
-//     free_neural_network(nn);
-//     free(layer_lengths);
-//     free_dataset(test_dataset);
-// }
-//
-// TEST(NetworkTest, MiniBatchGDTests) {
-//     dataset *test_dataset = get_xor();
-//
-//     // Multi layer test
-//     size_t layer_length = 2;
-//     size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
-//     layer_lengths[0] = 4;
-//     layer_lengths[1] = 2;
-//     neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid);
-//
-//     for (size_t i = 0; i < 1500000; i++) {
-//         dataset *batch_dataset = get_random_dataset_sample(test_dataset, test_dataset->length);
-//         mini_batch_gd(nn, 0.001f, batch_dataset);
-//         free_dataset(batch_dataset);
-//         if (i % 200000 == 0) {
-//             printf("Mini Batch Multi layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
-//         }
-//     }
-//
-//     ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f);
-//     ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f);
-//
-//     free_neural_network(nn);
-//     free(layer_lengths);
-//
-//     // Non-linearly separable test
-//     layer_length = 1;
-//     layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
-//     layer_lengths[0] = 2;
-//     nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid);
-//
-//     for (size_t i = 0; i < 100000; i++) {
-//         dataset *batch_dataset = get_random_dataset_sample(test_dataset, test_dataset->length);
-//         mini_batch_gd(nn, 0.001f, batch_dataset);
-//         free_dataset(batch_dataset);
-//         if (i % 20000 == 0) {
-//             printf("Mini Batch Non-linearly separable learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
-//         }
-//     }
-//
-//     ASSERT_GE(cost(nn, test_dataset, test_dataset->length), 0.2f);
-//     ASSERT_LE(test_network_percent(nn, test_dataset), 90.0f);
-//
-//     free_neural_network(nn);
-//     free(layer_lengths);
-//     free_dataset(test_dataset);
-// }
+#include <gtest/gtest.h>
+
+extern "C" {
+#include "cneuron/cneuron.h"
+#include "../src/rand.h"
+}
+
+#include <math.h>
+
+#include "test_utils.h"
+
+TEST(NetworkTest, RandomFloat) {
+    float test = randf(&randc, 0.0f, 1.0f);
+    bool same = true;
+    for (int i = 0; i < 10; i++) {
+        if (test != randf(&randc, 0.0f, 1.0f)) {
+            same = false;
+            break;
+        }
+    }
+    ASSERT_FALSE(same);
+}
+
+TEST(NetworkTest, GetLayer) {
+    size_t layer_length = 3;
+    layer *test_layer = get_layer(layer_length, 5);
+
+    ASSERT_NE(test_layer, nullptr);
+    ASSERT_NE(test_layer->delta, nullptr);
+    ASSERT_NE(test_layer->weighted_input, nullptr);
+    ASSERT_NE(test_layer->weights, nullptr);
+    ASSERT_NE(test_layer->bias, nullptr);
+    ASSERT_NE(test_layer->output, nullptr);
+
+    ASSERT_EQ(test_layer->length, layer_length);
+
+    free_layer(test_layer);
+}
+
+TEST(NetworkTest, GetNeuralNetwork) {
+    size_t layer_length = 3;
+    size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
+    layer_lengths[0] = 2;
+    layer_lengths[1] = 3;
+    layer_lengths[2] = 4;
+    size_t inputs_length = 2;
+    neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid);
+
+    ASSERT_NE(nn, nullptr);
+    ASSERT_EQ(nn->length, layer_length);
+    ASSERT_EQ(nn->inputs_length, inputs_length);
+    ASSERT_EQ(nn->activation_function, &sigmoid);
+    ASSERT_NE(nn->layers, nullptr);
+    for (size_t i = 0; i < layer_length; i++) {
+        ASSERT_NE(nn->layers[i], nullptr);
+        ASSERT_EQ(nn->layers[i]->length, layer_lengths[i]);
+        ASSERT_EQ(nn->layers[i]->prev_layer, (i == 0) ? nullptr : nn->layers[i - 1]);
+        ASSERT_EQ(nn->layers[i]->next_layer, (i == layer_length - 1) ? nullptr : nn->layers[i + 1]);
+    }
+
+    free_neural_network(nn);
+    free(layer_lengths);
+}
+
+TEST(NetworkTest, FreeDataset) {
+    size_t layer_length = 1;
+    size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
+    layer_lengths[0] = 2;
+    neural_network *nn = get_neural_network(layer_length, layer_lengths, 2, nullptr);
+
+    free_neural_network(nn);
+    // No crash
+    free(layer_lengths);
+}
+
+TEST(NetworkTest, FreeLayer) {
+    size_t layer_length = 2;
+    layer *test_layer = get_layer(layer_length, 3);
+
+    free_layer(test_layer);
+    // No crash
+}
+
+TEST(NetworkTest, ComputeNetwork) {
+    size_t layer_length = 1;
+    size_t inputs_length = 1;
+    size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
+    layer_lengths[0] = 1;
+    neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid);
+
+    float *inputs = (float *)malloc(sizeof(float) * inputs_length);
+    inputs[0] = 0.2f;
+
+    nn->layers[0]->weights[0] = 0.5f;
+    nn->layers[0]->bias[0] = 0.3f;
+
+    compute_network(nn, inputs);
+
+    ASSERT_FLOAT_EQ(nn->layers[0]->output[0], 0.59868766f);
+
+    free(inputs);
+    free_neural_network(nn);
+    free(layer_lengths);
+}
+
+TEST(NetworkTest, Softmax) {
+    size_t layer_length = 1;
+    size_t inputs_length = 1;
+    size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
+    layer_lengths[0] = 3;
+    neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid);
+
+    nn->layers[0]->output[0] = 0.2f;
+    nn->layers[0]->output[1] = 0.3f;
+    nn->layers[0]->output[2] = 0.5f;
+
+    ASSERT_FLOAT_EQ(softmax(nn, 0), 0.28943311f);
+    ASSERT_FLOAT_EQ(softmax(nn, 1), 0.31987305f);
+    ASSERT_FLOAT_EQ(softmax(nn, 2), 0.39069383f);
+
+    free_neural_network(nn);
+    free(layer_lengths);
+}
+
+// Output layer only
+TEST(NetworkTest, StochasticGDSingleLayer) {
+    dataset *test_dataset = get_or();
+
+    // Create network
+    size_t layer_length = 1;
+    size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
+    layer_lengths[0] = 2;
+    neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid);
+
+    for (size_t i = 0; i < 50000; i++) {
+        for (size_t j = 0; j < test_dataset->length; j++) {
+            stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]);
+        }
+        if (i % 10000 == 0) {
+            printf("Single layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
+        }
+    }
+
+    ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f);
+    ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f);
+
+    free_dataset(test_dataset);
+    free_neural_network(nn);
+    free(layer_lengths);
+}
+
+TEST(NetworkTest, StochasticGDTests) {
+    dataset *test_dataset = get_xor();
+
+    // Multi layer test
+    size_t layer_length = 2;
+    size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
+    layer_lengths[0] = 4;
+    layer_lengths[1] = 2;
+    neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid);
+
+    for (size_t i = 0; i < 500000; i++) {
+        for (size_t j = 0; j < test_dataset->length; j++) {
+            stochastic_gd(nn, 0.001f, test_dataset->datas[rand() % test_dataset->length]);
+        }
+        if (i % 100000 == 0) {
+            printf("Stochastic Multi layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
+        }
+    }
+
+    ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f);
+    ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f);
+
+    free_neural_network(nn);
+    free(layer_lengths);
+
+    // Non-linearly separable test
+    layer_length = 1;
+    layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
+    layer_lengths[0] = 2;
+    nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid);
+
+    for (size_t i = 0; i < 50000; i++) {
+        for (size_t j = 0; j < test_dataset->length; j++) {
+            stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]);
+        }
+        if (i % 10000 == 0) {
+            printf("Stochastic Non-linearly separable learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
+        }
+    }
+
+    ASSERT_GE(cost(nn, test_dataset, test_dataset->length), 0.2f);
+    ASSERT_LE(test_network_percent(nn, test_dataset), 90.0f);
+
+    free_neural_network(nn);
+    free(layer_lengths);
+    free_dataset(test_dataset);
+}
+
+TEST(NetworkTest, MiniBatchGDTests) {
+    dataset *test_dataset = get_xor();
+
+    // Multi layer test
+    size_t layer_length = 2;
+    size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
+    layer_lengths[0] = 4;
+    layer_lengths[1] = 2;
+    neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid);
+
+    for (size_t i = 0; i < 1500000; i++) {
+        dataset *batch_dataset = get_random_dataset_sample(test_dataset, test_dataset->length);
+        mini_batch_gd(nn, 0.001f, batch_dataset);
+        free_dataset(batch_dataset);
+        if (i % 200000 == 0) {
+            printf("Mini Batch Multi layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
+        }
+    }
+
+    ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f);
+    ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f);
+
+    free_neural_network(nn);
+    free(layer_lengths);
+
+    // Non-linearly separable test
+    layer_length = 1;
+    layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length);
+    layer_lengths[0] = 2;
+    nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid);
+
+    for (size_t i = 0; i < 100000; i++) {
+        dataset *batch_dataset = get_random_dataset_sample(test_dataset, test_dataset->length);
+        mini_batch_gd(nn, 0.001f, batch_dataset);
+        free_dataset(batch_dataset);
+        if (i % 20000 == 0) {
+            printf("Mini Batch Non-linearly separable learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
+        }
+    }
+
+    ASSERT_GE(cost(nn, test_dataset, test_dataset->length), 0.2f);
+    ASSERT_LE(test_network_percent(nn, test_dataset), 90.0f);
+
+    free_neural_network(nn);
+    free(layer_lengths);
+    free_dataset(test_dataset);
+}
diff --git a/test/test_utils.cpp b/test/test_utils.cpp
index def5f48ca..5513f015d 100644
--- a/test/test_utils.cpp
+++ b/test/test_utils.cpp
@@ -1,77 +1,77 @@
-// #include "test_utils.h"
-//
-// #include <math.h>
-//
-// float sigmoid(float val, bool is_deravative) {
-//     float result = 1.0f / (1.0f + expf(-val));
-//     if (is_deravative == 1) {
-//         return result * (1.0f - result);
-//     }
-//     return result;
-// }
-//
-// dataset *get_xor() {
-//     // Create data
-//     dataset *test_dataset = (dataset *)malloc(sizeof(dataset));
-//     test_dataset->length = 4;
-//     data **datas = (data **)malloc(sizeof(data *) * test_dataset->length);
-//     test_dataset->datas = datas;
-//     test_dataset->inputs_length = 2;
-//
-//     for (size_t i = 0; i < test_dataset->length; i++) {
-//         test_dataset->datas[i] = (data *)malloc(sizeof(data));
-//         test_dataset->datas[i]->inputs = (float *)malloc(sizeof(float) * test_dataset->inputs_length);
-//     }
-//
-//     // XOR gate
-//     test_dataset->datas[0]->inputs[0] = 1.0f;
-//     test_dataset->datas[0]->inputs[1] = 1.0f;
-//     test_dataset->datas[0]->expected_index = 0;
-//
-//     test_dataset->datas[1]->inputs[0] = 0.0f;
-//     test_dataset->datas[1]->inputs[1] = 0.0f;
-//     test_dataset->datas[1]->expected_index = 0;
-//
-//     test_dataset->datas[2]->inputs[0] = 0.0f;
-//     test_dataset->datas[2]->inputs[1] = 1.0f;
-//     test_dataset->datas[2]->expected_index = 1;
-//
-//     test_dataset->datas[3]->inputs[0] = 1.0f;
-//     test_dataset->datas[3]->inputs[1] = 0.0f;
-//     test_dataset->datas[3]->expected_index = 1;
-//
-//     return test_dataset;
-// }
-//
-// dataset *get_or() {
-//     // Create data
-//     dataset *test_dataset = (dataset *)malloc(sizeof(dataset));
-//     test_dataset->length = 4;
-//     data **datas = (data **)malloc(sizeof(data *) * test_dataset->length);
-//     test_dataset->datas = datas;
-//     test_dataset->inputs_length = 2;
-//
-//     for (size_t i = 0; i < test_dataset->length; i++) {
-//         test_dataset->datas[i] = (data *)malloc(sizeof(data));
-//         test_dataset->datas[i]->inputs = (float *)malloc(sizeof(float) * test_dataset->inputs_length);
-//     }
-//
-//     // OR gate
-//     test_dataset->datas[1]->inputs[0] = 0.0f;
-//     test_dataset->datas[1]->inputs[1] = 0.0f;
-//     test_dataset->datas[1]->expected_index = 0;
-//
-//     test_dataset->datas[0]->inputs[0] = 1.0f;
-//     test_dataset->datas[0]->inputs[1] = 1.0f;
-//     test_dataset->datas[0]->expected_index = 1;
-//
-//     test_dataset->datas[2]->inputs[0] = 0.0f;
-//     test_dataset->datas[2]->inputs[1] = 1.0f;
-//     test_dataset->datas[2]->expected_index = 1;
-//
-//     test_dataset->datas[3]->inputs[0] = 1.0f;
-//     test_dataset->datas[3]->inputs[1] = 0.0f;
-//     test_dataset->datas[3]->expected_index = 1;
-//
-//     return test_dataset;
-// }
+#include "test_utils.h"
+
+#include <math.h>
+
+float sigmoid(float val, bool is_deravative) {
+    float result = 1.0f / (1.0f + expf(-val));
+    if (is_deravative == 1) {
+        return result * (1.0f - result);
+    }
+    return result;
+}
+
+dataset *get_xor() {
+    // Create data
+    dataset *test_dataset = (dataset *)malloc(sizeof(dataset));
+    test_dataset->length = 4;
+    data **datas = (data **)malloc(sizeof(data *) * test_dataset->length);
+    test_dataset->datas = datas;
+    test_dataset->inputs_length = 2;
+
+    for (size_t i = 0; i < test_dataset->length; i++) {
+        test_dataset->datas[i] = (data *)malloc(sizeof(data));
+        test_dataset->datas[i]->inputs = (float *)malloc(sizeof(float) * test_dataset->inputs_length);
+    }
+
+    // XOR gate
+    test_dataset->datas[0]->inputs[0] = 1.0f;
+    test_dataset->datas[0]->inputs[1] = 1.0f;
+    test_dataset->datas[0]->expected_index = 0;
+
+    test_dataset->datas[1]->inputs[0] = 0.0f;
+    test_dataset->datas[1]->inputs[1] = 0.0f;
+    test_dataset->datas[1]->expected_index = 0;
+
+    test_dataset->datas[2]->inputs[0] = 0.0f;
+    test_dataset->datas[2]->inputs[1] = 1.0f;
+    test_dataset->datas[2]->expected_index = 1;
+
+    test_dataset->datas[3]->inputs[0] = 1.0f;
+    test_dataset->datas[3]->inputs[1] = 0.0f;
+    test_dataset->datas[3]->expected_index = 1;
+
+    return test_dataset;
+}
+
+dataset *get_or() {
+    // Create data
+    dataset *test_dataset = (dataset *)malloc(sizeof(dataset));
+    test_dataset->length = 4;
+    data **datas = (data **)malloc(sizeof(data *) * test_dataset->length);
+    test_dataset->datas = datas;
+    test_dataset->inputs_length = 2;
+
+    for (size_t i = 0; i < test_dataset->length; i++) {
+        test_dataset->datas[i] = (data *)malloc(sizeof(data));
+        test_dataset->datas[i]->inputs = (float *)malloc(sizeof(float) * test_dataset->inputs_length);
+    }
+
+    // OR gate
+    test_dataset->datas[1]->inputs[0] = 0.0f;
+    test_dataset->datas[1]->inputs[1] = 0.0f;
+    test_dataset->datas[1]->expected_index = 0;
+
+    test_dataset->datas[0]->inputs[0] = 1.0f;
+    test_dataset->datas[0]->inputs[1] = 1.0f;
+    test_dataset->datas[0]->expected_index = 1;
+
+    test_dataset->datas[2]->inputs[0] = 0.0f;
+    test_dataset->datas[2]->inputs[1] = 1.0f;
+    test_dataset->datas[2]->expected_index = 1;
+
+    test_dataset->datas[3]->inputs[0] = 1.0f;
+    test_dataset->datas[3]->inputs[1] = 0.0f;
+    test_dataset->datas[3]->expected_index = 1;
+
+    return test_dataset;
+}
diff --git a/test/test_utils.h b/test/test_utils.h
index 02dbbe9ce..3c310aef2 100644
--- a/test/test_utils.h
+++ b/test/test_utils.h
@@ -1,12 +1,12 @@
-// #ifndef TEST_UTILS_H
-// #define TEST_UTILS_H
-//
-// #include "cneuron/cneuron.h"
-//
-// float sigmoid(float val, bool is_deravative);
-//
-// dataset *get_xor();
-//
-// dataset *get_or();
-//
-// #endif
+#ifndef TEST_UTILS_H
+#define TEST_UTILS_H
+
+#include "cneuron/cneuron.h"
+
+float sigmoid(float val, bool is_deravative);
+
+dataset *get_xor();
+
+dataset *get_or();
+
+#endif

From 3c5a2c3747c75e533115909b600d420a9e8c31f2 Mon Sep 17 00:00:00 2001
From: BrBa2 <ddddddddrrdd3@gmail.com>
Date: Thu, 12 Jun 2025 21:37:28 +0800
Subject: [PATCH 06/11] optimize rand a bit

---
 src/data.c    |  2 +-
 src/main.c    |  2 --
 src/network.c |  2 +-
 src/rand.c    | 46 ++++++++++++++++++++++++++++------------------
 src/rand.h    |  9 ++++-----
 5 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/src/data.c b/src/data.c
index f3bae3913..d7c0eb984 100644
--- a/src/data.c
+++ b/src/data.c
@@ -137,7 +137,7 @@ dataset *get_random_dataset_sample(const dataset *source_dataset, size_t amount)
     new_dataset->datas = malloc(sizeof(data) * amount);
 
     for (size_t i = 0; i < amount; i++) {
-        new_dataset->datas[i] = get_data_copy(source_dataset->datas[randnum_u32(&randc, 0, source_dataset->length)], source_dataset->inputs_length);
+        new_dataset->datas[i] = get_data_copy(source_dataset->datas[randnum_u32(&randc, source_dataset->length, 0)], source_dataset->inputs_length);
     }
 
     return new_dataset;
diff --git a/src/main.c b/src/main.c
index e1b6f4e06..fc3cb1b30 100644
--- a/src/main.c
+++ b/src/main.c
@@ -98,8 +98,6 @@ dataset *get_mnist(bool is_test) {
 }
 
 int main(int argc, char **argv) {
-    static uint64_t seed[4] = {0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95};
-    prng_init(&__randstate, seed);
     dataset *train_dataset = get_mnist(false);
     dataset *test_dataset = get_mnist(true);
     size_t network_length = 3;
diff --git a/src/network.c b/src/network.c
index 7e5e2db6b..93ae307a1 100644
--- a/src/network.c
+++ b/src/network.c
@@ -201,7 +201,7 @@ float cost(neural_network *nn, const dataset *test_dataset, size_t num_test) {
 
     layer *output_layer = nn->layers[nn->length - 1];
     for (size_t i = 0; i < num_test; i++) {
-        data *test_data = test_dataset->datas[randnum_u32(&randc, 0, test_dataset->length)];
+        data *test_data = test_dataset->datas[randnum_u32(&randc, test_dataset->length, 0)];
         compute_network(nn, test_data->inputs);
         for (size_t j = 0; j < output_layer->length; j++) {
             float output = output_layer->output[j];
diff --git a/src/rand.c b/src/rand.c
index c01a6adbb..29428c44c 100644
--- a/src/rand.c
+++ b/src/rand.c
@@ -2,46 +2,56 @@
 
 #include <assert.h>
 #include <string.h>
+#include "shishua/shishua-sse2.h"
 
-uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max) {
+static prng_state __randstate;
+
+__attribute__((constructor)) static void auto_seed_rand(void) {
+    static uint64_t seed[4] = {0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95};
+    prng_init(&__randstate, seed);
+    prng_gen(&__randstate, randc.buf, 1024);
+}
+
+uint8_t randnum_u8(struct rand_chunk *randc, uint8_t range, uint8_t offset) {
     assert(max > min);
-    assert(randc->count <= 255);
-    uint8_t randnum = ((uint16_t)randc->buf[randc->count] * (max - min)) >> 8;
-    randnum += min;
+    assert(randc->count <= 1023);
+    uint8_t randnum = ((uint16_t)randc->buf[randc->count] * range) >> 8;
+    randnum += offset;
     randc->count++;
-    if (randc->count > 255) {
+    if (randc->count > 1023) {
 	randc->count = 0;
-	prng_gen(&__randstate, randc->buf, 256);
+	prng_gen(&__randstate, randc->buf, 1024);
     }
     return randnum;
 }
 
-uint32_t randnum_u32(struct rand_chunk *randc, uint32_t min, uint32_t max) {
+uint32_t randnum_u32(struct rand_chunk *randc, uint32_t range, uint32_t offset) {
     assert(max > min);
-    assert(randc->count <= 252);
+    assert(randc->count <= 1020);
     uint32_t value;
+    // uint32_t value = randc->buf[randc->count++] << 24 | randc->buf[randc->count++] << 16 | randc->buf[randc->count++] << 8 | randc->buf[randc->count++];
     memcpy(&value, &randc->buf[randc->count], sizeof(value));
-    uint32_t randnum = ((uint64_t)value * (max - min)) >> 32;
-    randnum += min;
-    randc->count += sizeof(uint32_t);
-    if (randc->count > 252) {
+    uint32_t randnum = ((uint64_t)value * range) >> 32;
+    randnum += offset;
+    // randc->count += sizeof(uint32_t);
+    if (randc->count > 1020) {
 	randc->count = 0;
-	prng_gen(&__randstate, randc->buf, 256);
+	prng_gen(&__randstate, randc->buf, 1024);
     }
     return randnum;
 }
 
 float randf(struct rand_chunk *randc, float range, float offset) {
     assert(range);
-    assert(randc->count <= 252);
+    assert(randc->count <= 1021);
     uint32_t value;
-    memcpy(&value, &randc->buf[randc->count], sizeof(value) - 1);
+    memcpy(&value, &randc->buf[randc->count], 3);
     // float randfloat = ((float)value / UINT32_MAX) * range + offset;
     float randfloat = (value / 16777216.0f) * range + offset;
-    randc->count += sizeof(value) - 1;
-    if (randc->count > 253) {
+    randc->count += 3;
+    if (randc->count > 1021) {
 	randc->count = 0;
-	prng_gen(&__randstate, randc->buf, 256);
+	prng_gen(&__randstate, randc->buf, 1024);
     }
     return randfloat;
 }
diff --git a/src/rand.h b/src/rand.h
index ba6f5159d..ed1918824 100644
--- a/src/rand.h
+++ b/src/rand.h
@@ -7,13 +7,12 @@
 
 struct rand_chunk {
     size_t count;
-    uint8_t buf[256];  // NOTE: must be multiple of 256
+    uint8_t buf[1024];  // NOTE: must be multiple of 256
 };
 
-uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max);
-uint32_t randnum_u32(struct rand_chunk *randc, uint32_t min, uint32_t max);
+uint8_t randnum_u8(struct rand_chunk *randc, uint8_t range, uint8_t offset);
+uint32_t randnum_u32(struct rand_chunk *randc, uint32_t range, uint32_t offset);
 float randf(struct rand_chunk *randc, float range, float offset);
-
-static prng_state __randstate;
 static struct rand_chunk randc = {0};
+
 #endif

From 046e062091752f11c0fc1a9c75a57ed66ddeb330 Mon Sep 17 00:00:00 2001
From: BrBa2 <ddddddddrrdd3@gmail.com>
Date: Fri, 13 Jun 2025 09:55:29 +0800
Subject: [PATCH 07/11] randf optimized

---
 src/rand.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/rand.c b/src/rand.c
index 29428c44c..fe364809a 100644
--- a/src/rand.c
+++ b/src/rand.c
@@ -1,6 +1,7 @@
 #include "rand.h"
 
 #include <assert.h>
+#include <stdlib.h>
 #include <string.h>
 #include "shishua/shishua-sse2.h"
 
@@ -44,11 +45,15 @@ uint32_t randnum_u32(struct rand_chunk *randc, uint32_t range, uint32_t offset)
 float randf(struct rand_chunk *randc, float range, float offset) {
     assert(range);
     assert(randc->count <= 1021);
-    uint32_t value;
-    memcpy(&value, &randc->buf[randc->count], 3);
-    // float randfloat = ((float)value / UINT32_MAX) * range + offset;
-    float randfloat = (value / 16777216.0f) * range + offset;
-    randc->count += 3;
+    // uint32_t value;
+    // memcpy(&value, &randc->buf[randc->count], 4);
+    uint32_t value = randc->buf[randc->count++] << 16 | randc->buf[randc->count++] << 8 | randc->buf[randc->count++];
+    // value |= 0x3F800000;
+    // float randfloat;
+    // randfloat = *(float *)&value;
+    // memcpy(&randfloat, &value, sizeof(randfloat));
+    // randfloat = (randfloat - 1) * range + offset;
+    float randfloat = (value) / 16777216.0f * range + offset;
     if (randc->count > 1021) {
 	randc->count = 0;
 	prng_gen(&__randstate, randc->buf, 1024);

From 0fef7593f9649678b389868b703f0ea919e70039 Mon Sep 17 00:00:00 2001
From: BrBa2 <ddddddddrrdd3@gmail.com>
Date: Fri, 13 Jun 2025 12:13:55 +0800
Subject: [PATCH 08/11] Fix: Fixed index bug

---
 src/rand.c | 2 +-
 src/rand.h | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/rand.c b/src/rand.c
index fe364809a..852382a54 100644
--- a/src/rand.c
+++ b/src/rand.c
@@ -34,7 +34,7 @@ uint32_t randnum_u32(struct rand_chunk *randc, uint32_t range, uint32_t offset)
     memcpy(&value, &randc->buf[randc->count], sizeof(value));
     uint32_t randnum = ((uint64_t)value * range) >> 32;
     randnum += offset;
-    // randc->count += sizeof(uint32_t);
+    randc->count += sizeof(randnum);
     if (randc->count > 1020) {
 	randc->count = 0;
 	prng_gen(&__randstate, randc->buf, 1024);
diff --git a/src/rand.h b/src/rand.h
index ed1918824..0becfd0a4 100644
--- a/src/rand.h
+++ b/src/rand.h
@@ -1,13 +1,14 @@
 #ifndef SHISHUA_RAND_H
 #define SHISHUA_RAND_H
 
+#include <stdalign.h>
 #include <stddef.h>
 #include <stdint.h>
 #include "../external/shishua/shishua.h"
 
 struct rand_chunk {
-    size_t count;
-    uint8_t buf[1024];  // NOTE: must be multiple of 256
+    size_t count __attribute__((aligned(32)));
+    uint8_t buf[1024] __attribute__((aligned(32)));// NOTE: must be multiple of 256
 };
 
 uint8_t randnum_u8(struct rand_chunk *randc, uint8_t range, uint8_t offset);

From 75bc06a461e17f1cef88a3d15d223df67033231e Mon Sep 17 00:00:00 2001
From: BrBa2 <ddddddddrrdd3@gmail.com>
Date: Fri, 13 Jun 2025 13:35:28 +0800
Subject: [PATCH 09/11] Fix: prng_gen work at beginning

---
 src/rand.c | 35 ++++++++++++++---------------------
 src/rand.h | 10 +++++++---
 2 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/src/rand.c b/src/rand.c
index 852382a54..0b0f60e9b 100644
--- a/src/rand.c
+++ b/src/rand.c
@@ -1,11 +1,7 @@
-#include "rand.h"
-
 #include <assert.h>
-#include <stdlib.h>
 #include <string.h>
-#include "shishua/shishua-sse2.h"
 
-static prng_state __randstate;
+#include "rand.h"
 
 __attribute__((constructor)) static void auto_seed_rand(void) {
     static uint64_t seed[4] = {0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95};
@@ -14,37 +10,38 @@ __attribute__((constructor)) static void auto_seed_rand(void) {
 }
 
 uint8_t randnum_u8(struct rand_chunk *randc, uint8_t range, uint8_t offset) {
-    assert(max > min);
-    assert(randc->count <= 1023);
-    uint8_t randnum = ((uint16_t)randc->buf[randc->count] * range) >> 8;
-    randnum += offset;
-    randc->count++;
+    assert(range);
     if (randc->count > 1023) {
 	randc->count = 0;
 	prng_gen(&__randstate, randc->buf, 1024);
     }
+    uint8_t randnum = ((uint16_t)randc->buf[randc->count] * range) >> 8;
+    randnum += offset;
+    randc->count++;
     return randnum;
 }
 
 uint32_t randnum_u32(struct rand_chunk *randc, uint32_t range, uint32_t offset) {
-    assert(max > min);
-    assert(randc->count <= 1020);
+    assert(range);
+    if (randc->count > 1020) {
+	randc->count = 0;
+	prng_gen(&__randstate, randc->buf, 1024);
+    }
     uint32_t value;
     // uint32_t value = randc->buf[randc->count++] << 24 | randc->buf[randc->count++] << 16 | randc->buf[randc->count++] << 8 | randc->buf[randc->count++];
     memcpy(&value, &randc->buf[randc->count], sizeof(value));
     uint32_t randnum = ((uint64_t)value * range) >> 32;
     randnum += offset;
     randc->count += sizeof(randnum);
-    if (randc->count > 1020) {
-	randc->count = 0;
-	prng_gen(&__randstate, randc->buf, 1024);
-    }
     return randnum;
 }
 
 float randf(struct rand_chunk *randc, float range, float offset) {
     assert(range);
-    assert(randc->count <= 1021);
+    if (randc->count > 1020) {
+	randc->count = 0;
+	prng_gen(&__randstate, randc->buf, 1024);
+    }
     // uint32_t value;
     // memcpy(&value, &randc->buf[randc->count], 4);
     uint32_t value = randc->buf[randc->count++] << 16 | randc->buf[randc->count++] << 8 | randc->buf[randc->count++];
@@ -54,10 +51,6 @@ float randf(struct rand_chunk *randc, float range, float offset) {
     // memcpy(&randfloat, &value, sizeof(randfloat));
     // randfloat = (randfloat - 1) * range + offset;
     float randfloat = (value) / 16777216.0f * range + offset;
-    if (randc->count > 1021) {
-	randc->count = 0;
-	prng_gen(&__randstate, randc->buf, 1024);
-    }
     return randfloat;
 }
 
diff --git a/src/rand.h b/src/rand.h
index 0becfd0a4..5d864f4e3 100644
--- a/src/rand.h
+++ b/src/rand.h
@@ -7,13 +7,17 @@
 #include "../external/shishua/shishua.h"
 
 struct rand_chunk {
-    size_t count __attribute__((aligned(32)));
-    uint8_t buf[1024] __attribute__((aligned(32)));// NOTE: must be multiple of 256
+    size_t count;
+    uint8_t buf[1024];// NOTE: must be multiple of 256
 };
 
 uint8_t randnum_u8(struct rand_chunk *randc, uint8_t range, uint8_t offset);
 uint32_t randnum_u32(struct rand_chunk *randc, uint32_t range, uint32_t offset);
 float randf(struct rand_chunk *randc, float range, float offset);
-static struct rand_chunk randc = {0};
+static struct rand_chunk randc = {
+  .count = 2000,
+  .buf = {0}
+};
 
+static prng_state __randstate;
 #endif

From b2bd7b298034179b481d6ec8b56568d7f2b2ae2d Mon Sep 17 00:00:00 2001
From: BrBa2 <ddddddddrrdd3@gmail.com>
Date: Fri, 13 Jun 2025 13:46:09 +0800
Subject: [PATCH 10/11] Fix: range issue with randf usage throughout the files

---
 src/main.c       | 6 +++---
 test/network.cpp | 7 +++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/main.c b/src/main.c
index fc3cb1b30..f63223be5 100644
--- a/src/main.c
+++ b/src/main.c
@@ -40,9 +40,9 @@ void train(neural_network *nn, dataset *train_dataset, dataset *test_dataset, fl
         dataset *batch_dataset = get_random_dataset_sample(train_dataset, batch_size);
         for (size_t i = 0; i < batch_dataset->length; i++) {
             data *data = batch_dataset->datas[i];
-            rotate_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, -5.0f, 5.0f));
-            scale_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, 0.9f, 1.1f));
-            offset_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, -3.0f, 3.0f), randf(&randc, -3.0f, 3.0f));
+            rotate_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, 10.0f, -5.0f));
+            scale_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, 1.2f, -0.1f));
+            offset_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, 6.0f, -3.0f), randf(&randc, 6.0f, -3.0f));
             noise_data(data, IMAGE_SIZE * IMAGE_SIZE, 0.3f, 0.08f);
         }
         mini_batch_gd(nn, learn_rate, batch_dataset);
diff --git a/test/network.cpp b/test/network.cpp
index 36010ab0b..a213b27b1 100644
--- a/test/network.cpp
+++ b/test/network.cpp
@@ -10,10 +10,13 @@ extern "C" {
 #include "test_utils.h"
 
 TEST(NetworkTest, RandomFloat) {
-    float test = randf(&randc, 0.0f, 1.0f);
+    // static uint64_t seed[4] = {0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95};
+    // prng_init(&__randstate, seed);
+    // prng_gen(&__randstate, randc.buf, 1024);
+    float test = randf(&randc, 1.0f, 0.0f);
     bool same = true;
     for (int i = 0; i < 10; i++) {
-        if (test != randf(&randc, 0.0f, 1.0f)) {
+        if (test != randf(&randc, 1.0f, 0.0f)) {
             same = false;
             break;
         }

From 6a28349489bb5dcd0c2594a9248d1cf561bfaeff Mon Sep 17 00:00:00 2001
From: Lee Hoon Lim <hoonlim123@gmail.com>
Date: Fri, 13 Jun 2025 16:07:21 +0800
Subject: [PATCH 11/11] Remove warnings and cleanup

---
 CMakeLists.txt                  |  2 +-
 external/shishua/shishua-sse2.h |  2 +-
 include/cneuron/cneuron.h       |  8 -----
 src/data.c                      |  6 ++--
 src/main.c                      |  6 ++--
 src/network.c                   |  4 +--
 src/rand.c                      | 53 +++++++++++++++++++--------------
 src/rand.h                      | 19 ++++++------
 test/main.cpp                   |  1 -
 test/network.cpp                | 15 ++++------
 10 files changed, 56 insertions(+), 60 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 95245ad12..359fce592 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ project(cneuron C CXX)
 
 set(CMAKE_C_STANDARD 17)
 set(CMAKE_C_STANDARD_REQUIRED ON)
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
diff --git a/external/shishua/shishua-sse2.h b/external/shishua/shishua-sse2.h
index a9648fb0b..812288e6c 100644
--- a/external/shishua/shishua-sse2.h
+++ b/external/shishua/shishua-sse2.h
@@ -185,7 +185,7 @@ static uint64_t phi[16] = {
     0xFEC507705E4AE6E5,
 };
 
-static void prng_init(prng_state *s, uint64_t seed[4]) {
+static inline void prng_init(prng_state *s, uint64_t seed[4]) {
     // Note: output is uninitialized at first, but since we pass NULL, its value
     // is initially ignored.
     s->counter[0] = _mm_setzero_si128();
diff --git a/include/cneuron/cneuron.h b/include/cneuron/cneuron.h
index 1d583f2e7..5ea530423 100644
--- a/include/cneuron/cneuron.h
+++ b/include/cneuron/cneuron.h
@@ -157,14 +157,6 @@ typedef struct {
     float (*activation_function)(float, bool); /**< Pointer to the activation function used in the network. */
 } neural_network;
 
-/**
- * @brief Generates a random floating-point number within a given range.
- *
- * @param min Minimum value for the random number.
- * @param max Maximum value for the random number.
- * @return A random float between min and max.
- */
-
 /**
  * @brief Allocates and initializes a new layer.
  *
diff --git a/src/data.c b/src/data.c
index d7c0eb984..94172cff5 100644
--- a/src/data.c
+++ b/src/data.c
@@ -137,7 +137,7 @@ dataset *get_random_dataset_sample(const dataset *source_dataset, size_t amount)
     new_dataset->datas = malloc(sizeof(data) * amount);
 
     for (size_t i = 0; i < amount; i++) {
-        new_dataset->datas[i] = get_data_copy(source_dataset->datas[randnum_u32(&randc, source_dataset->length, 0)], source_dataset->inputs_length);
+        new_dataset->datas[i] = get_data_copy(source_dataset->datas[randnum_u32(source_dataset->length, 0)], source_dataset->inputs_length);
     }
 
     return new_dataset;
@@ -243,9 +243,9 @@ void noise_data(data *data, size_t inputs_length, float noise_factor, float prob
     assert(inputs_length > 0);
 
     for (size_t i = 0; i < inputs_length; i++) {
-        float random_value = randf(&randc, 1, 0);
+        float random_value = randf(1, 0);
         if (random_value <= probability) {
-            float noise = randf(&randc, noise_factor, 0);
+            float noise = randf(noise_factor, 0);
             float new_value = data->inputs[i] + noise;
 
             data->inputs[i] = fmin(new_value, 1.0f);
diff --git a/src/main.c b/src/main.c
index f63223be5..87174a984 100644
--- a/src/main.c
+++ b/src/main.c
@@ -40,9 +40,9 @@ void train(neural_network *nn, dataset *train_dataset, dataset *test_dataset, fl
         dataset *batch_dataset = get_random_dataset_sample(train_dataset, batch_size);
         for (size_t i = 0; i < batch_dataset->length; i++) {
             data *data = batch_dataset->datas[i];
-            rotate_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, 10.0f, -5.0f));
-            scale_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, 1.2f, -0.1f));
-            offset_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, 6.0f, -3.0f), randf(&randc, 6.0f, -3.0f));
+            rotate_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(10.0f, -5.0f));
+            scale_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(1.2f, -0.1f));
+            offset_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(6.0f, -3.0f), randf(6.0f, -3.0f));
             noise_data(data, IMAGE_SIZE * IMAGE_SIZE, 0.3f, 0.08f);
         }
         mini_batch_gd(nn, learn_rate, batch_dataset);
diff --git a/src/network.c b/src/network.c
index 93ae307a1..504d777ab 100644
--- a/src/network.c
+++ b/src/network.c
@@ -25,7 +25,7 @@ layer *get_layer(size_t length, size_t prev_length) {
     }
 
     for (size_t i = 0; i < length * prev_length; i++)
-        new_layer->weights[i] = randf(&randc, 2.0f, -1.0f);
+        new_layer->weights[i] = randf(2.0f, -1.0f);
 
     new_layer->delta = calloc(length, sizeof(float));
     if (!new_layer->delta) {
@@ -201,7 +201,7 @@ float cost(neural_network *nn, const dataset *test_dataset, size_t num_test) {
 
     layer *output_layer = nn->layers[nn->length - 1];
     for (size_t i = 0; i < num_test; i++) {
-        data *test_data = test_dataset->datas[randnum_u32(&randc, test_dataset->length, 0)];
+        data *test_data = test_dataset->datas[randnum_u32(test_dataset->length, 0)];
         compute_network(nn, test_data->inputs);
         for (size_t j = 0; j < output_layer->length; j++) {
             float output = output_layer->output[j];
diff --git a/src/rand.c b/src/rand.c
index 0b0f60e9b..55b27a4be 100644
--- a/src/rand.c
+++ b/src/rand.c
@@ -1,50 +1,61 @@
+#include "rand.h"
+
 #include <assert.h>
 #include <string.h>
+#include <time.h>
 
-#include "rand.h"
+struct rand_chunk randc = {
+    .count = 2000,
+    .buf = {0}};
+
+prng_state __randstate = {0};
 
-__attribute__((constructor)) static void auto_seed_rand(void) {
-    static uint64_t seed[4] = {0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95};
+__attribute__((constructor)) void auto_seed_rand(void) {
+    uint64_t seed[4] = {time(NULL), 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95};
     prng_init(&__randstate, seed);
     prng_gen(&__randstate, randc.buf, 1024);
 }
 
-uint8_t randnum_u8(struct rand_chunk *randc, uint8_t range, uint8_t offset) {
+uint8_t randnum_u8(uint8_t range, uint8_t offset) {
     assert(range);
-    if (randc->count > 1023) {
-	randc->count = 0;
-	prng_gen(&__randstate, randc->buf, 1024);
+    if (randc.count > 1023) {
+        randc.count = 0;
+        prng_gen(&__randstate, randc.buf, 1024);
     }
-    uint8_t randnum = ((uint16_t)randc->buf[randc->count] * range) >> 8;
+    uint8_t randnum = ((uint16_t)randc.buf[randc.count] * range) >> 8;
     randnum += offset;
-    randc->count++;
+    randc.count++;
     return randnum;
 }
 
-uint32_t randnum_u32(struct rand_chunk *randc, uint32_t range, uint32_t offset) {
+uint32_t randnum_u32(uint32_t range, uint32_t offset) {
     assert(range);
-    if (randc->count > 1020) {
-	randc->count = 0;
-	prng_gen(&__randstate, randc->buf, 1024);
+    if (randc.count > 1020) {
+        randc.count = 0;
+        prng_gen(&__randstate, randc.buf, 1024);
     }
     uint32_t value;
     // uint32_t value = randc->buf[randc->count++] << 24 | randc->buf[randc->count++] << 16 | randc->buf[randc->count++] << 8 | randc->buf[randc->count++];
-    memcpy(&value, &randc->buf[randc->count], sizeof(value));
+    memcpy(&value, &randc.buf[randc.count], sizeof(value));
     uint32_t randnum = ((uint64_t)value * range) >> 32;
     randnum += offset;
-    randc->count += sizeof(randnum);
+    randc.count += sizeof(randnum);
     return randnum;
 }
 
-float randf(struct rand_chunk *randc, float range, float offset) {
+float randf(float range, float offset) {
     assert(range);
-    if (randc->count > 1020) {
-	randc->count = 0;
-	prng_gen(&__randstate, randc->buf, 1024);
+    if (randc.count > 1020) {
+        randc.count = 0;
+        prng_gen(&__randstate, randc.buf, 1024);
     }
     // uint32_t value;
     // memcpy(&value, &randc->buf[randc->count], 4);
-    uint32_t value = randc->buf[randc->count++] << 16 | randc->buf[randc->count++] << 8 | randc->buf[randc->count++];
+    uint32_t byte1 = randc.buf[randc.count++];
+    uint32_t byte2 = randc.buf[randc.count++];
+    uint32_t byte3 = randc.buf[randc.count++];
+
+    uint32_t value = (byte1 << 16) | (byte2 << 8) | byte3;
     // value |= 0x3F800000;
     // float randfloat;
     // randfloat = *(float *)&value;
@@ -54,8 +65,6 @@ float randf(struct rand_chunk *randc, float range, float offset) {
     return randfloat;
 }
 
-
-
 // NOTE: example
 
 // int main() {
diff --git a/src/rand.h b/src/rand.h
index 5d864f4e3..7184630e1 100644
--- a/src/rand.h
+++ b/src/rand.h
@@ -4,20 +4,19 @@
 #include <stdalign.h>
 #include <stddef.h>
 #include <stdint.h>
-#include "../external/shishua/shishua.h"
+
+#include "shishua/shishua.h"
 
 struct rand_chunk {
     size_t count;
-    uint8_t buf[1024];// NOTE: must be multiple of 256
+    uint8_t buf[1024];  // NOTE: must be multiple of 256
 };
 
-uint8_t randnum_u8(struct rand_chunk *randc, uint8_t range, uint8_t offset);
-uint32_t randnum_u32(struct rand_chunk *randc, uint32_t range, uint32_t offset);
-float randf(struct rand_chunk *randc, float range, float offset);
-static struct rand_chunk randc = {
-  .count = 2000,
-  .buf = {0}
-};
+uint8_t randnum_u8(uint8_t range, uint8_t offset);
+uint32_t randnum_u32(uint32_t range, uint32_t offset);
+float randf(float range, float offset);
+
+extern struct rand_chunk randc;
 
-static prng_state __randstate;
+extern prng_state __randstate;
 #endif
diff --git a/test/main.cpp b/test/main.cpp
index 0a4376670..76f841f1b 100644
--- a/test/main.cpp
+++ b/test/main.cpp
@@ -1,7 +1,6 @@
 #include <gtest/gtest.h>
 
 int main(int argc, char **argv) {
-    srand(time(NULL));
     ::testing::InitGoogleTest(&argc, argv);
     return RUN_ALL_TESTS();
 }
diff --git a/test/network.cpp b/test/network.cpp
index a213b27b1..489d650ae 100644
--- a/test/network.cpp
+++ b/test/network.cpp
@@ -1,8 +1,8 @@
 #include <gtest/gtest.h>
 
 extern "C" {
-#include "cneuron/cneuron.h"
 #include "../src/rand.h"
+#include "cneuron/cneuron.h"
 }
 
 #include <math.h>
@@ -10,13 +10,10 @@ extern "C" {
 #include "test_utils.h"
 
 TEST(NetworkTest, RandomFloat) {
-    // static uint64_t seed[4] = {0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95};
-    // prng_init(&__randstate, seed);
-    // prng_gen(&__randstate, randc.buf, 1024);
-    float test = randf(&randc, 1.0f, 0.0f);
+    float test = randf(1.0f, 0.0f);
     bool same = true;
     for (int i = 0; i < 10; i++) {
-        if (test != randf(&randc, 1.0f, 0.0f)) {
+        if (test != randf(1.0f, 0.0f)) {
             same = false;
             break;
         }
@@ -137,7 +134,7 @@ TEST(NetworkTest, StochasticGDSingleLayer) {
 
     for (size_t i = 0; i < 50000; i++) {
         for (size_t j = 0; j < test_dataset->length; j++) {
-            stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]);
+            stochastic_gd(nn, 0.03f, test_dataset->datas[randnum_u32(test_dataset->length, 0)]);
         }
         if (i % 10000 == 0) {
             printf("Single layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
@@ -164,7 +161,7 @@ TEST(NetworkTest, StochasticGDTests) {
 
     for (size_t i = 0; i < 500000; i++) {
         for (size_t j = 0; j < test_dataset->length; j++) {
-            stochastic_gd(nn, 0.001f, test_dataset->datas[rand() % test_dataset->length]);
+            stochastic_gd(nn, 0.001f, test_dataset->datas[randnum_u32(test_dataset->length, 0)]);
         }
         if (i % 100000 == 0) {
             printf("Stochastic Multi layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));
@@ -185,7 +182,7 @@ TEST(NetworkTest, StochasticGDTests) {
 
     for (size_t i = 0; i < 50000; i++) {
         for (size_t j = 0; j < test_dataset->length; j++) {
-            stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]);
+            stochastic_gd(nn, 0.03f, test_dataset->datas[randnum_u32(test_dataset->length, 0)]);
         }
         if (i % 10000 == 0) {
             printf("Stochastic Non-linearly separable learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));