From 593edc02d578a290a6a8ec85851a2083a930d5db Mon Sep 17 00:00:00 2001 From: BrBa2 Date: Thu, 12 Jun 2025 12:52:03 +0800 Subject: [PATCH 01/11] helper function for random --- src/helper/shishua/rand.c | 55 ++++++ src/helper/shishua/rand.h | 16 ++ src/helper/shishua/shishua-avx2.h | 111 ++++++++++++ src/helper/shishua/shishua-half-avx2.h | 96 ++++++++++ src/helper/shishua/shishua-half-neon.h | 155 ++++++++++++++++ src/helper/shishua/shishua-half-sse2.h | 194 ++++++++++++++++++++ src/helper/shishua/shishua-half.h | 195 +++++++++++++++++++++ src/helper/shishua/shishua-neon.h | 180 +++++++++++++++++++ src/helper/shishua/shishua-sse2.h | 214 ++++++++++++++++++++++ src/helper/shishua/shishua.h | 234 +++++++++++++++++++++++++ 10 files changed, 1450 insertions(+) create mode 100644 src/helper/shishua/rand.c create mode 100644 src/helper/shishua/rand.h create mode 100644 src/helper/shishua/shishua-avx2.h create mode 100644 src/helper/shishua/shishua-half-avx2.h create mode 100644 src/helper/shishua/shishua-half-neon.h create mode 100644 src/helper/shishua/shishua-half-sse2.h create mode 100644 src/helper/shishua/shishua-half.h create mode 100644 src/helper/shishua/shishua-neon.h create mode 100644 src/helper/shishua/shishua-sse2.h create mode 100644 src/helper/shishua/shishua.h diff --git a/src/helper/shishua/rand.c b/src/helper/shishua/rand.c new file mode 100644 index 000000000..fe566e4dd --- /dev/null +++ b/src/helper/shishua/rand.c @@ -0,0 +1,55 @@ +#include +#include + +#include "rand.h" + +uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max) { + assert(max > min); + assert(randc->count <= 127); + uint8_t randnum = ((uint16_t)randc->buf[randc->count] * (max - min)) >> 8; + randnum += min; + randc->count++; + return randnum; +} + +uint32_t randnum_u32(struct rand_chunk *randc, uint32_t min, uint32_t max) { + assert(max > min); + assert(randc->count <= 124); + uint32_t value; + memcpy(&value, &randc->buf[randc->count], sizeof(value)); + uint32_t randnum = ((uint64_t)value * (max - min)) >> 32; + randnum += min; + randc->count += sizeof(uint32_t); + return randnum; +} + +float randf(struct rand_chunk *randc, float range, float offset) { + assert(range); + assert(randc->count <= 124); + uint32_t value; + memcpy(&value, &randc->buf[randc->count], sizeof(value)); + // float randfloat = ((float)value / UINT32_MAX) * range + offset; + float randfloat = ((value >> 8) / 16777216.0f) * range + offset; + randc->count += sizeof(float); + return randfloat; +} + +//NOTE: example + +// int main() { +// prng_state state; +// static uint64_t seed[4] = {0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95}; +// struct rand_chunk randc = {0}; +// prng_init(&state, seed); +// prng_gen(&state, randc.buf, 128); +// +// for (int i = 0; i < 1000; i++) { +// float randfloat = randf(&randc, 2, -1); +// if(randc.count > 128 - sizeof(randfloat)) { //NOTE: always check randc.count b4 using function +// randc.count = 0; +// prng_gen(&state, randc.buf, 128); +// } +// printf("%f, ", randfloat); +// } +// return 0; +// } diff --git a/src/helper/shishua/rand.h b/src/helper/shishua/rand.h new file mode 100644 index 000000000..bdfc71f84 --- /dev/null +++ b/src/helper/shishua/rand.h @@ -0,0 +1,16 @@ +#ifndef SHISHUA_RAND_H +#define SHISHUA_RAND_H + +#include "shishua.h" +#include + +struct rand_chunk{ + size_t count; + uint8_t buf[128]; //NOTE: must be multiple of 128 +}; + +uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max); +uint32_t randnum_u32(struct rand_chunk *randc, uint32_t min, uint32_t max); +float randf(struct rand_chunk *randc, float range, float offset); + +#endif diff --git a/src/helper/shishua/shishua-avx2.h b/src/helper/shishua/shishua-avx2.h new file mode 100644 index 000000000..c2c865e80 --- /dev/null +++ b/src/helper/shishua/shishua-avx2.h @@ -0,0 +1,111 @@ +#ifndef SHISHUA_AVX2_H +#define SHISHUA_AVX2_H +#include +#include +#include +#include +#include +typedef struct prng_state { + __m256i state[4]; + __m256i output[4]; + __m256i counter; +} prng_state; + +// buf's size must be a multiple of 128 bytes. +static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) { + __m256i o0 = s->output[0], o1 = s->output[1], o2 = s->output[2], o3 = s->output[3], + s0 = s->state[0], s1 = s->state[1], s2 = s->state[2], s3 = s->state[3], + t0, t1, t2, t3, u0, u1, u2, u3, counter = s->counter; + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // You may notice that they are simply 256-bit rotations (96 and 160). + __m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5), + shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3); + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, + // or about 7 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + __m256i increment = _mm256_set_epi64x(1, 3, 5, 7); + + // TODO: consider adding proper uneven write handling + assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes."); + + for (size_t i = 0; i < size; i += 128) { + if (buf != NULL) { + _mm256_storeu_si256((__m256i*)&buf[i + 0], o0); + _mm256_storeu_si256((__m256i*)&buf[i + 32], o1); + _mm256_storeu_si256((__m256i*)&buf[i + 64], o2); + _mm256_storeu_si256((__m256i*)&buf[i + 96], o3); + } + + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + s1 = _mm256_add_epi64(s1, counter); + s3 = _mm256_add_epi64(s3, counter); + counter = _mm256_add_epi64(counter, increment); + + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + u0 = _mm256_srli_epi64(s0, 1); u1 = _mm256_srli_epi64(s1, 3); + u2 = _mm256_srli_epi64(s2, 1); u3 = _mm256_srli_epi64(s3, 3); + t0 = _mm256_permutevar8x32_epi32(s0, shu0); t1 = _mm256_permutevar8x32_epi32(s1, shu1); + t2 = _mm256_permutevar8x32_epi32(s2, shu0); t3 = _mm256_permutevar8x32_epi32(s3, shu1); + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + s0 = _mm256_add_epi64(t0, u0); s1 = _mm256_add_epi64(t1, u1); + s2 = _mm256_add_epi64(t2, u2); s3 = _mm256_add_epi64(t3, u3); + + // Two orthogonally grown pieces evolving independently, XORed. + o0 = _mm256_xor_si256(u0, t1); + o1 = _mm256_xor_si256(u2, t3); + o2 = _mm256_xor_si256(s0, s3); + o3 = _mm256_xor_si256(s2, s1); + } + s->output[0] = o0; s->output[1] = o1; s->output[2] = o2; s->output[3] = o3; + s->state [0] = s0; s->state [1] = s1; s->state [2] = s2; s->state [3] = s3; + s->counter = counter; +} + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[16] = { + 0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36, + 0xF06AD7AE9717877E, 0x85839D6EFFBD7DC6, 0x64D325D1C5371682, 0xCADD0CCCFDFFBBE1, + 0x626E33B8D04B4331, 0xBBF73C790D94F79D, 0x471C4AB3ED3D82A5, 0xFEC507705E4AE6E5, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + memset(s, 0, sizeof(prng_state)); +# define STEPS 1 +# define ROUNDS 13 + uint8_t buf[128 * STEPS]; + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + s->state[0] = _mm256_set_epi64x(phi[ 3], phi[ 2] ^ seed[1], phi[ 1], phi[ 0] ^ seed[0]); + s->state[1] = _mm256_set_epi64x(phi[ 7], phi[ 6] ^ seed[3], phi[ 5], phi[ 4] ^ seed[2]); + s->state[2] = _mm256_set_epi64x(phi[11], phi[10] ^ seed[3], phi[ 9], phi[ 8] ^ seed[2]); + s->state[3] = _mm256_set_epi64x(phi[15], phi[14] ^ seed[1], phi[13], phi[12] ^ seed[0]); + for (size_t i = 0; i < ROUNDS; i++) { + prng_gen(s, buf, 128 * STEPS); + s->state[0] = s->output[3]; s->state[1] = s->output[2]; + s->state[2] = s->output[1]; s->state[3] = s->output[0]; + } +# undef STEPS +# undef ROUNDS +} +#endif diff --git a/src/helper/shishua/shishua-half-avx2.h b/src/helper/shishua/shishua-half-avx2.h new file mode 100644 index 000000000..b21f46174 --- /dev/null +++ b/src/helper/shishua/shishua-half-avx2.h @@ -0,0 +1,96 @@ +#ifndef SHISHUA_H +#define SHISHUA_H +#include +#include +#include +#include +#include +typedef struct prng_state { + __m256i state[2]; + __m256i output; + __m256i counter; +} prng_state; + +// buf's size must be a multiple of 32 bytes. +static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) { + __m256i s0 = s->state[0], counter = s->counter, + s1 = s->state[1], o = s->output, + t0, t1, t2, t3, u0, u1, u2, u3; + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // You may notice that they are simply 256-bit rotations (96 and 160). + __m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5), + shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3); + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^69 bytes = 512 EiB to the period, + // or about 1 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + __m256i increment = _mm256_set_epi64x(1, 3, 5, 7); + + // TODO: consider adding proper uneven write handling + assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes."); + + for (size_t i = 0; i < size; i += 32) { + if (buf != NULL) { + _mm256_storeu_si256((__m256i*)&buf[i], o); + } + + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + s1 = _mm256_add_epi64(s1, counter); + counter = _mm256_add_epi64(counter, increment); + + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + u0 = _mm256_srli_epi64(s0, 1); u1 = _mm256_srli_epi64(s1, 3); + t0 = _mm256_permutevar8x32_epi32(s0, shu0); t1 = _mm256_permutevar8x32_epi32(s1, shu1); + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + s0 = _mm256_add_epi64(t0, u0); s1 = _mm256_add_epi64(t1, u1); + + // Two orthogonally grown pieces evolving independently, XORed. + o = _mm256_xor_si256(u0, t1); + } + s->state[0] = s0; s->counter = counter; + s->state[1] = s1; s->output = o; +} + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[8] = { + 0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + memset(s, 0, sizeof(prng_state)); +# define STEPS 5 +# define ROUNDS 4 + uint8_t buf[32 * STEPS]; // 4 64-bit numbers per 256-bit SIMD. + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + s->state[0] = _mm256_set_epi64x(phi[3], phi[2] ^ seed[1], phi[1], phi[0] ^ seed[0]); + s->state[1] = _mm256_set_epi64x(phi[7], phi[6] ^ seed[3], phi[5], phi[4] ^ seed[2]); + for (size_t i = 0; i < ROUNDS; i++) { + prng_gen(s, buf, 32 * STEPS); + s->state[0] = s->state[1]; + s->state[1] = s->output; + } +# undef STEPS +# undef ROUNDS +} +#endif diff --git a/src/helper/shishua/shishua-half-neon.h b/src/helper/shishua/shishua-half-neon.h new file mode 100644 index 000000000..f946a9aee --- /dev/null +++ b/src/helper/shishua/shishua-half-neon.h @@ -0,0 +1,155 @@ +// An ARM NEON version of shishua. +#ifndef SHISHUA_HALF_NEON_H +#define SHISHUA_HALF_NEON_H +#include +#include +#include +#include +// A NEON version. +typedef struct prng_state { + uint64x2_t state[4]; + uint64x2_t output[2]; + uint64x2_t counter[2]; +} prng_state; + +#define SHISHUA_VSETQ_N_U64(a, b) vcombine_u64(vdup_n_u64(a), vdup_n_u64(b)) + +// To hide the vreinterpret ritual. +#define SHISHUA_VEXTQ_U8(Rn, Rm, Imm) \ + vreinterpretq_u64_u8( \ + vextq_u8( \ + vreinterpretq_u8_u64(Rn), \ + vreinterpretq_u8_u64(Rm), \ + (Imm) \ + ) \ + ) + +// buf could technically alias with prng_state, according to the compiler. +#if defined(__GNUC__) || defined(_MSC_VER) +# define SHISHUA_RESTRICT __restrict +#else +# define SHISHUA_RESTRICT +#endif + +// buf's size must be a multiple of 32 bytes. +static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) { + uint64x2_t s0_lo = s->state[0], s0_hi = s->state[1], + s1_lo = s->state[2], s1_hi = s->state[3], + o_lo = s->output[0], o_hi = s->output[1], + t0_lo, t0_hi, t1_lo, t1_hi, u_lo, u_hi, + counter_lo = s->counter[0], counter_hi = s->counter[1]; + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, + // or about 7 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + uint64x2_t increment_lo = SHISHUA_VSETQ_N_U64(7, 5); + uint64x2_t increment_hi = SHISHUA_VSETQ_N_U64(3, 1); + uint8_t *b = buf; + // TODO: consider adding proper uneven write handling + assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes."); + + for (size_t i = 0; i < size/32; i++) { + if (buf != NULL) { + vst1q_u8(&b[0], vreinterpretq_u8_u64(o_lo)); b += 16; + vst1q_u8(&b[0], vreinterpretq_u8_u64(o_hi)); b += 16; + } + + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + s1_lo = vaddq_u64(s1_lo, counter_lo); + s1_hi = vaddq_u64(s1_hi, counter_hi); + + counter_lo = vaddq_u64(counter_lo, increment_lo); + counter_hi = vaddq_u64(counter_hi, increment_hi); + + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // You may notice that they are simply 256-bit rotations (96 and 160). + // Note: This: + // x = (y << 96) | (y >> 160) + // can be rewritten as this + // x_lo = (y_lo << 96) | (y_hi >> 32) + // x_hi = (y_hi << 96) | (y_lo >> 32) + // which we can do with 2 vext.8 instructions. + t0_lo = SHISHUA_VEXTQ_U8(s0_hi, s0_lo, 4); + t0_hi = SHISHUA_VEXTQ_U8(s0_lo, s0_hi, 4); + t1_lo = SHISHUA_VEXTQ_U8(s1_lo, s1_hi, 12); + t1_hi = SHISHUA_VEXTQ_U8(s1_hi, s1_lo, 12); + + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + u_lo = vshrq_n_u64(s0_lo, 1); u_hi = vshrq_n_u64(s0_hi, 1); +#if defined(__clang__) + // UGLY HACK: Clang enjoys merging the above statements with the vadds below + // into vsras. + // This is dumb, as it still needs to do the original vshr for the xor mix, + // causing it to shift twice. + // This makes Clang assume that this line has side effects, preventing the + // combination and speeding things up significantly. + __asm__("" : "+w" (u_lo), "+w" (u_hi)); +#endif + + + // Two orthogonally grown pieces evolving independently, XORed. + // Note: We do this first because on the assembly side, vsra would overwrite + // t1_lo and t1_hi. + o_lo = veorq_u64(u_lo, t1_lo); + o_hi = veorq_u64(u_hi, t1_hi); + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + s0_lo = vaddq_u64(t0_lo, u_lo); + s0_hi = vaddq_u64(t0_hi, u_hi); + // Use vsra here directly. + s1_lo = vsraq_n_u64(t1_lo, s1_lo, 3); + s1_hi = vsraq_n_u64(t1_hi, s1_hi, 3); + } + s->state[0] = s0_lo; s->state[1] = s0_hi; + s->state[2] = s1_lo; s->state[3] = s1_hi; + s->output[0] = o_lo; s->output[1] = o_hi; + s->counter[0] = counter_lo; s->counter[1] = counter_hi; +} + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[8] = { + 0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + s->counter[0] = vdupq_n_u64(0); + s->counter[1] = vdupq_n_u64(0); +# define STEPS 5 +# define ROUNDS 4 + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + for (size_t i = 0; i < 4; i++) { + s->state[i] = veorq_u64(SHISHUA_VSETQ_N_U64(seed[i], 0), vld1q_u64(&phi[i * 2])); + } + for (size_t i = 0; i < ROUNDS; i++) { + prng_gen(s, NULL, 32 * STEPS); + s->state[0] = s->state[2]; s->state[1] = s->state[3]; + s->state[2] = s->output[0]; s->state[3] = s->output[1]; + } +# undef STEPS +# undef ROUNDS +} +#undef SHISHUA_VSETQ_N_U64 +#undef SHISHUA_VEXTQ_U8 +#undef SHISHUA_RESTRICT + +#endif diff --git a/src/helper/shishua/shishua-half-sse2.h b/src/helper/shishua/shishua-half-sse2.h new file mode 100644 index 000000000..d6f958930 --- /dev/null +++ b/src/helper/shishua/shishua-half-sse2.h @@ -0,0 +1,194 @@ +// An SSE2/SSSE3 version of shishua half. Slower than AVX2, but more compatible. +// Also compatible with 32-bit x86. +// +// SSSE3 is recommended, as it has the useful _mm_alignr_epi8 intrinsic. +// We can still emulate it on SSE2, but it is slower. +#ifndef SHISHUA_HALF_SSE2_H +#define SHISHUA_HALF_SSE2_H + +#include +#include +#include +// Note: cl.exe doesn't define __SSSE3__ +#if defined(__SSSE3__) || defined(__AVX__) +# include // SSSE3 +# define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \ + _mm_alignr_epi8(hi, lo, amt) +#else +# include // SSE2 +// Emulate _mm_alignr_epi8 for SSE2. It's a little slow. +// The compiler may convert it to a sequence of shufps instructions, which is +// perfectly fine. +# define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \ + _mm_or_si128( \ + _mm_slli_si128(hi, 16 - (amt)), \ + _mm_srli_si128(lo, amt) \ + ) +#endif + +typedef struct prng_state { + __m128i state[4]; + __m128i output[2]; + __m128i counter[2]; +} prng_state; + +// Wrappers for x86 targets which usually lack these intrinsics. +// Don't call these with side effects. +#if defined(__x86_64__) || defined(_M_X64) +# define SHISHUA_SET_EPI64X(b, a) _mm_set_epi64x(b, a) +# define SHISHUA_CVTSI64_SI128(x) _mm_cvtsi64_si128(x) +#else +# define SHISHUA_SET_EPI64X(b, a) \ + _mm_set_epi32( \ + (int)(((uint64_t)(b)) >> 32), \ + (int)(b), \ + (int)(((uint64_t)(a)) >> 32), \ + (int)(a) \ + ) +# define SHISHUA_CVTSI64_SI128(x) SHISHUA_SET_EPI64X(0, x) +#endif + +// buf could technically alias with prng_state, according to the compiler. +#if defined(__GNUC__) || defined(_MSC_VER) +# define SHISHUA_RESTRICT __restrict +#else +# define SHISHUA_RESTRICT +#endif + +// buf's size must be a multiple of 32 bytes. +static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) { + // We need a lot of variables... + __m128i s0_lo = s->state[0], s0_hi = s->state[1], + s1_lo = s->state[2], s1_hi = s->state[3], + t0_lo, t0_hi, t1_lo, t1_hi, + u_lo, u_hi, + counter_lo = s->counter[0], counter_hi = s->counter[1]; + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, + // or about 7 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + // increment = { 7, 5, 3, 1 }; + const __m128i increment_lo = SHISHUA_SET_EPI64X(5, 7); + const __m128i increment_hi = SHISHUA_SET_EPI64X(1, 3); + + // TODO: consider adding proper uneven write handling + assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes."); + + for (size_t i = 0; i < size; i += 32) { + // Split to try to reduce register pressure + if (buf != NULL) { + _mm_storeu_si128((__m128i*)&buf[i+ 0], s->output[0]); + _mm_storeu_si128((__m128i*)&buf[i+16], s->output[1]); + } + + // Lane 0 + + + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // You may notice that they are simply 256-bit rotations (96 and 160). + // Note: This: + // x = (y << 96) | (y >> 160) + // can be rewritten as this + // x_lo = (y_lo << 96) | (y_hi >> 32) + // x_hi = (y_hi << 96) | (y_lo >> 32) + // which we can do with 2 _mm_alignr_epi8 instructions. + t0_lo = SHISHUA_ALIGNR_EPI8(s0_lo, s0_hi, 4); + t0_hi = SHISHUA_ALIGNR_EPI8(s0_hi, s0_lo, 4); + + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + u_lo = _mm_srli_epi64(s0_lo, 1); + u_hi = _mm_srli_epi64(s0_hi, 1); + + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + s0_lo = _mm_add_epi64(u_lo, t0_lo); + s0_hi = _mm_add_epi64(u_hi, t0_hi); + + // Lane 1 + + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + s1_lo = _mm_add_epi64(s1_lo, counter_lo); + s1_hi = _mm_add_epi64(s1_hi, counter_hi); + + // Increment the counter + counter_lo = _mm_add_epi64(counter_lo, increment_lo); + counter_hi = _mm_add_epi64(counter_hi, increment_hi); + + // Same as above, but with different shift amounts. + t1_lo = SHISHUA_ALIGNR_EPI8(s1_hi, s1_lo, 12); + t1_hi = SHISHUA_ALIGNR_EPI8(s1_lo, s1_hi, 12); + + s1_lo = _mm_srli_epi64(s1_lo, 3); + s1_hi = _mm_srli_epi64(s1_hi, 3); + + s1_lo = _mm_add_epi64(s1_lo, t1_lo); + s1_hi = _mm_add_epi64(s1_hi, t1_hi); + + // Two orthogonally grown pieces evolving independently, XORed. + s->output[0] = _mm_xor_si128(u_lo, t1_lo); + s->output[1] = _mm_xor_si128(u_hi, t1_hi); + } + + s->state[0] = s0_lo; s->state[1] = s0_hi; + s->state[2] = s1_lo; s->state[3] = s1_hi; + + s->counter[0] = counter_lo; + s->counter[1] = counter_hi; +} + + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[8] = { + 0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + s->counter[0] = _mm_setzero_si128(); + s->counter[1] = _mm_setzero_si128(); +# define STEPS 5 +# define ROUNDS 4 + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + // XXX: better way to do this? + __m128i seed_0 = SHISHUA_CVTSI64_SI128(seed[0]); + __m128i seed_1 = SHISHUA_CVTSI64_SI128(seed[1]); + __m128i seed_2 = SHISHUA_CVTSI64_SI128(seed[2]); + __m128i seed_3 = SHISHUA_CVTSI64_SI128(seed[3]); + s->state[0] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[0])); + s->state[1] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[2])); + s->state[2] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[4])); + s->state[3] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[6])); + + for (int i = 0; i < ROUNDS; i++) { + prng_gen(s, NULL, 32 * STEPS); + s->state[0] = s->state[2]; s->state[1] = s->state[3]; + s->state[2] = s->output[0]; s->state[3] = s->output[1]; + } +# undef STEPS +# undef ROUNDS +} +#undef SHISHUA_ALIGNR_EPI8 +#undef SHISHUA_CVTSI64_SI128 +#undef SHISHUA_SET_EPI64X +#undef SHISHUA_RESTRICT + +#endif diff --git a/src/helper/shishua/shishua-half.h b/src/helper/shishua/shishua-half.h new file mode 100644 index 000000000..a1ea715b1 --- /dev/null +++ b/src/helper/shishua/shishua-half.h @@ -0,0 +1,195 @@ +#ifndef SHISHUA_HALF_H +#define SHISHUA_HALF_H + +#define SHISHUA_TARGET_SCALAR 0 +#define SHISHUA_TARGET_AVX2 1 +#define SHISHUA_TARGET_SSE2 2 +#define SHISHUA_TARGET_NEON 3 + +#ifndef SHISHUA_TARGET +# if defined(__AVX2__) && (defined(__x86_64__) || defined(_M_X64)) +# define SHISHUA_TARGET SHISHUA_TARGET_AVX2 +# elif defined(__x86_64__) || defined(_M_X64) || defined(__SSE2__) \ + || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) +# define SHISHUA_TARGET SHISHUA_TARGET_SSE2 +// GCC's NEON codegen leaves much to be desired, at least as of 9.2.0. The +// scalar path ends up being faster. +// Device: Google Pixel 2 XL, 2.46GHz Qualcomm Snapdragon 835 +// | GCC 9.2.0 | Clang 9.0.1 +// shishua neon | 0.2845 ns/byte | 0.0966 ns/byte +// shishua scalar | 0.2056 ns/byte | 0.2958 ns/byte +// shishua half neon | 0.5169 ns/byte | 0.1929 ns/byte +// shishua half scalar | 0.2496 ns/byte | 0.2911 ns/byte +// Therefore, we only autoselect the NEON path on Clang, at least until GCC's +// NEON codegen improves. +# elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__clang__) +# define SHISHUA_TARGET SHISHUA_TARGET_NEON +# else +# define SHISHUA_TARGET SHISHUA_TARGET_SCALAR +# endif +#endif + +// These are all optional: By defining SHISHUA_TARGET_SCALAR, you only +// need this header. +// Additionally, these headers can also be used on their own. +#if SHISHUA_TARGET == SHISHUA_TARGET_AVX2 +# include "shishua-half-avx2.h" +#elif SHISHUA_TARGET == SHISHUA_TARGET_SSE2 +# include "shishua-half-sse2.h" +#elif SHISHUA_TARGET == SHISHUA_TARGET_NEON +# include "shishua-half-neon.h" +#else // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR + +// Portable scalar implementation of shishua half. +// Aims for a balance between code size and performance. +#include +#include +#include +#include + +// Note: While it is an array, a "lane" refers to 4 consecutive uint64_t. +typedef struct prng_state { + uint64_t state[8]; // 2 lanes + uint64_t output[4]; // 1 lane + uint64_t counter[4]; // 1 lane +} prng_state; + +// buf could technically alias with prng_state, according to the compiler. +#if defined(__GNUC__) || defined(_MSC_VER) +# define SHISHUA_RESTRICT __restrict +#else +# define SHISHUA_RESTRICT +#endif + +static inline void shishua_write_le64(void *dst, uint64_t val) { + // Define to write in native endianness with memcpy + // Also, use memcpy on known little endian setups. +# if defined(SHISHUA_NATIVE_ENDIAN) \ + || defined(_WIN32) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ + || defined(__LITTLE_ENDIAN__) + memcpy(dst, &val, sizeof(uint64_t)); +#else + // Byteshift write. + uint8_t *d = (uint8_t *)dst; + for (size_t i = 0; i < 8; i++) { + d[i] = (uint8_t)(val & 0xff); + val >>= 8; + } +#endif +} + +// buf's size must be a multiple of 32 bytes. +static inline void prng_gen(prng_state *SHISHUA_RESTRICT state, uint8_t *SHISHUA_RESTRICT buf, size_t size) { + + uint8_t *b = buf; + // TODO: consider adding proper uneven write handling + assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes."); + + for (size_t i = 0; i < size; i += 32) { + uint64_t t[8]; + // Write to buf + if (buf != NULL) { + for (size_t j = 0; j < 4; j++) { + shishua_write_le64(b, state->output[j]); + b += 8; + } + } + + for (size_t j = 0; j < 4; j++) { + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + state->state[j + 4] += state->counter[j]; + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, + // or about 7 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + // + // For the scalar version, we calculate this dynamically, as it is + // simple enough. + state->counter[j] += 7 - (j * 2); // 7, 5, 3, 1 + } + + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // You may notice that they are simply 256-bit rotations (96 and 160). + // + // It would be much easier and cleaner to just reinterpret as a uint32_t + // pointer or use memcpy, but that is unfortunately endian-dependent, and + // the former also breaks strict aliasing. + // + // The only known solution which doesn't rely on endianness is to + // read two 64-bit integers and do a funnel shift. + // + // See shishua.h for details. + + // Lookup table for the _offsets_ in the shuffle. Even lanes rotate + // by 5, odd lanes rotate by 3. + // If it were by 32-bit lanes, it would be + // { 5,6,7,0,1,2,3,4, 11,12,13,14,15,8,9,10 } + const uint8_t shuf_offsets[16] = { 2,3,0,1, 5,6,7,4, // left + 3,0,1,2, 6,7,4,5 }; // right + for (size_t j = 0; j < 8; j++) { + t[j] = (state->state[shuf_offsets[j]] >> 32) | (state->state[shuf_offsets[j + 8]] << 32); + } + for (size_t j = 0; j < 4; j++) { + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + uint64_t u_lo = state->state[j + 0] >> 1; + uint64_t u_hi = state->state[j + 4] >> 3; + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + state->state[j + 0] = u_lo + t[j + 0]; + state->state[j + 4] = u_hi + t[j + 4]; + + // Two orthogonally grown pieces evolving independently, XORed. + state->output[j] = u_lo ^ t[j + 4]; + } + } +} +#undef SHISHUA_RESTRICT + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[8] = { + 0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + memset(s, 0, sizeof(prng_state)); + +# define STEPS 5 +# define ROUNDS 4 + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + memcpy(s->state, phi, sizeof(phi)); + for (size_t i = 0; i < 4; i++) { + s->state[i * 2] ^= seed[i]; + } + for (size_t i = 0; i < ROUNDS; i++) { + prng_gen(s, NULL, 32 * STEPS); + for (size_t j = 0; j < 4; j++) { + s->state[j + 0] = s->state[j + 4]; + s->state[j + 4] = s->output[j]; + } + } +# undef STEPS +# undef ROUNDS +} +#endif // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR +#endif // SHISHUA_HALF_SCALAR_H diff --git a/src/helper/shishua/shishua-neon.h b/src/helper/shishua/shishua-neon.h new file mode 100644 index 000000000..b3ecbafdd --- /dev/null +++ b/src/helper/shishua/shishua-neon.h @@ -0,0 +1,180 @@ +// An ARM NEON version of shishua. +#ifndef SHISHUA_NEON_H +#define SHISHUA_NEON_H +#include +#include +#include +#include + +typedef struct prng_state { + uint64x2_t state[8]; + uint64x2_t output[8]; + uint64x2_t counter[2]; +} prng_state; + +#if defined(__GNUC__) && (defined(__BYTE_ORDER__) && __BYTE_ORDER__==__ORDER_LITTLE_ENDIAN__) +# define SHISHUA_VSETQ_N_U64(a, b) (__extension__(uint64x2_t) { a, b }) +#else +# define SHISHUA_VSETQ_N_U64(a, b) vcombine_u64(vdup_n_u64(a), vdup_n_u64(b)) +#endif +// To hide the vreinterpret ritual. +#define SHISHUA_VEXTQ_U8(Rn, Rm, Imm) \ + vreinterpretq_u64_u8( \ + vextq_u8( \ + vreinterpretq_u8_u64(Rn), \ + vreinterpretq_u8_u64(Rm), \ + (Imm) \ + ) \ + ) + +// buf could technically alias with prng_state, according to the compiler. +#if defined(__GNUC__) || defined(_MSC_VER) +# define SHISHUA_RESTRICT __restrict +#else +# define SHISHUA_RESTRICT +#endif + +// buf's size must be a multiple of 128 bytes. +static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) { + uint8_t *b = buf; + uint64x2_t counter_lo = s->counter[0], counter_hi = s->counter[1]; + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, + // or about 7 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + uint64x2_t increment_lo = SHISHUA_VSETQ_N_U64(7, 5); + uint64x2_t increment_hi = SHISHUA_VSETQ_N_U64(3, 1); + // TODO: consider adding proper uneven write handling + assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes."); + + for (size_t i = 0; i < size; i += 128) { + // Write the current output block to state if it is not NULL + if (buf != NULL) { + for (size_t j = 0; j < 8; j++) { + vst1q_u8(b, vreinterpretq_u8_u64(s->output[j])); + b += 16; + } + } + // NEON has less register pressure than SSE2, but we reroll it anyways for + // code size. + for (size_t j = 0; j < 2; j++) { + uint64x2_t s0_lo = s->state[j * 4 + 0], + s0_hi = s->state[j * 4 + 1], + s1_lo = s->state[j * 4 + 2], + s1_hi = s->state[j * 4 + 3], + t0_lo, t0_hi, t1_lo, t1_hi, + u_lo, u_hi; + + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + s1_lo = vaddq_u64(s1_lo, counter_lo); s1_hi = vaddq_u64(s1_hi, counter_hi); + + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // You may notice that they are simply 256-bit rotations (96 and 160). + // Note: This: + // x = (y << 96) | (y >> 160) + // can be rewritten as this + // x_lo = (y_lo << 96) | (y_hi >> 32) + // x_hi = (y_hi << 96) | (y_lo >> 32) + // which we can do with 2 vext.8 instructions. + t0_lo = SHISHUA_VEXTQ_U8(s0_hi, s0_lo, 4); t0_hi = SHISHUA_VEXTQ_U8(s0_lo, s0_hi, 4); + t1_lo = SHISHUA_VEXTQ_U8(s1_lo, s1_hi, 12); t1_hi = SHISHUA_VEXTQ_U8(s1_hi, s1_lo, 12); + + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + u_lo = vshrq_n_u64(s0_lo, 1); u_hi = vshrq_n_u64(s0_hi, 1); +#if defined(__clang__) + // UGLY HACK: Clang enjoys merging the above statements with the vadds below + // into vsras. + // This is dumb, as it still needs to do the original vshr for the xor mix, + // causing it to shift twice. + // This makes Clang assume that this line has side effects, preventing the + // combination and speeding things up significantly. + __asm__("" : "+w" (u_lo), "+w" (u_hi)); +#endif + + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + s->state[4 * j + 0] = vaddq_u64(t0_lo, u_lo); + s->state[4 * j + 1] = vaddq_u64(t0_hi, u_hi); + // Use vsra here directly. + s->state[4 * j + 2] = vsraq_n_u64(t1_lo, s1_lo, 3); + s->state[4 * j + 3] = vsraq_n_u64(t1_hi, s1_hi, 3); + + // The first orthogonally grown pieces evolving independently, XORed. + s->output[2 * j + 0] = veorq_u64(u_lo, t1_lo); + s->output[2 * j + 1] = veorq_u64(u_hi, t1_hi); + + } + // The second orthogonally grown piece evolving independently, XORed. + s->output[4] = veorq_u64(s->state[0], s->state[6]); + s->output[5] = veorq_u64(s->state[1], s->state[7]); + + s->output[6] = veorq_u64(s->state[2], s->state[4]); + s->output[7] = veorq_u64(s->state[3], s->state[5]); + + counter_lo = vaddq_u64(counter_lo, increment_lo); + counter_hi = vaddq_u64(counter_hi, increment_hi); + } + s->counter[0] = counter_lo; + s->counter[1] = counter_hi; +} + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[16] = { + 0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36, + 0xF06AD7AE9717877E, 0x85839D6EFFBD7DC6, 0x64D325D1C5371682, 0xCADD0CCCFDFFBBE1, + 0x626E33B8D04B4331, 0xBBF73C790D94F79D, 0x471C4AB3ED3D82A5, 0xFEC507705E4AE6E5, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + s->counter[0] = vdupq_n_u64(0); + s->counter[1] = vdupq_n_u64(0); +# define ROUNDS 13 +# define STEPS 1 + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + uint64x2_t seed_0 = SHISHUA_VSETQ_N_U64(seed[0], 0); + uint64x2_t seed_1 = SHISHUA_VSETQ_N_U64(seed[1], 0); + uint64x2_t seed_2 = SHISHUA_VSETQ_N_U64(seed[2], 0); + uint64x2_t seed_3 = SHISHUA_VSETQ_N_U64(seed[3], 0); + s->state[0] = veorq_u64(seed_0, vld1q_u64(&phi[ 0])); + s->state[1] = veorq_u64(seed_1, vld1q_u64(&phi[ 2])); + s->state[2] = veorq_u64(seed_2, vld1q_u64(&phi[ 4])); + s->state[3] = veorq_u64(seed_3, vld1q_u64(&phi[ 6])); + s->state[4] = veorq_u64(seed_2, vld1q_u64(&phi[ 8])); + s->state[5] = veorq_u64(seed_3, vld1q_u64(&phi[10])); + s->state[6] = veorq_u64(seed_0, vld1q_u64(&phi[12])); + s->state[7] = veorq_u64(seed_1, vld1q_u64(&phi[14])); + + for (int i = 0; i < ROUNDS; i++) { + prng_gen(s, NULL, 128 * STEPS); + s->state[0] = s->output[6]; s->state[1] = s->output[7]; + s->state[2] = s->output[4]; s->state[3] = s->output[5]; + s->state[4] = s->output[2]; s->state[5] = s->output[3]; + s->state[6] = s->output[0]; s->state[7] = s->output[1]; + } +# undef STEPS +# undef ROUNDS +} +#undef SHISHUA_VSETQ_N_U64 +#undef SHISHUA_VEXTQ_U8 +#undef SHISHUA_RESTRICT +#endif diff --git a/src/helper/shishua/shishua-sse2.h b/src/helper/shishua/shishua-sse2.h new file mode 100644 index 000000000..3a9807638 --- /dev/null +++ b/src/helper/shishua/shishua-sse2.h @@ -0,0 +1,214 @@ +// An SSE2/SSSE3 version of shishua. Slower than AVX2, but more compatible. +// Also compatible with 32-bit x86. +// +// SSSE3 is recommended, as it has the useful _mm_alignr_epi8 intrinsic. +// We can still emulate it on SSE2, but it is slower. +// Consider the half version if AVX2 is not available. +#ifndef SHISHUA_SSE2_H +#define SHISHUA_SSE2_H +#include +#include +#include +// Note: cl.exe doesn't define __SSSE3__ +#if defined(__SSSE3__) || defined(__AVX__) +# include // SSSE3 +# define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \ + _mm_alignr_epi8(hi, lo, amt) +#else +# include // SSE2 +// Emulate _mm_alignr_epi8 for SSE2. It's a little slow. +// The compiler may convert it to a sequence of shufps instructions, which is +// perfectly fine. +# define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \ + _mm_or_si128( \ + _mm_slli_si128(hi, 16 - (amt)), \ + _mm_srli_si128(lo, amt) \ + ) +#endif + +typedef struct prng_state { + __m128i state[8]; + __m128i output[8]; + __m128i counter[2]; +} prng_state; + +// Wrappers for x86 targets which usually lack these intrinsics. +// Don't call these with side effects. +#if defined(__x86_64__) || defined(_M_X64) +# define SHISHUA_SET_EPI64X(b, a) _mm_set_epi64x(b, a) +# define SHISHUA_CVTSI64_SI128(x) _mm_cvtsi64_si128(x) +#else +# define SHISHUA_SET_EPI64X(b, a) \ + _mm_set_epi32( \ + (int)(((uint64_t)(b)) >> 32), \ + (int)(b), \ + (int)(((uint64_t)(a)) >> 32), \ + (int)(a) \ + ) +# define SHISHUA_CVTSI64_SI128(x) SHISHUA_SET_EPI64X(0, x) +#endif + +// buf could technically alias with prng_state, according to the compiler. +#if defined(__GNUC__) || defined(_MSC_VER) +# define SHISHUA_RESTRICT __restrict +#else +# define SHISHUA_RESTRICT +#endif + +// buf's size must be a multiple of 128 bytes. +static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) { + __m128i counter_lo = s->counter[0], counter_hi = s->counter[1]; + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, + // or about 7 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + + // increment = { 7, 5, 3, 1 }; + __m128i increment_lo = SHISHUA_SET_EPI64X(5, 7); + __m128i increment_hi = SHISHUA_SET_EPI64X(1, 3); + + // TODO: consider adding proper uneven write handling + assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes."); + + for (size_t i = 0; i < size; i += 128) { + // Write the current output block to state if it is not NULL + if (buf != NULL) { + for (size_t j = 0; j < 8; j++) { + _mm_storeu_si128((__m128i *)&buf[i + (16 * j)], s->output[j]); + } + } + + // There are only 16 SSE registers (8 on i686), and we have to account for + // temporary copies due to being stuck with 2-operand instructions. + // Therefore, we use fixed iteration loops to reduce code complexity while + // still allowing the compiler to easily unroll the loop. + // We also try to keep variables active for as short as possible. + for (size_t j = 0; j < 2; j++) { + __m128i s_lo, s_hi, u0_lo, u0_hi, u1_lo, u1_hi, t_lo, t_hi; + + // Lane 0 + s_lo = s->state[4 * j + 0]; + s_hi = s->state[4 * j + 1]; + + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + u0_lo = _mm_srli_epi64(s_lo, 1); + u0_hi = _mm_srli_epi64(s_hi, 1); + + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // You may notice that they are simply 256-bit rotations (96 and 160). + // Note: This: + // x = (y << 96) | (y >> 160) + // can be rewritten as this + // x_lo = (y_lo << 96) | (y_hi >> 32) + // x_hi = (y_hi << 96) | (y_lo >> 32) + // which we can do with 2 _mm_alignr_epi8 instructions. + t_lo = SHISHUA_ALIGNR_EPI8(s_lo, s_hi, 4); + t_hi = SHISHUA_ALIGNR_EPI8(s_hi, s_lo, 4); + + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + s->state[4 * j + 0] = _mm_add_epi64(t_lo, u0_lo); + s->state[4 * j + 1] = _mm_add_epi64(t_hi, u0_hi); + + // Lane 1 + s_lo = s->state[4 * j + 2]; + s_hi = s->state[4 * j + 3]; + + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + s_lo = _mm_add_epi64(s_lo, counter_lo); + s_hi = _mm_add_epi64(s_hi, counter_hi); + + // Same as above but with different shifts + u1_lo = _mm_srli_epi64(s_lo, 3); + u1_hi = _mm_srli_epi64(s_hi, 3); + + t_lo = SHISHUA_ALIGNR_EPI8(s_hi, s_lo, 12); + t_hi = SHISHUA_ALIGNR_EPI8(s_lo, s_hi, 12); + + s->state[4 * j + 2] = _mm_add_epi64(t_lo, u1_lo); + s->state[4 * j + 3] = _mm_add_epi64(t_hi, u1_hi); + + // Merge lane 0 and lane 1 + // The first orthogonally grown piece evolving independently, XORed. + s->output[2 * j + 0] = _mm_xor_si128(u0_lo, t_lo); + s->output[2 * j + 1] = _mm_xor_si128(u0_hi, t_hi); + } + + // The second orthogonally grown piece evolving independently, XORed. + s->output[4] = _mm_xor_si128(s->state[0], s->state[6]); + s->output[5] = _mm_xor_si128(s->state[1], s->state[7]); + s->output[6] = _mm_xor_si128(s->state[4], s->state[2]); + s->output[7] = _mm_xor_si128(s->state[5], s->state[3]); + + // Increment the counter + counter_lo = _mm_add_epi64(counter_lo, increment_lo); + counter_hi = _mm_add_epi64(counter_hi, increment_hi); + } + + s->counter[0] = counter_lo; + s->counter[1] = counter_hi; +} + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[16] = { + 0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36, + 0xF06AD7AE9717877E, 0x85839D6EFFBD7DC6, 0x64D325D1C5371682, 0xCADD0CCCFDFFBBE1, + 0x626E33B8D04B4331, 0xBBF73C790D94F79D, 0x471C4AB3ED3D82A5, 0xFEC507705E4AE6E5, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + // Note: output is uninitialized at first, but since we pass NULL, its value + // is initially ignored. + s->counter[0] = _mm_setzero_si128(); + s->counter[1] = _mm_setzero_si128(); +# define ROUNDS 13 +# define STEPS 1 + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + __m128i seed_0 = SHISHUA_CVTSI64_SI128(seed[0]); + __m128i seed_1 = SHISHUA_CVTSI64_SI128(seed[1]); + __m128i seed_2 = SHISHUA_CVTSI64_SI128(seed[2]); + __m128i seed_3 = SHISHUA_CVTSI64_SI128(seed[3]); + s->state[0] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[ 0])); + s->state[1] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[ 2])); + s->state[2] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[ 4])); + s->state[3] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[ 6])); + s->state[4] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[ 8])); + s->state[5] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[10])); + s->state[6] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[12])); + s->state[7] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[14])); + + for (int i = 0; i < ROUNDS; i++) { + prng_gen(s, NULL, 128 * STEPS); + s->state[0] = s->output[6]; s->state[1] = s->output[7]; + s->state[2] = s->output[4]; s->state[3] = s->output[5]; + s->state[4] = s->output[2]; s->state[5] = s->output[3]; + s->state[6] = s->output[0]; s->state[7] = s->output[1]; + } +# undef STEPS +# undef ROUNDS +} +#undef SHISHUA_CVTSI64_SI128 +#undef SHISHUA_ALIGNR_EPI8 +#undef SHISHUA_SET_EPI64X +#undef SHISHUA_RESTRICT +#endif diff --git a/src/helper/shishua/shishua.h b/src/helper/shishua/shishua.h new file mode 100644 index 000000000..352a8aac2 --- /dev/null +++ b/src/helper/shishua/shishua.h @@ -0,0 +1,234 @@ +#ifndef SHISHUA_H +#define SHISHUA_H + +#define SHISHUA_TARGET_SCALAR 0 +#define SHISHUA_TARGET_AVX2 1 +#define SHISHUA_TARGET_SSE2 2 +#define SHISHUA_TARGET_NEON 3 + +#ifndef SHISHUA_TARGET +# if defined(__AVX2__) && (defined(__x86_64__) || defined(_M_X64)) +# define SHISHUA_TARGET SHISHUA_TARGET_AVX2 +# elif defined(__x86_64__) || defined(_M_X64) || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) +# define SHISHUA_TARGET SHISHUA_TARGET_SSE2 +// GCC's NEON codegen leaves much to be desired, at least as of 9.2.0. The +// scalar path ends up being faster. +// Device: Google Pixel 2 XL, 2.46GHz Qualcomm Snapdragon 835 +// algorithm | GCC 9.2.0 | Clang 9.0.1 +// shishua neon | 0.2845 ns/byte | 0.0966 ns/byte +// shishua scalar | 0.2056 ns/byte | 0.2958 ns/byte +// shishua half neon | 0.5169 ns/byte | 0.1929 ns/byte +// shishua half scalar | 0.2496 ns/byte | 0.2911 ns/byte +// Therefore, we only autoselect the NEON path on Clang, at least until GCC's +// NEON codegen improves. +# elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__clang__) +# define SHISHUA_TARGET SHISHUA_TARGET_NEON +# else +# define SHISHUA_TARGET SHISHUA_TARGET_SCALAR +# endif +#endif + +// These are all optional, with defining SHISHUA_TARGET_SCALAR, you only +// need this header. +#if SHISHUA_TARGET == SHISHUA_TARGET_AVX2 +# include "shishua-avx2.h" +#elif SHISHUA_TARGET == SHISHUA_TARGET_SSE2 +# include "shishua-sse2.h" +#elif SHISHUA_TARGET == SHISHUA_TARGET_NEON +# include "shishua-neon.h" +#else // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR + +// Portable scalar implementation of shishua. +// Designed to balance performance and code size. +#include +#include +#include +#include + +// Note: While it is an array, a "lane" refers to 4 consecutive uint64_t. +typedef struct prng_state { + uint64_t state[16]; // 4 lanes + uint64_t output[16]; // 4 lanes, 2 parts + uint64_t counter[4]; // 1 lane +} prng_state; + +// buf could technically alias with prng_state, according to the compiler. +#if defined(__GNUC__) || defined(_MSC_VER) +# define SHISHUA_RESTRICT __restrict +#else +# define SHISHUA_RESTRICT +#endif + +// Writes a 64-bit little endian integer to dst +static inline void prng_write_le64(void *dst, uint64_t val) { + // Define to write in native endianness with memcpy + // Also, use memcpy on known little endian setups. +#if defined(SHISHUA_NATIVE_ENDIAN) \ + || defined(_WIN32) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ + || defined(__LITTLE_ENDIAN__) + memcpy(dst, &val, sizeof(uint64_t)); +#else + // Byteshift write. + uint8_t *d = (uint8_t *)dst; + for (size_t i = 0; i < 8; i++) { + d[i] = (uint8_t)(val & 0xff); + val >>= 8; + } +#endif +} + +// buf's size must be a multiple of 128 bytes. +static inline void prng_gen(prng_state *SHISHUA_RESTRICT state, uint8_t *SHISHUA_RESTRICT buf, size_t size) { + uint8_t *b = buf; + // TODO: consider adding proper uneven write handling + assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes."); + + for (size_t i = 0; i < size; i += 128) { + // Write the current output block to state if it is not NULL + if (buf != NULL) { + for (size_t j = 0; j < 16; j++) { + prng_write_le64(b, state->output[j]); b += 8; + } + } + // Similar to SSE, use fixed iteration loops to reduce code complexity + // and allow the compiler more control over optimization. + for (size_t j = 0; j < 2; j++) { + // I don't want to type this 15 times. + uint64_t *s = &state->state[j * 8]; // 2 lanes + uint64_t *o = &state->output[j * 4]; // 1 lane + uint64_t t[8]; // temp buffer + + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + for (size_t k = 0; k < 4; k++) { + s[k + 4] += state->counter[k]; + } + + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // + // You may notice that they are simply 256-bit rotations (96 and 160): + // + // t0 = (s0 << 96) | (s0 >> (256 - 96)); + // t1 = (s1 << 160) | (s1 >> (256 - 160)); + // + // The easiest way to do this would be to cast s and t to uint32_t * + // and operate on them that way. + // + // uint32_t *t0_32 = (uint32_t *)t0, *t1_32 = (uint32_t *)t1; + // uint32_t *s0_32 = (uint32_t *)s0, *s1_32 = (uint32_t *)s1; + // for (size_t k = 0; k < 4; k++) { + // t0_32[k] = s0_32[(k + 5) % 8]; + // t1_32[k] = s1_32[(k + 3) % 8]; + // } + // + // This is pretty, but it violates strict aliasing and relies on little + // endian data layout. + // + // A common workaround to strict aliasing would be to use memcpy: + // + // // legal casts + // unsigned char *t8 = (unsigned char *)t; + // unsigned char *s8 = (unsigned char *)s; + // memcpy(&t8[0], &s8[20], 32 - 20); + // memcpy(&t8[32 - 20], &s8[0], 20); + // + // However, this still doesn't fix the endianness issue, and is very + // ugly. + // + // The only known solution which doesn't rely on endianness is to + // read two 64-bit integers and do a funnel shift. + + // Lookup table for the _offsets_ in the shuffle. Even lanes rotate + // by 5, odd lanes rotate by 3. + // If it were by 32-bit lanes, it would be + // { 5,6,7,0,1,2,3,4, 11,12,13,14,15,8,9,10 } + const uint8_t shuf_offsets[16] = { 2,3,0,1, 5,6,7,4, // left + 3,0,1,2, 6,7,4,5 }; // right + for (size_t k = 0; k < 8; k++) { + t[k] = (s[shuf_offsets[k]] >> 32) | (s[shuf_offsets[k + 8]] << 32); + } + + for (size_t k = 0; k < 4; k++) { + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + uint64_t u_lo = s[k + 0] >> 1; + uint64_t u_hi = s[k + 4] >> 3; + + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + s[k + 0] = u_lo + t[k + 0]; + s[k + 4] = u_hi + t[k + 4]; + + // The first orthogonally grown piece evolving independently, XORed. + o[k] = u_lo ^ t[k + 4]; + } + } + + // Merge together. + for (size_t j = 0; j < 4; j++) { + // The second orthogonally grown piece evolving independently, XORed. + state->output[j + 8] = state->state[j + 0] ^ state->state[j + 12]; + state->output[j + 12] = state->state[j + 8] ^ state->state[j + 4]; + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, + // or about 7 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + // + // For the scalar version, we calculate this dynamically, as it is + // simple enough. + state->counter[j] += 7 - (j * 2); // 7, 5, 3, 1 + } + } +} +#undef SHISHUA_RESTRICT + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[16] = { + 0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36, + 0xF06AD7AE9717877E, 0x85839D6EFFBD7DC6, 0x64D325D1C5371682, 0xCADD0CCCFDFFBBE1, + 0x626E33B8D04B4331, 0xBBF73C790D94F79D, 0x471C4AB3ED3D82A5, 0xFEC507705E4AE6E5, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + memset(s, 0, sizeof(prng_state)); +# define STEPS 1 +# define ROUNDS 13 + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + memcpy(s->state, phi, sizeof(phi)); + for (size_t i = 0; i < 4; i++) { + s->state[i * 2 + 0] ^= seed[i]; // { s0,0,s1,0,s2,0,s3,0 } + s->state[i * 2 + 8] ^= seed[(i + 2) % 4]; // { s2,0,s3,0,s0,0,s1,0 } + } + for (size_t i = 0; i < ROUNDS; i++) { + prng_gen(s, NULL, 128 * STEPS); + for (size_t j = 0; j < 4; j++) { + s->state[j+ 0] = s->output[j+12]; + s->state[j+ 4] = s->output[j+ 8]; + s->state[j+ 8] = s->output[j+ 4]; + s->state[j+12] = s->output[j+ 0]; + } + } +# undef STEPS +# undef ROUNDS +} +#endif // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR +#endif // SHISHUA_SCALAR_H From 8753eee66349c2e518ef26682520f0e689e3a868 Mon Sep 17 00:00:00 2001 From: Lee Hoon Lim Date: Thu, 12 Jun 2025 14:48:29 +0800 Subject: [PATCH 02/11] Cleanup --- CMakeLists.txt | 2 + external/shishua/shishua-avx2.h | 137 ++++++++++++++ external/shishua/shishua-half-avx2.h | 107 +++++++++++ external/shishua/shishua-half-neon.h | 169 +++++++++++++++++ external/shishua/shishua-half-sse2.h | 200 ++++++++++++++++++++ external/shishua/shishua-half.h | 196 ++++++++++++++++++++ external/shishua/shishua-neon.h | 199 ++++++++++++++++++++ external/shishua/shishua-sse2.h | 228 +++++++++++++++++++++++ external/shishua/shishua.h | 244 +++++++++++++++++++++++++ src/helper/shishua/shishua-avx2.h | 111 ----------- src/helper/shishua/shishua-half-avx2.h | 96 ---------- src/helper/shishua/shishua-half-neon.h | 155 ---------------- src/helper/shishua/shishua-half-sse2.h | 194 -------------------- src/helper/shishua/shishua-half.h | 195 -------------------- src/helper/shishua/shishua-neon.h | 180 ------------------ src/helper/shishua/shishua-sse2.h | 214 ---------------------- src/helper/shishua/shishua.h | 234 ------------------------ src/{helper/shishua => }/rand.c | 6 +- src/{helper/shishua => }/rand.h | 6 +- 19 files changed, 1488 insertions(+), 1385 deletions(-) create mode 100644 external/shishua/shishua-avx2.h create mode 100644 external/shishua/shishua-half-avx2.h create mode 100644 external/shishua/shishua-half-neon.h create mode 100644 external/shishua/shishua-half-sse2.h create mode 100644 external/shishua/shishua-half.h create mode 100644 external/shishua/shishua-neon.h create mode 100644 external/shishua/shishua-sse2.h create mode 100644 external/shishua/shishua.h delete mode 100644 src/helper/shishua/shishua-avx2.h delete mode 100644 src/helper/shishua/shishua-half-avx2.h delete mode 100644 src/helper/shishua/shishua-half-neon.h delete mode 100644 src/helper/shishua/shishua-half-sse2.h delete mode 100644 src/helper/shishua/shishua-half.h delete mode 100644 src/helper/shishua/shishua-neon.h delete mode 100644 src/helper/shishua/shishua-sse2.h delete mode 100644 src/helper/shishua/shishua.h rename src/{helper/shishua => }/rand.c (98%) rename src/{helper/shishua => }/rand.h (76%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4591d9ea1..95245ad12 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,8 @@ set(PROFILE_FLAG -O3 -p -g) function(apply_target target) target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/include) + target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/external) + target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/src) target_compile_options(${target} PRIVATE ${WARNING_FLAGS}) target_link_libraries(libcneuron PRIVATE ${BLAS_LIBRARIES} m) diff --git a/external/shishua/shishua-avx2.h b/external/shishua/shishua-avx2.h new file mode 100644 index 000000000..e25dcf20f --- /dev/null +++ b/external/shishua/shishua-avx2.h @@ -0,0 +1,137 @@ +#ifndef SHISHUA_AVX2_H +#define SHISHUA_AVX2_H +#include +#include +#include +#include +#include +typedef struct prng_state { + __m256i state[4]; + __m256i output[4]; + __m256i counter; +} prng_state; + +// buf's size must be a multiple of 128 bytes. +static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) { + __m256i o0 = s->output[0], o1 = s->output[1], o2 = s->output[2], o3 = s->output[3], + s0 = s->state[0], s1 = s->state[1], s2 = s->state[2], s3 = s->state[3], + t0, t1, t2, t3, u0, u1, u2, u3, counter = s->counter; + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // You may notice that they are simply 256-bit rotations (96 and 160). + __m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5), + shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3); + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, + // or about 7 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + __m256i increment = _mm256_set_epi64x(1, 3, 5, 7); + + // TODO: consider adding proper uneven write handling + assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes."); + + for (size_t i = 0; i < size; i += 128) { + if (buf != NULL) { + _mm256_storeu_si256((__m256i *)&buf[i + 0], o0); + _mm256_storeu_si256((__m256i *)&buf[i + 32], o1); + _mm256_storeu_si256((__m256i *)&buf[i + 64], o2); + _mm256_storeu_si256((__m256i *)&buf[i + 96], o3); + } + + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + s1 = _mm256_add_epi64(s1, counter); + s3 = _mm256_add_epi64(s3, counter); + counter = _mm256_add_epi64(counter, increment); + + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + u0 = _mm256_srli_epi64(s0, 1); + u1 = _mm256_srli_epi64(s1, 3); + u2 = _mm256_srli_epi64(s2, 1); + u3 = _mm256_srli_epi64(s3, 3); + t0 = _mm256_permutevar8x32_epi32(s0, shu0); + t1 = _mm256_permutevar8x32_epi32(s1, shu1); + t2 = _mm256_permutevar8x32_epi32(s2, shu0); + t3 = _mm256_permutevar8x32_epi32(s3, shu1); + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + s0 = _mm256_add_epi64(t0, u0); + s1 = _mm256_add_epi64(t1, u1); + s2 = _mm256_add_epi64(t2, u2); + s3 = _mm256_add_epi64(t3, u3); + + // Two orthogonally grown pieces evolving independently, XORed. + o0 = _mm256_xor_si256(u0, t1); + o1 = _mm256_xor_si256(u2, t3); + o2 = _mm256_xor_si256(s0, s3); + o3 = _mm256_xor_si256(s2, s1); + } + s->output[0] = o0; + s->output[1] = o1; + s->output[2] = o2; + s->output[3] = o3; + s->state[0] = s0; + s->state[1] = s1; + s->state[2] = s2; + s->state[3] = s3; + s->counter = counter; +} + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[16] = { + 0x9E3779B97F4A7C15, + 0xF39CC0605CEDC834, + 0x1082276BF3A27251, + 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, + 0x0347045B5BF1827F, + 0x01886F0928403002, + 0xC1D64BA40F335E36, + 0xF06AD7AE9717877E, + 0x85839D6EFFBD7DC6, + 0x64D325D1C5371682, + 0xCADD0CCCFDFFBBE1, + 0x626E33B8D04B4331, + 0xBBF73C790D94F79D, + 0x471C4AB3ED3D82A5, + 0xFEC507705E4AE6E5, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + memset(s, 0, sizeof(prng_state)); +#define STEPS 1 +#define ROUNDS 13 + uint8_t buf[128 * STEPS]; + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + s->state[0] = _mm256_set_epi64x(phi[3], phi[2] ^ seed[1], phi[1], phi[0] ^ seed[0]); + s->state[1] = _mm256_set_epi64x(phi[7], phi[6] ^ seed[3], phi[5], phi[4] ^ seed[2]); + s->state[2] = _mm256_set_epi64x(phi[11], phi[10] ^ seed[3], phi[9], phi[8] ^ seed[2]); + s->state[3] = _mm256_set_epi64x(phi[15], phi[14] ^ seed[1], phi[13], phi[12] ^ seed[0]); + for (size_t i = 0; i < ROUNDS; i++) { + prng_gen(s, buf, 128 * STEPS); + s->state[0] = s->output[3]; + s->state[1] = s->output[2]; + s->state[2] = s->output[1]; + s->state[3] = s->output[0]; + } +#undef STEPS +#undef ROUNDS +} +#endif diff --git a/external/shishua/shishua-half-avx2.h b/external/shishua/shishua-half-avx2.h new file mode 100644 index 000000000..d1b3df42a --- /dev/null +++ b/external/shishua/shishua-half-avx2.h @@ -0,0 +1,107 @@ +#ifndef SHISHUA_H +#define SHISHUA_H +#include +#include +#include +#include +#include +typedef struct prng_state { + __m256i state[2]; + __m256i output; + __m256i counter; +} prng_state; + +// buf's size must be a multiple of 32 bytes. +static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) { + __m256i s0 = s->state[0], counter = s->counter, + s1 = s->state[1], o = s->output, + t0, t1, t2, t3, u0, u1, u2, u3; + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // You may notice that they are simply 256-bit rotations (96 and 160). + __m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5), + shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3); + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^69 bytes = 512 EiB to the period, + // or about 1 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + __m256i increment = _mm256_set_epi64x(1, 3, 5, 7); + + // TODO: consider adding proper uneven write handling + assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes."); + + for (size_t i = 0; i < size; i += 32) { + if (buf != NULL) { + _mm256_storeu_si256((__m256i *)&buf[i], o); + } + + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + s1 = _mm256_add_epi64(s1, counter); + counter = _mm256_add_epi64(counter, increment); + + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + u0 = _mm256_srli_epi64(s0, 1); + u1 = _mm256_srli_epi64(s1, 3); + t0 = _mm256_permutevar8x32_epi32(s0, shu0); + t1 = _mm256_permutevar8x32_epi32(s1, shu1); + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + s0 = _mm256_add_epi64(t0, u0); + s1 = _mm256_add_epi64(t1, u1); + + // Two orthogonally grown pieces evolving independently, XORed. + o = _mm256_xor_si256(u0, t1); + } + s->state[0] = s0; + s->counter = counter; + s->state[1] = s1; + s->output = o; +} + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[8] = { + 0x9E3779B97F4A7C15, + 0xF39CC0605CEDC834, + 0x1082276BF3A27251, + 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, + 0x0347045B5BF1827F, + 0x01886F0928403002, + 0xC1D64BA40F335E36, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + memset(s, 0, sizeof(prng_state)); +#define STEPS 5 +#define ROUNDS 4 + uint8_t buf[32 * STEPS]; // 4 64-bit numbers per 256-bit SIMD. + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + s->state[0] = _mm256_set_epi64x(phi[3], phi[2] ^ seed[1], phi[1], phi[0] ^ seed[0]); + s->state[1] = _mm256_set_epi64x(phi[7], phi[6] ^ seed[3], phi[5], phi[4] ^ seed[2]); + for (size_t i = 0; i < ROUNDS; i++) { + prng_gen(s, buf, 32 * STEPS); + s->state[0] = s->state[1]; + s->state[1] = s->output; + } +#undef STEPS +#undef ROUNDS +} +#endif diff --git a/external/shishua/shishua-half-neon.h b/external/shishua/shishua-half-neon.h new file mode 100644 index 000000000..ba5655b14 --- /dev/null +++ b/external/shishua/shishua-half-neon.h @@ -0,0 +1,169 @@ +// An ARM NEON version of shishua. +#ifndef SHISHUA_HALF_NEON_H +#define SHISHUA_HALF_NEON_H +#include +#include +#include +#include +// A NEON version. +typedef struct prng_state { + uint64x2_t state[4]; + uint64x2_t output[2]; + uint64x2_t counter[2]; +} prng_state; + +#define SHISHUA_VSETQ_N_U64(a, b) vcombine_u64(vdup_n_u64(a), vdup_n_u64(b)) + +// To hide the vreinterpret ritual. +#define SHISHUA_VEXTQ_U8(Rn, Rm, Imm) \ + vreinterpretq_u64_u8( \ + vextq_u8( \ + vreinterpretq_u8_u64(Rn), \ + vreinterpretq_u8_u64(Rm), \ + (Imm))) + +// buf could technically alias with prng_state, according to the compiler. +#if defined(__GNUC__) || defined(_MSC_VER) +#define SHISHUA_RESTRICT __restrict +#else +#define SHISHUA_RESTRICT +#endif + +// buf's size must be a multiple of 32 bytes. +static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) { + uint64x2_t s0_lo = s->state[0], s0_hi = s->state[1], + s1_lo = s->state[2], s1_hi = s->state[3], + o_lo = s->output[0], o_hi = s->output[1], + t0_lo, t0_hi, t1_lo, t1_hi, u_lo, u_hi, + counter_lo = s->counter[0], counter_hi = s->counter[1]; + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, + // or about 7 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + uint64x2_t increment_lo = SHISHUA_VSETQ_N_U64(7, 5); + uint64x2_t increment_hi = SHISHUA_VSETQ_N_U64(3, 1); + uint8_t *b = buf; + // TODO: consider adding proper uneven write handling + assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes."); + + for (size_t i = 0; i < size / 32; i++) { + if (buf != NULL) { + vst1q_u8(&b[0], vreinterpretq_u8_u64(o_lo)); + b += 16; + vst1q_u8(&b[0], vreinterpretq_u8_u64(o_hi)); + b += 16; + } + + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + s1_lo = vaddq_u64(s1_lo, counter_lo); + s1_hi = vaddq_u64(s1_hi, counter_hi); + + counter_lo = vaddq_u64(counter_lo, increment_lo); + counter_hi = vaddq_u64(counter_hi, increment_hi); + + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // You may notice that they are simply 256-bit rotations (96 and 160). + // Note: This: + // x = (y << 96) | (y >> 160) + // can be rewritten as this + // x_lo = (y_lo << 96) | (y_hi >> 32) + // x_hi = (y_hi << 96) | (y_lo >> 32) + // which we can do with 2 vext.8 instructions. + t0_lo = SHISHUA_VEXTQ_U8(s0_hi, s0_lo, 4); + t0_hi = SHISHUA_VEXTQ_U8(s0_lo, s0_hi, 4); + t1_lo = SHISHUA_VEXTQ_U8(s1_lo, s1_hi, 12); + t1_hi = SHISHUA_VEXTQ_U8(s1_hi, s1_lo, 12); + + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + u_lo = vshrq_n_u64(s0_lo, 1); + u_hi = vshrq_n_u64(s0_hi, 1); +#if defined(__clang__) + // UGLY HACK: Clang enjoys merging the above statements with the vadds below + // into vsras. + // This is dumb, as it still needs to do the original vshr for the xor mix, + // causing it to shift twice. + // This makes Clang assume that this line has side effects, preventing the + // combination and speeding things up significantly. + // clang-format off + __asm__("" : "+w"(u_lo), "+w"(u_hi)); + // clang-format on +#endif + + // Two orthogonally grown pieces evolving independently, XORed. + // Note: We do this first because on the assembly side, vsra would overwrite + // t1_lo and t1_hi. + o_lo = veorq_u64(u_lo, t1_lo); + o_hi = veorq_u64(u_hi, t1_hi); + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + s0_lo = vaddq_u64(t0_lo, u_lo); + s0_hi = vaddq_u64(t0_hi, u_hi); + // Use vsra here directly. + s1_lo = vsraq_n_u64(t1_lo, s1_lo, 3); + s1_hi = vsraq_n_u64(t1_hi, s1_hi, 3); + } + s->state[0] = s0_lo; + s->state[1] = s0_hi; + s->state[2] = s1_lo; + s->state[3] = s1_hi; + s->output[0] = o_lo; + s->output[1] = o_hi; + s->counter[0] = counter_lo; + s->counter[1] = counter_hi; +} + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[8] = { + 0x9E3779B97F4A7C15, + 0xF39CC0605CEDC834, + 0x1082276BF3A27251, + 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, + 0x0347045B5BF1827F, + 0x01886F0928403002, + 0xC1D64BA40F335E36, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + s->counter[0] = vdupq_n_u64(0); + s->counter[1] = vdupq_n_u64(0); +#define STEPS 5 +#define ROUNDS 4 + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + for (size_t i = 0; i < 4; i++) { + s->state[i] = veorq_u64(SHISHUA_VSETQ_N_U64(seed[i], 0), vld1q_u64(&phi[i * 2])); + } + for (size_t i = 0; i < ROUNDS; i++) { + prng_gen(s, NULL, 32 * STEPS); + s->state[0] = s->state[2]; + s->state[1] = s->state[3]; + s->state[2] = s->output[0]; + s->state[3] = s->output[1]; + } +#undef STEPS +#undef ROUNDS +} +#undef SHISHUA_VSETQ_N_U64 +#undef SHISHUA_VEXTQ_U8 +#undef SHISHUA_RESTRICT + +#endif diff --git a/external/shishua/shishua-half-sse2.h b/external/shishua/shishua-half-sse2.h new file mode 100644 index 000000000..91e1194fa --- /dev/null +++ b/external/shishua/shishua-half-sse2.h @@ -0,0 +1,200 @@ +// An SSE2/SSSE3 version of shishua half. Slower than AVX2, but more compatible. +// Also compatible with 32-bit x86. +// +// SSSE3 is recommended, as it has the useful _mm_alignr_epi8 intrinsic. +// We can still emulate it on SSE2, but it is slower. +#ifndef SHISHUA_HALF_SSE2_H +#define SHISHUA_HALF_SSE2_H + +#include +#include +#include +// Note: cl.exe doesn't define __SSSE3__ +#if defined(__SSSE3__) || defined(__AVX__) +#include // SSSE3 +#define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \ + _mm_alignr_epi8(hi, lo, amt) +#else +#include // SSE2 +// Emulate _mm_alignr_epi8 for SSE2. It's a little slow. +// The compiler may convert it to a sequence of shufps instructions, which is +// perfectly fine. +#define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \ + _mm_or_si128( \ + _mm_slli_si128(hi, 16 - (amt)), \ + _mm_srli_si128(lo, amt)) +#endif + +typedef struct prng_state { + __m128i state[4]; + __m128i output[2]; + __m128i counter[2]; +} prng_state; + +// Wrappers for x86 targets which usually lack these intrinsics. +// Don't call these with side effects. +#if defined(__x86_64__) || defined(_M_X64) +#define SHISHUA_SET_EPI64X(b, a) _mm_set_epi64x(b, a) +#define SHISHUA_CVTSI64_SI128(x) _mm_cvtsi64_si128(x) +#else +#define SHISHUA_SET_EPI64X(b, a) \ + _mm_set_epi32( \ + (int)(((uint64_t)(b)) >> 32), \ + (int)(b), \ + (int)(((uint64_t)(a)) >> 32), \ + (int)(a)) +#define SHISHUA_CVTSI64_SI128(x) SHISHUA_SET_EPI64X(0, x) +#endif + +// buf could technically alias with prng_state, according to the compiler. +#if defined(__GNUC__) || defined(_MSC_VER) +#define SHISHUA_RESTRICT __restrict +#else +#define SHISHUA_RESTRICT +#endif + +// buf's size must be a multiple of 32 bytes. +static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) { + // We need a lot of variables... + __m128i s0_lo = s->state[0], s0_hi = s->state[1], + s1_lo = s->state[2], s1_hi = s->state[3], + t0_lo, t0_hi, t1_lo, t1_hi, + u_lo, u_hi, + counter_lo = s->counter[0], counter_hi = s->counter[1]; + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, + // or about 7 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + // increment = { 7, 5, 3, 1 }; + const __m128i increment_lo = SHISHUA_SET_EPI64X(5, 7); + const __m128i increment_hi = SHISHUA_SET_EPI64X(1, 3); + + // TODO: consider adding proper uneven write handling + assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes."); + + for (size_t i = 0; i < size; i += 32) { + // Split to try to reduce register pressure + if (buf != NULL) { + _mm_storeu_si128((__m128i *)&buf[i + 0], s->output[0]); + _mm_storeu_si128((__m128i *)&buf[i + 16], s->output[1]); + } + + // Lane 0 + + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // You may notice that they are simply 256-bit rotations (96 and 160). + // Note: This: + // x = (y << 96) | (y >> 160) + // can be rewritten as this + // x_lo = (y_lo << 96) | (y_hi >> 32) + // x_hi = (y_hi << 96) | (y_lo >> 32) + // which we can do with 2 _mm_alignr_epi8 instructions. + t0_lo = SHISHUA_ALIGNR_EPI8(s0_lo, s0_hi, 4); + t0_hi = SHISHUA_ALIGNR_EPI8(s0_hi, s0_lo, 4); + + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + u_lo = _mm_srli_epi64(s0_lo, 1); + u_hi = _mm_srli_epi64(s0_hi, 1); + + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + s0_lo = _mm_add_epi64(u_lo, t0_lo); + s0_hi = _mm_add_epi64(u_hi, t0_hi); + + // Lane 1 + + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + s1_lo = _mm_add_epi64(s1_lo, counter_lo); + s1_hi = _mm_add_epi64(s1_hi, counter_hi); + + // Increment the counter + counter_lo = _mm_add_epi64(counter_lo, increment_lo); + counter_hi = _mm_add_epi64(counter_hi, increment_hi); + + // Same as above, but with different shift amounts. + t1_lo = SHISHUA_ALIGNR_EPI8(s1_hi, s1_lo, 12); + t1_hi = SHISHUA_ALIGNR_EPI8(s1_lo, s1_hi, 12); + + s1_lo = _mm_srli_epi64(s1_lo, 3); + s1_hi = _mm_srli_epi64(s1_hi, 3); + + s1_lo = _mm_add_epi64(s1_lo, t1_lo); + s1_hi = _mm_add_epi64(s1_hi, t1_hi); + + // Two orthogonally grown pieces evolving independently, XORed. + s->output[0] = _mm_xor_si128(u_lo, t1_lo); + s->output[1] = _mm_xor_si128(u_hi, t1_hi); + } + + s->state[0] = s0_lo; + s->state[1] = s0_hi; + s->state[2] = s1_lo; + s->state[3] = s1_hi; + + s->counter[0] = counter_lo; + s->counter[1] = counter_hi; +} + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[8] = { + 0x9E3779B97F4A7C15, + 0xF39CC0605CEDC834, + 0x1082276BF3A27251, + 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, + 0x0347045B5BF1827F, + 0x01886F0928403002, + 0xC1D64BA40F335E36, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + s->counter[0] = _mm_setzero_si128(); + s->counter[1] = _mm_setzero_si128(); +#define STEPS 5 +#define ROUNDS 4 + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + // XXX: better way to do this? + __m128i seed_0 = SHISHUA_CVTSI64_SI128(seed[0]); + __m128i seed_1 = SHISHUA_CVTSI64_SI128(seed[1]); + __m128i seed_2 = SHISHUA_CVTSI64_SI128(seed[2]); + __m128i seed_3 = SHISHUA_CVTSI64_SI128(seed[3]); + s->state[0] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[0])); + s->state[1] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[2])); + s->state[2] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[4])); + s->state[3] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[6])); + + for (int i = 0; i < ROUNDS; i++) { + prng_gen(s, NULL, 32 * STEPS); + s->state[0] = s->state[2]; + s->state[1] = s->state[3]; + s->state[2] = s->output[0]; + s->state[3] = s->output[1]; + } +#undef STEPS +#undef ROUNDS +} +#undef SHISHUA_ALIGNR_EPI8 +#undef SHISHUA_CVTSI64_SI128 +#undef SHISHUA_SET_EPI64X +#undef SHISHUA_RESTRICT + +#endif diff --git a/external/shishua/shishua-half.h b/external/shishua/shishua-half.h new file mode 100644 index 000000000..70a9f4be4 --- /dev/null +++ b/external/shishua/shishua-half.h @@ -0,0 +1,196 @@ +#ifndef SHISHUA_HALF_H +#define SHISHUA_HALF_H + +#define SHISHUA_TARGET_SCALAR 0 +#define SHISHUA_TARGET_AVX2 1 +#define SHISHUA_TARGET_SSE2 2 +#define SHISHUA_TARGET_NEON 3 + +#ifndef SHISHUA_TARGET +#if defined(__AVX2__) && (defined(__x86_64__) || defined(_M_X64)) +#define SHISHUA_TARGET SHISHUA_TARGET_AVX2 +#elif defined(__x86_64__) || defined(_M_X64) || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) +#define SHISHUA_TARGET SHISHUA_TARGET_SSE2 +// GCC's NEON codegen leaves much to be desired, at least as of 9.2.0. The +// scalar path ends up being faster. +// Device: Google Pixel 2 XL, 2.46GHz Qualcomm Snapdragon 835 +// | GCC 9.2.0 | Clang 9.0.1 +// shishua neon | 0.2845 ns/byte | 0.0966 ns/byte +// shishua scalar | 0.2056 ns/byte | 0.2958 ns/byte +// shishua half neon | 0.5169 ns/byte | 0.1929 ns/byte +// shishua half scalar | 0.2496 ns/byte | 0.2911 ns/byte +// Therefore, we only autoselect the NEON path on Clang, at least until GCC's +// NEON codegen improves. +#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__clang__) +#define SHISHUA_TARGET SHISHUA_TARGET_NEON +#else +#define SHISHUA_TARGET SHISHUA_TARGET_SCALAR +#endif +#endif + +// These are all optional: By defining SHISHUA_TARGET_SCALAR, you only +// need this header. +// Additionally, these headers can also be used on their own. +#if SHISHUA_TARGET == SHISHUA_TARGET_AVX2 +#include "shishua-half-avx2.h" +#elif SHISHUA_TARGET == SHISHUA_TARGET_SSE2 +#include "shishua-half-sse2.h" +#elif SHISHUA_TARGET == SHISHUA_TARGET_NEON +#include "shishua-half-neon.h" +#else // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR + +// Portable scalar implementation of shishua half. +// Aims for a balance between code size and performance. +#include +#include +#include +#include + +// Note: While it is an array, a "lane" refers to 4 consecutive uint64_t. +typedef struct prng_state { + uint64_t state[8]; // 2 lanes + uint64_t output[4]; // 1 lane + uint64_t counter[4]; // 1 lane +} prng_state; + +// buf could technically alias with prng_state, according to the compiler. +#if defined(__GNUC__) || defined(_MSC_VER) +#define SHISHUA_RESTRICT __restrict +#else +#define SHISHUA_RESTRICT +#endif + +static inline void shishua_write_le64(void *dst, uint64_t val) { + // Define to write in native endianness with memcpy + // Also, use memcpy on known little endian setups. +#if defined(SHISHUA_NATIVE_ENDIAN) || defined(_WIN32) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN__) + memcpy(dst, &val, sizeof(uint64_t)); +#else + // Byteshift write. + uint8_t *d = (uint8_t *)dst; + for (size_t i = 0; i < 8; i++) { + d[i] = (uint8_t)(val & 0xff); + val >>= 8; + } +#endif +} + +// buf's size must be a multiple of 32 bytes. +static inline void prng_gen(prng_state *SHISHUA_RESTRICT state, uint8_t *SHISHUA_RESTRICT buf, size_t size) { + uint8_t *b = buf; + // TODO: consider adding proper uneven write handling + assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes."); + + for (size_t i = 0; i < size; i += 32) { + uint64_t t[8]; + // Write to buf + if (buf != NULL) { + for (size_t j = 0; j < 4; j++) { + shishua_write_le64(b, state->output[j]); + b += 8; + } + } + + for (size_t j = 0; j < 4; j++) { + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + state->state[j + 4] += state->counter[j]; + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, + // or about 7 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + // + // For the scalar version, we calculate this dynamically, as it is + // simple enough. + state->counter[j] += 7 - (j * 2); // 7, 5, 3, 1 + } + + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // You may notice that they are simply 256-bit rotations (96 and 160). + // + // It would be much easier and cleaner to just reinterpret as a uint32_t + // pointer or use memcpy, but that is unfortunately endian-dependent, and + // the former also breaks strict aliasing. + // + // The only known solution which doesn't rely on endianness is to + // read two 64-bit integers and do a funnel shift. + // + // See shishua.h for details. + + // Lookup table for the _offsets_ in the shuffle. Even lanes rotate + // by 5, odd lanes rotate by 3. + // If it were by 32-bit lanes, it would be + // { 5,6,7,0,1,2,3,4, 11,12,13,14,15,8,9,10 } + const uint8_t shuf_offsets[16] = {2, 3, 0, 1, 5, 6, 7, 4, // left + 3, 0, 1, 2, 6, 7, 4, 5}; // right + for (size_t j = 0; j < 8; j++) { + t[j] = (state->state[shuf_offsets[j]] >> 32) | (state->state[shuf_offsets[j + 8]] << 32); + } + for (size_t j = 0; j < 4; j++) { + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + uint64_t u_lo = state->state[j + 0] >> 1; + uint64_t u_hi = state->state[j + 4] >> 3; + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + state->state[j + 0] = u_lo + t[j + 0]; + state->state[j + 4] = u_hi + t[j + 4]; + + // Two orthogonally grown pieces evolving independently, XORed. + state->output[j] = u_lo ^ t[j + 4]; + } + } +} +#undef SHISHUA_RESTRICT + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[8] = { + 0x9E3779B97F4A7C15, + 0xF39CC0605CEDC834, + 0x1082276BF3A27251, + 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, + 0x0347045B5BF1827F, + 0x01886F0928403002, + 0xC1D64BA40F335E36, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + memset(s, 0, sizeof(prng_state)); + +#define STEPS 5 +#define ROUNDS 4 + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + memcpy(s->state, phi, sizeof(phi)); + for (size_t i = 0; i < 4; i++) { + s->state[i * 2] ^= seed[i]; + } + for (size_t i = 0; i < ROUNDS; i++) { + prng_gen(s, NULL, 32 * STEPS); + for (size_t j = 0; j < 4; j++) { + s->state[j + 0] = s->state[j + 4]; + s->state[j + 4] = s->output[j]; + } + } +#undef STEPS +#undef ROUNDS +} +#endif // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR +#endif // SHISHUA_HALF_SCALAR_H diff --git a/external/shishua/shishua-neon.h b/external/shishua/shishua-neon.h new file mode 100644 index 000000000..6cd5124bd --- /dev/null +++ b/external/shishua/shishua-neon.h @@ -0,0 +1,199 @@ +// An ARM NEON version of shishua. +#ifndef SHISHUA_NEON_H +#define SHISHUA_NEON_H +#include +#include +#include +#include + +typedef struct prng_state { + uint64x2_t state[8]; + uint64x2_t output[8]; + uint64x2_t counter[2]; +} prng_state; + +#if defined(__GNUC__) && (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define SHISHUA_VSETQ_N_U64(a, b) (__extension__(uint64x2_t){a, b}) +#else +#define SHISHUA_VSETQ_N_U64(a, b) vcombine_u64(vdup_n_u64(a), vdup_n_u64(b)) +#endif +// To hide the vreinterpret ritual. +#define SHISHUA_VEXTQ_U8(Rn, Rm, Imm) \ + vreinterpretq_u64_u8( \ + vextq_u8( \ + vreinterpretq_u8_u64(Rn), \ + vreinterpretq_u8_u64(Rm), \ + (Imm))) + +// buf could technically alias with prng_state, according to the compiler. +#if defined(__GNUC__) || defined(_MSC_VER) +#define SHISHUA_RESTRICT __restrict +#else +#define SHISHUA_RESTRICT +#endif + +// buf's size must be a multiple of 128 bytes. +static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) { + uint8_t *b = buf; + uint64x2_t counter_lo = s->counter[0], counter_hi = s->counter[1]; + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, + // or about 7 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + uint64x2_t increment_lo = SHISHUA_VSETQ_N_U64(7, 5); + uint64x2_t increment_hi = SHISHUA_VSETQ_N_U64(3, 1); + // TODO: consider adding proper uneven write handling + assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes."); + + for (size_t i = 0; i < size; i += 128) { + // Write the current output block to state if it is not NULL + if (buf != NULL) { + for (size_t j = 0; j < 8; j++) { + vst1q_u8(b, vreinterpretq_u8_u64(s->output[j])); + b += 16; + } + } + // NEON has less register pressure than SSE2, but we reroll it anyways for + // code size. + for (size_t j = 0; j < 2; j++) { + uint64x2_t s0_lo = s->state[j * 4 + 0], + s0_hi = s->state[j * 4 + 1], + s1_lo = s->state[j * 4 + 2], + s1_hi = s->state[j * 4 + 3], + t0_lo, t0_hi, t1_lo, t1_hi, + u_lo, u_hi; + + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + s1_lo = vaddq_u64(s1_lo, counter_lo); + s1_hi = vaddq_u64(s1_hi, counter_hi); + + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // You may notice that they are simply 256-bit rotations (96 and 160). + // Note: This: + // x = (y << 96) | (y >> 160) + // can be rewritten as this + // x_lo = (y_lo << 96) | (y_hi >> 32) + // x_hi = (y_hi << 96) | (y_lo >> 32) + // which we can do with 2 vext.8 instructions. + t0_lo = SHISHUA_VEXTQ_U8(s0_hi, s0_lo, 4); + t0_hi = SHISHUA_VEXTQ_U8(s0_lo, s0_hi, 4); + t1_lo = SHISHUA_VEXTQ_U8(s1_lo, s1_hi, 12); + t1_hi = SHISHUA_VEXTQ_U8(s1_hi, s1_lo, 12); + + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + u_lo = vshrq_n_u64(s0_lo, 1); + u_hi = vshrq_n_u64(s0_hi, 1); +#if defined(__clang__) + // UGLY HACK: Clang enjoys merging the above statements with the vadds below + // into vsras. + // This is dumb, as it still needs to do the original vshr for the xor mix, + // causing it to shift twice. + // This makes Clang assume that this line has side effects, preventing the + // combination and speeding things up significantly. + // clang-format off + __asm__("" : "+w"(u_lo), "+w"(u_hi)); + // clang-format on +#endif + + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + s->state[4 * j + 0] = vaddq_u64(t0_lo, u_lo); + s->state[4 * j + 1] = vaddq_u64(t0_hi, u_hi); + // Use vsra here directly. + s->state[4 * j + 2] = vsraq_n_u64(t1_lo, s1_lo, 3); + s->state[4 * j + 3] = vsraq_n_u64(t1_hi, s1_hi, 3); + + // The first orthogonally grown pieces evolving independently, XORed. + s->output[2 * j + 0] = veorq_u64(u_lo, t1_lo); + s->output[2 * j + 1] = veorq_u64(u_hi, t1_hi); + } + // The second orthogonally grown piece evolving independently, XORed. + s->output[4] = veorq_u64(s->state[0], s->state[6]); + s->output[5] = veorq_u64(s->state[1], s->state[7]); + + s->output[6] = veorq_u64(s->state[2], s->state[4]); + s->output[7] = veorq_u64(s->state[3], s->state[5]); + + counter_lo = vaddq_u64(counter_lo, increment_lo); + counter_hi = vaddq_u64(counter_hi, increment_hi); + } + s->counter[0] = counter_lo; + s->counter[1] = counter_hi; +} + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[16] = { + 0x9E3779B97F4A7C15, + 0xF39CC0605CEDC834, + 0x1082276BF3A27251, + 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, + 0x0347045B5BF1827F, + 0x01886F0928403002, + 0xC1D64BA40F335E36, + 0xF06AD7AE9717877E, + 0x85839D6EFFBD7DC6, + 0x64D325D1C5371682, + 0xCADD0CCCFDFFBBE1, + 0x626E33B8D04B4331, + 0xBBF73C790D94F79D, + 0x471C4AB3ED3D82A5, + 0xFEC507705E4AE6E5, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + s->counter[0] = vdupq_n_u64(0); + s->counter[1] = vdupq_n_u64(0); +#define ROUNDS 13 +#define STEPS 1 + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + uint64x2_t seed_0 = SHISHUA_VSETQ_N_U64(seed[0], 0); + uint64x2_t seed_1 = SHISHUA_VSETQ_N_U64(seed[1], 0); + uint64x2_t seed_2 = SHISHUA_VSETQ_N_U64(seed[2], 0); + uint64x2_t seed_3 = SHISHUA_VSETQ_N_U64(seed[3], 0); + s->state[0] = veorq_u64(seed_0, vld1q_u64(&phi[0])); + s->state[1] = veorq_u64(seed_1, vld1q_u64(&phi[2])); + s->state[2] = veorq_u64(seed_2, vld1q_u64(&phi[4])); + s->state[3] = veorq_u64(seed_3, vld1q_u64(&phi[6])); + s->state[4] = veorq_u64(seed_2, vld1q_u64(&phi[8])); + s->state[5] = veorq_u64(seed_3, vld1q_u64(&phi[10])); + s->state[6] = veorq_u64(seed_0, vld1q_u64(&phi[12])); + s->state[7] = veorq_u64(seed_1, vld1q_u64(&phi[14])); + + for (int i = 0; i < ROUNDS; i++) { + prng_gen(s, NULL, 128 * STEPS); + s->state[0] = s->output[6]; + s->state[1] = s->output[7]; + s->state[2] = s->output[4]; + s->state[3] = s->output[5]; + s->state[4] = s->output[2]; + s->state[5] = s->output[3]; + s->state[6] = s->output[0]; + s->state[7] = s->output[1]; + } +#undef STEPS +#undef ROUNDS +} +#undef SHISHUA_VSETQ_N_U64 +#undef SHISHUA_VEXTQ_U8 +#undef SHISHUA_RESTRICT +#endif diff --git a/external/shishua/shishua-sse2.h b/external/shishua/shishua-sse2.h new file mode 100644 index 000000000..af28300ad --- /dev/null +++ b/external/shishua/shishua-sse2.h @@ -0,0 +1,228 @@ +// An SSE2/SSSE3 version of shishua. Slower than AVX2, but more compatible. +// Also compatible with 32-bit x86. +// +// SSSE3 is recommended, as it has the useful _mm_alignr_epi8 intrinsic. +// We can still emulate it on SSE2, but it is slower. +// Consider the half version if AVX2 is not available. +#ifndef SHISHUA_SSE2_H +#define SHISHUA_SSE2_H +#include +#include +#include +// Note: cl.exe doesn't define __SSSE3__ +#if defined(__SSSE3__) || defined(__AVX__) +#include // SSSE3 +#define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \ + _mm_alignr_epi8(hi, lo, amt) +#else +#include // SSE2 +// Emulate _mm_alignr_epi8 for SSE2. It's a little slow. +// The compiler may convert it to a sequence of shufps instructions, which is +// perfectly fine. +#define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \ + _mm_or_si128( \ + _mm_slli_si128(hi, 16 - (amt)), \ + _mm_srli_si128(lo, amt)) +#endif + +typedef struct prng_state { + __m128i state[8]; + __m128i output[8]; + __m128i counter[2]; +} prng_state; + +// Wrappers for x86 targets which usually lack these intrinsics. +// Don't call these with side effects. +#if defined(__x86_64__) || defined(_M_X64) +#define SHISHUA_SET_EPI64X(b, a) _mm_set_epi64x(b, a) +#define SHISHUA_CVTSI64_SI128(x) _mm_cvtsi64_si128(x) +#else +#define SHISHUA_SET_EPI64X(b, a) \ + _mm_set_epi32( \ + (int)(((uint64_t)(b)) >> 32), \ + (int)(b), \ + (int)(((uint64_t)(a)) >> 32), \ + (int)(a)) +#define SHISHUA_CVTSI64_SI128(x) SHISHUA_SET_EPI64X(0, x) +#endif + +// buf could technically alias with prng_state, according to the compiler. +#if defined(__GNUC__) || defined(_MSC_VER) +#define SHISHUA_RESTRICT __restrict +#else +#define SHISHUA_RESTRICT +#endif + +// buf's size must be a multiple of 128 bytes. +static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) { + __m128i counter_lo = s->counter[0], counter_hi = s->counter[1]; + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, + // or about 7 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + + // increment = { 7, 5, 3, 1 }; + __m128i increment_lo = SHISHUA_SET_EPI64X(5, 7); + __m128i increment_hi = SHISHUA_SET_EPI64X(1, 3); + + // TODO: consider adding proper uneven write handling + assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes."); + + for (size_t i = 0; i < size; i += 128) { + // Write the current output block to state if it is not NULL + if (buf != NULL) { + for (size_t j = 0; j < 8; j++) { + _mm_storeu_si128((__m128i *)&buf[i + (16 * j)], s->output[j]); + } + } + + // There are only 16 SSE registers (8 on i686), and we have to account for + // temporary copies due to being stuck with 2-operand instructions. + // Therefore, we use fixed iteration loops to reduce code complexity while + // still allowing the compiler to easily unroll the loop. + // We also try to keep variables active for as short as possible. + for (size_t j = 0; j < 2; j++) { + __m128i s_lo, s_hi, u0_lo, u0_hi, u1_lo, u1_hi, t_lo, t_hi; + + // Lane 0 + s_lo = s->state[4 * j + 0]; + s_hi = s->state[4 * j + 1]; + + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + u0_lo = _mm_srli_epi64(s_lo, 1); + u0_hi = _mm_srli_epi64(s_hi, 1); + + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // You may notice that they are simply 256-bit rotations (96 and 160). + // Note: This: + // x = (y << 96) | (y >> 160) + // can be rewritten as this + // x_lo = (y_lo << 96) | (y_hi >> 32) + // x_hi = (y_hi << 96) | (y_lo >> 32) + // which we can do with 2 _mm_alignr_epi8 instructions. + t_lo = SHISHUA_ALIGNR_EPI8(s_lo, s_hi, 4); + t_hi = SHISHUA_ALIGNR_EPI8(s_hi, s_lo, 4); + + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + s->state[4 * j + 0] = _mm_add_epi64(t_lo, u0_lo); + s->state[4 * j + 1] = _mm_add_epi64(t_hi, u0_hi); + + // Lane 1 + s_lo = s->state[4 * j + 2]; + s_hi = s->state[4 * j + 3]; + + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + s_lo = _mm_add_epi64(s_lo, counter_lo); + s_hi = _mm_add_epi64(s_hi, counter_hi); + + // Same as above but with different shifts + u1_lo = _mm_srli_epi64(s_lo, 3); + u1_hi = _mm_srli_epi64(s_hi, 3); + + t_lo = SHISHUA_ALIGNR_EPI8(s_hi, s_lo, 12); + t_hi = SHISHUA_ALIGNR_EPI8(s_lo, s_hi, 12); + + s->state[4 * j + 2] = _mm_add_epi64(t_lo, u1_lo); + s->state[4 * j + 3] = _mm_add_epi64(t_hi, u1_hi); + + // Merge lane 0 and lane 1 + // The first orthogonally grown piece evolving independently, XORed. + s->output[2 * j + 0] = _mm_xor_si128(u0_lo, t_lo); + s->output[2 * j + 1] = _mm_xor_si128(u0_hi, t_hi); + } + + // The second orthogonally grown piece evolving independently, XORed. + s->output[4] = _mm_xor_si128(s->state[0], s->state[6]); + s->output[5] = _mm_xor_si128(s->state[1], s->state[7]); + s->output[6] = _mm_xor_si128(s->state[4], s->state[2]); + s->output[7] = _mm_xor_si128(s->state[5], s->state[3]); + + // Increment the counter + counter_lo = _mm_add_epi64(counter_lo, increment_lo); + counter_hi = _mm_add_epi64(counter_hi, increment_hi); + } + + s->counter[0] = counter_lo; + s->counter[1] = counter_hi; +} + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[16] = { + 0x9E3779B97F4A7C15, + 0xF39CC0605CEDC834, + 0x1082276BF3A27251, + 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, + 0x0347045B5BF1827F, + 0x01886F0928403002, + 0xC1D64BA40F335E36, + 0xF06AD7AE9717877E, + 0x85839D6EFFBD7DC6, + 0x64D325D1C5371682, + 0xCADD0CCCFDFFBBE1, + 0x626E33B8D04B4331, + 0xBBF73C790D94F79D, + 0x471C4AB3ED3D82A5, + 0xFEC507705E4AE6E5, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + // Note: output is uninitialized at first, but since we pass NULL, its value + // is initially ignored. + s->counter[0] = _mm_setzero_si128(); + s->counter[1] = _mm_setzero_si128(); +#define ROUNDS 13 +#define STEPS 1 + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + __m128i seed_0 = SHISHUA_CVTSI64_SI128(seed[0]); + __m128i seed_1 = SHISHUA_CVTSI64_SI128(seed[1]); + __m128i seed_2 = SHISHUA_CVTSI64_SI128(seed[2]); + __m128i seed_3 = SHISHUA_CVTSI64_SI128(seed[3]); + s->state[0] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[0])); + s->state[1] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[2])); + s->state[2] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[4])); + s->state[3] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[6])); + s->state[4] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[8])); + s->state[5] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[10])); + s->state[6] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[12])); + s->state[7] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[14])); + + for (int i = 0; i < ROUNDS; i++) { + prng_gen(s, NULL, 128 * STEPS); + s->state[0] = s->output[6]; + s->state[1] = s->output[7]; + s->state[2] = s->output[4]; + s->state[3] = s->output[5]; + s->state[4] = s->output[2]; + s->state[5] = s->output[3]; + s->state[6] = s->output[0]; + s->state[7] = s->output[1]; + } +#undef STEPS +#undef ROUNDS +} +#undef SHISHUA_CVTSI64_SI128 +#undef SHISHUA_ALIGNR_EPI8 +#undef SHISHUA_SET_EPI64X +#undef SHISHUA_RESTRICT +#endif diff --git a/external/shishua/shishua.h b/external/shishua/shishua.h new file mode 100644 index 000000000..a0bb9248d --- /dev/null +++ b/external/shishua/shishua.h @@ -0,0 +1,244 @@ +#ifndef SHISHUA_H +#define SHISHUA_H + +#define SHISHUA_TARGET_SCALAR 0 +#define SHISHUA_TARGET_AVX2 1 +#define SHISHUA_TARGET_SSE2 2 +#define SHISHUA_TARGET_NEON 3 + +#ifndef SHISHUA_TARGET +#if defined(__AVX2__) && (defined(__x86_64__) || defined(_M_X64)) +#define SHISHUA_TARGET SHISHUA_TARGET_AVX2 +#elif defined(__x86_64__) || defined(_M_X64) || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) +#define SHISHUA_TARGET SHISHUA_TARGET_SSE2 +// GCC's NEON codegen leaves much to be desired, at least as of 9.2.0. The +// scalar path ends up being faster. +// Device: Google Pixel 2 XL, 2.46GHz Qualcomm Snapdragon 835 +// algorithm | GCC 9.2.0 | Clang 9.0.1 +// shishua neon | 0.2845 ns/byte | 0.0966 ns/byte +// shishua scalar | 0.2056 ns/byte | 0.2958 ns/byte +// shishua half neon | 0.5169 ns/byte | 0.1929 ns/byte +// shishua half scalar | 0.2496 ns/byte | 0.2911 ns/byte +// Therefore, we only autoselect the NEON path on Clang, at least until GCC's +// NEON codegen improves. +#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__clang__) +#define SHISHUA_TARGET SHISHUA_TARGET_NEON +#else +#define SHISHUA_TARGET SHISHUA_TARGET_SCALAR +#endif +#endif + +// These are all optional, with defining SHISHUA_TARGET_SCALAR, you only +// need this header. +#if SHISHUA_TARGET == SHISHUA_TARGET_AVX2 +#include "shishua-avx2.h" +#elif SHISHUA_TARGET == SHISHUA_TARGET_SSE2 +#include "shishua-sse2.h" +#elif SHISHUA_TARGET == SHISHUA_TARGET_NEON +#include "shishua-neon.h" +#else // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR + +// Portable scalar implementation of shishua. +// Designed to balance performance and code size. +#include +#include +#include +#include + +// Note: While it is an array, a "lane" refers to 4 consecutive uint64_t. +typedef struct prng_state { + uint64_t state[16]; // 4 lanes + uint64_t output[16]; // 4 lanes, 2 parts + uint64_t counter[4]; // 1 lane +} prng_state; + +// buf could technically alias with prng_state, according to the compiler. +#if defined(__GNUC__) || defined(_MSC_VER) +#define SHISHUA_RESTRICT __restrict +#else +#define SHISHUA_RESTRICT +#endif + +// Writes a 64-bit little endian integer to dst +static inline void prng_write_le64(void *dst, uint64_t val) { + // Define to write in native endianness with memcpy + // Also, use memcpy on known little endian setups. +#if defined(SHISHUA_NATIVE_ENDIAN) || defined(_WIN32) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN__) + memcpy(dst, &val, sizeof(uint64_t)); +#else + // Byteshift write. + uint8_t *d = (uint8_t *)dst; + for (size_t i = 0; i < 8; i++) { + d[i] = (uint8_t)(val & 0xff); + val >>= 8; + } +#endif +} + +// buf's size must be a multiple of 128 bytes. +static inline void prng_gen(prng_state *SHISHUA_RESTRICT state, uint8_t *SHISHUA_RESTRICT buf, size_t size) { + uint8_t *b = buf; + // TODO: consider adding proper uneven write handling + assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes."); + + for (size_t i = 0; i < size; i += 128) { + // Write the current output block to state if it is not NULL + if (buf != NULL) { + for (size_t j = 0; j < 16; j++) { + prng_write_le64(b, state->output[j]); + b += 8; + } + } + // Similar to SSE, use fixed iteration loops to reduce code complexity + // and allow the compiler more control over optimization. + for (size_t j = 0; j < 2; j++) { + // I don't want to type this 15 times. + uint64_t *s = &state->state[j * 8]; // 2 lanes + uint64_t *o = &state->output[j * 4]; // 1 lane + uint64_t t[8]; // temp buffer + + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + for (size_t k = 0; k < 4; k++) { + s[k + 4] += state->counter[k]; + } + + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // + // You may notice that they are simply 256-bit rotations (96 and 160): + // + // t0 = (s0 << 96) | (s0 >> (256 - 96)); + // t1 = (s1 << 160) | (s1 >> (256 - 160)); + // + // The easiest way to do this would be to cast s and t to uint32_t * + // and operate on them that way. + // + // uint32_t *t0_32 = (uint32_t *)t0, *t1_32 = (uint32_t *)t1; + // uint32_t *s0_32 = (uint32_t *)s0, *s1_32 = (uint32_t *)s1; + // for (size_t k = 0; k < 4; k++) { + // t0_32[k] = s0_32[(k + 5) % 8]; + // t1_32[k] = s1_32[(k + 3) % 8]; + // } + // + // This is pretty, but it violates strict aliasing and relies on little + // endian data layout. + // + // A common workaround to strict aliasing would be to use memcpy: + // + // // legal casts + // unsigned char *t8 = (unsigned char *)t; + // unsigned char *s8 = (unsigned char *)s; + // memcpy(&t8[0], &s8[20], 32 - 20); + // memcpy(&t8[32 - 20], &s8[0], 20); + // + // However, this still doesn't fix the endianness issue, and is very + // ugly. + // + // The only known solution which doesn't rely on endianness is to + // read two 64-bit integers and do a funnel shift. + + // Lookup table for the _offsets_ in the shuffle. Even lanes rotate + // by 5, odd lanes rotate by 3. + // If it were by 32-bit lanes, it would be + // { 5,6,7,0,1,2,3,4, 11,12,13,14,15,8,9,10 } + const uint8_t shuf_offsets[16] = {2, 3, 0, 1, 5, 6, 7, 4, // left + 3, 0, 1, 2, 6, 7, 4, 5}; // right + for (size_t k = 0; k < 8; k++) { + t[k] = (s[shuf_offsets[k]] >> 32) | (s[shuf_offsets[k + 8]] << 32); + } + + for (size_t k = 0; k < 4; k++) { + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + uint64_t u_lo = s[k + 0] >> 1; + uint64_t u_hi = s[k + 4] >> 3; + + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + s[k + 0] = u_lo + t[k + 0]; + s[k + 4] = u_hi + t[k + 4]; + + // The first orthogonally grown piece evolving independently, XORed. + o[k] = u_lo ^ t[k + 4]; + } + } + + // Merge together. + for (size_t j = 0; j < 4; j++) { + // The second orthogonally grown piece evolving independently, XORed. + state->output[j + 8] = state->state[j + 0] ^ state->state[j + 12]; + state->output[j + 12] = state->state[j + 8] ^ state->state[j + 4]; + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, + // or about 7 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + // + // For the scalar version, we calculate this dynamically, as it is + // simple enough. + state->counter[j] += 7 - (j * 2); // 7, 5, 3, 1 + } + } +} +#undef SHISHUA_RESTRICT + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[16] = { + 0x9E3779B97F4A7C15, + 0xF39CC0605CEDC834, + 0x1082276BF3A27251, + 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, + 0x0347045B5BF1827F, + 0x01886F0928403002, + 0xC1D64BA40F335E36, + 0xF06AD7AE9717877E, + 0x85839D6EFFBD7DC6, + 0x64D325D1C5371682, + 0xCADD0CCCFDFFBBE1, + 0x626E33B8D04B4331, + 0xBBF73C790D94F79D, + 0x471C4AB3ED3D82A5, + 0xFEC507705E4AE6E5, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + memset(s, 0, sizeof(prng_state)); +#define STEPS 1 +#define ROUNDS 13 + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + memcpy(s->state, phi, sizeof(phi)); + for (size_t i = 0; i < 4; i++) { + s->state[i * 2 + 0] ^= seed[i]; // { s0,0,s1,0,s2,0,s3,0 } + s->state[i * 2 + 8] ^= seed[(i + 2) % 4]; // { s2,0,s3,0,s0,0,s1,0 } + } + for (size_t i = 0; i < ROUNDS; i++) { + prng_gen(s, NULL, 128 * STEPS); + for (size_t j = 0; j < 4; j++) { + s->state[j + 0] = s->output[j + 12]; + s->state[j + 4] = s->output[j + 8]; + s->state[j + 8] = s->output[j + 4]; + s->state[j + 12] = s->output[j + 0]; + } + } +#undef STEPS +#undef ROUNDS +} +#endif // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR +#endif // SHISHUA_SCALAR_H diff --git a/src/helper/shishua/shishua-avx2.h b/src/helper/shishua/shishua-avx2.h deleted file mode 100644 index c2c865e80..000000000 --- a/src/helper/shishua/shishua-avx2.h +++ /dev/null @@ -1,111 +0,0 @@ -#ifndef SHISHUA_AVX2_H -#define SHISHUA_AVX2_H -#include -#include -#include -#include -#include -typedef struct prng_state { - __m256i state[4]; - __m256i output[4]; - __m256i counter; -} prng_state; - -// buf's size must be a multiple of 128 bytes. -static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) { - __m256i o0 = s->output[0], o1 = s->output[1], o2 = s->output[2], o3 = s->output[3], - s0 = s->state[0], s1 = s->state[1], s2 = s->state[2], s3 = s->state[3], - t0, t1, t2, t3, u0, u1, u2, u3, counter = s->counter; - // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit - // additions to strong positions for enrichment. The low 32-bit part of a - // 64-bit chunk never moves to the same 64-bit chunk as its high part. - // They do not remain in the same chunk. Each part eventually reaches all - // positions ringwise: A to B, B to C, …, H to A. - // You may notice that they are simply 256-bit rotations (96 and 160). - __m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5), - shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3); - // The counter is not necessary to beat PractRand. - // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, - // or about 7 millenia at 10 GiB/s. - // The increments are picked as odd numbers, - // since only coprimes of the base cover the full cycle, - // and all odd numbers are coprime of 2. - // I use different odd numbers for each 64-bit chunk - // for a tiny amount of variation stirring. - // I used the smallest odd numbers to avoid having a magic number. - __m256i increment = _mm256_set_epi64x(1, 3, 5, 7); - - // TODO: consider adding proper uneven write handling - assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes."); - - for (size_t i = 0; i < size; i += 128) { - if (buf != NULL) { - _mm256_storeu_si256((__m256i*)&buf[i + 0], o0); - _mm256_storeu_si256((__m256i*)&buf[i + 32], o1); - _mm256_storeu_si256((__m256i*)&buf[i + 64], o2); - _mm256_storeu_si256((__m256i*)&buf[i + 96], o3); - } - - // I apply the counter to s1, - // since it is the one whose shift loses most entropy. - s1 = _mm256_add_epi64(s1, counter); - s3 = _mm256_add_epi64(s3, counter); - counter = _mm256_add_epi64(counter, increment); - - // SIMD does not support rotations. Shift is the next best thing to entangle - // bits with other 64-bit positions. We must shift by an odd number so that - // each bit reaches all 64-bit positions, not just half. We must lose bits - // of information, so we minimize it: 1 and 3. We use different shift values - // to increase divergence between the two sides. We use rightward shift - // because the rightmost bits have the least diffusion in addition (the low - // bit is just a XOR of the low bits). - u0 = _mm256_srli_epi64(s0, 1); u1 = _mm256_srli_epi64(s1, 3); - u2 = _mm256_srli_epi64(s2, 1); u3 = _mm256_srli_epi64(s3, 3); - t0 = _mm256_permutevar8x32_epi32(s0, shu0); t1 = _mm256_permutevar8x32_epi32(s1, shu1); - t2 = _mm256_permutevar8x32_epi32(s2, shu0); t3 = _mm256_permutevar8x32_epi32(s3, shu1); - // Addition is the main source of diffusion. - // Storing the output in the state keeps that diffusion permanently. - s0 = _mm256_add_epi64(t0, u0); s1 = _mm256_add_epi64(t1, u1); - s2 = _mm256_add_epi64(t2, u2); s3 = _mm256_add_epi64(t3, u3); - - // Two orthogonally grown pieces evolving independently, XORed. - o0 = _mm256_xor_si256(u0, t1); - o1 = _mm256_xor_si256(u2, t3); - o2 = _mm256_xor_si256(s0, s3); - o3 = _mm256_xor_si256(s2, s1); - } - s->output[0] = o0; s->output[1] = o1; s->output[2] = o2; s->output[3] = o3; - s->state [0] = s0; s->state [1] = s1; s->state [2] = s2; s->state [3] = s3; - s->counter = counter; -} - -// Nothing up my sleeve: those are the hex digits of Φ, -// the least approximable irrational number. -// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc -static uint64_t phi[16] = { - 0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95, - 0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36, - 0xF06AD7AE9717877E, 0x85839D6EFFBD7DC6, 0x64D325D1C5371682, 0xCADD0CCCFDFFBBE1, - 0x626E33B8D04B4331, 0xBBF73C790D94F79D, 0x471C4AB3ED3D82A5, 0xFEC507705E4AE6E5, -}; - -void prng_init(prng_state *s, uint64_t seed[4]) { - memset(s, 0, sizeof(prng_state)); -# define STEPS 1 -# define ROUNDS 13 - uint8_t buf[128 * STEPS]; - // Diffuse first two seed elements in s0, then the last two. Same for s1. - // We must keep half of the state unchanged so users cannot set a bad state. - s->state[0] = _mm256_set_epi64x(phi[ 3], phi[ 2] ^ seed[1], phi[ 1], phi[ 0] ^ seed[0]); - s->state[1] = _mm256_set_epi64x(phi[ 7], phi[ 6] ^ seed[3], phi[ 5], phi[ 4] ^ seed[2]); - s->state[2] = _mm256_set_epi64x(phi[11], phi[10] ^ seed[3], phi[ 9], phi[ 8] ^ seed[2]); - s->state[3] = _mm256_set_epi64x(phi[15], phi[14] ^ seed[1], phi[13], phi[12] ^ seed[0]); - for (size_t i = 0; i < ROUNDS; i++) { - prng_gen(s, buf, 128 * STEPS); - s->state[0] = s->output[3]; s->state[1] = s->output[2]; - s->state[2] = s->output[1]; s->state[3] = s->output[0]; - } -# undef STEPS -# undef ROUNDS -} -#endif diff --git a/src/helper/shishua/shishua-half-avx2.h b/src/helper/shishua/shishua-half-avx2.h deleted file mode 100644 index b21f46174..000000000 --- a/src/helper/shishua/shishua-half-avx2.h +++ /dev/null @@ -1,96 +0,0 @@ -#ifndef SHISHUA_H -#define SHISHUA_H -#include -#include -#include -#include -#include -typedef struct prng_state { - __m256i state[2]; - __m256i output; - __m256i counter; -} prng_state; - -// buf's size must be a multiple of 32 bytes. -static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) { - __m256i s0 = s->state[0], counter = s->counter, - s1 = s->state[1], o = s->output, - t0, t1, t2, t3, u0, u1, u2, u3; - // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit - // additions to strong positions for enrichment. The low 32-bit part of a - // 64-bit chunk never moves to the same 64-bit chunk as its high part. - // They do not remain in the same chunk. Each part eventually reaches all - // positions ringwise: A to B, B to C, …, H to A. - // You may notice that they are simply 256-bit rotations (96 and 160). - __m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5), - shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3); - // The counter is not necessary to beat PractRand. - // It sets a lower bound of 2^69 bytes = 512 EiB to the period, - // or about 1 millenia at 10 GiB/s. - // The increments are picked as odd numbers, - // since only coprimes of the base cover the full cycle, - // and all odd numbers are coprime of 2. - // I use different odd numbers for each 64-bit chunk - // for a tiny amount of variation stirring. - // I used the smallest odd numbers to avoid having a magic number. - __m256i increment = _mm256_set_epi64x(1, 3, 5, 7); - - // TODO: consider adding proper uneven write handling - assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes."); - - for (size_t i = 0; i < size; i += 32) { - if (buf != NULL) { - _mm256_storeu_si256((__m256i*)&buf[i], o); - } - - // I apply the counter to s1, - // since it is the one whose shift loses most entropy. - s1 = _mm256_add_epi64(s1, counter); - counter = _mm256_add_epi64(counter, increment); - - // SIMD does not support rotations. Shift is the next best thing to entangle - // bits with other 64-bit positions. We must shift by an odd number so that - // each bit reaches all 64-bit positions, not just half. We must lose bits - // of information, so we minimize it: 1 and 3. We use different shift values - // to increase divergence between the two sides. We use rightward shift - // because the rightmost bits have the least diffusion in addition (the low - // bit is just a XOR of the low bits). - u0 = _mm256_srli_epi64(s0, 1); u1 = _mm256_srli_epi64(s1, 3); - t0 = _mm256_permutevar8x32_epi32(s0, shu0); t1 = _mm256_permutevar8x32_epi32(s1, shu1); - // Addition is the main source of diffusion. - // Storing the output in the state keeps that diffusion permanently. - s0 = _mm256_add_epi64(t0, u0); s1 = _mm256_add_epi64(t1, u1); - - // Two orthogonally grown pieces evolving independently, XORed. - o = _mm256_xor_si256(u0, t1); - } - s->state[0] = s0; s->counter = counter; - s->state[1] = s1; s->output = o; -} - -// Nothing up my sleeve: those are the hex digits of Φ, -// the least approximable irrational number. -// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc -static uint64_t phi[8] = { - 0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95, - 0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36, -}; - -void prng_init(prng_state *s, uint64_t seed[4]) { - memset(s, 0, sizeof(prng_state)); -# define STEPS 5 -# define ROUNDS 4 - uint8_t buf[32 * STEPS]; // 4 64-bit numbers per 256-bit SIMD. - // Diffuse first two seed elements in s0, then the last two. Same for s1. - // We must keep half of the state unchanged so users cannot set a bad state. - s->state[0] = _mm256_set_epi64x(phi[3], phi[2] ^ seed[1], phi[1], phi[0] ^ seed[0]); - s->state[1] = _mm256_set_epi64x(phi[7], phi[6] ^ seed[3], phi[5], phi[4] ^ seed[2]); - for (size_t i = 0; i < ROUNDS; i++) { - prng_gen(s, buf, 32 * STEPS); - s->state[0] = s->state[1]; - s->state[1] = s->output; - } -# undef STEPS -# undef ROUNDS -} -#endif diff --git a/src/helper/shishua/shishua-half-neon.h b/src/helper/shishua/shishua-half-neon.h deleted file mode 100644 index f946a9aee..000000000 --- a/src/helper/shishua/shishua-half-neon.h +++ /dev/null @@ -1,155 +0,0 @@ -// An ARM NEON version of shishua. -#ifndef SHISHUA_HALF_NEON_H -#define SHISHUA_HALF_NEON_H -#include -#include -#include -#include -// A NEON version. -typedef struct prng_state { - uint64x2_t state[4]; - uint64x2_t output[2]; - uint64x2_t counter[2]; -} prng_state; - -#define SHISHUA_VSETQ_N_U64(a, b) vcombine_u64(vdup_n_u64(a), vdup_n_u64(b)) - -// To hide the vreinterpret ritual. -#define SHISHUA_VEXTQ_U8(Rn, Rm, Imm) \ - vreinterpretq_u64_u8( \ - vextq_u8( \ - vreinterpretq_u8_u64(Rn), \ - vreinterpretq_u8_u64(Rm), \ - (Imm) \ - ) \ - ) - -// buf could technically alias with prng_state, according to the compiler. -#if defined(__GNUC__) || defined(_MSC_VER) -# define SHISHUA_RESTRICT __restrict -#else -# define SHISHUA_RESTRICT -#endif - -// buf's size must be a multiple of 32 bytes. -static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) { - uint64x2_t s0_lo = s->state[0], s0_hi = s->state[1], - s1_lo = s->state[2], s1_hi = s->state[3], - o_lo = s->output[0], o_hi = s->output[1], - t0_lo, t0_hi, t1_lo, t1_hi, u_lo, u_hi, - counter_lo = s->counter[0], counter_hi = s->counter[1]; - // The counter is not necessary to beat PractRand. - // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, - // or about 7 millenia at 10 GiB/s. - // The increments are picked as odd numbers, - // since only coprimes of the base cover the full cycle, - // and all odd numbers are coprime of 2. - // I use different odd numbers for each 64-bit chunk - // for a tiny amount of variation stirring. - // I used the smallest odd numbers to avoid having a magic number. - uint64x2_t increment_lo = SHISHUA_VSETQ_N_U64(7, 5); - uint64x2_t increment_hi = SHISHUA_VSETQ_N_U64(3, 1); - uint8_t *b = buf; - // TODO: consider adding proper uneven write handling - assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes."); - - for (size_t i = 0; i < size/32; i++) { - if (buf != NULL) { - vst1q_u8(&b[0], vreinterpretq_u8_u64(o_lo)); b += 16; - vst1q_u8(&b[0], vreinterpretq_u8_u64(o_hi)); b += 16; - } - - // I apply the counter to s1, - // since it is the one whose shift loses most entropy. - s1_lo = vaddq_u64(s1_lo, counter_lo); - s1_hi = vaddq_u64(s1_hi, counter_hi); - - counter_lo = vaddq_u64(counter_lo, increment_lo); - counter_hi = vaddq_u64(counter_hi, increment_hi); - - // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit - // additions to strong positions for enrichment. The low 32-bit part of a - // 64-bit chunk never moves to the same 64-bit chunk as its high part. - // They do not remain in the same chunk. Each part eventually reaches all - // positions ringwise: A to B, B to C, …, H to A. - // You may notice that they are simply 256-bit rotations (96 and 160). - // Note: This: - // x = (y << 96) | (y >> 160) - // can be rewritten as this - // x_lo = (y_lo << 96) | (y_hi >> 32) - // x_hi = (y_hi << 96) | (y_lo >> 32) - // which we can do with 2 vext.8 instructions. - t0_lo = SHISHUA_VEXTQ_U8(s0_hi, s0_lo, 4); - t0_hi = SHISHUA_VEXTQ_U8(s0_lo, s0_hi, 4); - t1_lo = SHISHUA_VEXTQ_U8(s1_lo, s1_hi, 12); - t1_hi = SHISHUA_VEXTQ_U8(s1_hi, s1_lo, 12); - - // SIMD does not support rotations. Shift is the next best thing to entangle - // bits with other 64-bit positions. We must shift by an odd number so that - // each bit reaches all 64-bit positions, not just half. We must lose bits - // of information, so we minimize it: 1 and 3. We use different shift values - // to increase divergence between the two sides. We use rightward shift - // because the rightmost bits have the least diffusion in addition (the low - // bit is just a XOR of the low bits). - u_lo = vshrq_n_u64(s0_lo, 1); u_hi = vshrq_n_u64(s0_hi, 1); -#if defined(__clang__) - // UGLY HACK: Clang enjoys merging the above statements with the vadds below - // into vsras. - // This is dumb, as it still needs to do the original vshr for the xor mix, - // causing it to shift twice. - // This makes Clang assume that this line has side effects, preventing the - // combination and speeding things up significantly. - __asm__("" : "+w" (u_lo), "+w" (u_hi)); -#endif - - - // Two orthogonally grown pieces evolving independently, XORed. - // Note: We do this first because on the assembly side, vsra would overwrite - // t1_lo and t1_hi. - o_lo = veorq_u64(u_lo, t1_lo); - o_hi = veorq_u64(u_hi, t1_hi); - // Addition is the main source of diffusion. - // Storing the output in the state keeps that diffusion permanently. - s0_lo = vaddq_u64(t0_lo, u_lo); - s0_hi = vaddq_u64(t0_hi, u_hi); - // Use vsra here directly. - s1_lo = vsraq_n_u64(t1_lo, s1_lo, 3); - s1_hi = vsraq_n_u64(t1_hi, s1_hi, 3); - } - s->state[0] = s0_lo; s->state[1] = s0_hi; - s->state[2] = s1_lo; s->state[3] = s1_hi; - s->output[0] = o_lo; s->output[1] = o_hi; - s->counter[0] = counter_lo; s->counter[1] = counter_hi; -} - -// Nothing up my sleeve: those are the hex digits of Φ, -// the least approximable irrational number. -// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc -static uint64_t phi[8] = { - 0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95, - 0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36, -}; - -void prng_init(prng_state *s, uint64_t seed[4]) { - s->counter[0] = vdupq_n_u64(0); - s->counter[1] = vdupq_n_u64(0); -# define STEPS 5 -# define ROUNDS 4 - // Diffuse first two seed elements in s0, then the last two. Same for s1. - // We must keep half of the state unchanged so users cannot set a bad state. - for (size_t i = 0; i < 4; i++) { - s->state[i] = veorq_u64(SHISHUA_VSETQ_N_U64(seed[i], 0), vld1q_u64(&phi[i * 2])); - } - for (size_t i = 0; i < ROUNDS; i++) { - prng_gen(s, NULL, 32 * STEPS); - s->state[0] = s->state[2]; s->state[1] = s->state[3]; - s->state[2] = s->output[0]; s->state[3] = s->output[1]; - } -# undef STEPS -# undef ROUNDS -} -#undef SHISHUA_VSETQ_N_U64 -#undef SHISHUA_VEXTQ_U8 -#undef SHISHUA_RESTRICT - -#endif diff --git a/src/helper/shishua/shishua-half-sse2.h b/src/helper/shishua/shishua-half-sse2.h deleted file mode 100644 index d6f958930..000000000 --- a/src/helper/shishua/shishua-half-sse2.h +++ /dev/null @@ -1,194 +0,0 @@ -// An SSE2/SSSE3 version of shishua half. Slower than AVX2, but more compatible. -// Also compatible with 32-bit x86. -// -// SSSE3 is recommended, as it has the useful _mm_alignr_epi8 intrinsic. -// We can still emulate it on SSE2, but it is slower. -#ifndef SHISHUA_HALF_SSE2_H -#define SHISHUA_HALF_SSE2_H - -#include -#include -#include -// Note: cl.exe doesn't define __SSSE3__ -#if defined(__SSSE3__) || defined(__AVX__) -# include // SSSE3 -# define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \ - _mm_alignr_epi8(hi, lo, amt) -#else -# include // SSE2 -// Emulate _mm_alignr_epi8 for SSE2. It's a little slow. -// The compiler may convert it to a sequence of shufps instructions, which is -// perfectly fine. -# define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \ - _mm_or_si128( \ - _mm_slli_si128(hi, 16 - (amt)), \ - _mm_srli_si128(lo, amt) \ - ) -#endif - -typedef struct prng_state { - __m128i state[4]; - __m128i output[2]; - __m128i counter[2]; -} prng_state; - -// Wrappers for x86 targets which usually lack these intrinsics. -// Don't call these with side effects. -#if defined(__x86_64__) || defined(_M_X64) -# define SHISHUA_SET_EPI64X(b, a) _mm_set_epi64x(b, a) -# define SHISHUA_CVTSI64_SI128(x) _mm_cvtsi64_si128(x) -#else -# define SHISHUA_SET_EPI64X(b, a) \ - _mm_set_epi32( \ - (int)(((uint64_t)(b)) >> 32), \ - (int)(b), \ - (int)(((uint64_t)(a)) >> 32), \ - (int)(a) \ - ) -# define SHISHUA_CVTSI64_SI128(x) SHISHUA_SET_EPI64X(0, x) -#endif - -// buf could technically alias with prng_state, according to the compiler. -#if defined(__GNUC__) || defined(_MSC_VER) -# define SHISHUA_RESTRICT __restrict -#else -# define SHISHUA_RESTRICT -#endif - -// buf's size must be a multiple of 32 bytes. -static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) { - // We need a lot of variables... - __m128i s0_lo = s->state[0], s0_hi = s->state[1], - s1_lo = s->state[2], s1_hi = s->state[3], - t0_lo, t0_hi, t1_lo, t1_hi, - u_lo, u_hi, - counter_lo = s->counter[0], counter_hi = s->counter[1]; - // The counter is not necessary to beat PractRand. - // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, - // or about 7 millenia at 10 GiB/s. - // The increments are picked as odd numbers, - // since only coprimes of the base cover the full cycle, - // and all odd numbers are coprime of 2. - // I use different odd numbers for each 64-bit chunk - // for a tiny amount of variation stirring. - // I used the smallest odd numbers to avoid having a magic number. - // increment = { 7, 5, 3, 1 }; - const __m128i increment_lo = SHISHUA_SET_EPI64X(5, 7); - const __m128i increment_hi = SHISHUA_SET_EPI64X(1, 3); - - // TODO: consider adding proper uneven write handling - assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes."); - - for (size_t i = 0; i < size; i += 32) { - // Split to try to reduce register pressure - if (buf != NULL) { - _mm_storeu_si128((__m128i*)&buf[i+ 0], s->output[0]); - _mm_storeu_si128((__m128i*)&buf[i+16], s->output[1]); - } - - // Lane 0 - - - // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit - // additions to strong positions for enrichment. The low 32-bit part of a - // 64-bit chunk never moves to the same 64-bit chunk as its high part. - // They do not remain in the same chunk. Each part eventually reaches all - // positions ringwise: A to B, B to C, …, H to A. - // You may notice that they are simply 256-bit rotations (96 and 160). - // Note: This: - // x = (y << 96) | (y >> 160) - // can be rewritten as this - // x_lo = (y_lo << 96) | (y_hi >> 32) - // x_hi = (y_hi << 96) | (y_lo >> 32) - // which we can do with 2 _mm_alignr_epi8 instructions. - t0_lo = SHISHUA_ALIGNR_EPI8(s0_lo, s0_hi, 4); - t0_hi = SHISHUA_ALIGNR_EPI8(s0_hi, s0_lo, 4); - - // SIMD does not support rotations. Shift is the next best thing to entangle - // bits with other 64-bit positions. We must shift by an odd number so that - // each bit reaches all 64-bit positions, not just half. We must lose bits - // of information, so we minimize it: 1 and 3. We use different shift values - // to increase divergence between the two sides. We use rightward shift - // because the rightmost bits have the least diffusion in addition (the low - // bit is just a XOR of the low bits). - u_lo = _mm_srli_epi64(s0_lo, 1); - u_hi = _mm_srli_epi64(s0_hi, 1); - - // Addition is the main source of diffusion. - // Storing the output in the state keeps that diffusion permanently. - s0_lo = _mm_add_epi64(u_lo, t0_lo); - s0_hi = _mm_add_epi64(u_hi, t0_hi); - - // Lane 1 - - // I apply the counter to s1, - // since it is the one whose shift loses most entropy. - s1_lo = _mm_add_epi64(s1_lo, counter_lo); - s1_hi = _mm_add_epi64(s1_hi, counter_hi); - - // Increment the counter - counter_lo = _mm_add_epi64(counter_lo, increment_lo); - counter_hi = _mm_add_epi64(counter_hi, increment_hi); - - // Same as above, but with different shift amounts. - t1_lo = SHISHUA_ALIGNR_EPI8(s1_hi, s1_lo, 12); - t1_hi = SHISHUA_ALIGNR_EPI8(s1_lo, s1_hi, 12); - - s1_lo = _mm_srli_epi64(s1_lo, 3); - s1_hi = _mm_srli_epi64(s1_hi, 3); - - s1_lo = _mm_add_epi64(s1_lo, t1_lo); - s1_hi = _mm_add_epi64(s1_hi, t1_hi); - - // Two orthogonally grown pieces evolving independently, XORed. - s->output[0] = _mm_xor_si128(u_lo, t1_lo); - s->output[1] = _mm_xor_si128(u_hi, t1_hi); - } - - s->state[0] = s0_lo; s->state[1] = s0_hi; - s->state[2] = s1_lo; s->state[3] = s1_hi; - - s->counter[0] = counter_lo; - s->counter[1] = counter_hi; -} - - -// Nothing up my sleeve: those are the hex digits of Φ, -// the least approximable irrational number. -// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc -static uint64_t phi[8] = { - 0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95, - 0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36, -}; - -void prng_init(prng_state *s, uint64_t seed[4]) { - s->counter[0] = _mm_setzero_si128(); - s->counter[1] = _mm_setzero_si128(); -# define STEPS 5 -# define ROUNDS 4 - // Diffuse first two seed elements in s0, then the last two. Same for s1. - // We must keep half of the state unchanged so users cannot set a bad state. - // XXX: better way to do this? - __m128i seed_0 = SHISHUA_CVTSI64_SI128(seed[0]); - __m128i seed_1 = SHISHUA_CVTSI64_SI128(seed[1]); - __m128i seed_2 = SHISHUA_CVTSI64_SI128(seed[2]); - __m128i seed_3 = SHISHUA_CVTSI64_SI128(seed[3]); - s->state[0] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[0])); - s->state[1] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[2])); - s->state[2] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[4])); - s->state[3] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[6])); - - for (int i = 0; i < ROUNDS; i++) { - prng_gen(s, NULL, 32 * STEPS); - s->state[0] = s->state[2]; s->state[1] = s->state[3]; - s->state[2] = s->output[0]; s->state[3] = s->output[1]; - } -# undef STEPS -# undef ROUNDS -} -#undef SHISHUA_ALIGNR_EPI8 -#undef SHISHUA_CVTSI64_SI128 -#undef SHISHUA_SET_EPI64X -#undef SHISHUA_RESTRICT - -#endif diff --git a/src/helper/shishua/shishua-half.h b/src/helper/shishua/shishua-half.h deleted file mode 100644 index a1ea715b1..000000000 --- a/src/helper/shishua/shishua-half.h +++ /dev/null @@ -1,195 +0,0 @@ -#ifndef SHISHUA_HALF_H -#define SHISHUA_HALF_H - -#define SHISHUA_TARGET_SCALAR 0 -#define SHISHUA_TARGET_AVX2 1 -#define SHISHUA_TARGET_SSE2 2 -#define SHISHUA_TARGET_NEON 3 - -#ifndef SHISHUA_TARGET -# if defined(__AVX2__) && (defined(__x86_64__) || defined(_M_X64)) -# define SHISHUA_TARGET SHISHUA_TARGET_AVX2 -# elif defined(__x86_64__) || defined(_M_X64) || defined(__SSE2__) \ - || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) -# define SHISHUA_TARGET SHISHUA_TARGET_SSE2 -// GCC's NEON codegen leaves much to be desired, at least as of 9.2.0. The -// scalar path ends up being faster. -// Device: Google Pixel 2 XL, 2.46GHz Qualcomm Snapdragon 835 -// | GCC 9.2.0 | Clang 9.0.1 -// shishua neon | 0.2845 ns/byte | 0.0966 ns/byte -// shishua scalar | 0.2056 ns/byte | 0.2958 ns/byte -// shishua half neon | 0.5169 ns/byte | 0.1929 ns/byte -// shishua half scalar | 0.2496 ns/byte | 0.2911 ns/byte -// Therefore, we only autoselect the NEON path on Clang, at least until GCC's -// NEON codegen improves. -# elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__clang__) -# define SHISHUA_TARGET SHISHUA_TARGET_NEON -# else -# define SHISHUA_TARGET SHISHUA_TARGET_SCALAR -# endif -#endif - -// These are all optional: By defining SHISHUA_TARGET_SCALAR, you only -// need this header. -// Additionally, these headers can also be used on their own. -#if SHISHUA_TARGET == SHISHUA_TARGET_AVX2 -# include "shishua-half-avx2.h" -#elif SHISHUA_TARGET == SHISHUA_TARGET_SSE2 -# include "shishua-half-sse2.h" -#elif SHISHUA_TARGET == SHISHUA_TARGET_NEON -# include "shishua-half-neon.h" -#else // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR - -// Portable scalar implementation of shishua half. -// Aims for a balance between code size and performance. -#include -#include -#include -#include - -// Note: While it is an array, a "lane" refers to 4 consecutive uint64_t. -typedef struct prng_state { - uint64_t state[8]; // 2 lanes - uint64_t output[4]; // 1 lane - uint64_t counter[4]; // 1 lane -} prng_state; - -// buf could technically alias with prng_state, according to the compiler. -#if defined(__GNUC__) || defined(_MSC_VER) -# define SHISHUA_RESTRICT __restrict -#else -# define SHISHUA_RESTRICT -#endif - -static inline void shishua_write_le64(void *dst, uint64_t val) { - // Define to write in native endianness with memcpy - // Also, use memcpy on known little endian setups. -# if defined(SHISHUA_NATIVE_ENDIAN) \ - || defined(_WIN32) \ - || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ - || defined(__LITTLE_ENDIAN__) - memcpy(dst, &val, sizeof(uint64_t)); -#else - // Byteshift write. - uint8_t *d = (uint8_t *)dst; - for (size_t i = 0; i < 8; i++) { - d[i] = (uint8_t)(val & 0xff); - val >>= 8; - } -#endif -} - -// buf's size must be a multiple of 32 bytes. -static inline void prng_gen(prng_state *SHISHUA_RESTRICT state, uint8_t *SHISHUA_RESTRICT buf, size_t size) { - - uint8_t *b = buf; - // TODO: consider adding proper uneven write handling - assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes."); - - for (size_t i = 0; i < size; i += 32) { - uint64_t t[8]; - // Write to buf - if (buf != NULL) { - for (size_t j = 0; j < 4; j++) { - shishua_write_le64(b, state->output[j]); - b += 8; - } - } - - for (size_t j = 0; j < 4; j++) { - // I apply the counter to s1, - // since it is the one whose shift loses most entropy. - state->state[j + 4] += state->counter[j]; - // The counter is not necessary to beat PractRand. - // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, - // or about 7 millenia at 10 GiB/s. - // The increments are picked as odd numbers, - // since only coprimes of the base cover the full cycle, - // and all odd numbers are coprime of 2. - // I use different odd numbers for each 64-bit chunk - // for a tiny amount of variation stirring. - // I used the smallest odd numbers to avoid having a magic number. - // - // For the scalar version, we calculate this dynamically, as it is - // simple enough. - state->counter[j] += 7 - (j * 2); // 7, 5, 3, 1 - } - - // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit - // additions to strong positions for enrichment. The low 32-bit part of a - // 64-bit chunk never moves to the same 64-bit chunk as its high part. - // They do not remain in the same chunk. Each part eventually reaches all - // positions ringwise: A to B, B to C, …, H to A. - // You may notice that they are simply 256-bit rotations (96 and 160). - // - // It would be much easier and cleaner to just reinterpret as a uint32_t - // pointer or use memcpy, but that is unfortunately endian-dependent, and - // the former also breaks strict aliasing. - // - // The only known solution which doesn't rely on endianness is to - // read two 64-bit integers and do a funnel shift. - // - // See shishua.h for details. - - // Lookup table for the _offsets_ in the shuffle. Even lanes rotate - // by 5, odd lanes rotate by 3. - // If it were by 32-bit lanes, it would be - // { 5,6,7,0,1,2,3,4, 11,12,13,14,15,8,9,10 } - const uint8_t shuf_offsets[16] = { 2,3,0,1, 5,6,7,4, // left - 3,0,1,2, 6,7,4,5 }; // right - for (size_t j = 0; j < 8; j++) { - t[j] = (state->state[shuf_offsets[j]] >> 32) | (state->state[shuf_offsets[j + 8]] << 32); - } - for (size_t j = 0; j < 4; j++) { - // SIMD does not support rotations. Shift is the next best thing to entangle - // bits with other 64-bit positions. We must shift by an odd number so that - // each bit reaches all 64-bit positions, not just half. We must lose bits - // of information, so we minimize it: 1 and 3. We use different shift values - // to increase divergence between the two sides. We use rightward shift - // because the rightmost bits have the least diffusion in addition (the low - // bit is just a XOR of the low bits). - uint64_t u_lo = state->state[j + 0] >> 1; - uint64_t u_hi = state->state[j + 4] >> 3; - // Addition is the main source of diffusion. - // Storing the output in the state keeps that diffusion permanently. - state->state[j + 0] = u_lo + t[j + 0]; - state->state[j + 4] = u_hi + t[j + 4]; - - // Two orthogonally grown pieces evolving independently, XORed. - state->output[j] = u_lo ^ t[j + 4]; - } - } -} -#undef SHISHUA_RESTRICT - -// Nothing up my sleeve: those are the hex digits of Φ, -// the least approximable irrational number. -// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc -static uint64_t phi[8] = { - 0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95, - 0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36, -}; - -void prng_init(prng_state *s, uint64_t seed[4]) { - memset(s, 0, sizeof(prng_state)); - -# define STEPS 5 -# define ROUNDS 4 - // Diffuse first two seed elements in s0, then the last two. Same for s1. - // We must keep half of the state unchanged so users cannot set a bad state. - memcpy(s->state, phi, sizeof(phi)); - for (size_t i = 0; i < 4; i++) { - s->state[i * 2] ^= seed[i]; - } - for (size_t i = 0; i < ROUNDS; i++) { - prng_gen(s, NULL, 32 * STEPS); - for (size_t j = 0; j < 4; j++) { - s->state[j + 0] = s->state[j + 4]; - s->state[j + 4] = s->output[j]; - } - } -# undef STEPS -# undef ROUNDS -} -#endif // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR -#endif // SHISHUA_HALF_SCALAR_H diff --git a/src/helper/shishua/shishua-neon.h b/src/helper/shishua/shishua-neon.h deleted file mode 100644 index b3ecbafdd..000000000 --- a/src/helper/shishua/shishua-neon.h +++ /dev/null @@ -1,180 +0,0 @@ -// An ARM NEON version of shishua. -#ifndef SHISHUA_NEON_H -#define SHISHUA_NEON_H -#include -#include -#include -#include - -typedef struct prng_state { - uint64x2_t state[8]; - uint64x2_t output[8]; - uint64x2_t counter[2]; -} prng_state; - -#if defined(__GNUC__) && (defined(__BYTE_ORDER__) && __BYTE_ORDER__==__ORDER_LITTLE_ENDIAN__) -# define SHISHUA_VSETQ_N_U64(a, b) (__extension__(uint64x2_t) { a, b }) -#else -# define SHISHUA_VSETQ_N_U64(a, b) vcombine_u64(vdup_n_u64(a), vdup_n_u64(b)) -#endif -// To hide the vreinterpret ritual. -#define SHISHUA_VEXTQ_U8(Rn, Rm, Imm) \ - vreinterpretq_u64_u8( \ - vextq_u8( \ - vreinterpretq_u8_u64(Rn), \ - vreinterpretq_u8_u64(Rm), \ - (Imm) \ - ) \ - ) - -// buf could technically alias with prng_state, according to the compiler. -#if defined(__GNUC__) || defined(_MSC_VER) -# define SHISHUA_RESTRICT __restrict -#else -# define SHISHUA_RESTRICT -#endif - -// buf's size must be a multiple of 128 bytes. -static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) { - uint8_t *b = buf; - uint64x2_t counter_lo = s->counter[0], counter_hi = s->counter[1]; - // The counter is not necessary to beat PractRand. - // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, - // or about 7 millenia at 10 GiB/s. - // The increments are picked as odd numbers, - // since only coprimes of the base cover the full cycle, - // and all odd numbers are coprime of 2. - // I use different odd numbers for each 64-bit chunk - // for a tiny amount of variation stirring. - // I used the smallest odd numbers to avoid having a magic number. - uint64x2_t increment_lo = SHISHUA_VSETQ_N_U64(7, 5); - uint64x2_t increment_hi = SHISHUA_VSETQ_N_U64(3, 1); - // TODO: consider adding proper uneven write handling - assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes."); - - for (size_t i = 0; i < size; i += 128) { - // Write the current output block to state if it is not NULL - if (buf != NULL) { - for (size_t j = 0; j < 8; j++) { - vst1q_u8(b, vreinterpretq_u8_u64(s->output[j])); - b += 16; - } - } - // NEON has less register pressure than SSE2, but we reroll it anyways for - // code size. - for (size_t j = 0; j < 2; j++) { - uint64x2_t s0_lo = s->state[j * 4 + 0], - s0_hi = s->state[j * 4 + 1], - s1_lo = s->state[j * 4 + 2], - s1_hi = s->state[j * 4 + 3], - t0_lo, t0_hi, t1_lo, t1_hi, - u_lo, u_hi; - - // I apply the counter to s1, - // since it is the one whose shift loses most entropy. - s1_lo = vaddq_u64(s1_lo, counter_lo); s1_hi = vaddq_u64(s1_hi, counter_hi); - - // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit - // additions to strong positions for enrichment. The low 32-bit part of a - // 64-bit chunk never moves to the same 64-bit chunk as its high part. - // They do not remain in the same chunk. Each part eventually reaches all - // positions ringwise: A to B, B to C, …, H to A. - // You may notice that they are simply 256-bit rotations (96 and 160). - // Note: This: - // x = (y << 96) | (y >> 160) - // can be rewritten as this - // x_lo = (y_lo << 96) | (y_hi >> 32) - // x_hi = (y_hi << 96) | (y_lo >> 32) - // which we can do with 2 vext.8 instructions. - t0_lo = SHISHUA_VEXTQ_U8(s0_hi, s0_lo, 4); t0_hi = SHISHUA_VEXTQ_U8(s0_lo, s0_hi, 4); - t1_lo = SHISHUA_VEXTQ_U8(s1_lo, s1_hi, 12); t1_hi = SHISHUA_VEXTQ_U8(s1_hi, s1_lo, 12); - - // SIMD does not support rotations. Shift is the next best thing to entangle - // bits with other 64-bit positions. We must shift by an odd number so that - // each bit reaches all 64-bit positions, not just half. We must lose bits - // of information, so we minimize it: 1 and 3. We use different shift values - // to increase divergence between the two sides. We use rightward shift - // because the rightmost bits have the least diffusion in addition (the low - // bit is just a XOR of the low bits). - u_lo = vshrq_n_u64(s0_lo, 1); u_hi = vshrq_n_u64(s0_hi, 1); -#if defined(__clang__) - // UGLY HACK: Clang enjoys merging the above statements with the vadds below - // into vsras. - // This is dumb, as it still needs to do the original vshr for the xor mix, - // causing it to shift twice. - // This makes Clang assume that this line has side effects, preventing the - // combination and speeding things up significantly. - __asm__("" : "+w" (u_lo), "+w" (u_hi)); -#endif - - // Addition is the main source of diffusion. - // Storing the output in the state keeps that diffusion permanently. - s->state[4 * j + 0] = vaddq_u64(t0_lo, u_lo); - s->state[4 * j + 1] = vaddq_u64(t0_hi, u_hi); - // Use vsra here directly. - s->state[4 * j + 2] = vsraq_n_u64(t1_lo, s1_lo, 3); - s->state[4 * j + 3] = vsraq_n_u64(t1_hi, s1_hi, 3); - - // The first orthogonally grown pieces evolving independently, XORed. - s->output[2 * j + 0] = veorq_u64(u_lo, t1_lo); - s->output[2 * j + 1] = veorq_u64(u_hi, t1_hi); - - } - // The second orthogonally grown piece evolving independently, XORed. - s->output[4] = veorq_u64(s->state[0], s->state[6]); - s->output[5] = veorq_u64(s->state[1], s->state[7]); - - s->output[6] = veorq_u64(s->state[2], s->state[4]); - s->output[7] = veorq_u64(s->state[3], s->state[5]); - - counter_lo = vaddq_u64(counter_lo, increment_lo); - counter_hi = vaddq_u64(counter_hi, increment_hi); - } - s->counter[0] = counter_lo; - s->counter[1] = counter_hi; -} - -// Nothing up my sleeve: those are the hex digits of Φ, -// the least approximable irrational number. -// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc -static uint64_t phi[16] = { - 0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95, - 0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36, - 0xF06AD7AE9717877E, 0x85839D6EFFBD7DC6, 0x64D325D1C5371682, 0xCADD0CCCFDFFBBE1, - 0x626E33B8D04B4331, 0xBBF73C790D94F79D, 0x471C4AB3ED3D82A5, 0xFEC507705E4AE6E5, -}; - -void prng_init(prng_state *s, uint64_t seed[4]) { - s->counter[0] = vdupq_n_u64(0); - s->counter[1] = vdupq_n_u64(0); -# define ROUNDS 13 -# define STEPS 1 - // Diffuse first two seed elements in s0, then the last two. Same for s1. - // We must keep half of the state unchanged so users cannot set a bad state. - uint64x2_t seed_0 = SHISHUA_VSETQ_N_U64(seed[0], 0); - uint64x2_t seed_1 = SHISHUA_VSETQ_N_U64(seed[1], 0); - uint64x2_t seed_2 = SHISHUA_VSETQ_N_U64(seed[2], 0); - uint64x2_t seed_3 = SHISHUA_VSETQ_N_U64(seed[3], 0); - s->state[0] = veorq_u64(seed_0, vld1q_u64(&phi[ 0])); - s->state[1] = veorq_u64(seed_1, vld1q_u64(&phi[ 2])); - s->state[2] = veorq_u64(seed_2, vld1q_u64(&phi[ 4])); - s->state[3] = veorq_u64(seed_3, vld1q_u64(&phi[ 6])); - s->state[4] = veorq_u64(seed_2, vld1q_u64(&phi[ 8])); - s->state[5] = veorq_u64(seed_3, vld1q_u64(&phi[10])); - s->state[6] = veorq_u64(seed_0, vld1q_u64(&phi[12])); - s->state[7] = veorq_u64(seed_1, vld1q_u64(&phi[14])); - - for (int i = 0; i < ROUNDS; i++) { - prng_gen(s, NULL, 128 * STEPS); - s->state[0] = s->output[6]; s->state[1] = s->output[7]; - s->state[2] = s->output[4]; s->state[3] = s->output[5]; - s->state[4] = s->output[2]; s->state[5] = s->output[3]; - s->state[6] = s->output[0]; s->state[7] = s->output[1]; - } -# undef STEPS -# undef ROUNDS -} -#undef SHISHUA_VSETQ_N_U64 -#undef SHISHUA_VEXTQ_U8 -#undef SHISHUA_RESTRICT -#endif diff --git a/src/helper/shishua/shishua-sse2.h b/src/helper/shishua/shishua-sse2.h deleted file mode 100644 index 3a9807638..000000000 --- a/src/helper/shishua/shishua-sse2.h +++ /dev/null @@ -1,214 +0,0 @@ -// An SSE2/SSSE3 version of shishua. Slower than AVX2, but more compatible. -// Also compatible with 32-bit x86. -// -// SSSE3 is recommended, as it has the useful _mm_alignr_epi8 intrinsic. -// We can still emulate it on SSE2, but it is slower. -// Consider the half version if AVX2 is not available. -#ifndef SHISHUA_SSE2_H -#define SHISHUA_SSE2_H -#include -#include -#include -// Note: cl.exe doesn't define __SSSE3__ -#if defined(__SSSE3__) || defined(__AVX__) -# include // SSSE3 -# define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \ - _mm_alignr_epi8(hi, lo, amt) -#else -# include // SSE2 -// Emulate _mm_alignr_epi8 for SSE2. It's a little slow. -// The compiler may convert it to a sequence of shufps instructions, which is -// perfectly fine. -# define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \ - _mm_or_si128( \ - _mm_slli_si128(hi, 16 - (amt)), \ - _mm_srli_si128(lo, amt) \ - ) -#endif - -typedef struct prng_state { - __m128i state[8]; - __m128i output[8]; - __m128i counter[2]; -} prng_state; - -// Wrappers for x86 targets which usually lack these intrinsics. -// Don't call these with side effects. -#if defined(__x86_64__) || defined(_M_X64) -# define SHISHUA_SET_EPI64X(b, a) _mm_set_epi64x(b, a) -# define SHISHUA_CVTSI64_SI128(x) _mm_cvtsi64_si128(x) -#else -# define SHISHUA_SET_EPI64X(b, a) \ - _mm_set_epi32( \ - (int)(((uint64_t)(b)) >> 32), \ - (int)(b), \ - (int)(((uint64_t)(a)) >> 32), \ - (int)(a) \ - ) -# define SHISHUA_CVTSI64_SI128(x) SHISHUA_SET_EPI64X(0, x) -#endif - -// buf could technically alias with prng_state, according to the compiler. -#if defined(__GNUC__) || defined(_MSC_VER) -# define SHISHUA_RESTRICT __restrict -#else -# define SHISHUA_RESTRICT -#endif - -// buf's size must be a multiple of 128 bytes. -static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) { - __m128i counter_lo = s->counter[0], counter_hi = s->counter[1]; - // The counter is not necessary to beat PractRand. - // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, - // or about 7 millenia at 10 GiB/s. - // The increments are picked as odd numbers, - // since only coprimes of the base cover the full cycle, - // and all odd numbers are coprime of 2. - // I use different odd numbers for each 64-bit chunk - // for a tiny amount of variation stirring. - // I used the smallest odd numbers to avoid having a magic number. - - // increment = { 7, 5, 3, 1 }; - __m128i increment_lo = SHISHUA_SET_EPI64X(5, 7); - __m128i increment_hi = SHISHUA_SET_EPI64X(1, 3); - - // TODO: consider adding proper uneven write handling - assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes."); - - for (size_t i = 0; i < size; i += 128) { - // Write the current output block to state if it is not NULL - if (buf != NULL) { - for (size_t j = 0; j < 8; j++) { - _mm_storeu_si128((__m128i *)&buf[i + (16 * j)], s->output[j]); - } - } - - // There are only 16 SSE registers (8 on i686), and we have to account for - // temporary copies due to being stuck with 2-operand instructions. - // Therefore, we use fixed iteration loops to reduce code complexity while - // still allowing the compiler to easily unroll the loop. - // We also try to keep variables active for as short as possible. - for (size_t j = 0; j < 2; j++) { - __m128i s_lo, s_hi, u0_lo, u0_hi, u1_lo, u1_hi, t_lo, t_hi; - - // Lane 0 - s_lo = s->state[4 * j + 0]; - s_hi = s->state[4 * j + 1]; - - // SIMD does not support rotations. Shift is the next best thing to entangle - // bits with other 64-bit positions. We must shift by an odd number so that - // each bit reaches all 64-bit positions, not just half. We must lose bits - // of information, so we minimize it: 1 and 3. We use different shift values - // to increase divergence between the two sides. We use rightward shift - // because the rightmost bits have the least diffusion in addition (the low - // bit is just a XOR of the low bits). - u0_lo = _mm_srli_epi64(s_lo, 1); - u0_hi = _mm_srli_epi64(s_hi, 1); - - // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit - // additions to strong positions for enrichment. The low 32-bit part of a - // 64-bit chunk never moves to the same 64-bit chunk as its high part. - // They do not remain in the same chunk. Each part eventually reaches all - // positions ringwise: A to B, B to C, …, H to A. - // You may notice that they are simply 256-bit rotations (96 and 160). - // Note: This: - // x = (y << 96) | (y >> 160) - // can be rewritten as this - // x_lo = (y_lo << 96) | (y_hi >> 32) - // x_hi = (y_hi << 96) | (y_lo >> 32) - // which we can do with 2 _mm_alignr_epi8 instructions. - t_lo = SHISHUA_ALIGNR_EPI8(s_lo, s_hi, 4); - t_hi = SHISHUA_ALIGNR_EPI8(s_hi, s_lo, 4); - - // Addition is the main source of diffusion. - // Storing the output in the state keeps that diffusion permanently. - s->state[4 * j + 0] = _mm_add_epi64(t_lo, u0_lo); - s->state[4 * j + 1] = _mm_add_epi64(t_hi, u0_hi); - - // Lane 1 - s_lo = s->state[4 * j + 2]; - s_hi = s->state[4 * j + 3]; - - // I apply the counter to s1, - // since it is the one whose shift loses most entropy. - s_lo = _mm_add_epi64(s_lo, counter_lo); - s_hi = _mm_add_epi64(s_hi, counter_hi); - - // Same as above but with different shifts - u1_lo = _mm_srli_epi64(s_lo, 3); - u1_hi = _mm_srli_epi64(s_hi, 3); - - t_lo = SHISHUA_ALIGNR_EPI8(s_hi, s_lo, 12); - t_hi = SHISHUA_ALIGNR_EPI8(s_lo, s_hi, 12); - - s->state[4 * j + 2] = _mm_add_epi64(t_lo, u1_lo); - s->state[4 * j + 3] = _mm_add_epi64(t_hi, u1_hi); - - // Merge lane 0 and lane 1 - // The first orthogonally grown piece evolving independently, XORed. - s->output[2 * j + 0] = _mm_xor_si128(u0_lo, t_lo); - s->output[2 * j + 1] = _mm_xor_si128(u0_hi, t_hi); - } - - // The second orthogonally grown piece evolving independently, XORed. - s->output[4] = _mm_xor_si128(s->state[0], s->state[6]); - s->output[5] = _mm_xor_si128(s->state[1], s->state[7]); - s->output[6] = _mm_xor_si128(s->state[4], s->state[2]); - s->output[7] = _mm_xor_si128(s->state[5], s->state[3]); - - // Increment the counter - counter_lo = _mm_add_epi64(counter_lo, increment_lo); - counter_hi = _mm_add_epi64(counter_hi, increment_hi); - } - - s->counter[0] = counter_lo; - s->counter[1] = counter_hi; -} - -// Nothing up my sleeve: those are the hex digits of Φ, -// the least approximable irrational number. -// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc -static uint64_t phi[16] = { - 0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95, - 0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36, - 0xF06AD7AE9717877E, 0x85839D6EFFBD7DC6, 0x64D325D1C5371682, 0xCADD0CCCFDFFBBE1, - 0x626E33B8D04B4331, 0xBBF73C790D94F79D, 0x471C4AB3ED3D82A5, 0xFEC507705E4AE6E5, -}; - -void prng_init(prng_state *s, uint64_t seed[4]) { - // Note: output is uninitialized at first, but since we pass NULL, its value - // is initially ignored. - s->counter[0] = _mm_setzero_si128(); - s->counter[1] = _mm_setzero_si128(); -# define ROUNDS 13 -# define STEPS 1 - // Diffuse first two seed elements in s0, then the last two. Same for s1. - // We must keep half of the state unchanged so users cannot set a bad state. - __m128i seed_0 = SHISHUA_CVTSI64_SI128(seed[0]); - __m128i seed_1 = SHISHUA_CVTSI64_SI128(seed[1]); - __m128i seed_2 = SHISHUA_CVTSI64_SI128(seed[2]); - __m128i seed_3 = SHISHUA_CVTSI64_SI128(seed[3]); - s->state[0] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[ 0])); - s->state[1] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[ 2])); - s->state[2] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[ 4])); - s->state[3] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[ 6])); - s->state[4] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[ 8])); - s->state[5] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[10])); - s->state[6] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[12])); - s->state[7] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[14])); - - for (int i = 0; i < ROUNDS; i++) { - prng_gen(s, NULL, 128 * STEPS); - s->state[0] = s->output[6]; s->state[1] = s->output[7]; - s->state[2] = s->output[4]; s->state[3] = s->output[5]; - s->state[4] = s->output[2]; s->state[5] = s->output[3]; - s->state[6] = s->output[0]; s->state[7] = s->output[1]; - } -# undef STEPS -# undef ROUNDS -} -#undef SHISHUA_CVTSI64_SI128 -#undef SHISHUA_ALIGNR_EPI8 -#undef SHISHUA_SET_EPI64X -#undef SHISHUA_RESTRICT -#endif diff --git a/src/helper/shishua/shishua.h b/src/helper/shishua/shishua.h deleted file mode 100644 index 352a8aac2..000000000 --- a/src/helper/shishua/shishua.h +++ /dev/null @@ -1,234 +0,0 @@ -#ifndef SHISHUA_H -#define SHISHUA_H - -#define SHISHUA_TARGET_SCALAR 0 -#define SHISHUA_TARGET_AVX2 1 -#define SHISHUA_TARGET_SSE2 2 -#define SHISHUA_TARGET_NEON 3 - -#ifndef SHISHUA_TARGET -# if defined(__AVX2__) && (defined(__x86_64__) || defined(_M_X64)) -# define SHISHUA_TARGET SHISHUA_TARGET_AVX2 -# elif defined(__x86_64__) || defined(_M_X64) || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) -# define SHISHUA_TARGET SHISHUA_TARGET_SSE2 -// GCC's NEON codegen leaves much to be desired, at least as of 9.2.0. The -// scalar path ends up being faster. -// Device: Google Pixel 2 XL, 2.46GHz Qualcomm Snapdragon 835 -// algorithm | GCC 9.2.0 | Clang 9.0.1 -// shishua neon | 0.2845 ns/byte | 0.0966 ns/byte -// shishua scalar | 0.2056 ns/byte | 0.2958 ns/byte -// shishua half neon | 0.5169 ns/byte | 0.1929 ns/byte -// shishua half scalar | 0.2496 ns/byte | 0.2911 ns/byte -// Therefore, we only autoselect the NEON path on Clang, at least until GCC's -// NEON codegen improves. -# elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__clang__) -# define SHISHUA_TARGET SHISHUA_TARGET_NEON -# else -# define SHISHUA_TARGET SHISHUA_TARGET_SCALAR -# endif -#endif - -// These are all optional, with defining SHISHUA_TARGET_SCALAR, you only -// need this header. -#if SHISHUA_TARGET == SHISHUA_TARGET_AVX2 -# include "shishua-avx2.h" -#elif SHISHUA_TARGET == SHISHUA_TARGET_SSE2 -# include "shishua-sse2.h" -#elif SHISHUA_TARGET == SHISHUA_TARGET_NEON -# include "shishua-neon.h" -#else // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR - -// Portable scalar implementation of shishua. -// Designed to balance performance and code size. -#include -#include -#include -#include - -// Note: While it is an array, a "lane" refers to 4 consecutive uint64_t. -typedef struct prng_state { - uint64_t state[16]; // 4 lanes - uint64_t output[16]; // 4 lanes, 2 parts - uint64_t counter[4]; // 1 lane -} prng_state; - -// buf could technically alias with prng_state, according to the compiler. -#if defined(__GNUC__) || defined(_MSC_VER) -# define SHISHUA_RESTRICT __restrict -#else -# define SHISHUA_RESTRICT -#endif - -// Writes a 64-bit little endian integer to dst -static inline void prng_write_le64(void *dst, uint64_t val) { - // Define to write in native endianness with memcpy - // Also, use memcpy on known little endian setups. -#if defined(SHISHUA_NATIVE_ENDIAN) \ - || defined(_WIN32) \ - || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ - || defined(__LITTLE_ENDIAN__) - memcpy(dst, &val, sizeof(uint64_t)); -#else - // Byteshift write. - uint8_t *d = (uint8_t *)dst; - for (size_t i = 0; i < 8; i++) { - d[i] = (uint8_t)(val & 0xff); - val >>= 8; - } -#endif -} - -// buf's size must be a multiple of 128 bytes. -static inline void prng_gen(prng_state *SHISHUA_RESTRICT state, uint8_t *SHISHUA_RESTRICT buf, size_t size) { - uint8_t *b = buf; - // TODO: consider adding proper uneven write handling - assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes."); - - for (size_t i = 0; i < size; i += 128) { - // Write the current output block to state if it is not NULL - if (buf != NULL) { - for (size_t j = 0; j < 16; j++) { - prng_write_le64(b, state->output[j]); b += 8; - } - } - // Similar to SSE, use fixed iteration loops to reduce code complexity - // and allow the compiler more control over optimization. - for (size_t j = 0; j < 2; j++) { - // I don't want to type this 15 times. - uint64_t *s = &state->state[j * 8]; // 2 lanes - uint64_t *o = &state->output[j * 4]; // 1 lane - uint64_t t[8]; // temp buffer - - // I apply the counter to s1, - // since it is the one whose shift loses most entropy. - for (size_t k = 0; k < 4; k++) { - s[k + 4] += state->counter[k]; - } - - // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit - // additions to strong positions for enrichment. The low 32-bit part of a - // 64-bit chunk never moves to the same 64-bit chunk as its high part. - // They do not remain in the same chunk. Each part eventually reaches all - // positions ringwise: A to B, B to C, …, H to A. - // - // You may notice that they are simply 256-bit rotations (96 and 160): - // - // t0 = (s0 << 96) | (s0 >> (256 - 96)); - // t1 = (s1 << 160) | (s1 >> (256 - 160)); - // - // The easiest way to do this would be to cast s and t to uint32_t * - // and operate on them that way. - // - // uint32_t *t0_32 = (uint32_t *)t0, *t1_32 = (uint32_t *)t1; - // uint32_t *s0_32 = (uint32_t *)s0, *s1_32 = (uint32_t *)s1; - // for (size_t k = 0; k < 4; k++) { - // t0_32[k] = s0_32[(k + 5) % 8]; - // t1_32[k] = s1_32[(k + 3) % 8]; - // } - // - // This is pretty, but it violates strict aliasing and relies on little - // endian data layout. - // - // A common workaround to strict aliasing would be to use memcpy: - // - // // legal casts - // unsigned char *t8 = (unsigned char *)t; - // unsigned char *s8 = (unsigned char *)s; - // memcpy(&t8[0], &s8[20], 32 - 20); - // memcpy(&t8[32 - 20], &s8[0], 20); - // - // However, this still doesn't fix the endianness issue, and is very - // ugly. - // - // The only known solution which doesn't rely on endianness is to - // read two 64-bit integers and do a funnel shift. - - // Lookup table for the _offsets_ in the shuffle. Even lanes rotate - // by 5, odd lanes rotate by 3. - // If it were by 32-bit lanes, it would be - // { 5,6,7,0,1,2,3,4, 11,12,13,14,15,8,9,10 } - const uint8_t shuf_offsets[16] = { 2,3,0,1, 5,6,7,4, // left - 3,0,1,2, 6,7,4,5 }; // right - for (size_t k = 0; k < 8; k++) { - t[k] = (s[shuf_offsets[k]] >> 32) | (s[shuf_offsets[k + 8]] << 32); - } - - for (size_t k = 0; k < 4; k++) { - // SIMD does not support rotations. Shift is the next best thing to entangle - // bits with other 64-bit positions. We must shift by an odd number so that - // each bit reaches all 64-bit positions, not just half. We must lose bits - // of information, so we minimize it: 1 and 3. We use different shift values - // to increase divergence between the two sides. We use rightward shift - // because the rightmost bits have the least diffusion in addition (the low - // bit is just a XOR of the low bits). - uint64_t u_lo = s[k + 0] >> 1; - uint64_t u_hi = s[k + 4] >> 3; - - // Addition is the main source of diffusion. - // Storing the output in the state keeps that diffusion permanently. - s[k + 0] = u_lo + t[k + 0]; - s[k + 4] = u_hi + t[k + 4]; - - // The first orthogonally grown piece evolving independently, XORed. - o[k] = u_lo ^ t[k + 4]; - } - } - - // Merge together. - for (size_t j = 0; j < 4; j++) { - // The second orthogonally grown piece evolving independently, XORed. - state->output[j + 8] = state->state[j + 0] ^ state->state[j + 12]; - state->output[j + 12] = state->state[j + 8] ^ state->state[j + 4]; - // The counter is not necessary to beat PractRand. - // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, - // or about 7 millenia at 10 GiB/s. - // The increments are picked as odd numbers, - // since only coprimes of the base cover the full cycle, - // and all odd numbers are coprime of 2. - // I use different odd numbers for each 64-bit chunk - // for a tiny amount of variation stirring. - // I used the smallest odd numbers to avoid having a magic number. - // - // For the scalar version, we calculate this dynamically, as it is - // simple enough. - state->counter[j] += 7 - (j * 2); // 7, 5, 3, 1 - } - } -} -#undef SHISHUA_RESTRICT - -// Nothing up my sleeve: those are the hex digits of Φ, -// the least approximable irrational number. -// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc -static uint64_t phi[16] = { - 0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95, - 0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36, - 0xF06AD7AE9717877E, 0x85839D6EFFBD7DC6, 0x64D325D1C5371682, 0xCADD0CCCFDFFBBE1, - 0x626E33B8D04B4331, 0xBBF73C790D94F79D, 0x471C4AB3ED3D82A5, 0xFEC507705E4AE6E5, -}; - -void prng_init(prng_state *s, uint64_t seed[4]) { - memset(s, 0, sizeof(prng_state)); -# define STEPS 1 -# define ROUNDS 13 - // Diffuse first two seed elements in s0, then the last two. Same for s1. - // We must keep half of the state unchanged so users cannot set a bad state. - memcpy(s->state, phi, sizeof(phi)); - for (size_t i = 0; i < 4; i++) { - s->state[i * 2 + 0] ^= seed[i]; // { s0,0,s1,0,s2,0,s3,0 } - s->state[i * 2 + 8] ^= seed[(i + 2) % 4]; // { s2,0,s3,0,s0,0,s1,0 } - } - for (size_t i = 0; i < ROUNDS; i++) { - prng_gen(s, NULL, 128 * STEPS); - for (size_t j = 0; j < 4; j++) { - s->state[j+ 0] = s->output[j+12]; - s->state[j+ 4] = s->output[j+ 8]; - s->state[j+ 8] = s->output[j+ 4]; - s->state[j+12] = s->output[j+ 0]; - } - } -# undef STEPS -# undef ROUNDS -} -#endif // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR -#endif // SHISHUA_SCALAR_H diff --git a/src/helper/shishua/rand.c b/src/rand.c similarity index 98% rename from src/helper/shishua/rand.c rename to src/rand.c index fe566e4dd..7d31a04cc 100644 --- a/src/helper/shishua/rand.c +++ b/src/rand.c @@ -1,8 +1,8 @@ +#include "rand.h" + #include #include -#include "rand.h" - uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max) { assert(max > min); assert(randc->count <= 127); @@ -34,7 +34,7 @@ float randf(struct rand_chunk *randc, float range, float offset) { return randfloat; } -//NOTE: example +// NOTE: example // int main() { // prng_state state; diff --git a/src/helper/shishua/rand.h b/src/rand.h similarity index 76% rename from src/helper/shishua/rand.h rename to src/rand.h index bdfc71f84..40a6d4d2e 100644 --- a/src/helper/shishua/rand.h +++ b/src/rand.h @@ -1,12 +1,12 @@ #ifndef SHISHUA_RAND_H #define SHISHUA_RAND_H -#include "shishua.h" +#include #include -struct rand_chunk{ +struct rand_chunk { size_t count; - uint8_t buf[128]; //NOTE: must be multiple of 128 + uint8_t buf[128]; // NOTE: must be multiple of 128 }; uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max); From 4e1fc685a5d02cefdd89e5759bf9d46bc672785c Mon Sep 17 00:00:00 2001 From: BrBa2 <108572893+barium-bromide@users.noreply.github.com> Date: Thu, 12 Jun 2025 14:49:08 +0800 Subject: [PATCH 03/11] helper function for random (#13) * helper function for random * Cleanup --------- Co-authored-by: Lee Hoon Lim --- CMakeLists.txt | 2 + external/shishua/shishua-avx2.h | 137 +++++++++++++++ external/shishua/shishua-half-avx2.h | 107 ++++++++++++ external/shishua/shishua-half-neon.h | 169 +++++++++++++++++++ external/shishua/shishua-half-sse2.h | 200 ++++++++++++++++++++++ external/shishua/shishua-half.h | 196 +++++++++++++++++++++ external/shishua/shishua-neon.h | 199 ++++++++++++++++++++++ external/shishua/shishua-sse2.h | 228 +++++++++++++++++++++++++ external/shishua/shishua.h | 244 +++++++++++++++++++++++++++ src/rand.c | 55 ++++++ src/rand.h | 16 ++ 11 files changed, 1553 insertions(+) create mode 100644 external/shishua/shishua-avx2.h create mode 100644 external/shishua/shishua-half-avx2.h create mode 100644 external/shishua/shishua-half-neon.h create mode 100644 external/shishua/shishua-half-sse2.h create mode 100644 external/shishua/shishua-half.h create mode 100644 external/shishua/shishua-neon.h create mode 100644 external/shishua/shishua-sse2.h create mode 100644 external/shishua/shishua.h create mode 100644 src/rand.c create mode 100644 src/rand.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 4591d9ea1..95245ad12 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,8 @@ set(PROFILE_FLAG -O3 -p -g) function(apply_target target) target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/include) + target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/external) + target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/src) target_compile_options(${target} PRIVATE ${WARNING_FLAGS}) target_link_libraries(libcneuron PRIVATE ${BLAS_LIBRARIES} m) diff --git a/external/shishua/shishua-avx2.h b/external/shishua/shishua-avx2.h new file mode 100644 index 000000000..e25dcf20f --- /dev/null +++ b/external/shishua/shishua-avx2.h @@ -0,0 +1,137 @@ +#ifndef SHISHUA_AVX2_H +#define SHISHUA_AVX2_H +#include +#include +#include +#include +#include +typedef struct prng_state { + __m256i state[4]; + __m256i output[4]; + __m256i counter; +} prng_state; + +// buf's size must be a multiple of 128 bytes. +static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) { + __m256i o0 = s->output[0], o1 = s->output[1], o2 = s->output[2], o3 = s->output[3], + s0 = s->state[0], s1 = s->state[1], s2 = s->state[2], s3 = s->state[3], + t0, t1, t2, t3, u0, u1, u2, u3, counter = s->counter; + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // You may notice that they are simply 256-bit rotations (96 and 160). + __m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5), + shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3); + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, + // or about 7 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + __m256i increment = _mm256_set_epi64x(1, 3, 5, 7); + + // TODO: consider adding proper uneven write handling + assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes."); + + for (size_t i = 0; i < size; i += 128) { + if (buf != NULL) { + _mm256_storeu_si256((__m256i *)&buf[i + 0], o0); + _mm256_storeu_si256((__m256i *)&buf[i + 32], o1); + _mm256_storeu_si256((__m256i *)&buf[i + 64], o2); + _mm256_storeu_si256((__m256i *)&buf[i + 96], o3); + } + + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + s1 = _mm256_add_epi64(s1, counter); + s3 = _mm256_add_epi64(s3, counter); + counter = _mm256_add_epi64(counter, increment); + + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + u0 = _mm256_srli_epi64(s0, 1); + u1 = _mm256_srli_epi64(s1, 3); + u2 = _mm256_srli_epi64(s2, 1); + u3 = _mm256_srli_epi64(s3, 3); + t0 = _mm256_permutevar8x32_epi32(s0, shu0); + t1 = _mm256_permutevar8x32_epi32(s1, shu1); + t2 = _mm256_permutevar8x32_epi32(s2, shu0); + t3 = _mm256_permutevar8x32_epi32(s3, shu1); + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + s0 = _mm256_add_epi64(t0, u0); + s1 = _mm256_add_epi64(t1, u1); + s2 = _mm256_add_epi64(t2, u2); + s3 = _mm256_add_epi64(t3, u3); + + // Two orthogonally grown pieces evolving independently, XORed. + o0 = _mm256_xor_si256(u0, t1); + o1 = _mm256_xor_si256(u2, t3); + o2 = _mm256_xor_si256(s0, s3); + o3 = _mm256_xor_si256(s2, s1); + } + s->output[0] = o0; + s->output[1] = o1; + s->output[2] = o2; + s->output[3] = o3; + s->state[0] = s0; + s->state[1] = s1; + s->state[2] = s2; + s->state[3] = s3; + s->counter = counter; +} + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[16] = { + 0x9E3779B97F4A7C15, + 0xF39CC0605CEDC834, + 0x1082276BF3A27251, + 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, + 0x0347045B5BF1827F, + 0x01886F0928403002, + 0xC1D64BA40F335E36, + 0xF06AD7AE9717877E, + 0x85839D6EFFBD7DC6, + 0x64D325D1C5371682, + 0xCADD0CCCFDFFBBE1, + 0x626E33B8D04B4331, + 0xBBF73C790D94F79D, + 0x471C4AB3ED3D82A5, + 0xFEC507705E4AE6E5, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + memset(s, 0, sizeof(prng_state)); +#define STEPS 1 +#define ROUNDS 13 + uint8_t buf[128 * STEPS]; + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + s->state[0] = _mm256_set_epi64x(phi[3], phi[2] ^ seed[1], phi[1], phi[0] ^ seed[0]); + s->state[1] = _mm256_set_epi64x(phi[7], phi[6] ^ seed[3], phi[5], phi[4] ^ seed[2]); + s->state[2] = _mm256_set_epi64x(phi[11], phi[10] ^ seed[3], phi[9], phi[8] ^ seed[2]); + s->state[3] = _mm256_set_epi64x(phi[15], phi[14] ^ seed[1], phi[13], phi[12] ^ seed[0]); + for (size_t i = 0; i < ROUNDS; i++) { + prng_gen(s, buf, 128 * STEPS); + s->state[0] = s->output[3]; + s->state[1] = s->output[2]; + s->state[2] = s->output[1]; + s->state[3] = s->output[0]; + } +#undef STEPS +#undef ROUNDS +} +#endif diff --git a/external/shishua/shishua-half-avx2.h b/external/shishua/shishua-half-avx2.h new file mode 100644 index 000000000..d1b3df42a --- /dev/null +++ b/external/shishua/shishua-half-avx2.h @@ -0,0 +1,107 @@ +#ifndef SHISHUA_H +#define SHISHUA_H +#include +#include +#include +#include +#include +typedef struct prng_state { + __m256i state[2]; + __m256i output; + __m256i counter; +} prng_state; + +// buf's size must be a multiple of 32 bytes. +static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) { + __m256i s0 = s->state[0], counter = s->counter, + s1 = s->state[1], o = s->output, + t0, t1, t2, t3, u0, u1, u2, u3; + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // You may notice that they are simply 256-bit rotations (96 and 160). + __m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5), + shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3); + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^69 bytes = 512 EiB to the period, + // or about 1 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + __m256i increment = _mm256_set_epi64x(1, 3, 5, 7); + + // TODO: consider adding proper uneven write handling + assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes."); + + for (size_t i = 0; i < size; i += 32) { + if (buf != NULL) { + _mm256_storeu_si256((__m256i *)&buf[i], o); + } + + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + s1 = _mm256_add_epi64(s1, counter); + counter = _mm256_add_epi64(counter, increment); + + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + u0 = _mm256_srli_epi64(s0, 1); + u1 = _mm256_srli_epi64(s1, 3); + t0 = _mm256_permutevar8x32_epi32(s0, shu0); + t1 = _mm256_permutevar8x32_epi32(s1, shu1); + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + s0 = _mm256_add_epi64(t0, u0); + s1 = _mm256_add_epi64(t1, u1); + + // Two orthogonally grown pieces evolving independently, XORed. + o = _mm256_xor_si256(u0, t1); + } + s->state[0] = s0; + s->counter = counter; + s->state[1] = s1; + s->output = o; +} + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[8] = { + 0x9E3779B97F4A7C15, + 0xF39CC0605CEDC834, + 0x1082276BF3A27251, + 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, + 0x0347045B5BF1827F, + 0x01886F0928403002, + 0xC1D64BA40F335E36, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + memset(s, 0, sizeof(prng_state)); +#define STEPS 5 +#define ROUNDS 4 + uint8_t buf[32 * STEPS]; // 4 64-bit numbers per 256-bit SIMD. + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + s->state[0] = _mm256_set_epi64x(phi[3], phi[2] ^ seed[1], phi[1], phi[0] ^ seed[0]); + s->state[1] = _mm256_set_epi64x(phi[7], phi[6] ^ seed[3], phi[5], phi[4] ^ seed[2]); + for (size_t i = 0; i < ROUNDS; i++) { + prng_gen(s, buf, 32 * STEPS); + s->state[0] = s->state[1]; + s->state[1] = s->output; + } +#undef STEPS +#undef ROUNDS +} +#endif diff --git a/external/shishua/shishua-half-neon.h b/external/shishua/shishua-half-neon.h new file mode 100644 index 000000000..ba5655b14 --- /dev/null +++ b/external/shishua/shishua-half-neon.h @@ -0,0 +1,169 @@ +// An ARM NEON version of shishua. +#ifndef SHISHUA_HALF_NEON_H +#define SHISHUA_HALF_NEON_H +#include +#include +#include +#include +// A NEON version. +typedef struct prng_state { + uint64x2_t state[4]; + uint64x2_t output[2]; + uint64x2_t counter[2]; +} prng_state; + +#define SHISHUA_VSETQ_N_U64(a, b) vcombine_u64(vdup_n_u64(a), vdup_n_u64(b)) + +// To hide the vreinterpret ritual. +#define SHISHUA_VEXTQ_U8(Rn, Rm, Imm) \ + vreinterpretq_u64_u8( \ + vextq_u8( \ + vreinterpretq_u8_u64(Rn), \ + vreinterpretq_u8_u64(Rm), \ + (Imm))) + +// buf could technically alias with prng_state, according to the compiler. +#if defined(__GNUC__) || defined(_MSC_VER) +#define SHISHUA_RESTRICT __restrict +#else +#define SHISHUA_RESTRICT +#endif + +// buf's size must be a multiple of 32 bytes. +static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) { + uint64x2_t s0_lo = s->state[0], s0_hi = s->state[1], + s1_lo = s->state[2], s1_hi = s->state[3], + o_lo = s->output[0], o_hi = s->output[1], + t0_lo, t0_hi, t1_lo, t1_hi, u_lo, u_hi, + counter_lo = s->counter[0], counter_hi = s->counter[1]; + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, + // or about 7 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + uint64x2_t increment_lo = SHISHUA_VSETQ_N_U64(7, 5); + uint64x2_t increment_hi = SHISHUA_VSETQ_N_U64(3, 1); + uint8_t *b = buf; + // TODO: consider adding proper uneven write handling + assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes."); + + for (size_t i = 0; i < size / 32; i++) { + if (buf != NULL) { + vst1q_u8(&b[0], vreinterpretq_u8_u64(o_lo)); + b += 16; + vst1q_u8(&b[0], vreinterpretq_u8_u64(o_hi)); + b += 16; + } + + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + s1_lo = vaddq_u64(s1_lo, counter_lo); + s1_hi = vaddq_u64(s1_hi, counter_hi); + + counter_lo = vaddq_u64(counter_lo, increment_lo); + counter_hi = vaddq_u64(counter_hi, increment_hi); + + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // You may notice that they are simply 256-bit rotations (96 and 160). + // Note: This: + // x = (y << 96) | (y >> 160) + // can be rewritten as this + // x_lo = (y_lo << 96) | (y_hi >> 32) + // x_hi = (y_hi << 96) | (y_lo >> 32) + // which we can do with 2 vext.8 instructions. + t0_lo = SHISHUA_VEXTQ_U8(s0_hi, s0_lo, 4); + t0_hi = SHISHUA_VEXTQ_U8(s0_lo, s0_hi, 4); + t1_lo = SHISHUA_VEXTQ_U8(s1_lo, s1_hi, 12); + t1_hi = SHISHUA_VEXTQ_U8(s1_hi, s1_lo, 12); + + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + u_lo = vshrq_n_u64(s0_lo, 1); + u_hi = vshrq_n_u64(s0_hi, 1); +#if defined(__clang__) + // UGLY HACK: Clang enjoys merging the above statements with the vadds below + // into vsras. + // This is dumb, as it still needs to do the original vshr for the xor mix, + // causing it to shift twice. + // This makes Clang assume that this line has side effects, preventing the + // combination and speeding things up significantly. + // clang-format off + __asm__("" : "+w"(u_lo), "+w"(u_hi)); + // clang-format on +#endif + + // Two orthogonally grown pieces evolving independently, XORed. + // Note: We do this first because on the assembly side, vsra would overwrite + // t1_lo and t1_hi. + o_lo = veorq_u64(u_lo, t1_lo); + o_hi = veorq_u64(u_hi, t1_hi); + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + s0_lo = vaddq_u64(t0_lo, u_lo); + s0_hi = vaddq_u64(t0_hi, u_hi); + // Use vsra here directly. + s1_lo = vsraq_n_u64(t1_lo, s1_lo, 3); + s1_hi = vsraq_n_u64(t1_hi, s1_hi, 3); + } + s->state[0] = s0_lo; + s->state[1] = s0_hi; + s->state[2] = s1_lo; + s->state[3] = s1_hi; + s->output[0] = o_lo; + s->output[1] = o_hi; + s->counter[0] = counter_lo; + s->counter[1] = counter_hi; +} + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[8] = { + 0x9E3779B97F4A7C15, + 0xF39CC0605CEDC834, + 0x1082276BF3A27251, + 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, + 0x0347045B5BF1827F, + 0x01886F0928403002, + 0xC1D64BA40F335E36, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + s->counter[0] = vdupq_n_u64(0); + s->counter[1] = vdupq_n_u64(0); +#define STEPS 5 +#define ROUNDS 4 + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + for (size_t i = 0; i < 4; i++) { + s->state[i] = veorq_u64(SHISHUA_VSETQ_N_U64(seed[i], 0), vld1q_u64(&phi[i * 2])); + } + for (size_t i = 0; i < ROUNDS; i++) { + prng_gen(s, NULL, 32 * STEPS); + s->state[0] = s->state[2]; + s->state[1] = s->state[3]; + s->state[2] = s->output[0]; + s->state[3] = s->output[1]; + } +#undef STEPS +#undef ROUNDS +} +#undef SHISHUA_VSETQ_N_U64 +#undef SHISHUA_VEXTQ_U8 +#undef SHISHUA_RESTRICT + +#endif diff --git a/external/shishua/shishua-half-sse2.h b/external/shishua/shishua-half-sse2.h new file mode 100644 index 000000000..91e1194fa --- /dev/null +++ b/external/shishua/shishua-half-sse2.h @@ -0,0 +1,200 @@ +// An SSE2/SSSE3 version of shishua half. Slower than AVX2, but more compatible. +// Also compatible with 32-bit x86. +// +// SSSE3 is recommended, as it has the useful _mm_alignr_epi8 intrinsic. +// We can still emulate it on SSE2, but it is slower. +#ifndef SHISHUA_HALF_SSE2_H +#define SHISHUA_HALF_SSE2_H + +#include +#include +#include +// Note: cl.exe doesn't define __SSSE3__ +#if defined(__SSSE3__) || defined(__AVX__) +#include // SSSE3 +#define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \ + _mm_alignr_epi8(hi, lo, amt) +#else +#include // SSE2 +// Emulate _mm_alignr_epi8 for SSE2. It's a little slow. +// The compiler may convert it to a sequence of shufps instructions, which is +// perfectly fine. +#define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \ + _mm_or_si128( \ + _mm_slli_si128(hi, 16 - (amt)), \ + _mm_srli_si128(lo, amt)) +#endif + +typedef struct prng_state { + __m128i state[4]; + __m128i output[2]; + __m128i counter[2]; +} prng_state; + +// Wrappers for x86 targets which usually lack these intrinsics. +// Don't call these with side effects. +#if defined(__x86_64__) || defined(_M_X64) +#define SHISHUA_SET_EPI64X(b, a) _mm_set_epi64x(b, a) +#define SHISHUA_CVTSI64_SI128(x) _mm_cvtsi64_si128(x) +#else +#define SHISHUA_SET_EPI64X(b, a) \ + _mm_set_epi32( \ + (int)(((uint64_t)(b)) >> 32), \ + (int)(b), \ + (int)(((uint64_t)(a)) >> 32), \ + (int)(a)) +#define SHISHUA_CVTSI64_SI128(x) SHISHUA_SET_EPI64X(0, x) +#endif + +// buf could technically alias with prng_state, according to the compiler. +#if defined(__GNUC__) || defined(_MSC_VER) +#define SHISHUA_RESTRICT __restrict +#else +#define SHISHUA_RESTRICT +#endif + +// buf's size must be a multiple of 32 bytes. +static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) { + // We need a lot of variables... + __m128i s0_lo = s->state[0], s0_hi = s->state[1], + s1_lo = s->state[2], s1_hi = s->state[3], + t0_lo, t0_hi, t1_lo, t1_hi, + u_lo, u_hi, + counter_lo = s->counter[0], counter_hi = s->counter[1]; + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, + // or about 7 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + // increment = { 7, 5, 3, 1 }; + const __m128i increment_lo = SHISHUA_SET_EPI64X(5, 7); + const __m128i increment_hi = SHISHUA_SET_EPI64X(1, 3); + + // TODO: consider adding proper uneven write handling + assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes."); + + for (size_t i = 0; i < size; i += 32) { + // Split to try to reduce register pressure + if (buf != NULL) { + _mm_storeu_si128((__m128i *)&buf[i + 0], s->output[0]); + _mm_storeu_si128((__m128i *)&buf[i + 16], s->output[1]); + } + + // Lane 0 + + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // You may notice that they are simply 256-bit rotations (96 and 160). + // Note: This: + // x = (y << 96) | (y >> 160) + // can be rewritten as this + // x_lo = (y_lo << 96) | (y_hi >> 32) + // x_hi = (y_hi << 96) | (y_lo >> 32) + // which we can do with 2 _mm_alignr_epi8 instructions. + t0_lo = SHISHUA_ALIGNR_EPI8(s0_lo, s0_hi, 4); + t0_hi = SHISHUA_ALIGNR_EPI8(s0_hi, s0_lo, 4); + + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + u_lo = _mm_srli_epi64(s0_lo, 1); + u_hi = _mm_srli_epi64(s0_hi, 1); + + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + s0_lo = _mm_add_epi64(u_lo, t0_lo); + s0_hi = _mm_add_epi64(u_hi, t0_hi); + + // Lane 1 + + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + s1_lo = _mm_add_epi64(s1_lo, counter_lo); + s1_hi = _mm_add_epi64(s1_hi, counter_hi); + + // Increment the counter + counter_lo = _mm_add_epi64(counter_lo, increment_lo); + counter_hi = _mm_add_epi64(counter_hi, increment_hi); + + // Same as above, but with different shift amounts. + t1_lo = SHISHUA_ALIGNR_EPI8(s1_hi, s1_lo, 12); + t1_hi = SHISHUA_ALIGNR_EPI8(s1_lo, s1_hi, 12); + + s1_lo = _mm_srli_epi64(s1_lo, 3); + s1_hi = _mm_srli_epi64(s1_hi, 3); + + s1_lo = _mm_add_epi64(s1_lo, t1_lo); + s1_hi = _mm_add_epi64(s1_hi, t1_hi); + + // Two orthogonally grown pieces evolving independently, XORed. + s->output[0] = _mm_xor_si128(u_lo, t1_lo); + s->output[1] = _mm_xor_si128(u_hi, t1_hi); + } + + s->state[0] = s0_lo; + s->state[1] = s0_hi; + s->state[2] = s1_lo; + s->state[3] = s1_hi; + + s->counter[0] = counter_lo; + s->counter[1] = counter_hi; +} + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[8] = { + 0x9E3779B97F4A7C15, + 0xF39CC0605CEDC834, + 0x1082276BF3A27251, + 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, + 0x0347045B5BF1827F, + 0x01886F0928403002, + 0xC1D64BA40F335E36, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + s->counter[0] = _mm_setzero_si128(); + s->counter[1] = _mm_setzero_si128(); +#define STEPS 5 +#define ROUNDS 4 + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + // XXX: better way to do this? + __m128i seed_0 = SHISHUA_CVTSI64_SI128(seed[0]); + __m128i seed_1 = SHISHUA_CVTSI64_SI128(seed[1]); + __m128i seed_2 = SHISHUA_CVTSI64_SI128(seed[2]); + __m128i seed_3 = SHISHUA_CVTSI64_SI128(seed[3]); + s->state[0] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[0])); + s->state[1] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[2])); + s->state[2] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[4])); + s->state[3] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[6])); + + for (int i = 0; i < ROUNDS; i++) { + prng_gen(s, NULL, 32 * STEPS); + s->state[0] = s->state[2]; + s->state[1] = s->state[3]; + s->state[2] = s->output[0]; + s->state[3] = s->output[1]; + } +#undef STEPS +#undef ROUNDS +} +#undef SHISHUA_ALIGNR_EPI8 +#undef SHISHUA_CVTSI64_SI128 +#undef SHISHUA_SET_EPI64X +#undef SHISHUA_RESTRICT + +#endif diff --git a/external/shishua/shishua-half.h b/external/shishua/shishua-half.h new file mode 100644 index 000000000..70a9f4be4 --- /dev/null +++ b/external/shishua/shishua-half.h @@ -0,0 +1,196 @@ +#ifndef SHISHUA_HALF_H +#define SHISHUA_HALF_H + +#define SHISHUA_TARGET_SCALAR 0 +#define SHISHUA_TARGET_AVX2 1 +#define SHISHUA_TARGET_SSE2 2 +#define SHISHUA_TARGET_NEON 3 + +#ifndef SHISHUA_TARGET +#if defined(__AVX2__) && (defined(__x86_64__) || defined(_M_X64)) +#define SHISHUA_TARGET SHISHUA_TARGET_AVX2 +#elif defined(__x86_64__) || defined(_M_X64) || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) +#define SHISHUA_TARGET SHISHUA_TARGET_SSE2 +// GCC's NEON codegen leaves much to be desired, at least as of 9.2.0. The +// scalar path ends up being faster. +// Device: Google Pixel 2 XL, 2.46GHz Qualcomm Snapdragon 835 +// | GCC 9.2.0 | Clang 9.0.1 +// shishua neon | 0.2845 ns/byte | 0.0966 ns/byte +// shishua scalar | 0.2056 ns/byte | 0.2958 ns/byte +// shishua half neon | 0.5169 ns/byte | 0.1929 ns/byte +// shishua half scalar | 0.2496 ns/byte | 0.2911 ns/byte +// Therefore, we only autoselect the NEON path on Clang, at least until GCC's +// NEON codegen improves. +#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__clang__) +#define SHISHUA_TARGET SHISHUA_TARGET_NEON +#else +#define SHISHUA_TARGET SHISHUA_TARGET_SCALAR +#endif +#endif + +// These are all optional: By defining SHISHUA_TARGET_SCALAR, you only +// need this header. +// Additionally, these headers can also be used on their own. +#if SHISHUA_TARGET == SHISHUA_TARGET_AVX2 +#include "shishua-half-avx2.h" +#elif SHISHUA_TARGET == SHISHUA_TARGET_SSE2 +#include "shishua-half-sse2.h" +#elif SHISHUA_TARGET == SHISHUA_TARGET_NEON +#include "shishua-half-neon.h" +#else // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR + +// Portable scalar implementation of shishua half. +// Aims for a balance between code size and performance. +#include +#include +#include +#include + +// Note: While it is an array, a "lane" refers to 4 consecutive uint64_t. +typedef struct prng_state { + uint64_t state[8]; // 2 lanes + uint64_t output[4]; // 1 lane + uint64_t counter[4]; // 1 lane +} prng_state; + +// buf could technically alias with prng_state, according to the compiler. +#if defined(__GNUC__) || defined(_MSC_VER) +#define SHISHUA_RESTRICT __restrict +#else +#define SHISHUA_RESTRICT +#endif + +static inline void shishua_write_le64(void *dst, uint64_t val) { + // Define to write in native endianness with memcpy + // Also, use memcpy on known little endian setups. +#if defined(SHISHUA_NATIVE_ENDIAN) || defined(_WIN32) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN__) + memcpy(dst, &val, sizeof(uint64_t)); +#else + // Byteshift write. + uint8_t *d = (uint8_t *)dst; + for (size_t i = 0; i < 8; i++) { + d[i] = (uint8_t)(val & 0xff); + val >>= 8; + } +#endif +} + +// buf's size must be a multiple of 32 bytes. +static inline void prng_gen(prng_state *SHISHUA_RESTRICT state, uint8_t *SHISHUA_RESTRICT buf, size_t size) { + uint8_t *b = buf; + // TODO: consider adding proper uneven write handling + assert((size % 32 == 0) && "buf's size must be a multiple of 32 bytes."); + + for (size_t i = 0; i < size; i += 32) { + uint64_t t[8]; + // Write to buf + if (buf != NULL) { + for (size_t j = 0; j < 4; j++) { + shishua_write_le64(b, state->output[j]); + b += 8; + } + } + + for (size_t j = 0; j < 4; j++) { + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + state->state[j + 4] += state->counter[j]; + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, + // or about 7 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + // + // For the scalar version, we calculate this dynamically, as it is + // simple enough. + state->counter[j] += 7 - (j * 2); // 7, 5, 3, 1 + } + + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // You may notice that they are simply 256-bit rotations (96 and 160). + // + // It would be much easier and cleaner to just reinterpret as a uint32_t + // pointer or use memcpy, but that is unfortunately endian-dependent, and + // the former also breaks strict aliasing. + // + // The only known solution which doesn't rely on endianness is to + // read two 64-bit integers and do a funnel shift. + // + // See shishua.h for details. + + // Lookup table for the _offsets_ in the shuffle. Even lanes rotate + // by 5, odd lanes rotate by 3. + // If it were by 32-bit lanes, it would be + // { 5,6,7,0,1,2,3,4, 11,12,13,14,15,8,9,10 } + const uint8_t shuf_offsets[16] = {2, 3, 0, 1, 5, 6, 7, 4, // left + 3, 0, 1, 2, 6, 7, 4, 5}; // right + for (size_t j = 0; j < 8; j++) { + t[j] = (state->state[shuf_offsets[j]] >> 32) | (state->state[shuf_offsets[j + 8]] << 32); + } + for (size_t j = 0; j < 4; j++) { + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + uint64_t u_lo = state->state[j + 0] >> 1; + uint64_t u_hi = state->state[j + 4] >> 3; + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + state->state[j + 0] = u_lo + t[j + 0]; + state->state[j + 4] = u_hi + t[j + 4]; + + // Two orthogonally grown pieces evolving independently, XORed. + state->output[j] = u_lo ^ t[j + 4]; + } + } +} +#undef SHISHUA_RESTRICT + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[8] = { + 0x9E3779B97F4A7C15, + 0xF39CC0605CEDC834, + 0x1082276BF3A27251, + 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, + 0x0347045B5BF1827F, + 0x01886F0928403002, + 0xC1D64BA40F335E36, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + memset(s, 0, sizeof(prng_state)); + +#define STEPS 5 +#define ROUNDS 4 + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + memcpy(s->state, phi, sizeof(phi)); + for (size_t i = 0; i < 4; i++) { + s->state[i * 2] ^= seed[i]; + } + for (size_t i = 0; i < ROUNDS; i++) { + prng_gen(s, NULL, 32 * STEPS); + for (size_t j = 0; j < 4; j++) { + s->state[j + 0] = s->state[j + 4]; + s->state[j + 4] = s->output[j]; + } + } +#undef STEPS +#undef ROUNDS +} +#endif // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR +#endif // SHISHUA_HALF_SCALAR_H diff --git a/external/shishua/shishua-neon.h b/external/shishua/shishua-neon.h new file mode 100644 index 000000000..6cd5124bd --- /dev/null +++ b/external/shishua/shishua-neon.h @@ -0,0 +1,199 @@ +// An ARM NEON version of shishua. +#ifndef SHISHUA_NEON_H +#define SHISHUA_NEON_H +#include +#include +#include +#include + +typedef struct prng_state { + uint64x2_t state[8]; + uint64x2_t output[8]; + uint64x2_t counter[2]; +} prng_state; + +#if defined(__GNUC__) && (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define SHISHUA_VSETQ_N_U64(a, b) (__extension__(uint64x2_t){a, b}) +#else +#define SHISHUA_VSETQ_N_U64(a, b) vcombine_u64(vdup_n_u64(a), vdup_n_u64(b)) +#endif +// To hide the vreinterpret ritual. +#define SHISHUA_VEXTQ_U8(Rn, Rm, Imm) \ + vreinterpretq_u64_u8( \ + vextq_u8( \ + vreinterpretq_u8_u64(Rn), \ + vreinterpretq_u8_u64(Rm), \ + (Imm))) + +// buf could technically alias with prng_state, according to the compiler. +#if defined(__GNUC__) || defined(_MSC_VER) +#define SHISHUA_RESTRICT __restrict +#else +#define SHISHUA_RESTRICT +#endif + +// buf's size must be a multiple of 128 bytes. +static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) { + uint8_t *b = buf; + uint64x2_t counter_lo = s->counter[0], counter_hi = s->counter[1]; + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, + // or about 7 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + uint64x2_t increment_lo = SHISHUA_VSETQ_N_U64(7, 5); + uint64x2_t increment_hi = SHISHUA_VSETQ_N_U64(3, 1); + // TODO: consider adding proper uneven write handling + assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes."); + + for (size_t i = 0; i < size; i += 128) { + // Write the current output block to state if it is not NULL + if (buf != NULL) { + for (size_t j = 0; j < 8; j++) { + vst1q_u8(b, vreinterpretq_u8_u64(s->output[j])); + b += 16; + } + } + // NEON has less register pressure than SSE2, but we reroll it anyways for + // code size. + for (size_t j = 0; j < 2; j++) { + uint64x2_t s0_lo = s->state[j * 4 + 0], + s0_hi = s->state[j * 4 + 1], + s1_lo = s->state[j * 4 + 2], + s1_hi = s->state[j * 4 + 3], + t0_lo, t0_hi, t1_lo, t1_hi, + u_lo, u_hi; + + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + s1_lo = vaddq_u64(s1_lo, counter_lo); + s1_hi = vaddq_u64(s1_hi, counter_hi); + + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // You may notice that they are simply 256-bit rotations (96 and 160). + // Note: This: + // x = (y << 96) | (y >> 160) + // can be rewritten as this + // x_lo = (y_lo << 96) | (y_hi >> 32) + // x_hi = (y_hi << 96) | (y_lo >> 32) + // which we can do with 2 vext.8 instructions. + t0_lo = SHISHUA_VEXTQ_U8(s0_hi, s0_lo, 4); + t0_hi = SHISHUA_VEXTQ_U8(s0_lo, s0_hi, 4); + t1_lo = SHISHUA_VEXTQ_U8(s1_lo, s1_hi, 12); + t1_hi = SHISHUA_VEXTQ_U8(s1_hi, s1_lo, 12); + + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + u_lo = vshrq_n_u64(s0_lo, 1); + u_hi = vshrq_n_u64(s0_hi, 1); +#if defined(__clang__) + // UGLY HACK: Clang enjoys merging the above statements with the vadds below + // into vsras. + // This is dumb, as it still needs to do the original vshr for the xor mix, + // causing it to shift twice. + // This makes Clang assume that this line has side effects, preventing the + // combination and speeding things up significantly. + // clang-format off + __asm__("" : "+w"(u_lo), "+w"(u_hi)); + // clang-format on +#endif + + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + s->state[4 * j + 0] = vaddq_u64(t0_lo, u_lo); + s->state[4 * j + 1] = vaddq_u64(t0_hi, u_hi); + // Use vsra here directly. + s->state[4 * j + 2] = vsraq_n_u64(t1_lo, s1_lo, 3); + s->state[4 * j + 3] = vsraq_n_u64(t1_hi, s1_hi, 3); + + // The first orthogonally grown pieces evolving independently, XORed. + s->output[2 * j + 0] = veorq_u64(u_lo, t1_lo); + s->output[2 * j + 1] = veorq_u64(u_hi, t1_hi); + } + // The second orthogonally grown piece evolving independently, XORed. + s->output[4] = veorq_u64(s->state[0], s->state[6]); + s->output[5] = veorq_u64(s->state[1], s->state[7]); + + s->output[6] = veorq_u64(s->state[2], s->state[4]); + s->output[7] = veorq_u64(s->state[3], s->state[5]); + + counter_lo = vaddq_u64(counter_lo, increment_lo); + counter_hi = vaddq_u64(counter_hi, increment_hi); + } + s->counter[0] = counter_lo; + s->counter[1] = counter_hi; +} + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[16] = { + 0x9E3779B97F4A7C15, + 0xF39CC0605CEDC834, + 0x1082276BF3A27251, + 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, + 0x0347045B5BF1827F, + 0x01886F0928403002, + 0xC1D64BA40F335E36, + 0xF06AD7AE9717877E, + 0x85839D6EFFBD7DC6, + 0x64D325D1C5371682, + 0xCADD0CCCFDFFBBE1, + 0x626E33B8D04B4331, + 0xBBF73C790D94F79D, + 0x471C4AB3ED3D82A5, + 0xFEC507705E4AE6E5, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + s->counter[0] = vdupq_n_u64(0); + s->counter[1] = vdupq_n_u64(0); +#define ROUNDS 13 +#define STEPS 1 + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + uint64x2_t seed_0 = SHISHUA_VSETQ_N_U64(seed[0], 0); + uint64x2_t seed_1 = SHISHUA_VSETQ_N_U64(seed[1], 0); + uint64x2_t seed_2 = SHISHUA_VSETQ_N_U64(seed[2], 0); + uint64x2_t seed_3 = SHISHUA_VSETQ_N_U64(seed[3], 0); + s->state[0] = veorq_u64(seed_0, vld1q_u64(&phi[0])); + s->state[1] = veorq_u64(seed_1, vld1q_u64(&phi[2])); + s->state[2] = veorq_u64(seed_2, vld1q_u64(&phi[4])); + s->state[3] = veorq_u64(seed_3, vld1q_u64(&phi[6])); + s->state[4] = veorq_u64(seed_2, vld1q_u64(&phi[8])); + s->state[5] = veorq_u64(seed_3, vld1q_u64(&phi[10])); + s->state[6] = veorq_u64(seed_0, vld1q_u64(&phi[12])); + s->state[7] = veorq_u64(seed_1, vld1q_u64(&phi[14])); + + for (int i = 0; i < ROUNDS; i++) { + prng_gen(s, NULL, 128 * STEPS); + s->state[0] = s->output[6]; + s->state[1] = s->output[7]; + s->state[2] = s->output[4]; + s->state[3] = s->output[5]; + s->state[4] = s->output[2]; + s->state[5] = s->output[3]; + s->state[6] = s->output[0]; + s->state[7] = s->output[1]; + } +#undef STEPS +#undef ROUNDS +} +#undef SHISHUA_VSETQ_N_U64 +#undef SHISHUA_VEXTQ_U8 +#undef SHISHUA_RESTRICT +#endif diff --git a/external/shishua/shishua-sse2.h b/external/shishua/shishua-sse2.h new file mode 100644 index 000000000..af28300ad --- /dev/null +++ b/external/shishua/shishua-sse2.h @@ -0,0 +1,228 @@ +// An SSE2/SSSE3 version of shishua. Slower than AVX2, but more compatible. +// Also compatible with 32-bit x86. +// +// SSSE3 is recommended, as it has the useful _mm_alignr_epi8 intrinsic. +// We can still emulate it on SSE2, but it is slower. +// Consider the half version if AVX2 is not available. +#ifndef SHISHUA_SSE2_H +#define SHISHUA_SSE2_H +#include +#include +#include +// Note: cl.exe doesn't define __SSSE3__ +#if defined(__SSSE3__) || defined(__AVX__) +#include // SSSE3 +#define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \ + _mm_alignr_epi8(hi, lo, amt) +#else +#include // SSE2 +// Emulate _mm_alignr_epi8 for SSE2. It's a little slow. +// The compiler may convert it to a sequence of shufps instructions, which is +// perfectly fine. +#define SHISHUA_ALIGNR_EPI8(hi, lo, amt) \ + _mm_or_si128( \ + _mm_slli_si128(hi, 16 - (amt)), \ + _mm_srli_si128(lo, amt)) +#endif + +typedef struct prng_state { + __m128i state[8]; + __m128i output[8]; + __m128i counter[2]; +} prng_state; + +// Wrappers for x86 targets which usually lack these intrinsics. +// Don't call these with side effects. +#if defined(__x86_64__) || defined(_M_X64) +#define SHISHUA_SET_EPI64X(b, a) _mm_set_epi64x(b, a) +#define SHISHUA_CVTSI64_SI128(x) _mm_cvtsi64_si128(x) +#else +#define SHISHUA_SET_EPI64X(b, a) \ + _mm_set_epi32( \ + (int)(((uint64_t)(b)) >> 32), \ + (int)(b), \ + (int)(((uint64_t)(a)) >> 32), \ + (int)(a)) +#define SHISHUA_CVTSI64_SI128(x) SHISHUA_SET_EPI64X(0, x) +#endif + +// buf could technically alias with prng_state, according to the compiler. +#if defined(__GNUC__) || defined(_MSC_VER) +#define SHISHUA_RESTRICT __restrict +#else +#define SHISHUA_RESTRICT +#endif + +// buf's size must be a multiple of 128 bytes. +static inline void prng_gen(prng_state *SHISHUA_RESTRICT s, uint8_t *SHISHUA_RESTRICT buf, size_t size) { + __m128i counter_lo = s->counter[0], counter_hi = s->counter[1]; + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, + // or about 7 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + + // increment = { 7, 5, 3, 1 }; + __m128i increment_lo = SHISHUA_SET_EPI64X(5, 7); + __m128i increment_hi = SHISHUA_SET_EPI64X(1, 3); + + // TODO: consider adding proper uneven write handling + assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes."); + + for (size_t i = 0; i < size; i += 128) { + // Write the current output block to state if it is not NULL + if (buf != NULL) { + for (size_t j = 0; j < 8; j++) { + _mm_storeu_si128((__m128i *)&buf[i + (16 * j)], s->output[j]); + } + } + + // There are only 16 SSE registers (8 on i686), and we have to account for + // temporary copies due to being stuck with 2-operand instructions. + // Therefore, we use fixed iteration loops to reduce code complexity while + // still allowing the compiler to easily unroll the loop. + // We also try to keep variables active for as short as possible. + for (size_t j = 0; j < 2; j++) { + __m128i s_lo, s_hi, u0_lo, u0_hi, u1_lo, u1_hi, t_lo, t_hi; + + // Lane 0 + s_lo = s->state[4 * j + 0]; + s_hi = s->state[4 * j + 1]; + + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + u0_lo = _mm_srli_epi64(s_lo, 1); + u0_hi = _mm_srli_epi64(s_hi, 1); + + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // You may notice that they are simply 256-bit rotations (96 and 160). + // Note: This: + // x = (y << 96) | (y >> 160) + // can be rewritten as this + // x_lo = (y_lo << 96) | (y_hi >> 32) + // x_hi = (y_hi << 96) | (y_lo >> 32) + // which we can do with 2 _mm_alignr_epi8 instructions. + t_lo = SHISHUA_ALIGNR_EPI8(s_lo, s_hi, 4); + t_hi = SHISHUA_ALIGNR_EPI8(s_hi, s_lo, 4); + + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + s->state[4 * j + 0] = _mm_add_epi64(t_lo, u0_lo); + s->state[4 * j + 1] = _mm_add_epi64(t_hi, u0_hi); + + // Lane 1 + s_lo = s->state[4 * j + 2]; + s_hi = s->state[4 * j + 3]; + + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + s_lo = _mm_add_epi64(s_lo, counter_lo); + s_hi = _mm_add_epi64(s_hi, counter_hi); + + // Same as above but with different shifts + u1_lo = _mm_srli_epi64(s_lo, 3); + u1_hi = _mm_srli_epi64(s_hi, 3); + + t_lo = SHISHUA_ALIGNR_EPI8(s_hi, s_lo, 12); + t_hi = SHISHUA_ALIGNR_EPI8(s_lo, s_hi, 12); + + s->state[4 * j + 2] = _mm_add_epi64(t_lo, u1_lo); + s->state[4 * j + 3] = _mm_add_epi64(t_hi, u1_hi); + + // Merge lane 0 and lane 1 + // The first orthogonally grown piece evolving independently, XORed. + s->output[2 * j + 0] = _mm_xor_si128(u0_lo, t_lo); + s->output[2 * j + 1] = _mm_xor_si128(u0_hi, t_hi); + } + + // The second orthogonally grown piece evolving independently, XORed. + s->output[4] = _mm_xor_si128(s->state[0], s->state[6]); + s->output[5] = _mm_xor_si128(s->state[1], s->state[7]); + s->output[6] = _mm_xor_si128(s->state[4], s->state[2]); + s->output[7] = _mm_xor_si128(s->state[5], s->state[3]); + + // Increment the counter + counter_lo = _mm_add_epi64(counter_lo, increment_lo); + counter_hi = _mm_add_epi64(counter_hi, increment_hi); + } + + s->counter[0] = counter_lo; + s->counter[1] = counter_hi; +} + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[16] = { + 0x9E3779B97F4A7C15, + 0xF39CC0605CEDC834, + 0x1082276BF3A27251, + 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, + 0x0347045B5BF1827F, + 0x01886F0928403002, + 0xC1D64BA40F335E36, + 0xF06AD7AE9717877E, + 0x85839D6EFFBD7DC6, + 0x64D325D1C5371682, + 0xCADD0CCCFDFFBBE1, + 0x626E33B8D04B4331, + 0xBBF73C790D94F79D, + 0x471C4AB3ED3D82A5, + 0xFEC507705E4AE6E5, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + // Note: output is uninitialized at first, but since we pass NULL, its value + // is initially ignored. + s->counter[0] = _mm_setzero_si128(); + s->counter[1] = _mm_setzero_si128(); +#define ROUNDS 13 +#define STEPS 1 + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + __m128i seed_0 = SHISHUA_CVTSI64_SI128(seed[0]); + __m128i seed_1 = SHISHUA_CVTSI64_SI128(seed[1]); + __m128i seed_2 = SHISHUA_CVTSI64_SI128(seed[2]); + __m128i seed_3 = SHISHUA_CVTSI64_SI128(seed[3]); + s->state[0] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[0])); + s->state[1] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[2])); + s->state[2] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[4])); + s->state[3] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[6])); + s->state[4] = _mm_xor_si128(seed_2, _mm_loadu_si128((__m128i *)&phi[8])); + s->state[5] = _mm_xor_si128(seed_3, _mm_loadu_si128((__m128i *)&phi[10])); + s->state[6] = _mm_xor_si128(seed_0, _mm_loadu_si128((__m128i *)&phi[12])); + s->state[7] = _mm_xor_si128(seed_1, _mm_loadu_si128((__m128i *)&phi[14])); + + for (int i = 0; i < ROUNDS; i++) { + prng_gen(s, NULL, 128 * STEPS); + s->state[0] = s->output[6]; + s->state[1] = s->output[7]; + s->state[2] = s->output[4]; + s->state[3] = s->output[5]; + s->state[4] = s->output[2]; + s->state[5] = s->output[3]; + s->state[6] = s->output[0]; + s->state[7] = s->output[1]; + } +#undef STEPS +#undef ROUNDS +} +#undef SHISHUA_CVTSI64_SI128 +#undef SHISHUA_ALIGNR_EPI8 +#undef SHISHUA_SET_EPI64X +#undef SHISHUA_RESTRICT +#endif diff --git a/external/shishua/shishua.h b/external/shishua/shishua.h new file mode 100644 index 000000000..a0bb9248d --- /dev/null +++ b/external/shishua/shishua.h @@ -0,0 +1,244 @@ +#ifndef SHISHUA_H +#define SHISHUA_H + +#define SHISHUA_TARGET_SCALAR 0 +#define SHISHUA_TARGET_AVX2 1 +#define SHISHUA_TARGET_SSE2 2 +#define SHISHUA_TARGET_NEON 3 + +#ifndef SHISHUA_TARGET +#if defined(__AVX2__) && (defined(__x86_64__) || defined(_M_X64)) +#define SHISHUA_TARGET SHISHUA_TARGET_AVX2 +#elif defined(__x86_64__) || defined(_M_X64) || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) +#define SHISHUA_TARGET SHISHUA_TARGET_SSE2 +// GCC's NEON codegen leaves much to be desired, at least as of 9.2.0. The +// scalar path ends up being faster. +// Device: Google Pixel 2 XL, 2.46GHz Qualcomm Snapdragon 835 +// algorithm | GCC 9.2.0 | Clang 9.0.1 +// shishua neon | 0.2845 ns/byte | 0.0966 ns/byte +// shishua scalar | 0.2056 ns/byte | 0.2958 ns/byte +// shishua half neon | 0.5169 ns/byte | 0.1929 ns/byte +// shishua half scalar | 0.2496 ns/byte | 0.2911 ns/byte +// Therefore, we only autoselect the NEON path on Clang, at least until GCC's +// NEON codegen improves. +#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && defined(__clang__) +#define SHISHUA_TARGET SHISHUA_TARGET_NEON +#else +#define SHISHUA_TARGET SHISHUA_TARGET_SCALAR +#endif +#endif + +// These are all optional, with defining SHISHUA_TARGET_SCALAR, you only +// need this header. +#if SHISHUA_TARGET == SHISHUA_TARGET_AVX2 +#include "shishua-avx2.h" +#elif SHISHUA_TARGET == SHISHUA_TARGET_SSE2 +#include "shishua-sse2.h" +#elif SHISHUA_TARGET == SHISHUA_TARGET_NEON +#include "shishua-neon.h" +#else // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR + +// Portable scalar implementation of shishua. +// Designed to balance performance and code size. +#include +#include +#include +#include + +// Note: While it is an array, a "lane" refers to 4 consecutive uint64_t. +typedef struct prng_state { + uint64_t state[16]; // 4 lanes + uint64_t output[16]; // 4 lanes, 2 parts + uint64_t counter[4]; // 1 lane +} prng_state; + +// buf could technically alias with prng_state, according to the compiler. +#if defined(__GNUC__) || defined(_MSC_VER) +#define SHISHUA_RESTRICT __restrict +#else +#define SHISHUA_RESTRICT +#endif + +// Writes a 64-bit little endian integer to dst +static inline void prng_write_le64(void *dst, uint64_t val) { + // Define to write in native endianness with memcpy + // Also, use memcpy on known little endian setups. +#if defined(SHISHUA_NATIVE_ENDIAN) || defined(_WIN32) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN__) + memcpy(dst, &val, sizeof(uint64_t)); +#else + // Byteshift write. + uint8_t *d = (uint8_t *)dst; + for (size_t i = 0; i < 8; i++) { + d[i] = (uint8_t)(val & 0xff); + val >>= 8; + } +#endif +} + +// buf's size must be a multiple of 128 bytes. +static inline void prng_gen(prng_state *SHISHUA_RESTRICT state, uint8_t *SHISHUA_RESTRICT buf, size_t size) { + uint8_t *b = buf; + // TODO: consider adding proper uneven write handling + assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes."); + + for (size_t i = 0; i < size; i += 128) { + // Write the current output block to state if it is not NULL + if (buf != NULL) { + for (size_t j = 0; j < 16; j++) { + prng_write_le64(b, state->output[j]); + b += 8; + } + } + // Similar to SSE, use fixed iteration loops to reduce code complexity + // and allow the compiler more control over optimization. + for (size_t j = 0; j < 2; j++) { + // I don't want to type this 15 times. + uint64_t *s = &state->state[j * 8]; // 2 lanes + uint64_t *o = &state->output[j * 4]; // 1 lane + uint64_t t[8]; // temp buffer + + // I apply the counter to s1, + // since it is the one whose shift loses most entropy. + for (size_t k = 0; k < 4; k++) { + s[k + 4] += state->counter[k]; + } + + // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit + // additions to strong positions for enrichment. The low 32-bit part of a + // 64-bit chunk never moves to the same 64-bit chunk as its high part. + // They do not remain in the same chunk. Each part eventually reaches all + // positions ringwise: A to B, B to C, …, H to A. + // + // You may notice that they are simply 256-bit rotations (96 and 160): + // + // t0 = (s0 << 96) | (s0 >> (256 - 96)); + // t1 = (s1 << 160) | (s1 >> (256 - 160)); + // + // The easiest way to do this would be to cast s and t to uint32_t * + // and operate on them that way. + // + // uint32_t *t0_32 = (uint32_t *)t0, *t1_32 = (uint32_t *)t1; + // uint32_t *s0_32 = (uint32_t *)s0, *s1_32 = (uint32_t *)s1; + // for (size_t k = 0; k < 4; k++) { + // t0_32[k] = s0_32[(k + 5) % 8]; + // t1_32[k] = s1_32[(k + 3) % 8]; + // } + // + // This is pretty, but it violates strict aliasing and relies on little + // endian data layout. + // + // A common workaround to strict aliasing would be to use memcpy: + // + // // legal casts + // unsigned char *t8 = (unsigned char *)t; + // unsigned char *s8 = (unsigned char *)s; + // memcpy(&t8[0], &s8[20], 32 - 20); + // memcpy(&t8[32 - 20], &s8[0], 20); + // + // However, this still doesn't fix the endianness issue, and is very + // ugly. + // + // The only known solution which doesn't rely on endianness is to + // read two 64-bit integers and do a funnel shift. + + // Lookup table for the _offsets_ in the shuffle. Even lanes rotate + // by 5, odd lanes rotate by 3. + // If it were by 32-bit lanes, it would be + // { 5,6,7,0,1,2,3,4, 11,12,13,14,15,8,9,10 } + const uint8_t shuf_offsets[16] = {2, 3, 0, 1, 5, 6, 7, 4, // left + 3, 0, 1, 2, 6, 7, 4, 5}; // right + for (size_t k = 0; k < 8; k++) { + t[k] = (s[shuf_offsets[k]] >> 32) | (s[shuf_offsets[k + 8]] << 32); + } + + for (size_t k = 0; k < 4; k++) { + // SIMD does not support rotations. Shift is the next best thing to entangle + // bits with other 64-bit positions. We must shift by an odd number so that + // each bit reaches all 64-bit positions, not just half. We must lose bits + // of information, so we minimize it: 1 and 3. We use different shift values + // to increase divergence between the two sides. We use rightward shift + // because the rightmost bits have the least diffusion in addition (the low + // bit is just a XOR of the low bits). + uint64_t u_lo = s[k + 0] >> 1; + uint64_t u_hi = s[k + 4] >> 3; + + // Addition is the main source of diffusion. + // Storing the output in the state keeps that diffusion permanently. + s[k + 0] = u_lo + t[k + 0]; + s[k + 4] = u_hi + t[k + 4]; + + // The first orthogonally grown piece evolving independently, XORed. + o[k] = u_lo ^ t[k + 4]; + } + } + + // Merge together. + for (size_t j = 0; j < 4; j++) { + // The second orthogonally grown piece evolving independently, XORed. + state->output[j + 8] = state->state[j + 0] ^ state->state[j + 12]; + state->output[j + 12] = state->state[j + 8] ^ state->state[j + 4]; + // The counter is not necessary to beat PractRand. + // It sets a lower bound of 2^71 bytes = 2 ZiB to the period, + // or about 7 millenia at 10 GiB/s. + // The increments are picked as odd numbers, + // since only coprimes of the base cover the full cycle, + // and all odd numbers are coprime of 2. + // I use different odd numbers for each 64-bit chunk + // for a tiny amount of variation stirring. + // I used the smallest odd numbers to avoid having a magic number. + // + // For the scalar version, we calculate this dynamically, as it is + // simple enough. + state->counter[j] += 7 - (j * 2); // 7, 5, 3, 1 + } + } +} +#undef SHISHUA_RESTRICT + +// Nothing up my sleeve: those are the hex digits of Φ, +// the least approximable irrational number. +// $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc +static uint64_t phi[16] = { + 0x9E3779B97F4A7C15, + 0xF39CC0605CEDC834, + 0x1082276BF3A27251, + 0xF86C6A11D0C18E95, + 0x2767F0B153D27B7F, + 0x0347045B5BF1827F, + 0x01886F0928403002, + 0xC1D64BA40F335E36, + 0xF06AD7AE9717877E, + 0x85839D6EFFBD7DC6, + 0x64D325D1C5371682, + 0xCADD0CCCFDFFBBE1, + 0x626E33B8D04B4331, + 0xBBF73C790D94F79D, + 0x471C4AB3ED3D82A5, + 0xFEC507705E4AE6E5, +}; + +void prng_init(prng_state *s, uint64_t seed[4]) { + memset(s, 0, sizeof(prng_state)); +#define STEPS 1 +#define ROUNDS 13 + // Diffuse first two seed elements in s0, then the last two. Same for s1. + // We must keep half of the state unchanged so users cannot set a bad state. + memcpy(s->state, phi, sizeof(phi)); + for (size_t i = 0; i < 4; i++) { + s->state[i * 2 + 0] ^= seed[i]; // { s0,0,s1,0,s2,0,s3,0 } + s->state[i * 2 + 8] ^= seed[(i + 2) % 4]; // { s2,0,s3,0,s0,0,s1,0 } + } + for (size_t i = 0; i < ROUNDS; i++) { + prng_gen(s, NULL, 128 * STEPS); + for (size_t j = 0; j < 4; j++) { + s->state[j + 0] = s->output[j + 12]; + s->state[j + 4] = s->output[j + 8]; + s->state[j + 8] = s->output[j + 4]; + s->state[j + 12] = s->output[j + 0]; + } + } +#undef STEPS +#undef ROUNDS +} +#endif // SHISHUA_TARGET == SHISHUA_TARGET_SCALAR +#endif // SHISHUA_SCALAR_H diff --git a/src/rand.c b/src/rand.c new file mode 100644 index 000000000..7d31a04cc --- /dev/null +++ b/src/rand.c @@ -0,0 +1,55 @@ +#include "rand.h" + +#include +#include + +uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max) { + assert(max > min); + assert(randc->count <= 127); + uint8_t randnum = ((uint16_t)randc->buf[randc->count] * (max - min)) >> 8; + randnum += min; + randc->count++; + return randnum; +} + +uint32_t randnum_u32(struct rand_chunk *randc, uint32_t min, uint32_t max) { + assert(max > min); + assert(randc->count <= 124); + uint32_t value; + memcpy(&value, &randc->buf[randc->count], sizeof(value)); + uint32_t randnum = ((uint64_t)value * (max - min)) >> 32; + randnum += min; + randc->count += sizeof(uint32_t); + return randnum; +} + +float randf(struct rand_chunk *randc, float range, float offset) { + assert(range); + assert(randc->count <= 124); + uint32_t value; + memcpy(&value, &randc->buf[randc->count], sizeof(value)); + // float randfloat = ((float)value / UINT32_MAX) * range + offset; + float randfloat = ((value >> 8) / 16777216.0f) * range + offset; + randc->count += sizeof(float); + return randfloat; +} + +// NOTE: example + +// int main() { +// prng_state state; +// static uint64_t seed[4] = {0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95}; +// struct rand_chunk randc = {0}; +// prng_init(&state, seed); +// prng_gen(&state, randc.buf, 128); +// +// for (int i = 0; i < 1000; i++) { +// float randfloat = randf(&randc, 2, -1); +// if(randc.count > 128 - sizeof(randfloat)) { //NOTE: always check randc.count b4 using function +// randc.count = 0; +// prng_gen(&state, randc.buf, 128); +// } +// printf("%f, ", randfloat); +// } +// return 0; +// } diff --git a/src/rand.h b/src/rand.h new file mode 100644 index 000000000..40a6d4d2e --- /dev/null +++ b/src/rand.h @@ -0,0 +1,16 @@ +#ifndef SHISHUA_RAND_H +#define SHISHUA_RAND_H + +#include +#include + +struct rand_chunk { + size_t count; + uint8_t buf[128]; // NOTE: must be multiple of 128 +}; + +uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max); +uint32_t randnum_u32(struct rand_chunk *randc, uint32_t min, uint32_t max); +float randf(struct rand_chunk *randc, float range, float offset); + +#endif From a62e6fd6faa0978ced1ba04688b537793e09b7a2 Mon Sep 17 00:00:00 2001 From: BrBa2 Date: Thu, 12 Jun 2025 16:45:29 +0800 Subject: [PATCH 04/11] fuck this shit multiple definition go fuck yousrlf --- include/cneuron/cneuron.h | 2 +- src/data.c | 6 +- src/main.c | 9 +- src/network.c | 10 +- src/rand.c | 32 ++- src/rand.h | 5 +- test/data.cpp | 334 +++++++++++++------------- test/linear_algebra.cpp | 100 ++++---- test/main.cpp | 14 +- test/network.cpp | 491 +++++++++++++++++++------------------- test/test_utils.cpp | 154 ++++++------ test/test_utils.h | 24 +- 12 files changed, 597 insertions(+), 584 deletions(-) diff --git a/include/cneuron/cneuron.h b/include/cneuron/cneuron.h index c2c5c2d7e..4a68fae06 100644 --- a/include/cneuron/cneuron.h +++ b/include/cneuron/cneuron.h @@ -3,6 +3,7 @@ #include #include +#include "rand.h" /** * @brief Represents a single data element with its inputs and expected output index. @@ -164,7 +165,6 @@ typedef struct { * @param max Maximum value for the random number. * @return A random float between min and max. */ -float random_float(float min, float max); /** * @brief Allocates and initializes a new layer. diff --git a/src/data.c b/src/data.c index 51e319b43..e85c6ff9c 100644 --- a/src/data.c +++ b/src/data.c @@ -136,7 +136,7 @@ dataset *get_random_dataset_sample(const dataset *source_dataset, size_t amount) new_dataset->datas = malloc(sizeof(data) * amount); for (size_t i = 0; i < amount; i++) { - new_dataset->datas[i] = get_data_copy(source_dataset->datas[rand() % source_dataset->length], source_dataset->inputs_length); + new_dataset->datas[i] = get_data_copy(source_dataset->datas[randnum_u32(&randc, 0, source_dataset->length)], source_dataset->inputs_length); } return new_dataset; @@ -242,9 +242,9 @@ void noise_data(data *data, size_t inputs_length, float noise_factor, float prob assert(inputs_length > 0); for (size_t i = 0; i < inputs_length; i++) { - float random_value = rand() / (float)RAND_MAX; + float random_value = randf(&randc, 1, 0); if (random_value <= probability) { - float noise = (rand() / (float)RAND_MAX * noise_factor); + float noise = randf(&randc, noise_factor, 0); float new_value = data->inputs[i] + noise; data->inputs[i] = fmin(new_value, 1.0f); diff --git a/src/main.c b/src/main.c index 7f43f0789..cc9fafb73 100644 --- a/src/main.c +++ b/src/main.c @@ -39,9 +39,9 @@ void train(neural_network *nn, dataset *train_dataset, dataset *test_dataset, fl dataset *batch_dataset = get_random_dataset_sample(train_dataset, batch_size); for (size_t i = 0; i < batch_dataset->length; i++) { data *data = batch_dataset->datas[i]; - rotate_data(data, IMAGE_SIZE, IMAGE_SIZE, random_float(-5.0f, 5.0f)); - scale_data(data, IMAGE_SIZE, IMAGE_SIZE, random_float(0.9f, 1.1f)); - offset_data(data, IMAGE_SIZE, IMAGE_SIZE, random_float(-3.0f, 3.0f), random_float(-3.0f, 3.0f)); + rotate_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, -5.0f, 5.0f)); + scale_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, 0.9f, 1.1f)); + offset_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, -3.0f, 3.0f), randf(&randc, -3.0f, 3.0f)); noise_data(data, IMAGE_SIZE * IMAGE_SIZE, 0.3f, 0.08f); } mini_batch_gd(nn, learn_rate, batch_dataset); @@ -97,7 +97,8 @@ dataset *get_mnist(bool is_test) { } int main(int argc, char **argv) { - srand(time(NULL)); + static uint64_t seed[4] = {0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95}; + prng_init(&__randstate, seed); dataset *train_dataset = get_mnist(false); dataset *test_dataset = get_mnist(true); size_t network_length = 3; diff --git a/src/network.c b/src/network.c index 7352d4901..3d06103f7 100644 --- a/src/network.c +++ b/src/network.c @@ -10,12 +10,6 @@ #include "cneuron/cneuron.h" -float random_float(float min, float max) { - assert(min < max); - - return (float)rand() / (float)RAND_MAX * (max - min) + min; -} - layer *get_layer(size_t length, size_t prev_length) { layer *new_layer = calloc(1, sizeof(layer)); if (!new_layer) @@ -30,7 +24,7 @@ layer *get_layer(size_t length, size_t prev_length) { } for (size_t i = 0; i < length * prev_length; i++) - new_layer->weights[i] = ((float)rand() / (float)RAND_MAX * 2.0f - 1.0f); + new_layer->weights[i] = randf(&randc, 2.0f, -1.0f); new_layer->delta = calloc(length, sizeof(float)); if (!new_layer->delta) { @@ -206,7 +200,7 @@ float cost(neural_network *nn, const dataset *test_dataset, size_t num_test) { layer *output_layer = nn->layers[nn->length - 1]; for (size_t i = 0; i < num_test; i++) { - data *test_data = test_dataset->datas[rand() % test_dataset->length]; + data *test_data = test_dataset->datas[randnum_u32(&randc, 0, test_dataset->length)]; compute_network(nn, test_data->inputs); for (size_t j = 0; j < output_layer->length; j++) { float output = output_layer->output[j]; diff --git a/src/rand.c b/src/rand.c index 7d31a04cc..c01a6adbb 100644 --- a/src/rand.c +++ b/src/rand.c @@ -5,35 +5,49 @@ uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max) { assert(max > min); - assert(randc->count <= 127); + assert(randc->count <= 255); uint8_t randnum = ((uint16_t)randc->buf[randc->count] * (max - min)) >> 8; randnum += min; randc->count++; + if (randc->count > 255) { + randc->count = 0; + prng_gen(&__randstate, randc->buf, 256); + } return randnum; } uint32_t randnum_u32(struct rand_chunk *randc, uint32_t min, uint32_t max) { assert(max > min); - assert(randc->count <= 124); + assert(randc->count <= 252); uint32_t value; memcpy(&value, &randc->buf[randc->count], sizeof(value)); uint32_t randnum = ((uint64_t)value * (max - min)) >> 32; randnum += min; randc->count += sizeof(uint32_t); + if (randc->count > 252) { + randc->count = 0; + prng_gen(&__randstate, randc->buf, 256); + } return randnum; } float randf(struct rand_chunk *randc, float range, float offset) { assert(range); - assert(randc->count <= 124); + assert(randc->count <= 252); uint32_t value; - memcpy(&value, &randc->buf[randc->count], sizeof(value)); + memcpy(&value, &randc->buf[randc->count], sizeof(value) - 1); // float randfloat = ((float)value / UINT32_MAX) * range + offset; - float randfloat = ((value >> 8) / 16777216.0f) * range + offset; - randc->count += sizeof(float); + float randfloat = (value / 16777216.0f) * range + offset; + randc->count += sizeof(value) - 1; + if (randc->count > 253) { + randc->count = 0; + prng_gen(&__randstate, randc->buf, 256); + } return randfloat; } + + // NOTE: example // int main() { @@ -41,13 +55,13 @@ float randf(struct rand_chunk *randc, float range, float offset) { // static uint64_t seed[4] = {0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95}; // struct rand_chunk randc = {0}; // prng_init(&state, seed); -// prng_gen(&state, randc.buf, 128); +// prng_gen(&state, randc.buf, 256); // // for (int i = 0; i < 1000; i++) { // float randfloat = randf(&randc, 2, -1); -// if(randc.count > 128 - sizeof(randfloat)) { //NOTE: always check randc.count b4 using function +// if(randc.count > 256 - sizeof(randfloat)) { //NOTE: always check randc.count b4 using function // randc.count = 0; -// prng_gen(&state, randc.buf, 128); +// prng_gen(&state, randc.buf, 256); // } // printf("%f, ", randfloat); // } diff --git a/src/rand.h b/src/rand.h index 40a6d4d2e..0d06992cb 100644 --- a/src/rand.h +++ b/src/rand.h @@ -3,14 +3,17 @@ #include #include +#include "../external/shishua/shishua.h" struct rand_chunk { size_t count; - uint8_t buf[128]; // NOTE: must be multiple of 128 + uint8_t buf[256]; // NOTE: must be multiple of 256 }; uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max); uint32_t randnum_u32(struct rand_chunk *randc, uint32_t min, uint32_t max); float randf(struct rand_chunk *randc, float range, float offset); +prng_state __randstate; +struct rand_chunk randc = {0}; #endif diff --git a/test/data.cpp b/test/data.cpp index abe560d7f..0eb88c354 100644 --- a/test/data.cpp +++ b/test/data.cpp @@ -1,167 +1,167 @@ -#include - -extern "C" { -#include "cneuron/cneuron.h" -} - -TEST(DataTest, CreateData) { - data *test_data = (data *)malloc(sizeof(data)); - - size_t inputs_length = 10; - test_data->inputs = (float *)malloc(sizeof(float) * inputs_length); - EXPECT_NE(test_data, nullptr); - EXPECT_NE(test_data->inputs, nullptr); - for (size_t i = 0; i < inputs_length; i++) { - test_data->inputs[i] = static_cast(i); - } - free_data(test_data); - // No crash -} - -TEST(DataTest, GetDatasetFileNotFound) { - dataset *test_dataset = get_dataset("non_existent_file.dat"); - ASSERT_EQ(test_dataset, nullptr); -} - -TEST(DataTest, GetDatasetValidFile) { - dataset *test_dataset = get_dataset("data/mnist/test/0.dat"); - ASSERT_NE(test_dataset, nullptr); - ASSERT_GT(test_dataset->length, 0); - ASSERT_GT(test_dataset->inputs_length, 0); - - ASSERT_NE(test_dataset, nullptr); - ASSERT_NE(test_dataset->datas[0], nullptr); - ASSERT_NE(test_dataset->datas[0]->inputs, nullptr); - - free_dataset(test_dataset); -} - -TEST(DataTest, FreeDataset) { - dataset *test_dataset = get_dataset("data/mnist/test/0.dat"); - - free_dataset(test_dataset); - // No crash -} - -TEST(DataTest, FreeData) { - data *test_data = (data *)malloc(sizeof(data)); - test_data->inputs = (float *)malloc(sizeof(float) * 10); - - free_data(test_data); - // No crash -} - -TEST(DataTest, CopyData) { - dataset *test_dataset = get_dataset("data/mnist/test/0.dat"); - - data *data_copy = get_data_copy(test_dataset->datas[0], test_dataset->inputs_length); - ASSERT_NE(data_copy, nullptr); - ASSERT_NE(data_copy->inputs, nullptr); - - for (size_t i = 0; i < test_dataset->inputs_length; i++) { - ASSERT_FLOAT_EQ(data_copy->inputs[i], test_dataset->datas[0]->inputs[i]); - } - - ASSERT_FLOAT_EQ(data_copy->expected_index, test_dataset->datas[0]->expected_index); - - free_data(data_copy); - free_dataset(test_dataset); -} - -TEST(DataTest, RandomSampleDataset) { - dataset *test_dataset = get_dataset("data/mnist/test/0.dat"); - - dataset *dataset_sample = get_random_dataset_sample(test_dataset, test_dataset->length - 1); - ASSERT_NE(dataset_sample, nullptr); - ASSERT_NE(dataset_sample->datas, nullptr); - ASSERT_NE(dataset_sample->datas[0]->inputs, nullptr); - - free_dataset(dataset_sample); - free_dataset(test_dataset); -} - -TEST(DataTest, RotateData) { - data *test_data = (data *)malloc(sizeof(data)); - - size_t inputs_length = 9; - test_data->inputs = (float *)malloc(sizeof(float) * inputs_length); - for (size_t i = 0; i < inputs_length; i++) { - test_data->inputs[i] = static_cast(i) + 1.0f; - } - - rotate_data(test_data, 3, 3, 90); - ASSERT_FLOAT_EQ(test_data->inputs[0], 7.0f); - ASSERT_FLOAT_EQ(test_data->inputs[2], 1.0f); - ASSERT_FLOAT_EQ(test_data->inputs[4], 5.0f); - - free_data(test_data); -} - -TEST(DataTest, ScaleData) { - data *test_data = (data *)malloc(sizeof(data)); - - size_t inputs_length = 9; - test_data->inputs = (float *)malloc(sizeof(float) * inputs_length); - for (size_t i = 0; i < inputs_length; i++) { - test_data->inputs[i] = i + 1.0f; - } - - scale_data(test_data, 3, 3, 2.0f); - ASSERT_FLOAT_EQ(test_data->inputs[0], 5.0f); - ASSERT_FLOAT_EQ(test_data->inputs[2], 6.0f); - ASSERT_FLOAT_EQ(test_data->inputs[8], 9.0f); - - free_data(test_data); -} - -TEST(DataTest, OffsetData) { - data *test_data = (data *)malloc(sizeof(data)); - - size_t inputs_length = 9; - test_data->inputs = (float *)malloc(sizeof(float) * inputs_length); - for (size_t i = 0; i < inputs_length; i++) { - test_data->inputs[i] = i + 1.0f; - } - - offset_data(test_data, 3, 3, 1.0f, 1.0f); - ASSERT_FLOAT_EQ(test_data->inputs[0], 0.0f); - ASSERT_FLOAT_EQ(test_data->inputs[5], 2.0f); - ASSERT_FLOAT_EQ(test_data->inputs[6], 0.0f); - ASSERT_FLOAT_EQ(test_data->inputs[8], 5.0f); - - free_data(test_data); -} - -TEST(DataTest, NoiseData) { - data *test_data = (data *)malloc(sizeof(data)); - - size_t inputs_length = 9; - test_data->inputs = (float *)malloc(sizeof(float) * inputs_length); - for (size_t i = 0; i < inputs_length; i++) { - test_data->inputs[i] = i + 1.0f; - } - data *data_copy = get_data_copy(test_data, inputs_length); - - bool same = true; - noise_data(test_data, inputs_length, 1.0f, 1.0f); - for (size_t i = 0; i < inputs_length; i++) { - if (data_copy->inputs[i] != test_data->inputs[i]) { - same = false; - break; - } - } - ASSERT_FALSE(same); - - free_data(test_data); - free_data(data_copy); -} - -TEST(DataTest, OutputExpected) { - data *test_data = (data *)malloc(sizeof(data)); - test_data->expected_index = 1; - - ASSERT_FLOAT_EQ(output_expected(0, test_data), 0.0f); - ASSERT_FLOAT_EQ(output_expected(1, test_data), 1.0f); - - free(test_data); -} +// #include +// +// extern "C" { +// #include "cneuron/cneuron.h" +// } +// +// TEST(DataTest, CreateData) { +// data *test_data = (data *)malloc(sizeof(data)); +// +// size_t inputs_length = 10; +// test_data->inputs = (float *)malloc(sizeof(float) * inputs_length); +// EXPECT_NE(test_data, nullptr); +// EXPECT_NE(test_data->inputs, nullptr); +// for (size_t i = 0; i < inputs_length; i++) { +// test_data->inputs[i] = static_cast(i); +// } +// free_data(test_data); +// // No crash +// } +// +// TEST(DataTest, GetDatasetFileNotFound) { +// dataset *test_dataset = get_dataset("non_existent_file.dat"); +// ASSERT_EQ(test_dataset, nullptr); +// } +// +// TEST(DataTest, GetDatasetValidFile) { +// dataset *test_dataset = get_dataset("data/mnist/test/0.dat"); +// ASSERT_NE(test_dataset, nullptr); +// ASSERT_GT(test_dataset->length, 0); +// ASSERT_GT(test_dataset->inputs_length, 0); +// +// ASSERT_NE(test_dataset, nullptr); +// ASSERT_NE(test_dataset->datas[0], nullptr); +// ASSERT_NE(test_dataset->datas[0]->inputs, nullptr); +// +// free_dataset(test_dataset); +// } +// +// TEST(DataTest, FreeDataset) { +// dataset *test_dataset = get_dataset("data/mnist/test/0.dat"); +// +// free_dataset(test_dataset); +// // No crash +// } +// +// TEST(DataTest, FreeData) { +// data *test_data = (data *)malloc(sizeof(data)); +// test_data->inputs = (float *)malloc(sizeof(float) * 10); +// +// free_data(test_data); +// // No crash +// } +// +// TEST(DataTest, CopyData) { +// dataset *test_dataset = get_dataset("data/mnist/test/0.dat"); +// +// data *data_copy = get_data_copy(test_dataset->datas[0], test_dataset->inputs_length); +// ASSERT_NE(data_copy, nullptr); +// ASSERT_NE(data_copy->inputs, nullptr); +// +// for (size_t i = 0; i < test_dataset->inputs_length; i++) { +// ASSERT_FLOAT_EQ(data_copy->inputs[i], test_dataset->datas[0]->inputs[i]); +// } +// +// ASSERT_FLOAT_EQ(data_copy->expected_index, test_dataset->datas[0]->expected_index); +// +// free_data(data_copy); +// free_dataset(test_dataset); +// } +// +// TEST(DataTest, RandomSampleDataset) { +// dataset *test_dataset = get_dataset("data/mnist/test/0.dat"); +// +// dataset *dataset_sample = get_random_dataset_sample(test_dataset, test_dataset->length - 1); +// ASSERT_NE(dataset_sample, nullptr); +// ASSERT_NE(dataset_sample->datas, nullptr); +// ASSERT_NE(dataset_sample->datas[0]->inputs, nullptr); +// +// free_dataset(dataset_sample); +// free_dataset(test_dataset); +// } +// +// TEST(DataTest, RotateData) { +// data *test_data = (data *)malloc(sizeof(data)); +// +// size_t inputs_length = 9; +// test_data->inputs = (float *)malloc(sizeof(float) * inputs_length); +// for (size_t i = 0; i < inputs_length; i++) { +// test_data->inputs[i] = static_cast(i) + 1.0f; +// } +// +// rotate_data(test_data, 3, 3, 90); +// ASSERT_FLOAT_EQ(test_data->inputs[0], 7.0f); +// ASSERT_FLOAT_EQ(test_data->inputs[2], 1.0f); +// ASSERT_FLOAT_EQ(test_data->inputs[4], 5.0f); +// +// free_data(test_data); +// } +// +// TEST(DataTest, ScaleData) { +// data *test_data = (data *)malloc(sizeof(data)); +// +// size_t inputs_length = 9; +// test_data->inputs = (float *)malloc(sizeof(float) * inputs_length); +// for (size_t i = 0; i < inputs_length; i++) { +// test_data->inputs[i] = i + 1.0f; +// } +// +// scale_data(test_data, 3, 3, 2.0f); +// ASSERT_FLOAT_EQ(test_data->inputs[0], 5.0f); +// ASSERT_FLOAT_EQ(test_data->inputs[2], 6.0f); +// ASSERT_FLOAT_EQ(test_data->inputs[8], 9.0f); +// +// free_data(test_data); +// } +// +// TEST(DataTest, OffsetData) { +// data *test_data = (data *)malloc(sizeof(data)); +// +// size_t inputs_length = 9; +// test_data->inputs = (float *)malloc(sizeof(float) * inputs_length); +// for (size_t i = 0; i < inputs_length; i++) { +// test_data->inputs[i] = i + 1.0f; +// } +// +// offset_data(test_data, 3, 3, 1.0f, 1.0f); +// ASSERT_FLOAT_EQ(test_data->inputs[0], 0.0f); +// ASSERT_FLOAT_EQ(test_data->inputs[5], 2.0f); +// ASSERT_FLOAT_EQ(test_data->inputs[6], 0.0f); +// ASSERT_FLOAT_EQ(test_data->inputs[8], 5.0f); +// +// free_data(test_data); +// } +// +// TEST(DataTest, NoiseData) { +// data *test_data = (data *)malloc(sizeof(data)); +// +// size_t inputs_length = 9; +// test_data->inputs = (float *)malloc(sizeof(float) * inputs_length); +// for (size_t i = 0; i < inputs_length; i++) { +// test_data->inputs[i] = i + 1.0f; +// } +// data *data_copy = get_data_copy(test_data, inputs_length); +// +// bool same = true; +// noise_data(test_data, inputs_length, 1.0f, 1.0f); +// for (size_t i = 0; i < inputs_length; i++) { +// if (data_copy->inputs[i] != test_data->inputs[i]) { +// same = false; +// break; +// } +// } +// ASSERT_FALSE(same); +// +// free_data(test_data); +// free_data(data_copy); +// } +// +// TEST(DataTest, OutputExpected) { +// data *test_data = (data *)malloc(sizeof(data)); +// test_data->expected_index = 1; +// +// ASSERT_FLOAT_EQ(output_expected(0, test_data), 0.0f); +// ASSERT_FLOAT_EQ(output_expected(1, test_data), 1.0f); +// +// free(test_data); +// } diff --git a/test/linear_algebra.cpp b/test/linear_algebra.cpp index fee0c10d7..31b5edc9b 100644 --- a/test/linear_algebra.cpp +++ b/test/linear_algebra.cpp @@ -1,50 +1,50 @@ -#include -#include - -extern "C" { -#include "cneuron/cneuron.h" -} - -#include "test_utils.h" - -TEST(LinearAlgebraTest, VectorApplyActivation) { - size_t length = 3; - float *a = (float *)malloc(sizeof(float) * length); - float *b = (float *)malloc(sizeof(float) * length); - - a[0] = 1.0f; - a[1] = -1.2f; - a[2] = -0.2f; - - vector_apply_activation(a, b, length, sigmoid, false); - - for (size_t i = 0; i < length; i++) { - ASSERT_FLOAT_EQ(b[i], sigmoid(a[i], false)); - } - - free(a); - free(b); -} - -TEST(LinearAlgebraTest, HadamardProduct) { - size_t length = 3; - float *a = (float *)malloc(sizeof(float) * length); - float *b = (float *)malloc(sizeof(float) * length); - - a[0] = 1.0f; - a[1] = -1.2f; - a[2] = -0.2f; - - b[0] = 1.2f; - b[1] = -2.2f; - b[2] = -0.4f; - - hadamard_product(a, b, b, length); - - ASSERT_FLOAT_EQ(b[0], 1.2f); - ASSERT_FLOAT_EQ(b[1], 2.64f); - ASSERT_FLOAT_EQ(b[2], 0.08f); - - free(a); - free(b); -} +// #include +// #include +// +// extern "C" { +// #include "cneuron/cneuron.h" +// } +// +// #include "test_utils.h" +// +// TEST(LinearAlgebraTest, VectorApplyActivation) { +// size_t length = 3; +// float *a = (float *)malloc(sizeof(float) * length); +// float *b = (float *)malloc(sizeof(float) * length); +// +// a[0] = 1.0f; +// a[1] = -1.2f; +// a[2] = -0.2f; +// +// vector_apply_activation(a, b, length, sigmoid, false); +// +// for (size_t i = 0; i < length; i++) { +// ASSERT_FLOAT_EQ(b[i], sigmoid(a[i], false)); +// } +// +// free(a); +// free(b); +// } +// +// TEST(LinearAlgebraTest, HadamardProduct) { +// size_t length = 3; +// float *a = (float *)malloc(sizeof(float) * length); +// float *b = (float *)malloc(sizeof(float) * length); +// +// a[0] = 1.0f; +// a[1] = -1.2f; +// a[2] = -0.2f; +// +// b[0] = 1.2f; +// b[1] = -2.2f; +// b[2] = -0.4f; +// +// hadamard_product(a, b, b, length); +// +// ASSERT_FLOAT_EQ(b[0], 1.2f); +// ASSERT_FLOAT_EQ(b[1], 2.64f); +// ASSERT_FLOAT_EQ(b[2], 0.08f); +// +// free(a); +// free(b); +// } diff --git a/test/main.cpp b/test/main.cpp index 0a4376670..5bebbc677 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -1,7 +1,7 @@ -#include - -int main(int argc, char **argv) { - srand(time(NULL)); - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} +// #include +// +// int main(int argc, char **argv) { +// srand(time(NULL)); +// ::testing::InitGoogleTest(&argc, argv); +// return RUN_ALL_TESTS(); +// } diff --git a/test/network.cpp b/test/network.cpp index 4082a2052..6e56ca377 100644 --- a/test/network.cpp +++ b/test/network.cpp @@ -1,245 +1,246 @@ -#include - -extern "C" { -#include "cneuron/cneuron.h" -} - -#include - -#include "test_utils.h" - -TEST(NetworkTest, RandomFloat) { - float test = random_float(0.0f, 1.0f); - bool same = true; - for (int i = 0; i < 10; i++) { - if (test != random_float(0.0f, 1.0f)) { - same = false; - break; - } - } - ASSERT_FALSE(same); -} - -TEST(NetworkTest, GetLayer) { - size_t layer_length = 3; - layer *test_layer = get_layer(layer_length, 5); - - ASSERT_NE(test_layer, nullptr); - ASSERT_NE(test_layer->delta, nullptr); - ASSERT_NE(test_layer->weighted_input, nullptr); - ASSERT_NE(test_layer->weights, nullptr); - ASSERT_NE(test_layer->bias, nullptr); - ASSERT_NE(test_layer->output, nullptr); - - ASSERT_EQ(test_layer->length, layer_length); - - free_layer(test_layer); -} - -TEST(NetworkTest, GetNeuralNetwork) { - size_t layer_length = 3; - size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); - layer_lengths[0] = 2; - layer_lengths[1] = 3; - layer_lengths[2] = 4; - size_t inputs_length = 2; - neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid); - - ASSERT_NE(nn, nullptr); - ASSERT_EQ(nn->length, layer_length); - ASSERT_EQ(nn->inputs_length, inputs_length); - ASSERT_EQ(nn->activation_function, &sigmoid); - ASSERT_NE(nn->layers, nullptr); - for (size_t i = 0; i < layer_length; i++) { - ASSERT_NE(nn->layers[i], nullptr); - ASSERT_EQ(nn->layers[i]->length, layer_lengths[i]); - ASSERT_EQ(nn->layers[i]->prev_layer, (i == 0) ? nullptr : nn->layers[i - 1]); - ASSERT_EQ(nn->layers[i]->next_layer, (i == layer_length - 1) ? nullptr : nn->layers[i + 1]); - } - - free_neural_network(nn); - free(layer_lengths); -} - -TEST(NetworkTest, FreeDataset) { - size_t layer_length = 1; - size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); - layer_lengths[0] = 2; - neural_network *nn = get_neural_network(layer_length, layer_lengths, 2, nullptr); - - free_neural_network(nn); - // No crash - free(layer_lengths); -} - -TEST(NetworkTest, FreeLayer) { - size_t layer_length = 2; - layer *test_layer = get_layer(layer_length, 3); - - free_layer(test_layer); - // No crash -} - -TEST(NetworkTest, ComputeNetwork) { - size_t layer_length = 1; - size_t inputs_length = 1; - size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); - layer_lengths[0] = 1; - neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid); - - float *inputs = (float *)malloc(sizeof(float) * inputs_length); - inputs[0] = 0.2f; - - nn->layers[0]->weights[0] = 0.5f; - nn->layers[0]->bias[0] = 0.3f; - - compute_network(nn, inputs); - - ASSERT_FLOAT_EQ(nn->layers[0]->output[0], 0.59868766f); - - free(inputs); - free_neural_network(nn); - free(layer_lengths); -} - -TEST(NetworkTest, Softmax) { - size_t layer_length = 1; - size_t inputs_length = 1; - size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); - layer_lengths[0] = 3; - neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid); - - nn->layers[0]->output[0] = 0.2f; - nn->layers[0]->output[1] = 0.3f; - nn->layers[0]->output[2] = 0.5f; - - ASSERT_FLOAT_EQ(softmax(nn, 0), 0.28943311f); - ASSERT_FLOAT_EQ(softmax(nn, 1), 0.31987305f); - ASSERT_FLOAT_EQ(softmax(nn, 2), 0.39069383f); - - free_neural_network(nn); - free(layer_lengths); -} - -// Output layer only -TEST(NetworkTest, StochasticGDSingleLayer) { - dataset *test_dataset = get_or(); - - // Create network - size_t layer_length = 1; - size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); - layer_lengths[0] = 2; - neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid); - - for (size_t i = 0; i < 50000; i++) { - for (size_t j = 0; j < test_dataset->length; j++) { - stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]); - } - if (i % 10000 == 0) { - printf("Single layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); - } - } - - ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f); - ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f); - - free_dataset(test_dataset); - free_neural_network(nn); - free(layer_lengths); -} - -TEST(NetworkTest, StochasticGDTests) { - dataset *test_dataset = get_xor(); - - // Multi layer test - size_t layer_length = 2; - size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); - layer_lengths[0] = 4; - layer_lengths[1] = 2; - neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid); - - for (size_t i = 0; i < 500000; i++) { - for (size_t j = 0; j < test_dataset->length; j++) { - stochastic_gd(nn, 0.001f, test_dataset->datas[rand() % test_dataset->length]); - } - if (i % 100000 == 0) { - printf("Stochastic Multi layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); - } - } - - ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f); - ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f); - - free_neural_network(nn); - free(layer_lengths); - - // Non-linearly separable test - layer_length = 1; - layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); - layer_lengths[0] = 2; - nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid); - - for (size_t i = 0; i < 50000; i++) { - for (size_t j = 0; j < test_dataset->length; j++) { - stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]); - } - if (i % 10000 == 0) { - printf("Stochastic Non-linearly separable learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); - } - } - - ASSERT_GE(cost(nn, test_dataset, test_dataset->length), 0.2f); - ASSERT_LE(test_network_percent(nn, test_dataset), 90.0f); - - free_neural_network(nn); - free(layer_lengths); - free_dataset(test_dataset); -} - -TEST(NetworkTest, MiniBatchGDTests) { - dataset *test_dataset = get_xor(); - - // Multi layer test - size_t layer_length = 2; - size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); - layer_lengths[0] = 4; - layer_lengths[1] = 2; - neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid); - - for (size_t i = 0; i < 1500000; i++) { - dataset *batch_dataset = get_random_dataset_sample(test_dataset, test_dataset->length); - mini_batch_gd(nn, 0.001f, batch_dataset); - free_dataset(batch_dataset); - if (i % 200000 == 0) { - printf("Mini Batch Multi layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); - } - } - - ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f); - ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f); - - free_neural_network(nn); - free(layer_lengths); - - // Non-linearly separable test - layer_length = 1; - layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); - layer_lengths[0] = 2; - nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid); - - for (size_t i = 0; i < 100000; i++) { - dataset *batch_dataset = get_random_dataset_sample(test_dataset, test_dataset->length); - mini_batch_gd(nn, 0.001f, batch_dataset); - free_dataset(batch_dataset); - if (i % 20000 == 0) { - printf("Mini Batch Non-linearly separable learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); - } - } - - ASSERT_GE(cost(nn, test_dataset, test_dataset->length), 0.2f); - ASSERT_LE(test_network_percent(nn, test_dataset), 90.0f); - - free_neural_network(nn); - free(layer_lengths); - free_dataset(test_dataset); -} +// #include +// +// extern "C" { +// #include "cneuron/cneuron.h" +// #include "../src/rand.h" +// } +// +// #include +// +// #include "test_utils.h" +// +// TEST(NetworkTest, RandomFloat) { +// float test = randf(&randc, 0.0f, 1.0f); +// bool same = true; +// for (int i = 0; i < 10; i++) { +// if (test != randf(&randc, 0.0f, 1.0f)) { +// same = false; +// break; +// } +// } +// ASSERT_FALSE(same); +// } +// +// TEST(NetworkTest, GetLayer) { +// size_t layer_length = 3; +// layer *test_layer = get_layer(layer_length, 5); +// +// ASSERT_NE(test_layer, nullptr); +// ASSERT_NE(test_layer->delta, nullptr); +// ASSERT_NE(test_layer->weighted_input, nullptr); +// ASSERT_NE(test_layer->weights, nullptr); +// ASSERT_NE(test_layer->bias, nullptr); +// ASSERT_NE(test_layer->output, nullptr); +// +// ASSERT_EQ(test_layer->length, layer_length); +// +// free_layer(test_layer); +// } +// +// TEST(NetworkTest, GetNeuralNetwork) { +// size_t layer_length = 3; +// size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); +// layer_lengths[0] = 2; +// layer_lengths[1] = 3; +// layer_lengths[2] = 4; +// size_t inputs_length = 2; +// neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid); +// +// ASSERT_NE(nn, nullptr); +// ASSERT_EQ(nn->length, layer_length); +// ASSERT_EQ(nn->inputs_length, inputs_length); +// ASSERT_EQ(nn->activation_function, &sigmoid); +// ASSERT_NE(nn->layers, nullptr); +// for (size_t i = 0; i < layer_length; i++) { +// ASSERT_NE(nn->layers[i], nullptr); +// ASSERT_EQ(nn->layers[i]->length, layer_lengths[i]); +// ASSERT_EQ(nn->layers[i]->prev_layer, (i == 0) ? nullptr : nn->layers[i - 1]); +// ASSERT_EQ(nn->layers[i]->next_layer, (i == layer_length - 1) ? nullptr : nn->layers[i + 1]); +// } +// +// free_neural_network(nn); +// free(layer_lengths); +// } +// +// TEST(NetworkTest, FreeDataset) { +// size_t layer_length = 1; +// size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); +// layer_lengths[0] = 2; +// neural_network *nn = get_neural_network(layer_length, layer_lengths, 2, nullptr); +// +// free_neural_network(nn); +// // No crash +// free(layer_lengths); +// } +// +// TEST(NetworkTest, FreeLayer) { +// size_t layer_length = 2; +// layer *test_layer = get_layer(layer_length, 3); +// +// free_layer(test_layer); +// // No crash +// } +// +// TEST(NetworkTest, ComputeNetwork) { +// size_t layer_length = 1; +// size_t inputs_length = 1; +// size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); +// layer_lengths[0] = 1; +// neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid); +// +// float *inputs = (float *)malloc(sizeof(float) * inputs_length); +// inputs[0] = 0.2f; +// +// nn->layers[0]->weights[0] = 0.5f; +// nn->layers[0]->bias[0] = 0.3f; +// +// compute_network(nn, inputs); +// +// ASSERT_FLOAT_EQ(nn->layers[0]->output[0], 0.59868766f); +// +// free(inputs); +// free_neural_network(nn); +// free(layer_lengths); +// } +// +// TEST(NetworkTest, Softmax) { +// size_t layer_length = 1; +// size_t inputs_length = 1; +// size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); +// layer_lengths[0] = 3; +// neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid); +// +// nn->layers[0]->output[0] = 0.2f; +// nn->layers[0]->output[1] = 0.3f; +// nn->layers[0]->output[2] = 0.5f; +// +// ASSERT_FLOAT_EQ(softmax(nn, 0), 0.28943311f); +// ASSERT_FLOAT_EQ(softmax(nn, 1), 0.31987305f); +// ASSERT_FLOAT_EQ(softmax(nn, 2), 0.39069383f); +// +// free_neural_network(nn); +// free(layer_lengths); +// } +// +// // Output layer only +// TEST(NetworkTest, StochasticGDSingleLayer) { +// dataset *test_dataset = get_or(); +// +// // Create network +// size_t layer_length = 1; +// size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); +// layer_lengths[0] = 2; +// neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid); +// +// for (size_t i = 0; i < 50000; i++) { +// for (size_t j = 0; j < test_dataset->length; j++) { +// stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]); +// } +// if (i % 10000 == 0) { +// printf("Single layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); +// } +// } +// +// ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f); +// ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f); +// +// free_dataset(test_dataset); +// free_neural_network(nn); +// free(layer_lengths); +// } +// +// TEST(NetworkTest, StochasticGDTests) { +// dataset *test_dataset = get_xor(); +// +// // Multi layer test +// size_t layer_length = 2; +// size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); +// layer_lengths[0] = 4; +// layer_lengths[1] = 2; +// neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid); +// +// for (size_t i = 0; i < 500000; i++) { +// for (size_t j = 0; j < test_dataset->length; j++) { +// stochastic_gd(nn, 0.001f, test_dataset->datas[rand() % test_dataset->length]); +// } +// if (i % 100000 == 0) { +// printf("Stochastic Multi layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); +// } +// } +// +// ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f); +// ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f); +// +// free_neural_network(nn); +// free(layer_lengths); +// +// // Non-linearly separable test +// layer_length = 1; +// layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); +// layer_lengths[0] = 2; +// nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid); +// +// for (size_t i = 0; i < 50000; i++) { +// for (size_t j = 0; j < test_dataset->length; j++) { +// stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]); +// } +// if (i % 10000 == 0) { +// printf("Stochastic Non-linearly separable learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); +// } +// } +// +// ASSERT_GE(cost(nn, test_dataset, test_dataset->length), 0.2f); +// ASSERT_LE(test_network_percent(nn, test_dataset), 90.0f); +// +// free_neural_network(nn); +// free(layer_lengths); +// free_dataset(test_dataset); +// } +// +// TEST(NetworkTest, MiniBatchGDTests) { +// dataset *test_dataset = get_xor(); +// +// // Multi layer test +// size_t layer_length = 2; +// size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); +// layer_lengths[0] = 4; +// layer_lengths[1] = 2; +// neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid); +// +// for (size_t i = 0; i < 1500000; i++) { +// dataset *batch_dataset = get_random_dataset_sample(test_dataset, test_dataset->length); +// mini_batch_gd(nn, 0.001f, batch_dataset); +// free_dataset(batch_dataset); +// if (i % 200000 == 0) { +// printf("Mini Batch Multi layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); +// } +// } +// +// ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f); +// ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f); +// +// free_neural_network(nn); +// free(layer_lengths); +// +// // Non-linearly separable test +// layer_length = 1; +// layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); +// layer_lengths[0] = 2; +// nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid); +// +// for (size_t i = 0; i < 100000; i++) { +// dataset *batch_dataset = get_random_dataset_sample(test_dataset, test_dataset->length); +// mini_batch_gd(nn, 0.001f, batch_dataset); +// free_dataset(batch_dataset); +// if (i % 20000 == 0) { +// printf("Mini Batch Non-linearly separable learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); +// } +// } +// +// ASSERT_GE(cost(nn, test_dataset, test_dataset->length), 0.2f); +// ASSERT_LE(test_network_percent(nn, test_dataset), 90.0f); +// +// free_neural_network(nn); +// free(layer_lengths); +// free_dataset(test_dataset); +// } diff --git a/test/test_utils.cpp b/test/test_utils.cpp index 5513f015d..def5f48ca 100644 --- a/test/test_utils.cpp +++ b/test/test_utils.cpp @@ -1,77 +1,77 @@ -#include "test_utils.h" - -#include - -float sigmoid(float val, bool is_deravative) { - float result = 1.0f / (1.0f + expf(-val)); - if (is_deravative == 1) { - return result * (1.0f - result); - } - return result; -} - -dataset *get_xor() { - // Create data - dataset *test_dataset = (dataset *)malloc(sizeof(dataset)); - test_dataset->length = 4; - data **datas = (data **)malloc(sizeof(data *) * test_dataset->length); - test_dataset->datas = datas; - test_dataset->inputs_length = 2; - - for (size_t i = 0; i < test_dataset->length; i++) { - test_dataset->datas[i] = (data *)malloc(sizeof(data)); - test_dataset->datas[i]->inputs = (float *)malloc(sizeof(float) * test_dataset->inputs_length); - } - - // XOR gate - test_dataset->datas[0]->inputs[0] = 1.0f; - test_dataset->datas[0]->inputs[1] = 1.0f; - test_dataset->datas[0]->expected_index = 0; - - test_dataset->datas[1]->inputs[0] = 0.0f; - test_dataset->datas[1]->inputs[1] = 0.0f; - test_dataset->datas[1]->expected_index = 0; - - test_dataset->datas[2]->inputs[0] = 0.0f; - test_dataset->datas[2]->inputs[1] = 1.0f; - test_dataset->datas[2]->expected_index = 1; - - test_dataset->datas[3]->inputs[0] = 1.0f; - test_dataset->datas[3]->inputs[1] = 0.0f; - test_dataset->datas[3]->expected_index = 1; - - return test_dataset; -} - -dataset *get_or() { - // Create data - dataset *test_dataset = (dataset *)malloc(sizeof(dataset)); - test_dataset->length = 4; - data **datas = (data **)malloc(sizeof(data *) * test_dataset->length); - test_dataset->datas = datas; - test_dataset->inputs_length = 2; - - for (size_t i = 0; i < test_dataset->length; i++) { - test_dataset->datas[i] = (data *)malloc(sizeof(data)); - test_dataset->datas[i]->inputs = (float *)malloc(sizeof(float) * test_dataset->inputs_length); - } - - // OR gate - test_dataset->datas[1]->inputs[0] = 0.0f; - test_dataset->datas[1]->inputs[1] = 0.0f; - test_dataset->datas[1]->expected_index = 0; - - test_dataset->datas[0]->inputs[0] = 1.0f; - test_dataset->datas[0]->inputs[1] = 1.0f; - test_dataset->datas[0]->expected_index = 1; - - test_dataset->datas[2]->inputs[0] = 0.0f; - test_dataset->datas[2]->inputs[1] = 1.0f; - test_dataset->datas[2]->expected_index = 1; - - test_dataset->datas[3]->inputs[0] = 1.0f; - test_dataset->datas[3]->inputs[1] = 0.0f; - test_dataset->datas[3]->expected_index = 1; - - return test_dataset; -} +// #include "test_utils.h" +// +// #include +// +// float sigmoid(float val, bool is_deravative) { +// float result = 1.0f / (1.0f + expf(-val)); +// if (is_deravative == 1) { +// return result * (1.0f - result); +// } +// return result; +// } +// +// dataset *get_xor() { +// // Create data +// dataset *test_dataset = (dataset *)malloc(sizeof(dataset)); +// test_dataset->length = 4; +// data **datas = (data **)malloc(sizeof(data *) * test_dataset->length); +// test_dataset->datas = datas; +// test_dataset->inputs_length = 2; +// +// for (size_t i = 0; i < test_dataset->length; i++) { +// test_dataset->datas[i] = (data *)malloc(sizeof(data)); +// test_dataset->datas[i]->inputs = (float *)malloc(sizeof(float) * test_dataset->inputs_length); +// } +// +// // XOR gate +// test_dataset->datas[0]->inputs[0] = 1.0f; +// test_dataset->datas[0]->inputs[1] = 1.0f; +// test_dataset->datas[0]->expected_index = 0; +// +// test_dataset->datas[1]->inputs[0] = 0.0f; +// test_dataset->datas[1]->inputs[1] = 0.0f; +// test_dataset->datas[1]->expected_index = 0; +// +// test_dataset->datas[2]->inputs[0] = 0.0f; +// test_dataset->datas[2]->inputs[1] = 1.0f; +// test_dataset->datas[2]->expected_index = 1; +// +// test_dataset->datas[3]->inputs[0] = 1.0f; +// test_dataset->datas[3]->inputs[1] = 0.0f; +// test_dataset->datas[3]->expected_index = 1; +// +// return test_dataset; +// } +// +// dataset *get_or() { +// // Create data +// dataset *test_dataset = (dataset *)malloc(sizeof(dataset)); +// test_dataset->length = 4; +// data **datas = (data **)malloc(sizeof(data *) * test_dataset->length); +// test_dataset->datas = datas; +// test_dataset->inputs_length = 2; +// +// for (size_t i = 0; i < test_dataset->length; i++) { +// test_dataset->datas[i] = (data *)malloc(sizeof(data)); +// test_dataset->datas[i]->inputs = (float *)malloc(sizeof(float) * test_dataset->inputs_length); +// } +// +// // OR gate +// test_dataset->datas[1]->inputs[0] = 0.0f; +// test_dataset->datas[1]->inputs[1] = 0.0f; +// test_dataset->datas[1]->expected_index = 0; +// +// test_dataset->datas[0]->inputs[0] = 1.0f; +// test_dataset->datas[0]->inputs[1] = 1.0f; +// test_dataset->datas[0]->expected_index = 1; +// +// test_dataset->datas[2]->inputs[0] = 0.0f; +// test_dataset->datas[2]->inputs[1] = 1.0f; +// test_dataset->datas[2]->expected_index = 1; +// +// test_dataset->datas[3]->inputs[0] = 1.0f; +// test_dataset->datas[3]->inputs[1] = 0.0f; +// test_dataset->datas[3]->expected_index = 1; +// +// return test_dataset; +// } diff --git a/test/test_utils.h b/test/test_utils.h index 3c310aef2..02dbbe9ce 100644 --- a/test/test_utils.h +++ b/test/test_utils.h @@ -1,12 +1,12 @@ -#ifndef TEST_UTILS_H -#define TEST_UTILS_H - -#include "cneuron/cneuron.h" - -float sigmoid(float val, bool is_deravative); - -dataset *get_xor(); - -dataset *get_or(); - -#endif +// #ifndef TEST_UTILS_H +// #define TEST_UTILS_H +// +// #include "cneuron/cneuron.h" +// +// float sigmoid(float val, bool is_deravative); +// +// dataset *get_xor(); +// +// dataset *get_or(); +// +// #endif From 209307cce56fffeafe4fe774fb31394600ce8d03 Mon Sep 17 00:00:00 2001 From: BrBa2 Date: Thu, 12 Jun 2025 17:06:22 +0800 Subject: [PATCH 05/11] fix: not working rand --- external/shishua/shishua-sse2.h | 2 +- include/cneuron/cneuron.h | 1 - src/data.c | 1 + src/main.c | 1 + src/network.c | 1 + src/rand.h | 4 +- test/data.cpp | 334 +++++++++++----------- test/linear_algebra.cpp | 100 +++---- test/main.cpp | 14 +- test/network.cpp | 492 ++++++++++++++++---------------- test/test_utils.cpp | 154 +++++----- test/test_utils.h | 24 +- 12 files changed, 565 insertions(+), 563 deletions(-) diff --git a/external/shishua/shishua-sse2.h b/external/shishua/shishua-sse2.h index af28300ad..a9648fb0b 100644 --- a/external/shishua/shishua-sse2.h +++ b/external/shishua/shishua-sse2.h @@ -185,7 +185,7 @@ static uint64_t phi[16] = { 0xFEC507705E4AE6E5, }; -void prng_init(prng_state *s, uint64_t seed[4]) { +static void prng_init(prng_state *s, uint64_t seed[4]) { // Note: output is uninitialized at first, but since we pass NULL, its value // is initially ignored. s->counter[0] = _mm_setzero_si128(); diff --git a/include/cneuron/cneuron.h b/include/cneuron/cneuron.h index 4a68fae06..1d583f2e7 100644 --- a/include/cneuron/cneuron.h +++ b/include/cneuron/cneuron.h @@ -3,7 +3,6 @@ #include #include -#include "rand.h" /** * @brief Represents a single data element with its inputs and expected output index. diff --git a/src/data.c b/src/data.c index e85c6ff9c..f3bae3913 100644 --- a/src/data.c +++ b/src/data.c @@ -7,6 +7,7 @@ #include #include "cneuron/cneuron.h" +#include "rand.h" #define BACKGROUND_VALUE 0.0f diff --git a/src/main.c b/src/main.c index cc9fafb73..e1b6f4e06 100644 --- a/src/main.c +++ b/src/main.c @@ -7,6 +7,7 @@ #include #include "cneuron/cneuron.h" +#include "rand.h" const size_t IMAGE_SIZE = 28; diff --git a/src/network.c b/src/network.c index 3d06103f7..7e5e2db6b 100644 --- a/src/network.c +++ b/src/network.c @@ -9,6 +9,7 @@ #include #include "cneuron/cneuron.h" +#include "rand.h" layer *get_layer(size_t length, size_t prev_length) { layer *new_layer = calloc(1, sizeof(layer)); diff --git a/src/rand.h b/src/rand.h index 0d06992cb..ba6f5159d 100644 --- a/src/rand.h +++ b/src/rand.h @@ -14,6 +14,6 @@ uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max); uint32_t randnum_u32(struct rand_chunk *randc, uint32_t min, uint32_t max); float randf(struct rand_chunk *randc, float range, float offset); -prng_state __randstate; -struct rand_chunk randc = {0}; +static prng_state __randstate; +static struct rand_chunk randc = {0}; #endif diff --git a/test/data.cpp b/test/data.cpp index 0eb88c354..abe560d7f 100644 --- a/test/data.cpp +++ b/test/data.cpp @@ -1,167 +1,167 @@ -// #include -// -// extern "C" { -// #include "cneuron/cneuron.h" -// } -// -// TEST(DataTest, CreateData) { -// data *test_data = (data *)malloc(sizeof(data)); -// -// size_t inputs_length = 10; -// test_data->inputs = (float *)malloc(sizeof(float) * inputs_length); -// EXPECT_NE(test_data, nullptr); -// EXPECT_NE(test_data->inputs, nullptr); -// for (size_t i = 0; i < inputs_length; i++) { -// test_data->inputs[i] = static_cast(i); -// } -// free_data(test_data); -// // No crash -// } -// -// TEST(DataTest, GetDatasetFileNotFound) { -// dataset *test_dataset = get_dataset("non_existent_file.dat"); -// ASSERT_EQ(test_dataset, nullptr); -// } -// -// TEST(DataTest, GetDatasetValidFile) { -// dataset *test_dataset = get_dataset("data/mnist/test/0.dat"); -// ASSERT_NE(test_dataset, nullptr); -// ASSERT_GT(test_dataset->length, 0); -// ASSERT_GT(test_dataset->inputs_length, 0); -// -// ASSERT_NE(test_dataset, nullptr); -// ASSERT_NE(test_dataset->datas[0], nullptr); -// ASSERT_NE(test_dataset->datas[0]->inputs, nullptr); -// -// free_dataset(test_dataset); -// } -// -// TEST(DataTest, FreeDataset) { -// dataset *test_dataset = get_dataset("data/mnist/test/0.dat"); -// -// free_dataset(test_dataset); -// // No crash -// } -// -// TEST(DataTest, FreeData) { -// data *test_data = (data *)malloc(sizeof(data)); -// test_data->inputs = (float *)malloc(sizeof(float) * 10); -// -// free_data(test_data); -// // No crash -// } -// -// TEST(DataTest, CopyData) { -// dataset *test_dataset = get_dataset("data/mnist/test/0.dat"); -// -// data *data_copy = get_data_copy(test_dataset->datas[0], test_dataset->inputs_length); -// ASSERT_NE(data_copy, nullptr); -// ASSERT_NE(data_copy->inputs, nullptr); -// -// for (size_t i = 0; i < test_dataset->inputs_length; i++) { -// ASSERT_FLOAT_EQ(data_copy->inputs[i], test_dataset->datas[0]->inputs[i]); -// } -// -// ASSERT_FLOAT_EQ(data_copy->expected_index, test_dataset->datas[0]->expected_index); -// -// free_data(data_copy); -// free_dataset(test_dataset); -// } -// -// TEST(DataTest, RandomSampleDataset) { -// dataset *test_dataset = get_dataset("data/mnist/test/0.dat"); -// -// dataset *dataset_sample = get_random_dataset_sample(test_dataset, test_dataset->length - 1); -// ASSERT_NE(dataset_sample, nullptr); -// ASSERT_NE(dataset_sample->datas, nullptr); -// ASSERT_NE(dataset_sample->datas[0]->inputs, nullptr); -// -// free_dataset(dataset_sample); -// free_dataset(test_dataset); -// } -// -// TEST(DataTest, RotateData) { -// data *test_data = (data *)malloc(sizeof(data)); -// -// size_t inputs_length = 9; -// test_data->inputs = (float *)malloc(sizeof(float) * inputs_length); -// for (size_t i = 0; i < inputs_length; i++) { -// test_data->inputs[i] = static_cast(i) + 1.0f; -// } -// -// rotate_data(test_data, 3, 3, 90); -// ASSERT_FLOAT_EQ(test_data->inputs[0], 7.0f); -// ASSERT_FLOAT_EQ(test_data->inputs[2], 1.0f); -// ASSERT_FLOAT_EQ(test_data->inputs[4], 5.0f); -// -// free_data(test_data); -// } -// -// TEST(DataTest, ScaleData) { -// data *test_data = (data *)malloc(sizeof(data)); -// -// size_t inputs_length = 9; -// test_data->inputs = (float *)malloc(sizeof(float) * inputs_length); -// for (size_t i = 0; i < inputs_length; i++) { -// test_data->inputs[i] = i + 1.0f; -// } -// -// scale_data(test_data, 3, 3, 2.0f); -// ASSERT_FLOAT_EQ(test_data->inputs[0], 5.0f); -// ASSERT_FLOAT_EQ(test_data->inputs[2], 6.0f); -// ASSERT_FLOAT_EQ(test_data->inputs[8], 9.0f); -// -// free_data(test_data); -// } -// -// TEST(DataTest, OffsetData) { -// data *test_data = (data *)malloc(sizeof(data)); -// -// size_t inputs_length = 9; -// test_data->inputs = (float *)malloc(sizeof(float) * inputs_length); -// for (size_t i = 0; i < inputs_length; i++) { -// test_data->inputs[i] = i + 1.0f; -// } -// -// offset_data(test_data, 3, 3, 1.0f, 1.0f); -// ASSERT_FLOAT_EQ(test_data->inputs[0], 0.0f); -// ASSERT_FLOAT_EQ(test_data->inputs[5], 2.0f); -// ASSERT_FLOAT_EQ(test_data->inputs[6], 0.0f); -// ASSERT_FLOAT_EQ(test_data->inputs[8], 5.0f); -// -// free_data(test_data); -// } -// -// TEST(DataTest, NoiseData) { -// data *test_data = (data *)malloc(sizeof(data)); -// -// size_t inputs_length = 9; -// test_data->inputs = (float *)malloc(sizeof(float) * inputs_length); -// for (size_t i = 0; i < inputs_length; i++) { -// test_data->inputs[i] = i + 1.0f; -// } -// data *data_copy = get_data_copy(test_data, inputs_length); -// -// bool same = true; -// noise_data(test_data, inputs_length, 1.0f, 1.0f); -// for (size_t i = 0; i < inputs_length; i++) { -// if (data_copy->inputs[i] != test_data->inputs[i]) { -// same = false; -// break; -// } -// } -// ASSERT_FALSE(same); -// -// free_data(test_data); -// free_data(data_copy); -// } -// -// TEST(DataTest, OutputExpected) { -// data *test_data = (data *)malloc(sizeof(data)); -// test_data->expected_index = 1; -// -// ASSERT_FLOAT_EQ(output_expected(0, test_data), 0.0f); -// ASSERT_FLOAT_EQ(output_expected(1, test_data), 1.0f); -// -// free(test_data); -// } +#include + +extern "C" { +#include "cneuron/cneuron.h" +} + +TEST(DataTest, CreateData) { + data *test_data = (data *)malloc(sizeof(data)); + + size_t inputs_length = 10; + test_data->inputs = (float *)malloc(sizeof(float) * inputs_length); + EXPECT_NE(test_data, nullptr); + EXPECT_NE(test_data->inputs, nullptr); + for (size_t i = 0; i < inputs_length; i++) { + test_data->inputs[i] = static_cast(i); + } + free_data(test_data); + // No crash +} + +TEST(DataTest, GetDatasetFileNotFound) { + dataset *test_dataset = get_dataset("non_existent_file.dat"); + ASSERT_EQ(test_dataset, nullptr); +} + +TEST(DataTest, GetDatasetValidFile) { + dataset *test_dataset = get_dataset("data/mnist/test/0.dat"); + ASSERT_NE(test_dataset, nullptr); + ASSERT_GT(test_dataset->length, 0); + ASSERT_GT(test_dataset->inputs_length, 0); + + ASSERT_NE(test_dataset, nullptr); + ASSERT_NE(test_dataset->datas[0], nullptr); + ASSERT_NE(test_dataset->datas[0]->inputs, nullptr); + + free_dataset(test_dataset); +} + +TEST(DataTest, FreeDataset) { + dataset *test_dataset = get_dataset("data/mnist/test/0.dat"); + + free_dataset(test_dataset); + // No crash +} + +TEST(DataTest, FreeData) { + data *test_data = (data *)malloc(sizeof(data)); + test_data->inputs = (float *)malloc(sizeof(float) * 10); + + free_data(test_data); + // No crash +} + +TEST(DataTest, CopyData) { + dataset *test_dataset = get_dataset("data/mnist/test/0.dat"); + + data *data_copy = get_data_copy(test_dataset->datas[0], test_dataset->inputs_length); + ASSERT_NE(data_copy, nullptr); + ASSERT_NE(data_copy->inputs, nullptr); + + for (size_t i = 0; i < test_dataset->inputs_length; i++) { + ASSERT_FLOAT_EQ(data_copy->inputs[i], test_dataset->datas[0]->inputs[i]); + } + + ASSERT_FLOAT_EQ(data_copy->expected_index, test_dataset->datas[0]->expected_index); + + free_data(data_copy); + free_dataset(test_dataset); +} + +TEST(DataTest, RandomSampleDataset) { + dataset *test_dataset = get_dataset("data/mnist/test/0.dat"); + + dataset *dataset_sample = get_random_dataset_sample(test_dataset, test_dataset->length - 1); + ASSERT_NE(dataset_sample, nullptr); + ASSERT_NE(dataset_sample->datas, nullptr); + ASSERT_NE(dataset_sample->datas[0]->inputs, nullptr); + + free_dataset(dataset_sample); + free_dataset(test_dataset); +} + +TEST(DataTest, RotateData) { + data *test_data = (data *)malloc(sizeof(data)); + + size_t inputs_length = 9; + test_data->inputs = (float *)malloc(sizeof(float) * inputs_length); + for (size_t i = 0; i < inputs_length; i++) { + test_data->inputs[i] = static_cast(i) + 1.0f; + } + + rotate_data(test_data, 3, 3, 90); + ASSERT_FLOAT_EQ(test_data->inputs[0], 7.0f); + ASSERT_FLOAT_EQ(test_data->inputs[2], 1.0f); + ASSERT_FLOAT_EQ(test_data->inputs[4], 5.0f); + + free_data(test_data); +} + +TEST(DataTest, ScaleData) { + data *test_data = (data *)malloc(sizeof(data)); + + size_t inputs_length = 9; + test_data->inputs = (float *)malloc(sizeof(float) * inputs_length); + for (size_t i = 0; i < inputs_length; i++) { + test_data->inputs[i] = i + 1.0f; + } + + scale_data(test_data, 3, 3, 2.0f); + ASSERT_FLOAT_EQ(test_data->inputs[0], 5.0f); + ASSERT_FLOAT_EQ(test_data->inputs[2], 6.0f); + ASSERT_FLOAT_EQ(test_data->inputs[8], 9.0f); + + free_data(test_data); +} + +TEST(DataTest, OffsetData) { + data *test_data = (data *)malloc(sizeof(data)); + + size_t inputs_length = 9; + test_data->inputs = (float *)malloc(sizeof(float) * inputs_length); + for (size_t i = 0; i < inputs_length; i++) { + test_data->inputs[i] = i + 1.0f; + } + + offset_data(test_data, 3, 3, 1.0f, 1.0f); + ASSERT_FLOAT_EQ(test_data->inputs[0], 0.0f); + ASSERT_FLOAT_EQ(test_data->inputs[5], 2.0f); + ASSERT_FLOAT_EQ(test_data->inputs[6], 0.0f); + ASSERT_FLOAT_EQ(test_data->inputs[8], 5.0f); + + free_data(test_data); +} + +TEST(DataTest, NoiseData) { + data *test_data = (data *)malloc(sizeof(data)); + + size_t inputs_length = 9; + test_data->inputs = (float *)malloc(sizeof(float) * inputs_length); + for (size_t i = 0; i < inputs_length; i++) { + test_data->inputs[i] = i + 1.0f; + } + data *data_copy = get_data_copy(test_data, inputs_length); + + bool same = true; + noise_data(test_data, inputs_length, 1.0f, 1.0f); + for (size_t i = 0; i < inputs_length; i++) { + if (data_copy->inputs[i] != test_data->inputs[i]) { + same = false; + break; + } + } + ASSERT_FALSE(same); + + free_data(test_data); + free_data(data_copy); +} + +TEST(DataTest, OutputExpected) { + data *test_data = (data *)malloc(sizeof(data)); + test_data->expected_index = 1; + + ASSERT_FLOAT_EQ(output_expected(0, test_data), 0.0f); + ASSERT_FLOAT_EQ(output_expected(1, test_data), 1.0f); + + free(test_data); +} diff --git a/test/linear_algebra.cpp b/test/linear_algebra.cpp index 31b5edc9b..fee0c10d7 100644 --- a/test/linear_algebra.cpp +++ b/test/linear_algebra.cpp @@ -1,50 +1,50 @@ -// #include -// #include -// -// extern "C" { -// #include "cneuron/cneuron.h" -// } -// -// #include "test_utils.h" -// -// TEST(LinearAlgebraTest, VectorApplyActivation) { -// size_t length = 3; -// float *a = (float *)malloc(sizeof(float) * length); -// float *b = (float *)malloc(sizeof(float) * length); -// -// a[0] = 1.0f; -// a[1] = -1.2f; -// a[2] = -0.2f; -// -// vector_apply_activation(a, b, length, sigmoid, false); -// -// for (size_t i = 0; i < length; i++) { -// ASSERT_FLOAT_EQ(b[i], sigmoid(a[i], false)); -// } -// -// free(a); -// free(b); -// } -// -// TEST(LinearAlgebraTest, HadamardProduct) { -// size_t length = 3; -// float *a = (float *)malloc(sizeof(float) * length); -// float *b = (float *)malloc(sizeof(float) * length); -// -// a[0] = 1.0f; -// a[1] = -1.2f; -// a[2] = -0.2f; -// -// b[0] = 1.2f; -// b[1] = -2.2f; -// b[2] = -0.4f; -// -// hadamard_product(a, b, b, length); -// -// ASSERT_FLOAT_EQ(b[0], 1.2f); -// ASSERT_FLOAT_EQ(b[1], 2.64f); -// ASSERT_FLOAT_EQ(b[2], 0.08f); -// -// free(a); -// free(b); -// } +#include +#include + +extern "C" { +#include "cneuron/cneuron.h" +} + +#include "test_utils.h" + +TEST(LinearAlgebraTest, VectorApplyActivation) { + size_t length = 3; + float *a = (float *)malloc(sizeof(float) * length); + float *b = (float *)malloc(sizeof(float) * length); + + a[0] = 1.0f; + a[1] = -1.2f; + a[2] = -0.2f; + + vector_apply_activation(a, b, length, sigmoid, false); + + for (size_t i = 0; i < length; i++) { + ASSERT_FLOAT_EQ(b[i], sigmoid(a[i], false)); + } + + free(a); + free(b); +} + +TEST(LinearAlgebraTest, HadamardProduct) { + size_t length = 3; + float *a = (float *)malloc(sizeof(float) * length); + float *b = (float *)malloc(sizeof(float) * length); + + a[0] = 1.0f; + a[1] = -1.2f; + a[2] = -0.2f; + + b[0] = 1.2f; + b[1] = -2.2f; + b[2] = -0.4f; + + hadamard_product(a, b, b, length); + + ASSERT_FLOAT_EQ(b[0], 1.2f); + ASSERT_FLOAT_EQ(b[1], 2.64f); + ASSERT_FLOAT_EQ(b[2], 0.08f); + + free(a); + free(b); +} diff --git a/test/main.cpp b/test/main.cpp index 5bebbc677..0a4376670 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -1,7 +1,7 @@ -// #include -// -// int main(int argc, char **argv) { -// srand(time(NULL)); -// ::testing::InitGoogleTest(&argc, argv); -// return RUN_ALL_TESTS(); -// } +#include + +int main(int argc, char **argv) { + srand(time(NULL)); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/test/network.cpp b/test/network.cpp index 6e56ca377..36010ab0b 100644 --- a/test/network.cpp +++ b/test/network.cpp @@ -1,246 +1,246 @@ -// #include -// -// extern "C" { -// #include "cneuron/cneuron.h" -// #include "../src/rand.h" -// } -// -// #include -// -// #include "test_utils.h" -// -// TEST(NetworkTest, RandomFloat) { -// float test = randf(&randc, 0.0f, 1.0f); -// bool same = true; -// for (int i = 0; i < 10; i++) { -// if (test != randf(&randc, 0.0f, 1.0f)) { -// same = false; -// break; -// } -// } -// ASSERT_FALSE(same); -// } -// -// TEST(NetworkTest, GetLayer) { -// size_t layer_length = 3; -// layer *test_layer = get_layer(layer_length, 5); -// -// ASSERT_NE(test_layer, nullptr); -// ASSERT_NE(test_layer->delta, nullptr); -// ASSERT_NE(test_layer->weighted_input, nullptr); -// ASSERT_NE(test_layer->weights, nullptr); -// ASSERT_NE(test_layer->bias, nullptr); -// ASSERT_NE(test_layer->output, nullptr); -// -// ASSERT_EQ(test_layer->length, layer_length); -// -// free_layer(test_layer); -// } -// -// TEST(NetworkTest, GetNeuralNetwork) { -// size_t layer_length = 3; -// size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); -// layer_lengths[0] = 2; -// layer_lengths[1] = 3; -// layer_lengths[2] = 4; -// size_t inputs_length = 2; -// neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid); -// -// ASSERT_NE(nn, nullptr); -// ASSERT_EQ(nn->length, layer_length); -// ASSERT_EQ(nn->inputs_length, inputs_length); -// ASSERT_EQ(nn->activation_function, &sigmoid); -// ASSERT_NE(nn->layers, nullptr); -// for (size_t i = 0; i < layer_length; i++) { -// ASSERT_NE(nn->layers[i], nullptr); -// ASSERT_EQ(nn->layers[i]->length, layer_lengths[i]); -// ASSERT_EQ(nn->layers[i]->prev_layer, (i == 0) ? nullptr : nn->layers[i - 1]); -// ASSERT_EQ(nn->layers[i]->next_layer, (i == layer_length - 1) ? nullptr : nn->layers[i + 1]); -// } -// -// free_neural_network(nn); -// free(layer_lengths); -// } -// -// TEST(NetworkTest, FreeDataset) { -// size_t layer_length = 1; -// size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); -// layer_lengths[0] = 2; -// neural_network *nn = get_neural_network(layer_length, layer_lengths, 2, nullptr); -// -// free_neural_network(nn); -// // No crash -// free(layer_lengths); -// } -// -// TEST(NetworkTest, FreeLayer) { -// size_t layer_length = 2; -// layer *test_layer = get_layer(layer_length, 3); -// -// free_layer(test_layer); -// // No crash -// } -// -// TEST(NetworkTest, ComputeNetwork) { -// size_t layer_length = 1; -// size_t inputs_length = 1; -// size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); -// layer_lengths[0] = 1; -// neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid); -// -// float *inputs = (float *)malloc(sizeof(float) * inputs_length); -// inputs[0] = 0.2f; -// -// nn->layers[0]->weights[0] = 0.5f; -// nn->layers[0]->bias[0] = 0.3f; -// -// compute_network(nn, inputs); -// -// ASSERT_FLOAT_EQ(nn->layers[0]->output[0], 0.59868766f); -// -// free(inputs); -// free_neural_network(nn); -// free(layer_lengths); -// } -// -// TEST(NetworkTest, Softmax) { -// size_t layer_length = 1; -// size_t inputs_length = 1; -// size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); -// layer_lengths[0] = 3; -// neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid); -// -// nn->layers[0]->output[0] = 0.2f; -// nn->layers[0]->output[1] = 0.3f; -// nn->layers[0]->output[2] = 0.5f; -// -// ASSERT_FLOAT_EQ(softmax(nn, 0), 0.28943311f); -// ASSERT_FLOAT_EQ(softmax(nn, 1), 0.31987305f); -// ASSERT_FLOAT_EQ(softmax(nn, 2), 0.39069383f); -// -// free_neural_network(nn); -// free(layer_lengths); -// } -// -// // Output layer only -// TEST(NetworkTest, StochasticGDSingleLayer) { -// dataset *test_dataset = get_or(); -// -// // Create network -// size_t layer_length = 1; -// size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); -// layer_lengths[0] = 2; -// neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid); -// -// for (size_t i = 0; i < 50000; i++) { -// for (size_t j = 0; j < test_dataset->length; j++) { -// stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]); -// } -// if (i % 10000 == 0) { -// printf("Single layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); -// } -// } -// -// ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f); -// ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f); -// -// free_dataset(test_dataset); -// free_neural_network(nn); -// free(layer_lengths); -// } -// -// TEST(NetworkTest, StochasticGDTests) { -// dataset *test_dataset = get_xor(); -// -// // Multi layer test -// size_t layer_length = 2; -// size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); -// layer_lengths[0] = 4; -// layer_lengths[1] = 2; -// neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid); -// -// for (size_t i = 0; i < 500000; i++) { -// for (size_t j = 0; j < test_dataset->length; j++) { -// stochastic_gd(nn, 0.001f, test_dataset->datas[rand() % test_dataset->length]); -// } -// if (i % 100000 == 0) { -// printf("Stochastic Multi layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); -// } -// } -// -// ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f); -// ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f); -// -// free_neural_network(nn); -// free(layer_lengths); -// -// // Non-linearly separable test -// layer_length = 1; -// layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); -// layer_lengths[0] = 2; -// nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid); -// -// for (size_t i = 0; i < 50000; i++) { -// for (size_t j = 0; j < test_dataset->length; j++) { -// stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]); -// } -// if (i % 10000 == 0) { -// printf("Stochastic Non-linearly separable learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); -// } -// } -// -// ASSERT_GE(cost(nn, test_dataset, test_dataset->length), 0.2f); -// ASSERT_LE(test_network_percent(nn, test_dataset), 90.0f); -// -// free_neural_network(nn); -// free(layer_lengths); -// free_dataset(test_dataset); -// } -// -// TEST(NetworkTest, MiniBatchGDTests) { -// dataset *test_dataset = get_xor(); -// -// // Multi layer test -// size_t layer_length = 2; -// size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); -// layer_lengths[0] = 4; -// layer_lengths[1] = 2; -// neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid); -// -// for (size_t i = 0; i < 1500000; i++) { -// dataset *batch_dataset = get_random_dataset_sample(test_dataset, test_dataset->length); -// mini_batch_gd(nn, 0.001f, batch_dataset); -// free_dataset(batch_dataset); -// if (i % 200000 == 0) { -// printf("Mini Batch Multi layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); -// } -// } -// -// ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f); -// ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f); -// -// free_neural_network(nn); -// free(layer_lengths); -// -// // Non-linearly separable test -// layer_length = 1; -// layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); -// layer_lengths[0] = 2; -// nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid); -// -// for (size_t i = 0; i < 100000; i++) { -// dataset *batch_dataset = get_random_dataset_sample(test_dataset, test_dataset->length); -// mini_batch_gd(nn, 0.001f, batch_dataset); -// free_dataset(batch_dataset); -// if (i % 20000 == 0) { -// printf("Mini Batch Non-linearly separable learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); -// } -// } -// -// ASSERT_GE(cost(nn, test_dataset, test_dataset->length), 0.2f); -// ASSERT_LE(test_network_percent(nn, test_dataset), 90.0f); -// -// free_neural_network(nn); -// free(layer_lengths); -// free_dataset(test_dataset); -// } +#include + +extern "C" { +#include "cneuron/cneuron.h" +#include "../src/rand.h" +} + +#include + +#include "test_utils.h" + +TEST(NetworkTest, RandomFloat) { + float test = randf(&randc, 0.0f, 1.0f); + bool same = true; + for (int i = 0; i < 10; i++) { + if (test != randf(&randc, 0.0f, 1.0f)) { + same = false; + break; + } + } + ASSERT_FALSE(same); +} + +TEST(NetworkTest, GetLayer) { + size_t layer_length = 3; + layer *test_layer = get_layer(layer_length, 5); + + ASSERT_NE(test_layer, nullptr); + ASSERT_NE(test_layer->delta, nullptr); + ASSERT_NE(test_layer->weighted_input, nullptr); + ASSERT_NE(test_layer->weights, nullptr); + ASSERT_NE(test_layer->bias, nullptr); + ASSERT_NE(test_layer->output, nullptr); + + ASSERT_EQ(test_layer->length, layer_length); + + free_layer(test_layer); +} + +TEST(NetworkTest, GetNeuralNetwork) { + size_t layer_length = 3; + size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); + layer_lengths[0] = 2; + layer_lengths[1] = 3; + layer_lengths[2] = 4; + size_t inputs_length = 2; + neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid); + + ASSERT_NE(nn, nullptr); + ASSERT_EQ(nn->length, layer_length); + ASSERT_EQ(nn->inputs_length, inputs_length); + ASSERT_EQ(nn->activation_function, &sigmoid); + ASSERT_NE(nn->layers, nullptr); + for (size_t i = 0; i < layer_length; i++) { + ASSERT_NE(nn->layers[i], nullptr); + ASSERT_EQ(nn->layers[i]->length, layer_lengths[i]); + ASSERT_EQ(nn->layers[i]->prev_layer, (i == 0) ? nullptr : nn->layers[i - 1]); + ASSERT_EQ(nn->layers[i]->next_layer, (i == layer_length - 1) ? nullptr : nn->layers[i + 1]); + } + + free_neural_network(nn); + free(layer_lengths); +} + +TEST(NetworkTest, FreeDataset) { + size_t layer_length = 1; + size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); + layer_lengths[0] = 2; + neural_network *nn = get_neural_network(layer_length, layer_lengths, 2, nullptr); + + free_neural_network(nn); + // No crash + free(layer_lengths); +} + +TEST(NetworkTest, FreeLayer) { + size_t layer_length = 2; + layer *test_layer = get_layer(layer_length, 3); + + free_layer(test_layer); + // No crash +} + +TEST(NetworkTest, ComputeNetwork) { + size_t layer_length = 1; + size_t inputs_length = 1; + size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); + layer_lengths[0] = 1; + neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid); + + float *inputs = (float *)malloc(sizeof(float) * inputs_length); + inputs[0] = 0.2f; + + nn->layers[0]->weights[0] = 0.5f; + nn->layers[0]->bias[0] = 0.3f; + + compute_network(nn, inputs); + + ASSERT_FLOAT_EQ(nn->layers[0]->output[0], 0.59868766f); + + free(inputs); + free_neural_network(nn); + free(layer_lengths); +} + +TEST(NetworkTest, Softmax) { + size_t layer_length = 1; + size_t inputs_length = 1; + size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); + layer_lengths[0] = 3; + neural_network *nn = get_neural_network(layer_length, layer_lengths, inputs_length, &sigmoid); + + nn->layers[0]->output[0] = 0.2f; + nn->layers[0]->output[1] = 0.3f; + nn->layers[0]->output[2] = 0.5f; + + ASSERT_FLOAT_EQ(softmax(nn, 0), 0.28943311f); + ASSERT_FLOAT_EQ(softmax(nn, 1), 0.31987305f); + ASSERT_FLOAT_EQ(softmax(nn, 2), 0.39069383f); + + free_neural_network(nn); + free(layer_lengths); +} + +// Output layer only +TEST(NetworkTest, StochasticGDSingleLayer) { + dataset *test_dataset = get_or(); + + // Create network + size_t layer_length = 1; + size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); + layer_lengths[0] = 2; + neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid); + + for (size_t i = 0; i < 50000; i++) { + for (size_t j = 0; j < test_dataset->length; j++) { + stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]); + } + if (i % 10000 == 0) { + printf("Single layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); + } + } + + ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f); + ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f); + + free_dataset(test_dataset); + free_neural_network(nn); + free(layer_lengths); +} + +TEST(NetworkTest, StochasticGDTests) { + dataset *test_dataset = get_xor(); + + // Multi layer test + size_t layer_length = 2; + size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); + layer_lengths[0] = 4; + layer_lengths[1] = 2; + neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid); + + for (size_t i = 0; i < 500000; i++) { + for (size_t j = 0; j < test_dataset->length; j++) { + stochastic_gd(nn, 0.001f, test_dataset->datas[rand() % test_dataset->length]); + } + if (i % 100000 == 0) { + printf("Stochastic Multi layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); + } + } + + ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f); + ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f); + + free_neural_network(nn); + free(layer_lengths); + + // Non-linearly separable test + layer_length = 1; + layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); + layer_lengths[0] = 2; + nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid); + + for (size_t i = 0; i < 50000; i++) { + for (size_t j = 0; j < test_dataset->length; j++) { + stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]); + } + if (i % 10000 == 0) { + printf("Stochastic Non-linearly separable learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); + } + } + + ASSERT_GE(cost(nn, test_dataset, test_dataset->length), 0.2f); + ASSERT_LE(test_network_percent(nn, test_dataset), 90.0f); + + free_neural_network(nn); + free(layer_lengths); + free_dataset(test_dataset); +} + +TEST(NetworkTest, MiniBatchGDTests) { + dataset *test_dataset = get_xor(); + + // Multi layer test + size_t layer_length = 2; + size_t *layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); + layer_lengths[0] = 4; + layer_lengths[1] = 2; + neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid); + + for (size_t i = 0; i < 1500000; i++) { + dataset *batch_dataset = get_random_dataset_sample(test_dataset, test_dataset->length); + mini_batch_gd(nn, 0.001f, batch_dataset); + free_dataset(batch_dataset); + if (i % 200000 == 0) { + printf("Mini Batch Multi layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); + } + } + + ASSERT_LE(cost(nn, test_dataset, test_dataset->length), 0.2f); + ASSERT_GE(test_network_percent(nn, test_dataset), 90.0f); + + free_neural_network(nn); + free(layer_lengths); + + // Non-linearly separable test + layer_length = 1; + layer_lengths = (size_t *)malloc(sizeof(size_t) * layer_length); + layer_lengths[0] = 2; + nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid); + + for (size_t i = 0; i < 100000; i++) { + dataset *batch_dataset = get_random_dataset_sample(test_dataset, test_dataset->length); + mini_batch_gd(nn, 0.001f, batch_dataset); + free_dataset(batch_dataset); + if (i % 20000 == 0) { + printf("Mini Batch Non-linearly separable learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); + } + } + + ASSERT_GE(cost(nn, test_dataset, test_dataset->length), 0.2f); + ASSERT_LE(test_network_percent(nn, test_dataset), 90.0f); + + free_neural_network(nn); + free(layer_lengths); + free_dataset(test_dataset); +} diff --git a/test/test_utils.cpp b/test/test_utils.cpp index def5f48ca..5513f015d 100644 --- a/test/test_utils.cpp +++ b/test/test_utils.cpp @@ -1,77 +1,77 @@ -// #include "test_utils.h" -// -// #include -// -// float sigmoid(float val, bool is_deravative) { -// float result = 1.0f / (1.0f + expf(-val)); -// if (is_deravative == 1) { -// return result * (1.0f - result); -// } -// return result; -// } -// -// dataset *get_xor() { -// // Create data -// dataset *test_dataset = (dataset *)malloc(sizeof(dataset)); -// test_dataset->length = 4; -// data **datas = (data **)malloc(sizeof(data *) * test_dataset->length); -// test_dataset->datas = datas; -// test_dataset->inputs_length = 2; -// -// for (size_t i = 0; i < test_dataset->length; i++) { -// test_dataset->datas[i] = (data *)malloc(sizeof(data)); -// test_dataset->datas[i]->inputs = (float *)malloc(sizeof(float) * test_dataset->inputs_length); -// } -// -// // XOR gate -// test_dataset->datas[0]->inputs[0] = 1.0f; -// test_dataset->datas[0]->inputs[1] = 1.0f; -// test_dataset->datas[0]->expected_index = 0; -// -// test_dataset->datas[1]->inputs[0] = 0.0f; -// test_dataset->datas[1]->inputs[1] = 0.0f; -// test_dataset->datas[1]->expected_index = 0; -// -// test_dataset->datas[2]->inputs[0] = 0.0f; -// test_dataset->datas[2]->inputs[1] = 1.0f; -// test_dataset->datas[2]->expected_index = 1; -// -// test_dataset->datas[3]->inputs[0] = 1.0f; -// test_dataset->datas[3]->inputs[1] = 0.0f; -// test_dataset->datas[3]->expected_index = 1; -// -// return test_dataset; -// } -// -// dataset *get_or() { -// // Create data -// dataset *test_dataset = (dataset *)malloc(sizeof(dataset)); -// test_dataset->length = 4; -// data **datas = (data **)malloc(sizeof(data *) * test_dataset->length); -// test_dataset->datas = datas; -// test_dataset->inputs_length = 2; -// -// for (size_t i = 0; i < test_dataset->length; i++) { -// test_dataset->datas[i] = (data *)malloc(sizeof(data)); -// test_dataset->datas[i]->inputs = (float *)malloc(sizeof(float) * test_dataset->inputs_length); -// } -// -// // OR gate -// test_dataset->datas[1]->inputs[0] = 0.0f; -// test_dataset->datas[1]->inputs[1] = 0.0f; -// test_dataset->datas[1]->expected_index = 0; -// -// test_dataset->datas[0]->inputs[0] = 1.0f; -// test_dataset->datas[0]->inputs[1] = 1.0f; -// test_dataset->datas[0]->expected_index = 1; -// -// test_dataset->datas[2]->inputs[0] = 0.0f; -// test_dataset->datas[2]->inputs[1] = 1.0f; -// test_dataset->datas[2]->expected_index = 1; -// -// test_dataset->datas[3]->inputs[0] = 1.0f; -// test_dataset->datas[3]->inputs[1] = 0.0f; -// test_dataset->datas[3]->expected_index = 1; -// -// return test_dataset; -// } +#include "test_utils.h" + +#include + +float sigmoid(float val, bool is_deravative) { + float result = 1.0f / (1.0f + expf(-val)); + if (is_deravative == 1) { + return result * (1.0f - result); + } + return result; +} + +dataset *get_xor() { + // Create data + dataset *test_dataset = (dataset *)malloc(sizeof(dataset)); + test_dataset->length = 4; + data **datas = (data **)malloc(sizeof(data *) * test_dataset->length); + test_dataset->datas = datas; + test_dataset->inputs_length = 2; + + for (size_t i = 0; i < test_dataset->length; i++) { + test_dataset->datas[i] = (data *)malloc(sizeof(data)); + test_dataset->datas[i]->inputs = (float *)malloc(sizeof(float) * test_dataset->inputs_length); + } + + // XOR gate + test_dataset->datas[0]->inputs[0] = 1.0f; + test_dataset->datas[0]->inputs[1] = 1.0f; + test_dataset->datas[0]->expected_index = 0; + + test_dataset->datas[1]->inputs[0] = 0.0f; + test_dataset->datas[1]->inputs[1] = 0.0f; + test_dataset->datas[1]->expected_index = 0; + + test_dataset->datas[2]->inputs[0] = 0.0f; + test_dataset->datas[2]->inputs[1] = 1.0f; + test_dataset->datas[2]->expected_index = 1; + + test_dataset->datas[3]->inputs[0] = 1.0f; + test_dataset->datas[3]->inputs[1] = 0.0f; + test_dataset->datas[3]->expected_index = 1; + + return test_dataset; +} + +dataset *get_or() { + // Create data + dataset *test_dataset = (dataset *)malloc(sizeof(dataset)); + test_dataset->length = 4; + data **datas = (data **)malloc(sizeof(data *) * test_dataset->length); + test_dataset->datas = datas; + test_dataset->inputs_length = 2; + + for (size_t i = 0; i < test_dataset->length; i++) { + test_dataset->datas[i] = (data *)malloc(sizeof(data)); + test_dataset->datas[i]->inputs = (float *)malloc(sizeof(float) * test_dataset->inputs_length); + } + + // OR gate + test_dataset->datas[1]->inputs[0] = 0.0f; + test_dataset->datas[1]->inputs[1] = 0.0f; + test_dataset->datas[1]->expected_index = 0; + + test_dataset->datas[0]->inputs[0] = 1.0f; + test_dataset->datas[0]->inputs[1] = 1.0f; + test_dataset->datas[0]->expected_index = 1; + + test_dataset->datas[2]->inputs[0] = 0.0f; + test_dataset->datas[2]->inputs[1] = 1.0f; + test_dataset->datas[2]->expected_index = 1; + + test_dataset->datas[3]->inputs[0] = 1.0f; + test_dataset->datas[3]->inputs[1] = 0.0f; + test_dataset->datas[3]->expected_index = 1; + + return test_dataset; +} diff --git a/test/test_utils.h b/test/test_utils.h index 02dbbe9ce..3c310aef2 100644 --- a/test/test_utils.h +++ b/test/test_utils.h @@ -1,12 +1,12 @@ -// #ifndef TEST_UTILS_H -// #define TEST_UTILS_H -// -// #include "cneuron/cneuron.h" -// -// float sigmoid(float val, bool is_deravative); -// -// dataset *get_xor(); -// -// dataset *get_or(); -// -// #endif +#ifndef TEST_UTILS_H +#define TEST_UTILS_H + +#include "cneuron/cneuron.h" + +float sigmoid(float val, bool is_deravative); + +dataset *get_xor(); + +dataset *get_or(); + +#endif From 3c5a2c3747c75e533115909b600d420a9e8c31f2 Mon Sep 17 00:00:00 2001 From: BrBa2 Date: Thu, 12 Jun 2025 21:37:28 +0800 Subject: [PATCH 06/11] optimize rand a bit --- src/data.c | 2 +- src/main.c | 2 -- src/network.c | 2 +- src/rand.c | 46 ++++++++++++++++++++++++++++------------------ src/rand.h | 9 ++++----- 5 files changed, 34 insertions(+), 27 deletions(-) diff --git a/src/data.c b/src/data.c index f3bae3913..d7c0eb984 100644 --- a/src/data.c +++ b/src/data.c @@ -137,7 +137,7 @@ dataset *get_random_dataset_sample(const dataset *source_dataset, size_t amount) new_dataset->datas = malloc(sizeof(data) * amount); for (size_t i = 0; i < amount; i++) { - new_dataset->datas[i] = get_data_copy(source_dataset->datas[randnum_u32(&randc, 0, source_dataset->length)], source_dataset->inputs_length); + new_dataset->datas[i] = get_data_copy(source_dataset->datas[randnum_u32(&randc, source_dataset->length, 0)], source_dataset->inputs_length); } return new_dataset; diff --git a/src/main.c b/src/main.c index e1b6f4e06..fc3cb1b30 100644 --- a/src/main.c +++ b/src/main.c @@ -98,8 +98,6 @@ dataset *get_mnist(bool is_test) { } int main(int argc, char **argv) { - static uint64_t seed[4] = {0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95}; - prng_init(&__randstate, seed); dataset *train_dataset = get_mnist(false); dataset *test_dataset = get_mnist(true); size_t network_length = 3; diff --git a/src/network.c b/src/network.c index 7e5e2db6b..93ae307a1 100644 --- a/src/network.c +++ b/src/network.c @@ -201,7 +201,7 @@ float cost(neural_network *nn, const dataset *test_dataset, size_t num_test) { layer *output_layer = nn->layers[nn->length - 1]; for (size_t i = 0; i < num_test; i++) { - data *test_data = test_dataset->datas[randnum_u32(&randc, 0, test_dataset->length)]; + data *test_data = test_dataset->datas[randnum_u32(&randc, test_dataset->length, 0)]; compute_network(nn, test_data->inputs); for (size_t j = 0; j < output_layer->length; j++) { float output = output_layer->output[j]; diff --git a/src/rand.c b/src/rand.c index c01a6adbb..29428c44c 100644 --- a/src/rand.c +++ b/src/rand.c @@ -2,46 +2,56 @@ #include #include +#include "shishua/shishua-sse2.h" -uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max) { +static prng_state __randstate; + +__attribute__((constructor)) static void auto_seed_rand(void) { + static uint64_t seed[4] = {0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95}; + prng_init(&__randstate, seed); + prng_gen(&__randstate, randc.buf, 1024); +} + +uint8_t randnum_u8(struct rand_chunk *randc, uint8_t range, uint8_t offset) { assert(max > min); - assert(randc->count <= 255); - uint8_t randnum = ((uint16_t)randc->buf[randc->count] * (max - min)) >> 8; - randnum += min; + assert(randc->count <= 1023); + uint8_t randnum = ((uint16_t)randc->buf[randc->count] * range) >> 8; + randnum += offset; randc->count++; - if (randc->count > 255) { + if (randc->count > 1023) { randc->count = 0; - prng_gen(&__randstate, randc->buf, 256); + prng_gen(&__randstate, randc->buf, 1024); } return randnum; } -uint32_t randnum_u32(struct rand_chunk *randc, uint32_t min, uint32_t max) { +uint32_t randnum_u32(struct rand_chunk *randc, uint32_t range, uint32_t offset) { assert(max > min); - assert(randc->count <= 252); + assert(randc->count <= 1020); uint32_t value; + // uint32_t value = randc->buf[randc->count++] << 24 | randc->buf[randc->count++] << 16 | randc->buf[randc->count++] << 8 | randc->buf[randc->count++]; memcpy(&value, &randc->buf[randc->count], sizeof(value)); - uint32_t randnum = ((uint64_t)value * (max - min)) >> 32; - randnum += min; - randc->count += sizeof(uint32_t); - if (randc->count > 252) { + uint32_t randnum = ((uint64_t)value * range) >> 32; + randnum += offset; + // randc->count += sizeof(uint32_t); + if (randc->count > 1020) { randc->count = 0; - prng_gen(&__randstate, randc->buf, 256); + prng_gen(&__randstate, randc->buf, 1024); } return randnum; } float randf(struct rand_chunk *randc, float range, float offset) { assert(range); - assert(randc->count <= 252); + assert(randc->count <= 1021); uint32_t value; - memcpy(&value, &randc->buf[randc->count], sizeof(value) - 1); + memcpy(&value, &randc->buf[randc->count], 3); // float randfloat = ((float)value / UINT32_MAX) * range + offset; float randfloat = (value / 16777216.0f) * range + offset; - randc->count += sizeof(value) - 1; - if (randc->count > 253) { + randc->count += 3; + if (randc->count > 1021) { randc->count = 0; - prng_gen(&__randstate, randc->buf, 256); + prng_gen(&__randstate, randc->buf, 1024); } return randfloat; } diff --git a/src/rand.h b/src/rand.h index ba6f5159d..ed1918824 100644 --- a/src/rand.h +++ b/src/rand.h @@ -7,13 +7,12 @@ struct rand_chunk { size_t count; - uint8_t buf[256]; // NOTE: must be multiple of 256 + uint8_t buf[1024]; // NOTE: must be multiple of 256 }; -uint8_t randnum_u8(struct rand_chunk *randc, uint8_t min, uint8_t max); -uint32_t randnum_u32(struct rand_chunk *randc, uint32_t min, uint32_t max); +uint8_t randnum_u8(struct rand_chunk *randc, uint8_t range, uint8_t offset); +uint32_t randnum_u32(struct rand_chunk *randc, uint32_t range, uint32_t offset); float randf(struct rand_chunk *randc, float range, float offset); - -static prng_state __randstate; static struct rand_chunk randc = {0}; + #endif From 046e062091752f11c0fc1a9c75a57ed66ddeb330 Mon Sep 17 00:00:00 2001 From: BrBa2 Date: Fri, 13 Jun 2025 09:55:29 +0800 Subject: [PATCH 07/11] randf optimized --- src/rand.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/rand.c b/src/rand.c index 29428c44c..fe364809a 100644 --- a/src/rand.c +++ b/src/rand.c @@ -1,6 +1,7 @@ #include "rand.h" #include +#include #include #include "shishua/shishua-sse2.h" @@ -44,11 +45,15 @@ uint32_t randnum_u32(struct rand_chunk *randc, uint32_t range, uint32_t offset) float randf(struct rand_chunk *randc, float range, float offset) { assert(range); assert(randc->count <= 1021); - uint32_t value; - memcpy(&value, &randc->buf[randc->count], 3); - // float randfloat = ((float)value / UINT32_MAX) * range + offset; - float randfloat = (value / 16777216.0f) * range + offset; - randc->count += 3; + // uint32_t value; + // memcpy(&value, &randc->buf[randc->count], 4); + uint32_t value = randc->buf[randc->count++] << 16 | randc->buf[randc->count++] << 8 | randc->buf[randc->count++]; + // value |= 0x3F800000; + // float randfloat; + // randfloat = *(float *)&value; + // memcpy(&randfloat, &value, sizeof(randfloat)); + // randfloat = (randfloat - 1) * range + offset; + float randfloat = (value) / 16777216.0f * range + offset; if (randc->count > 1021) { randc->count = 0; prng_gen(&__randstate, randc->buf, 1024); From 0fef7593f9649678b389868b703f0ea919e70039 Mon Sep 17 00:00:00 2001 From: BrBa2 Date: Fri, 13 Jun 2025 12:13:55 +0800 Subject: [PATCH 08/11] Fix: Fixed index bug --- src/rand.c | 2 +- src/rand.h | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/rand.c b/src/rand.c index fe364809a..852382a54 100644 --- a/src/rand.c +++ b/src/rand.c @@ -34,7 +34,7 @@ uint32_t randnum_u32(struct rand_chunk *randc, uint32_t range, uint32_t offset) memcpy(&value, &randc->buf[randc->count], sizeof(value)); uint32_t randnum = ((uint64_t)value * range) >> 32; randnum += offset; - // randc->count += sizeof(uint32_t); + randc->count += sizeof(randnum); if (randc->count > 1020) { randc->count = 0; prng_gen(&__randstate, randc->buf, 1024); diff --git a/src/rand.h b/src/rand.h index ed1918824..0becfd0a4 100644 --- a/src/rand.h +++ b/src/rand.h @@ -1,13 +1,14 @@ #ifndef SHISHUA_RAND_H #define SHISHUA_RAND_H +#include #include #include #include "../external/shishua/shishua.h" struct rand_chunk { - size_t count; - uint8_t buf[1024]; // NOTE: must be multiple of 256 + size_t count __attribute__((aligned(32))); + uint8_t buf[1024] __attribute__((aligned(32)));// NOTE: must be multiple of 256 }; uint8_t randnum_u8(struct rand_chunk *randc, uint8_t range, uint8_t offset); From 75bc06a461e17f1cef88a3d15d223df67033231e Mon Sep 17 00:00:00 2001 From: BrBa2 Date: Fri, 13 Jun 2025 13:35:28 +0800 Subject: [PATCH 09/11] Fix: prng_gen work at beginning --- src/rand.c | 35 ++++++++++++++--------------------- src/rand.h | 10 +++++++--- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/src/rand.c b/src/rand.c index 852382a54..0b0f60e9b 100644 --- a/src/rand.c +++ b/src/rand.c @@ -1,11 +1,7 @@ -#include "rand.h" - #include -#include #include -#include "shishua/shishua-sse2.h" -static prng_state __randstate; +#include "rand.h" __attribute__((constructor)) static void auto_seed_rand(void) { static uint64_t seed[4] = {0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95}; @@ -14,37 +10,38 @@ __attribute__((constructor)) static void auto_seed_rand(void) { } uint8_t randnum_u8(struct rand_chunk *randc, uint8_t range, uint8_t offset) { - assert(max > min); - assert(randc->count <= 1023); - uint8_t randnum = ((uint16_t)randc->buf[randc->count] * range) >> 8; - randnum += offset; - randc->count++; + assert(range); if (randc->count > 1023) { randc->count = 0; prng_gen(&__randstate, randc->buf, 1024); } + uint8_t randnum = ((uint16_t)randc->buf[randc->count] * range) >> 8; + randnum += offset; + randc->count++; return randnum; } uint32_t randnum_u32(struct rand_chunk *randc, uint32_t range, uint32_t offset) { - assert(max > min); - assert(randc->count <= 1020); + assert(range); + if (randc->count > 1020) { + randc->count = 0; + prng_gen(&__randstate, randc->buf, 1024); + } uint32_t value; // uint32_t value = randc->buf[randc->count++] << 24 | randc->buf[randc->count++] << 16 | randc->buf[randc->count++] << 8 | randc->buf[randc->count++]; memcpy(&value, &randc->buf[randc->count], sizeof(value)); uint32_t randnum = ((uint64_t)value * range) >> 32; randnum += offset; randc->count += sizeof(randnum); - if (randc->count > 1020) { - randc->count = 0; - prng_gen(&__randstate, randc->buf, 1024); - } return randnum; } float randf(struct rand_chunk *randc, float range, float offset) { assert(range); - assert(randc->count <= 1021); + if (randc->count > 1020) { + randc->count = 0; + prng_gen(&__randstate, randc->buf, 1024); + } // uint32_t value; // memcpy(&value, &randc->buf[randc->count], 4); uint32_t value = randc->buf[randc->count++] << 16 | randc->buf[randc->count++] << 8 | randc->buf[randc->count++]; @@ -54,10 +51,6 @@ float randf(struct rand_chunk *randc, float range, float offset) { // memcpy(&randfloat, &value, sizeof(randfloat)); // randfloat = (randfloat - 1) * range + offset; float randfloat = (value) / 16777216.0f * range + offset; - if (randc->count > 1021) { - randc->count = 0; - prng_gen(&__randstate, randc->buf, 1024); - } return randfloat; } diff --git a/src/rand.h b/src/rand.h index 0becfd0a4..5d864f4e3 100644 --- a/src/rand.h +++ b/src/rand.h @@ -7,13 +7,17 @@ #include "../external/shishua/shishua.h" struct rand_chunk { - size_t count __attribute__((aligned(32))); - uint8_t buf[1024] __attribute__((aligned(32)));// NOTE: must be multiple of 256 + size_t count; + uint8_t buf[1024];// NOTE: must be multiple of 256 }; uint8_t randnum_u8(struct rand_chunk *randc, uint8_t range, uint8_t offset); uint32_t randnum_u32(struct rand_chunk *randc, uint32_t range, uint32_t offset); float randf(struct rand_chunk *randc, float range, float offset); -static struct rand_chunk randc = {0}; +static struct rand_chunk randc = { + .count = 2000, + .buf = {0} +}; +static prng_state __randstate; #endif From b2bd7b298034179b481d6ec8b56568d7f2b2ae2d Mon Sep 17 00:00:00 2001 From: BrBa2 Date: Fri, 13 Jun 2025 13:46:09 +0800 Subject: [PATCH 10/11] Fix: range issue with randf usage throughout the files --- src/main.c | 6 +++--- test/network.cpp | 7 +++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/main.c b/src/main.c index fc3cb1b30..f63223be5 100644 --- a/src/main.c +++ b/src/main.c @@ -40,9 +40,9 @@ void train(neural_network *nn, dataset *train_dataset, dataset *test_dataset, fl dataset *batch_dataset = get_random_dataset_sample(train_dataset, batch_size); for (size_t i = 0; i < batch_dataset->length; i++) { data *data = batch_dataset->datas[i]; - rotate_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, -5.0f, 5.0f)); - scale_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, 0.9f, 1.1f)); - offset_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, -3.0f, 3.0f), randf(&randc, -3.0f, 3.0f)); + rotate_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, 10.0f, -5.0f)); + scale_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, 1.2f, -0.1f)); + offset_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, 6.0f, -3.0f), randf(&randc, 6.0f, -3.0f)); noise_data(data, IMAGE_SIZE * IMAGE_SIZE, 0.3f, 0.08f); } mini_batch_gd(nn, learn_rate, batch_dataset); diff --git a/test/network.cpp b/test/network.cpp index 36010ab0b..a213b27b1 100644 --- a/test/network.cpp +++ b/test/network.cpp @@ -10,10 +10,13 @@ extern "C" { #include "test_utils.h" TEST(NetworkTest, RandomFloat) { - float test = randf(&randc, 0.0f, 1.0f); + // static uint64_t seed[4] = {0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95}; + // prng_init(&__randstate, seed); + // prng_gen(&__randstate, randc.buf, 1024); + float test = randf(&randc, 1.0f, 0.0f); bool same = true; for (int i = 0; i < 10; i++) { - if (test != randf(&randc, 0.0f, 1.0f)) { + if (test != randf(&randc, 1.0f, 0.0f)) { same = false; break; } From 6a28349489bb5dcd0c2594a9248d1cf561bfaeff Mon Sep 17 00:00:00 2001 From: Lee Hoon Lim Date: Fri, 13 Jun 2025 16:07:21 +0800 Subject: [PATCH 11/11] Remove warnings and cleanup --- CMakeLists.txt | 2 +- external/shishua/shishua-sse2.h | 2 +- include/cneuron/cneuron.h | 8 ----- src/data.c | 6 ++-- src/main.c | 6 ++-- src/network.c | 4 +-- src/rand.c | 53 +++++++++++++++++++-------------- src/rand.h | 19 ++++++------ test/main.cpp | 1 - test/network.cpp | 15 ++++------ 10 files changed, 56 insertions(+), 60 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 95245ad12..359fce592 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ project(cneuron C CXX) set(CMAKE_C_STANDARD 17) set(CMAKE_C_STANDARD_REQUIRED ON) -set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) diff --git a/external/shishua/shishua-sse2.h b/external/shishua/shishua-sse2.h index a9648fb0b..812288e6c 100644 --- a/external/shishua/shishua-sse2.h +++ b/external/shishua/shishua-sse2.h @@ -185,7 +185,7 @@ static uint64_t phi[16] = { 0xFEC507705E4AE6E5, }; -static void prng_init(prng_state *s, uint64_t seed[4]) { +static inline void prng_init(prng_state *s, uint64_t seed[4]) { // Note: output is uninitialized at first, but since we pass NULL, its value // is initially ignored. s->counter[0] = _mm_setzero_si128(); diff --git a/include/cneuron/cneuron.h b/include/cneuron/cneuron.h index 1d583f2e7..5ea530423 100644 --- a/include/cneuron/cneuron.h +++ b/include/cneuron/cneuron.h @@ -157,14 +157,6 @@ typedef struct { float (*activation_function)(float, bool); /**< Pointer to the activation function used in the network. */ } neural_network; -/** - * @brief Generates a random floating-point number within a given range. - * - * @param min Minimum value for the random number. - * @param max Maximum value for the random number. - * @return A random float between min and max. - */ - /** * @brief Allocates and initializes a new layer. * diff --git a/src/data.c b/src/data.c index d7c0eb984..94172cff5 100644 --- a/src/data.c +++ b/src/data.c @@ -137,7 +137,7 @@ dataset *get_random_dataset_sample(const dataset *source_dataset, size_t amount) new_dataset->datas = malloc(sizeof(data) * amount); for (size_t i = 0; i < amount; i++) { - new_dataset->datas[i] = get_data_copy(source_dataset->datas[randnum_u32(&randc, source_dataset->length, 0)], source_dataset->inputs_length); + new_dataset->datas[i] = get_data_copy(source_dataset->datas[randnum_u32(source_dataset->length, 0)], source_dataset->inputs_length); } return new_dataset; @@ -243,9 +243,9 @@ void noise_data(data *data, size_t inputs_length, float noise_factor, float prob assert(inputs_length > 0); for (size_t i = 0; i < inputs_length; i++) { - float random_value = randf(&randc, 1, 0); + float random_value = randf(1, 0); if (random_value <= probability) { - float noise = randf(&randc, noise_factor, 0); + float noise = randf(noise_factor, 0); float new_value = data->inputs[i] + noise; data->inputs[i] = fmin(new_value, 1.0f); diff --git a/src/main.c b/src/main.c index f63223be5..87174a984 100644 --- a/src/main.c +++ b/src/main.c @@ -40,9 +40,9 @@ void train(neural_network *nn, dataset *train_dataset, dataset *test_dataset, fl dataset *batch_dataset = get_random_dataset_sample(train_dataset, batch_size); for (size_t i = 0; i < batch_dataset->length; i++) { data *data = batch_dataset->datas[i]; - rotate_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, 10.0f, -5.0f)); - scale_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, 1.2f, -0.1f)); - offset_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(&randc, 6.0f, -3.0f), randf(&randc, 6.0f, -3.0f)); + rotate_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(10.0f, -5.0f)); + scale_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(1.2f, -0.1f)); + offset_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(6.0f, -3.0f), randf(6.0f, -3.0f)); noise_data(data, IMAGE_SIZE * IMAGE_SIZE, 0.3f, 0.08f); } mini_batch_gd(nn, learn_rate, batch_dataset); diff --git a/src/network.c b/src/network.c index 93ae307a1..504d777ab 100644 --- a/src/network.c +++ b/src/network.c @@ -25,7 +25,7 @@ layer *get_layer(size_t length, size_t prev_length) { } for (size_t i = 0; i < length * prev_length; i++) - new_layer->weights[i] = randf(&randc, 2.0f, -1.0f); + new_layer->weights[i] = randf(2.0f, -1.0f); new_layer->delta = calloc(length, sizeof(float)); if (!new_layer->delta) { @@ -201,7 +201,7 @@ float cost(neural_network *nn, const dataset *test_dataset, size_t num_test) { layer *output_layer = nn->layers[nn->length - 1]; for (size_t i = 0; i < num_test; i++) { - data *test_data = test_dataset->datas[randnum_u32(&randc, test_dataset->length, 0)]; + data *test_data = test_dataset->datas[randnum_u32(test_dataset->length, 0)]; compute_network(nn, test_data->inputs); for (size_t j = 0; j < output_layer->length; j++) { float output = output_layer->output[j]; diff --git a/src/rand.c b/src/rand.c index 0b0f60e9b..55b27a4be 100644 --- a/src/rand.c +++ b/src/rand.c @@ -1,50 +1,61 @@ +#include "rand.h" + #include #include +#include -#include "rand.h" +struct rand_chunk randc = { + .count = 2000, + .buf = {0}}; + +prng_state __randstate = {0}; -__attribute__((constructor)) static void auto_seed_rand(void) { - static uint64_t seed[4] = {0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95}; +__attribute__((constructor)) void auto_seed_rand(void) { + uint64_t seed[4] = {time(NULL), 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95}; prng_init(&__randstate, seed); prng_gen(&__randstate, randc.buf, 1024); } -uint8_t randnum_u8(struct rand_chunk *randc, uint8_t range, uint8_t offset) { +uint8_t randnum_u8(uint8_t range, uint8_t offset) { assert(range); - if (randc->count > 1023) { - randc->count = 0; - prng_gen(&__randstate, randc->buf, 1024); + if (randc.count > 1023) { + randc.count = 0; + prng_gen(&__randstate, randc.buf, 1024); } - uint8_t randnum = ((uint16_t)randc->buf[randc->count] * range) >> 8; + uint8_t randnum = ((uint16_t)randc.buf[randc.count] * range) >> 8; randnum += offset; - randc->count++; + randc.count++; return randnum; } -uint32_t randnum_u32(struct rand_chunk *randc, uint32_t range, uint32_t offset) { +uint32_t randnum_u32(uint32_t range, uint32_t offset) { assert(range); - if (randc->count > 1020) { - randc->count = 0; - prng_gen(&__randstate, randc->buf, 1024); + if (randc.count > 1020) { + randc.count = 0; + prng_gen(&__randstate, randc.buf, 1024); } uint32_t value; // uint32_t value = randc->buf[randc->count++] << 24 | randc->buf[randc->count++] << 16 | randc->buf[randc->count++] << 8 | randc->buf[randc->count++]; - memcpy(&value, &randc->buf[randc->count], sizeof(value)); + memcpy(&value, &randc.buf[randc.count], sizeof(value)); uint32_t randnum = ((uint64_t)value * range) >> 32; randnum += offset; - randc->count += sizeof(randnum); + randc.count += sizeof(randnum); return randnum; } -float randf(struct rand_chunk *randc, float range, float offset) { +float randf(float range, float offset) { assert(range); - if (randc->count > 1020) { - randc->count = 0; - prng_gen(&__randstate, randc->buf, 1024); + if (randc.count > 1020) { + randc.count = 0; + prng_gen(&__randstate, randc.buf, 1024); } // uint32_t value; // memcpy(&value, &randc->buf[randc->count], 4); - uint32_t value = randc->buf[randc->count++] << 16 | randc->buf[randc->count++] << 8 | randc->buf[randc->count++]; + uint32_t byte1 = randc.buf[randc.count++]; + uint32_t byte2 = randc.buf[randc.count++]; + uint32_t byte3 = randc.buf[randc.count++]; + + uint32_t value = (byte1 << 16) | (byte2 << 8) | byte3; // value |= 0x3F800000; // float randfloat; // randfloat = *(float *)&value; @@ -54,8 +65,6 @@ float randf(struct rand_chunk *randc, float range, float offset) { return randfloat; } - - // NOTE: example // int main() { diff --git a/src/rand.h b/src/rand.h index 5d864f4e3..7184630e1 100644 --- a/src/rand.h +++ b/src/rand.h @@ -4,20 +4,19 @@ #include #include #include -#include "../external/shishua/shishua.h" + +#include "shishua/shishua.h" struct rand_chunk { size_t count; - uint8_t buf[1024];// NOTE: must be multiple of 256 + uint8_t buf[1024]; // NOTE: must be multiple of 256 }; -uint8_t randnum_u8(struct rand_chunk *randc, uint8_t range, uint8_t offset); -uint32_t randnum_u32(struct rand_chunk *randc, uint32_t range, uint32_t offset); -float randf(struct rand_chunk *randc, float range, float offset); -static struct rand_chunk randc = { - .count = 2000, - .buf = {0} -}; +uint8_t randnum_u8(uint8_t range, uint8_t offset); +uint32_t randnum_u32(uint32_t range, uint32_t offset); +float randf(float range, float offset); + +extern struct rand_chunk randc; -static prng_state __randstate; +extern prng_state __randstate; #endif diff --git a/test/main.cpp b/test/main.cpp index 0a4376670..76f841f1b 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -1,7 +1,6 @@ #include int main(int argc, char **argv) { - srand(time(NULL)); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/test/network.cpp b/test/network.cpp index a213b27b1..489d650ae 100644 --- a/test/network.cpp +++ b/test/network.cpp @@ -1,8 +1,8 @@ #include extern "C" { -#include "cneuron/cneuron.h" #include "../src/rand.h" +#include "cneuron/cneuron.h" } #include @@ -10,13 +10,10 @@ extern "C" { #include "test_utils.h" TEST(NetworkTest, RandomFloat) { - // static uint64_t seed[4] = {0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95}; - // prng_init(&__randstate, seed); - // prng_gen(&__randstate, randc.buf, 1024); - float test = randf(&randc, 1.0f, 0.0f); + float test = randf(1.0f, 0.0f); bool same = true; for (int i = 0; i < 10; i++) { - if (test != randf(&randc, 1.0f, 0.0f)) { + if (test != randf(1.0f, 0.0f)) { same = false; break; } @@ -137,7 +134,7 @@ TEST(NetworkTest, StochasticGDSingleLayer) { for (size_t i = 0; i < 50000; i++) { for (size_t j = 0; j < test_dataset->length; j++) { - stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]); + stochastic_gd(nn, 0.03f, test_dataset->datas[randnum_u32(test_dataset->length, 0)]); } if (i % 10000 == 0) { printf("Single layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); @@ -164,7 +161,7 @@ TEST(NetworkTest, StochasticGDTests) { for (size_t i = 0; i < 500000; i++) { for (size_t j = 0; j < test_dataset->length; j++) { - stochastic_gd(nn, 0.001f, test_dataset->datas[rand() % test_dataset->length]); + stochastic_gd(nn, 0.001f, test_dataset->datas[randnum_u32(test_dataset->length, 0)]); } if (i % 100000 == 0) { printf("Stochastic Multi layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); @@ -185,7 +182,7 @@ TEST(NetworkTest, StochasticGDTests) { for (size_t i = 0; i < 50000; i++) { for (size_t j = 0; j < test_dataset->length; j++) { - stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]); + stochastic_gd(nn, 0.03f, test_dataset->datas[randnum_u32(test_dataset->length, 0)]); } if (i % 10000 == 0) { printf("Stochastic Non-linearly separable learn cost: %f\n", cost(nn, test_dataset, test_dataset->length));