diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 67f1986c9..9ac1885f3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -27,7 +27,7 @@ jobs: sudo apt-get install -y cmake libgtest-dev clang-format libopenblas-dev - name: Configure CMake - run: cmake -S . -B build + run: cmake -S . -B build -DUSE_THREADING=OFF - name: Build run: cmake --build build diff --git a/CMakeLists.txt b/CMakeLists.txt index 359fce592..a06911f5d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,10 +27,15 @@ file(GLOB_RECURSE TEST_FILES test/*.cpp) set(ASAN_FLAGS -fsanitize=address -fno-omit-frame-pointer) set(WARNING_FLAGS -Wall -Wextra -Wpedantic) -set(RELEASE_FLAGS -O3 -DNDEBUG) +set(RELEASE_FLAGS -O3 -DNDEBUG -march=native) set(PROFILE_FLAG -O3 -p -g) +option(USE_THREADING "Enable multithreading support for batch training" ON) function(apply_target target) + if(USE_THREADING) + target_compile_definitions(${target} PRIVATE USE_THREADING) + endif() + target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/include) target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/external) target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/src) diff --git a/external/shishua/shishua-avx2.h b/external/shishua/shishua-avx2.h index e25dcf20f..099a70256 100644 --- a/external/shishua/shishua-avx2.h +++ b/external/shishua/shishua-avx2.h @@ -113,7 +113,7 @@ static uint64_t phi[16] = { 0xFEC507705E4AE6E5, }; -void prng_init(prng_state *s, uint64_t seed[4]) { +static inline void prng_init(prng_state *s, uint64_t seed[4]) { memset(s, 0, sizeof(prng_state)); #define STEPS 1 #define ROUNDS 13 diff --git a/external/shishua/shishua-half-avx2.h b/external/shishua/shishua-half-avx2.h index d1b3df42a..099e0b3d3 100644 --- a/external/shishua/shishua-half-avx2.h +++ b/external/shishua/shishua-half-avx2.h @@ -87,7 +87,7 @@ static uint64_t phi[8] = { 0xC1D64BA40F335E36, }; -void prng_init(prng_state *s, uint64_t seed[4]) { +static inline void prng_init(prng_state *s, uint64_t seed[4]) { memset(s, 0, sizeof(prng_state)); #define STEPS 5 #define ROUNDS 4 diff --git a/external/shishua/shishua-half-neon.h b/external/shishua/shishua-half-neon.h index ba5655b14..ce568aceb 100644 --- a/external/shishua/shishua-half-neon.h +++ b/external/shishua/shishua-half-neon.h @@ -142,7 +142,7 @@ static uint64_t phi[8] = { 0xC1D64BA40F335E36, }; -void prng_init(prng_state *s, uint64_t seed[4]) { +static inline void prng_init(prng_state *s, uint64_t seed[4]) { s->counter[0] = vdupq_n_u64(0); s->counter[1] = vdupq_n_u64(0); #define STEPS 5 diff --git a/external/shishua/shishua-half-sse2.h b/external/shishua/shishua-half-sse2.h index 91e1194fa..560b246f5 100644 --- a/external/shishua/shishua-half-sse2.h +++ b/external/shishua/shishua-half-sse2.h @@ -165,7 +165,7 @@ static uint64_t phi[8] = { 0xC1D64BA40F335E36, }; -void prng_init(prng_state *s, uint64_t seed[4]) { +static inline void prng_init(prng_state *s, uint64_t seed[4]) { s->counter[0] = _mm_setzero_si128(); s->counter[1] = _mm_setzero_si128(); #define STEPS 5 diff --git a/external/shishua/shishua-half.h b/external/shishua/shishua-half.h index 70a9f4be4..95481d406 100644 --- a/external/shishua/shishua-half.h +++ b/external/shishua/shishua-half.h @@ -171,7 +171,7 @@ static uint64_t phi[8] = { 0xC1D64BA40F335E36, }; -void prng_init(prng_state *s, uint64_t seed[4]) { +static inline void prng_init(prng_state *s, uint64_t seed[4]) { memset(s, 0, sizeof(prng_state)); #define STEPS 5 diff --git a/external/shishua/shishua-neon.h b/external/shishua/shishua-neon.h index 6cd5124bd..74d8d8bae 100644 --- a/external/shishua/shishua-neon.h +++ b/external/shishua/shishua-neon.h @@ -159,7 +159,7 @@ static uint64_t phi[16] = { 0xFEC507705E4AE6E5, }; -void prng_init(prng_state *s, uint64_t seed[4]) { +static inline void prng_init(prng_state *s, uint64_t seed[4]) { s->counter[0] = vdupq_n_u64(0); s->counter[1] = vdupq_n_u64(0); #define ROUNDS 13 diff --git a/external/shishua/shishua.h b/external/shishua/shishua.h index a0bb9248d..c3f1cca99 100644 --- a/external/shishua/shishua.h +++ b/external/shishua/shishua.h @@ -217,7 +217,7 @@ static uint64_t phi[16] = { 0xFEC507705E4AE6E5, }; -void prng_init(prng_state *s, uint64_t seed[4]) { +static inline void prng_init(prng_state *s, uint64_t seed[4]) { memset(s, 0, sizeof(prng_state)); #define STEPS 1 #define ROUNDS 13 diff --git a/src/data.c b/src/data.c index 94172cff5..088e38d2e 100644 --- a/src/data.c +++ b/src/data.c @@ -243,8 +243,7 @@ void noise_data(data *data, size_t inputs_length, float noise_factor, float prob assert(inputs_length > 0); for (size_t i = 0; i < inputs_length; i++) { - float random_value = randf(1, 0); - if (random_value <= probability) { + if (randf(1.0f, 0.0f) <= probability) { float noise = randf(noise_factor, 0); float new_value = data->inputs[i] + noise; diff --git a/src/main.c b/src/main.c index 87174a984..65679bf1c 100644 --- a/src/main.c +++ b/src/main.c @@ -6,6 +6,10 @@ #include #include +#ifdef USE_THREADING +#include +#endif + #include "cneuron/cneuron.h" #include "rand.h" @@ -26,8 +30,30 @@ float relu(float val, bool is_deravative) { return fmax(0.0f, val); } +typedef struct { + dataset *train_dataset; + size_t batch_size; +} generator_args; + +dataset *dataset_generator(generator_args *args) { + dataset *batch_dataset = get_random_dataset_sample(args->train_dataset, args->batch_size); + for (size_t i = 0; i < batch_dataset->length; i++) { + data *data = batch_dataset->datas[i]; + rotate_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(10.0f, -5.0f)); + scale_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(1.2f, -0.1f)); + offset_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(6.0f, -3.0f), randf(6.0f, -3.0f)); + noise_data(data, IMAGE_SIZE * IMAGE_SIZE, 0.3f, 0.08f); + } + return batch_dataset; +} + void train(neural_network *nn, dataset *train_dataset, dataset *test_dataset, float learn_rate, int batch_amount, int log_amount, size_t batch_size) { +#ifdef USE_THREADING + pthread_t thread; +#endif + generator_args args = (generator_args){.train_dataset = train_dataset, .batch_size = batch_size}; clock_t start_time = clock(); + dataset *batch_dataset = dataset_generator(&args); for (int i = 0; i < batch_amount; i++) { if (i % log_amount == 0 && i != 0) { float new_cost = cost(nn, test_dataset, 100); @@ -37,17 +63,22 @@ void train(neural_network *nn, dataset *train_dataset, dataset *test_dataset, fl printf("Learned: %zu, cost: %f, elapsed time: %.2fs, speed: %.2f Data/s\n", i * batch_size, new_cost, elapsed_s, speed); start_time = clock(); } - dataset *batch_dataset = get_random_dataset_sample(train_dataset, batch_size); - for (size_t i = 0; i < batch_dataset->length; i++) { - data *data = batch_dataset->datas[i]; - rotate_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(10.0f, -5.0f)); - scale_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(1.2f, -0.1f)); - offset_data(data, IMAGE_SIZE, IMAGE_SIZE, randf(6.0f, -3.0f), randf(6.0f, -3.0f)); - noise_data(data, IMAGE_SIZE * IMAGE_SIZE, 0.3f, 0.08f); - } + +#ifdef USE_THREADING + pthread_create(&thread, NULL, (void *(*)(void *))dataset_generator, &args); + mini_batch_gd(nn, learn_rate, batch_dataset); + free_dataset(batch_dataset); + void *result = NULL; + pthread_join(thread, &result); + batch_dataset = (dataset *)result; +#else mini_batch_gd(nn, learn_rate, batch_dataset); free_dataset(batch_dataset); + batch_dataset = dataset_generator(&args); +#endif } + // Last dataset not used + free_dataset(batch_dataset); } dataset *get_mnist(bool is_test) { @@ -111,7 +142,7 @@ int main(int argc, char **argv) { // Parameters float learn_rate = 1.5f; size_t batch_size = 30; - int learn_amount = 4800000; + int learn_amount = 48000000; int batch_amount = learn_amount / batch_size; int log_amount = 200; // Log once reached a number of batch diff --git a/src/network.c b/src/network.c index 504d777ab..dabe45b7e 100644 --- a/src/network.c +++ b/src/network.c @@ -8,6 +8,10 @@ #include #include +#ifdef USE_THREADING +#include +#endif + #include "cneuron/cneuron.h" #include "rand.h" @@ -24,8 +28,9 @@ layer *get_layer(size_t length, size_t prev_length) { return NULL; } - for (size_t i = 0; i < length * prev_length; i++) + for (size_t i = 0; i < length * prev_length; i++) { new_layer->weights[i] = randf(2.0f, -1.0f); + } new_layer->delta = calloc(length, sizeof(float)); if (!new_layer->delta) { @@ -293,36 +298,76 @@ void stochastic_gd(neural_network *nn, float learn_rate, const data *data) { } } -void mini_batch_gd(neural_network *nn, float learn_rate, const dataset *data_batch) { - assert(nn && data_batch); +typedef struct { + neural_network *nn; + const dataset *data_batch; + size_t start; + size_t end; + float **weights_gradients; + float **bias_gradients; + int thread_index; +} ThreadArgs; + +void *thread_worker(void *arg) { + ThreadArgs *args = (ThreadArgs *)arg; + neural_network *nn = args->nn; - float **weights_gradients = malloc(sizeof(float *) * nn->length); - float **bias_gradients = malloc(sizeof(float *) * nn->length); + float **weights_gradients = args->weights_gradients; + float **bias_gradients = args->bias_gradients; for (size_t i = 0; i < nn->length; i++) { - weights_gradients[i] = calloc(nn->layers[i]->length * ((i == 0) ? nn->inputs_length : nn->layers[i - 1]->length), sizeof(float)); + size_t weights_size = nn->layers[i]->length * ((i == 0) ? nn->inputs_length : nn->layers[i - 1]->length); + weights_gradients[i] = calloc(weights_size, sizeof(float)); bias_gradients[i] = calloc(nn->layers[i]->length, sizeof(float)); } - for (size_t i = 0; i < data_batch->length; i++) { - data *data = data_batch->datas[i]; + for (size_t i = 0; i < args->data_batch->length; i++) { + data *data = args->data_batch->datas[i]; compute_network(nn, data->inputs); + for (size_t j = 0; j < nn->length; j++) { size_t layer_index = nn->length - j - 1; layer_learn_collect_gradient(nn, weights_gradients[layer_index], bias_gradients[layer_index], layer_index, data); } } + return NULL; +} + +void mini_batch_gd(neural_network *nn, float learn_rate, const dataset *data_batch) { + assert(nn && data_batch); + + ThreadArgs args; + + float **weights_gradients = malloc(nn->length * sizeof(float *)); + float **bias_gradients = malloc(nn->length * sizeof(float *)); + + args = (ThreadArgs){.nn = nn, .data_batch = data_batch, .weights_gradients = weights_gradients, .bias_gradients = bias_gradients}; + +#ifdef USE_THREADING + pthread_t thread; + pthread_create(&thread, NULL, thread_worker, &args); + pthread_join(thread, NULL); +#else + thread_worker(&args); +#endif + for (size_t i = 0; i < nn->length; i++) { - for (size_t j = 0; j < nn->layers[i]->length * ((i == 0) ? nn->inputs_length : nn->layers[i - 1]->length); j++) + size_t weights_size = nn->layers[i]->length * ((i == 0) ? nn->inputs_length : nn->layers[i - 1]->length); + + for (size_t j = 0; j < weights_size; j++) { nn->layers[i]->weights[j] -= weights_gradients[i][j] / data_batch->length * learn_rate; - for (size_t j = 0; j < nn->layers[i]->length; j++) - nn->layers[i]->bias[j] -= bias_gradients[i][j] / data_batch->length * learn_rate; + } + for (size_t j = 0; j < nn->layers[i]->length; j++) { + nn->layers[i]->bias[j] -= (bias_gradients[i][j] / data_batch->length) * learn_rate; + } + } + + for (size_t i = 0; i < nn->length; i++) { free(weights_gradients[i]); free(bias_gradients[i]); } - free(weights_gradients); free(bias_gradients); } diff --git a/test/network.cpp b/test/network.cpp index 489d650ae..965b344c8 100644 --- a/test/network.cpp +++ b/test/network.cpp @@ -1,8 +1,8 @@ #include extern "C" { -#include "../src/rand.h" #include "cneuron/cneuron.h" +#include "rand.h" } #include @@ -207,7 +207,7 @@ TEST(NetworkTest, MiniBatchGDTests) { layer_lengths[1] = 2; neural_network *nn = get_neural_network(layer_length, layer_lengths, test_dataset->inputs_length, &sigmoid); - for (size_t i = 0; i < 1500000; i++) { + for (size_t i = 0; i < 2000000; i++) { dataset *batch_dataset = get_random_dataset_sample(test_dataset, test_dataset->length); mini_batch_gd(nn, 0.001f, batch_dataset); free_dataset(batch_dataset);