From e276a545f781078b71b877de6694b089f7800cd5 Mon Sep 17 00:00:00 2001 From: Lee Hoon Lim Date: Thu, 12 Jun 2025 13:35:21 +0800 Subject: [PATCH 1/6] Added xorshift implimentation --- CMakeLists.txt | 1 + src/data.c | 7 ++++--- src/main.c | 4 +++- src/network.c | 7 ++++--- src/prand32.h | 24 ++++++++++++++++++++++++ test/main.cpp | 6 +++++- test/network.cpp | 7 ++++--- 7 files changed, 45 insertions(+), 11 deletions(-) create mode 100644 src/prand32.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 4591d9ea1..a354ce47b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,7 @@ set(PROFILE_FLAG -O3 -p -g) function(apply_target target) target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/include) + target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/src) target_compile_options(${target} PRIVATE ${WARNING_FLAGS}) target_link_libraries(libcneuron PRIVATE ${BLAS_LIBRARIES} m) diff --git a/src/data.c b/src/data.c index 51e319b43..5e71b8ddb 100644 --- a/src/data.c +++ b/src/data.c @@ -7,6 +7,7 @@ #include #include "cneuron/cneuron.h" +#include "prand32.h" #define BACKGROUND_VALUE 0.0f @@ -136,7 +137,7 @@ dataset *get_random_dataset_sample(const dataset *source_dataset, size_t amount) new_dataset->datas = malloc(sizeof(data) * amount); for (size_t i = 0; i < amount; i++) { - new_dataset->datas[i] = get_data_copy(source_dataset->datas[rand() % source_dataset->length], source_dataset->inputs_length); + new_dataset->datas[i] = get_data_copy(source_dataset->datas[prand32() % source_dataset->length], source_dataset->inputs_length); } return new_dataset; @@ -242,9 +243,9 @@ void noise_data(data *data, size_t inputs_length, float noise_factor, float prob assert(inputs_length > 0); for (size_t i = 0; i < inputs_length; i++) { - float random_value = rand() / (float)RAND_MAX; + float random_value = prand32f(); if (random_value <= probability) { - float noise = (rand() / (float)RAND_MAX * noise_factor); + float noise = prand32f() * noise_factor; float new_value = data->inputs[i] + noise; data->inputs[i] = fmin(new_value, 1.0f); diff --git a/src/main.c b/src/main.c index 7f43f0789..e2ac79133 100644 --- a/src/main.c +++ b/src/main.c @@ -7,6 +7,7 @@ #include #include "cneuron/cneuron.h" +#include "prand32.h" const size_t IMAGE_SIZE = 28; @@ -97,7 +98,8 @@ dataset *get_mnist(bool is_test) { } int main(int argc, char **argv) { - srand(time(NULL)); + sprand32(time(NULL)); + dataset *train_dataset = get_mnist(false); dataset *test_dataset = get_mnist(true); size_t network_length = 3; diff --git a/src/network.c b/src/network.c index 7352d4901..4e4c5d463 100644 --- a/src/network.c +++ b/src/network.c @@ -9,11 +9,12 @@ #include #include "cneuron/cneuron.h" +#include "prand32.h" float random_float(float min, float max) { assert(min < max); - return (float)rand() / (float)RAND_MAX * (max - min) + min; + return prand32f() * (max - min) + min; } layer *get_layer(size_t length, size_t prev_length) { @@ -30,7 +31,7 @@ layer *get_layer(size_t length, size_t prev_length) { } for (size_t i = 0; i < length * prev_length; i++) - new_layer->weights[i] = ((float)rand() / (float)RAND_MAX * 2.0f - 1.0f); + new_layer->weights[i] = prand32f() * 2.0f - 1.0f; new_layer->delta = calloc(length, sizeof(float)); if (!new_layer->delta) { @@ -206,7 +207,7 @@ float cost(neural_network *nn, const dataset *test_dataset, size_t num_test) { layer *output_layer = nn->layers[nn->length - 1]; for (size_t i = 0; i < num_test; i++) { - data *test_data = test_dataset->datas[rand() % test_dataset->length]; + data *test_data = test_dataset->datas[prand32() % test_dataset->length]; compute_network(nn, test_data->inputs); for (size_t j = 0; j < output_layer->length; j++) { float output = output_layer->output[j]; diff --git a/src/prand32.h b/src/prand32.h new file mode 100644 index 000000000..fc83de1bd --- /dev/null +++ b/src/prand32.h @@ -0,0 +1,24 @@ +#ifndef PRAND32_H +#define PRAND32_H + +#include + +static uint32_t state = 123456; + +static inline uint32_t prand32(void) { + state ^= state << 13; + state ^= state >> 17; + state ^= state << 5; + return state; +} + +static inline void sprand32(uint32_t seed) { + state = seed; +} + +// 0.0f - 1.0f +static inline float prand32f(void) { + return (prand32() >> 8) * (1.0f / 16777216.0f); +} + +#endif diff --git a/test/main.cpp b/test/main.cpp index 0a4376670..a245b23ee 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -1,7 +1,11 @@ #include +extern "C" { +#include "prand32.h" +} + int main(int argc, char **argv) { - srand(time(NULL)); + sprand32(time(NULL)); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/test/network.cpp b/test/network.cpp index 4082a2052..da18df588 100644 --- a/test/network.cpp +++ b/test/network.cpp @@ -2,6 +2,7 @@ extern "C" { #include "cneuron/cneuron.h" +#include "prand32.h" } #include @@ -133,7 +134,7 @@ TEST(NetworkTest, StochasticGDSingleLayer) { for (size_t i = 0; i < 50000; i++) { for (size_t j = 0; j < test_dataset->length; j++) { - stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]); + stochastic_gd(nn, 0.03f, test_dataset->datas[prand32() % test_dataset->length]); } if (i % 10000 == 0) { printf("Single layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); @@ -160,7 +161,7 @@ TEST(NetworkTest, StochasticGDTests) { for (size_t i = 0; i < 500000; i++) { for (size_t j = 0; j < test_dataset->length; j++) { - stochastic_gd(nn, 0.001f, test_dataset->datas[rand() % test_dataset->length]); + stochastic_gd(nn, 0.001f, test_dataset->datas[prand32() % test_dataset->length]); } if (i % 100000 == 0) { printf("Stochastic Multi layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); @@ -181,7 +182,7 @@ TEST(NetworkTest, StochasticGDTests) { for (size_t i = 0; i < 50000; i++) { for (size_t j = 0; j < test_dataset->length; j++) { - stochastic_gd(nn, 0.03f, test_dataset->datas[rand() % test_dataset->length]); + stochastic_gd(nn, 0.03f, test_dataset->datas[prand32() % test_dataset->length]); } if (i % 10000 == 0) { printf("Stochastic Non-linearly separable learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); From c47ccf927e7697ef07fc7ce0ec389ffb61c4c5c4 Mon Sep 17 00:00:00 2001 From: LegendaryLHL Date: Thu, 12 Jun 2025 17:44:03 +0800 Subject: [PATCH 2/6] Added auto seed --- include/cneuron/cneuron.h | 9 --------- src/main.c | 8 +++----- src/network.c | 6 ------ src/prand32.c | 27 +++++++++++++++++++++++++++ src/prand32.h | 20 ++++++-------------- test/main.cpp | 1 - test/network.cpp | 12 ------------ 7 files changed, 36 insertions(+), 47 deletions(-) create mode 100644 src/prand32.c diff --git a/include/cneuron/cneuron.h b/include/cneuron/cneuron.h index c2c5c2d7e..5ea530423 100644 --- a/include/cneuron/cneuron.h +++ b/include/cneuron/cneuron.h @@ -157,15 +157,6 @@ typedef struct { float (*activation_function)(float, bool); /**< Pointer to the activation function used in the network. */ } neural_network; -/** - * @brief Generates a random floating-point number within a given range. - * - * @param min Minimum value for the random number. - * @param max Maximum value for the random number. - * @return A random float between min and max. - */ -float random_float(float min, float max); - /** * @brief Allocates and initializes a new layer. * diff --git a/src/main.c b/src/main.c index e2ac79133..79376d457 100644 --- a/src/main.c +++ b/src/main.c @@ -40,9 +40,9 @@ void train(neural_network *nn, dataset *train_dataset, dataset *test_dataset, fl dataset *batch_dataset = get_random_dataset_sample(train_dataset, batch_size); for (size_t i = 0; i < batch_dataset->length; i++) { data *data = batch_dataset->datas[i]; - rotate_data(data, IMAGE_SIZE, IMAGE_SIZE, random_float(-5.0f, 5.0f)); - scale_data(data, IMAGE_SIZE, IMAGE_SIZE, random_float(0.9f, 1.1f)); - offset_data(data, IMAGE_SIZE, IMAGE_SIZE, random_float(-3.0f, 3.0f), random_float(-3.0f, 3.0f)); + rotate_data(data, IMAGE_SIZE, IMAGE_SIZE, prand32f_range(-5.0f, 5.0f)); + scale_data(data, IMAGE_SIZE, IMAGE_SIZE, prand32f_range(0.9f, 1.1f)); + offset_data(data, IMAGE_SIZE, IMAGE_SIZE, prand32f_range(-3.0f, 3.0f), prand32f_range(-3.0f, 3.0f)); noise_data(data, IMAGE_SIZE * IMAGE_SIZE, 0.3f, 0.08f); } mini_batch_gd(nn, learn_rate, batch_dataset); @@ -98,8 +98,6 @@ dataset *get_mnist(bool is_test) { } int main(int argc, char **argv) { - sprand32(time(NULL)); - dataset *train_dataset = get_mnist(false); dataset *test_dataset = get_mnist(true); size_t network_length = 3; diff --git a/src/network.c b/src/network.c index 4e4c5d463..7a4b291ae 100644 --- a/src/network.c +++ b/src/network.c @@ -11,12 +11,6 @@ #include "cneuron/cneuron.h" #include "prand32.h" -float random_float(float min, float max) { - assert(min < max); - - return prand32f() * (max - min) + min; -} - layer *get_layer(size_t length, size_t prev_length) { layer *new_layer = calloc(1, sizeof(layer)); if (!new_layer) diff --git a/src/prand32.c b/src/prand32.c new file mode 100644 index 000000000..da920c8e0 --- /dev/null +++ b/src/prand32.c @@ -0,0 +1,27 @@ +#include "prand32.h" + +#include "time.h" + +uint32_t state = 123456; + +__attribute__((constructor)) static void auto_seed_prand32(void) { + state = (uint32_t)time(NULL); +} + +inline uint32_t prand32(void) { + state ^= state << 13; + state ^= state >> 17; + state ^= state << 5; + return state; +} + +// 0.0f - 1.0f +inline float prand32f(void) { + return (prand32() >> 8) * (1.0f / 16777216.0f); +} + +inline float prand32f_range(float min, float max) { + assert(min < max); + + return prand32f() * (max - min) + min; +} diff --git a/src/prand32.h b/src/prand32.h index fc83de1bd..fd8df24c5 100644 --- a/src/prand32.h +++ b/src/prand32.h @@ -1,24 +1,16 @@ #ifndef PRAND32_H #define PRAND32_H +#include #include -static uint32_t state = 123456; +extern uint32_t state; -static inline uint32_t prand32(void) { - state ^= state << 13; - state ^= state >> 17; - state ^= state << 5; - return state; -} - -static inline void sprand32(uint32_t seed) { - state = seed; -} +uint32_t prand32(void); // 0.0f - 1.0f -static inline float prand32f(void) { - return (prand32() >> 8) * (1.0f / 16777216.0f); -} +float prand32f(void); + +float prand32f_range(float min, float max); #endif diff --git a/test/main.cpp b/test/main.cpp index a245b23ee..b1ee5d525 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -5,7 +5,6 @@ extern "C" { } int main(int argc, char **argv) { - sprand32(time(NULL)); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/test/network.cpp b/test/network.cpp index da18df588..5a2127ebd 100644 --- a/test/network.cpp +++ b/test/network.cpp @@ -9,18 +9,6 @@ extern "C" { #include "test_utils.h" -TEST(NetworkTest, RandomFloat) { - float test = random_float(0.0f, 1.0f); - bool same = true; - for (int i = 0; i < 10; i++) { - if (test != random_float(0.0f, 1.0f)) { - same = false; - break; - } - } - ASSERT_FALSE(same); -} - TEST(NetworkTest, GetLayer) { size_t layer_length = 3; layer *test_layer = get_layer(layer_length, 5); From 39280c930ad45743891d10172042e9fcf823f879 Mon Sep 17 00:00:00 2001 From: LegendaryLHL Date: Thu, 12 Jun 2025 18:51:28 +0800 Subject: [PATCH 3/6] Faster random array index --- src/data.c | 5 ++--- src/network.c | 4 ++-- src/prand32.c | 5 ++++- src/prand32.h | 5 ++++- test/network.cpp | 6 +++--- 5 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/data.c b/src/data.c index 5e71b8ddb..3a6d0c5ec 100644 --- a/src/data.c +++ b/src/data.c @@ -137,7 +137,7 @@ dataset *get_random_dataset_sample(const dataset *source_dataset, size_t amount) new_dataset->datas = malloc(sizeof(data) * amount); for (size_t i = 0; i < amount; i++) { - new_dataset->datas[i] = get_data_copy(source_dataset->datas[prand32() % source_dataset->length], source_dataset->inputs_length); + new_dataset->datas[i] = get_data_copy(source_dataset->datas[prand32_index(source_dataset->length)], source_dataset->inputs_length); } return new_dataset; @@ -243,8 +243,7 @@ void noise_data(data *data, size_t inputs_length, float noise_factor, float prob assert(inputs_length > 0); for (size_t i = 0; i < inputs_length; i++) { - float random_value = prand32f(); - if (random_value <= probability) { + if (prand32f() <= probability) { float noise = prand32f() * noise_factor; float new_value = data->inputs[i] + noise; diff --git a/src/network.c b/src/network.c index 7a4b291ae..910f317e5 100644 --- a/src/network.c +++ b/src/network.c @@ -25,7 +25,7 @@ layer *get_layer(size_t length, size_t prev_length) { } for (size_t i = 0; i < length * prev_length; i++) - new_layer->weights[i] = prand32f() * 2.0f - 1.0f; + new_layer->weights[i] = prand32f_range(-1.0, 1.0); new_layer->delta = calloc(length, sizeof(float)); if (!new_layer->delta) { @@ -201,7 +201,7 @@ float cost(neural_network *nn, const dataset *test_dataset, size_t num_test) { layer *output_layer = nn->layers[nn->length - 1]; for (size_t i = 0; i < num_test; i++) { - data *test_data = test_dataset->datas[prand32() % test_dataset->length]; + data *test_data = test_dataset->datas[prand32_index(test_dataset->length)]; compute_network(nn, test_data->inputs); for (size_t j = 0; j < output_layer->length; j++) { float output = output_layer->output[j]; diff --git a/src/prand32.c b/src/prand32.c index da920c8e0..92cba633f 100644 --- a/src/prand32.c +++ b/src/prand32.c @@ -15,7 +15,10 @@ inline uint32_t prand32(void) { return state; } -// 0.0f - 1.0f +inline uint32_t prand32_index(uint32_t length) { + return ((uint64_t)prand32() * length) >> 32; +} + inline float prand32f(void) { return (prand32() >> 8) * (1.0f / 16777216.0f); } diff --git a/src/prand32.h b/src/prand32.h index fd8df24c5..30a549a42 100644 --- a/src/prand32.h +++ b/src/prand32.h @@ -4,10 +4,13 @@ #include #include -extern uint32_t state; +void ensure_seeded(void); uint32_t prand32(void); +// Select random index [0, length) +uint32_t prand32_index(uint32_t length); + // 0.0f - 1.0f float prand32f(void); diff --git a/test/network.cpp b/test/network.cpp index 5a2127ebd..c5e765c73 100644 --- a/test/network.cpp +++ b/test/network.cpp @@ -122,7 +122,7 @@ TEST(NetworkTest, StochasticGDSingleLayer) { for (size_t i = 0; i < 50000; i++) { for (size_t j = 0; j < test_dataset->length; j++) { - stochastic_gd(nn, 0.03f, test_dataset->datas[prand32() % test_dataset->length]); + stochastic_gd(nn, 0.03f, test_dataset->datas[prand32_index(test_dataset->length)]); } if (i % 10000 == 0) { printf("Single layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); @@ -149,7 +149,7 @@ TEST(NetworkTest, StochasticGDTests) { for (size_t i = 0; i < 500000; i++) { for (size_t j = 0; j < test_dataset->length; j++) { - stochastic_gd(nn, 0.001f, test_dataset->datas[prand32() % test_dataset->length]); + stochastic_gd(nn, 0.001f, test_dataset->datas[prand32_index(test_dataset->length)]); } if (i % 100000 == 0) { printf("Stochastic Multi layer learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); @@ -170,7 +170,7 @@ TEST(NetworkTest, StochasticGDTests) { for (size_t i = 0; i < 50000; i++) { for (size_t j = 0; j < test_dataset->length; j++) { - stochastic_gd(nn, 0.03f, test_dataset->datas[prand32() % test_dataset->length]); + stochastic_gd(nn, 0.03f, test_dataset->datas[prand32_index(test_dataset->length)]); } if (i % 10000 == 0) { printf("Stochastic Non-linearly separable learn cost: %f\n", cost(nn, test_dataset, test_dataset->length)); From 9414471f7f0bc82d38f3363b8319fc94230f2385 Mon Sep 17 00:00:00 2001 From: LegendaryLHL Date: Thu, 12 Jun 2025 21:00:47 +0800 Subject: [PATCH 4/6] What is this --- src/main.c | 33 +++++++++++---- src/network.c | 109 +++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 119 insertions(+), 23 deletions(-) diff --git a/src/main.c b/src/main.c index 79376d457..895a5214e 100644 --- a/src/main.c +++ b/src/main.c @@ -26,8 +26,30 @@ float relu(float val, bool is_deravative) { return fmax(0.0f, val); } +#include + +typedef struct { + dataset *train_dataset; + size_t batch_size; +} churn; + +dataset *churn_dataset(churn *churn) { + dataset *batch_dataset = get_random_dataset_sample(churn->train_dataset, churn->batch_size); + for (size_t i = 0; i < batch_dataset->length; i++) { + data *data = batch_dataset->datas[i]; + rotate_data(data, IMAGE_SIZE, IMAGE_SIZE, prand32f_range(-5.0f, 5.0f)); + scale_data(data, IMAGE_SIZE, IMAGE_SIZE, prand32f_range(0.9f, 1.1f)); + offset_data(data, IMAGE_SIZE, IMAGE_SIZE, prand32f_range(-3.0f, 3.0f), prand32f_range(-3.0f, 3.0f)); + noise_data(data, IMAGE_SIZE * IMAGE_SIZE, 0.3f, 0.08f); + } + return batch_dataset; +} + void train(neural_network *nn, dataset *train_dataset, dataset *test_dataset, float learn_rate, int batch_amount, int log_amount, size_t batch_size) { + pthread_t thread; + churn churner = (churn) {.train_dataset=train_dataset, .batch_size=batch_size}; clock_t start_time = clock(); + dataset *batch_dataset = churn_dataset(&churner); for (int i = 0; i < batch_amount; i++) { if (i % log_amount == 0 && i != 0) { float new_cost = cost(nn, test_dataset, 100); @@ -37,16 +59,11 @@ void train(neural_network *nn, dataset *train_dataset, dataset *test_dataset, fl printf("Learned: %zu, cost: %f, elapsed time: %.2fs, speed: %.2f Data/s\n", i * batch_size, new_cost, elapsed_s, speed); start_time = clock(); } - dataset *batch_dataset = get_random_dataset_sample(train_dataset, batch_size); - for (size_t i = 0; i < batch_dataset->length; i++) { - data *data = batch_dataset->datas[i]; - rotate_data(data, IMAGE_SIZE, IMAGE_SIZE, prand32f_range(-5.0f, 5.0f)); - scale_data(data, IMAGE_SIZE, IMAGE_SIZE, prand32f_range(0.9f, 1.1f)); - offset_data(data, IMAGE_SIZE, IMAGE_SIZE, prand32f_range(-3.0f, 3.0f), prand32f_range(-3.0f, 3.0f)); - noise_data(data, IMAGE_SIZE * IMAGE_SIZE, 0.3f, 0.08f); - } + + pthread_create(&thread, NULL, (void *(*)(void *))churn_dataset, &churner); mini_batch_gd(nn, learn_rate, batch_dataset); free_dataset(batch_dataset); + pthread_join(thread, (void **)&batch_dataset); } } diff --git a/src/network.c b/src/network.c index 910f317e5..950a8fb58 100644 --- a/src/network.c +++ b/src/network.c @@ -293,38 +293,117 @@ void stochastic_gd(neural_network *nn, float learn_rate, const data *data) { } } -void mini_batch_gd(neural_network *nn, float learn_rate, const dataset *data_batch) { - assert(nn && data_batch); +#include +#include +#include + +#define NUM_THREADS 1 // You can tweak this as needed + +typedef struct { + neural_network *nn; + const dataset *data_batch; + size_t start; + size_t end; + float ***weights_gradients; + float ***bias_gradients; + int thread_index; +} ThreadArgs; + +void *thread_worker(void *arg) { + ThreadArgs *args = (ThreadArgs *)arg; + neural_network *nn = args->nn; - float **weights_gradients = malloc(sizeof(float *) * nn->length); - float **bias_gradients = malloc(sizeof(float *) * nn->length); + float **weights_gradients = args->weights_gradients[args->thread_index]; + float **bias_gradients = args->bias_gradients[args->thread_index]; for (size_t i = 0; i < nn->length; i++) { - weights_gradients[i] = calloc(nn->layers[i]->length * ((i == 0) ? nn->inputs_length : nn->layers[i - 1]->length), sizeof(float)); + size_t weights_size = nn->layers[i]->length * ((i == 0) ? nn->inputs_length : nn->layers[i - 1]->length); + weights_gradients[i] = calloc(weights_size, sizeof(float)); bias_gradients[i] = calloc(nn->layers[i]->length, sizeof(float)); } - for (size_t i = 0; i < data_batch->length; i++) { - data *data = data_batch->datas[i]; + for (size_t i = args->start; i < args->end; i++) { + data *data = args->data_batch->datas[i]; compute_network(nn, data->inputs); + for (size_t j = 0; j < nn->length; j++) { size_t layer_index = nn->length - j - 1; layer_learn_collect_gradient(nn, weights_gradients[layer_index], bias_gradients[layer_index], layer_index, data); } } + return NULL; +} + +void mini_batch_gd(neural_network *nn, float learn_rate, const dataset *data_batch) { + assert(nn && data_batch); + + pthread_t threads[NUM_THREADS]; + ThreadArgs args[NUM_THREADS]; + + // Allocate gradient accumulators per thread + float ***all_weights_gradients = malloc(NUM_THREADS * sizeof(float **)); + float ***all_bias_gradients = malloc(NUM_THREADS * sizeof(float **)); + + for (int i = 0; i < NUM_THREADS; i++) { + all_weights_gradients[i] = malloc(nn->length * sizeof(float *)); + all_bias_gradients[i] = malloc(nn->length * sizeof(float *)); + } + + size_t chunk_size = (data_batch->length + NUM_THREADS - 1) / NUM_THREADS; + + // Launch threads + for (int t = 0; t < NUM_THREADS; t++) { + args[t] = (ThreadArgs){ + .nn = nn, + .data_batch = data_batch, + .start = t * chunk_size, + .end = (t + 1) * chunk_size > data_batch->length ? data_batch->length : (t + 1) * chunk_size, + .weights_gradients = all_weights_gradients, + .bias_gradients = all_bias_gradients, + .thread_index = t + }; + pthread_create(&threads[t], NULL, thread_worker, &args[t]); + } + + // Join threads + for (int t = 0; t < NUM_THREADS; t++) { + pthread_join(threads[t], NULL); + } + + // Merge gradients for (size_t i = 0; i < nn->length; i++) { - for (size_t j = 0; j < nn->layers[i]->length * ((i == 0) ? nn->inputs_length : nn->layers[i - 1]->length); j++) - nn->layers[i]->weights[j] -= weights_gradients[i][j] / data_batch->length * learn_rate; - for (size_t j = 0; j < nn->layers[i]->length; j++) - nn->layers[i]->bias[j] -= bias_gradients[i][j] / data_batch->length * learn_rate; + size_t weights_size = nn->layers[i]->length * ((i == 0) ? nn->inputs_length : nn->layers[i - 1]->length); + + for (size_t j = 0; j < weights_size; j++) { + float sum = 0.0f; + for (int t = 0; t < NUM_THREADS; t++) { + sum += all_weights_gradients[t][i][j]; + } + nn->layers[i]->weights[j] -= (sum / data_batch->length) * learn_rate; + } + + for (size_t j = 0; j < nn->layers[i]->length; j++) { + float sum = 0.0f; + for (int t = 0; t < NUM_THREADS; t++) { + sum += all_bias_gradients[t][i][j]; + } + nn->layers[i]->bias[j] -= (sum / data_batch->length) * learn_rate; + } + } - free(weights_gradients[i]); - free(bias_gradients[i]); + // Free memory + for (int t = 0; t < NUM_THREADS; t++) { + for (size_t i = 0; i < nn->length; i++) { + free(all_weights_gradients[t][i]); + free(all_bias_gradients[t][i]); + } + free(all_weights_gradients[t]); + free(all_bias_gradients[t]); } - free(weights_gradients); - free(bias_gradients); + free(all_weights_gradients); + free(all_bias_gradients); } bool save_network(const char *filename, neural_network *nn) { From 82fa48060e8d7b93f4da136bd123c841898f20cc Mon Sep 17 00:00:00 2001 From: LegendaryLHL Date: Fri, 13 Jun 2025 09:20:26 +0800 Subject: [PATCH 5/6] Fix multithreading issues by using only one thread --- CMakeLists.txt | 7 +++- src/main.c | 14 ++++++-- src/network.c | 89 +++++++++++++++----------------------------------- 3 files changed, 45 insertions(+), 65 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a354ce47b..892da562f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,10 +27,15 @@ file(GLOB_RECURSE TEST_FILES test/*.cpp) set(ASAN_FLAGS -fsanitize=address -fno-omit-frame-pointer) set(WARNING_FLAGS -Wall -Wextra -Wpedantic) -set(RELEASE_FLAGS -O3 -DNDEBUG) +set(RELEASE_FLAGS -O3 -DNDEBUG -march=native) set(PROFILE_FLAG -O3 -p -g) +option(USE_THREADING "Enable multithreading support for batch training" ON) function(apply_target target) + if(USE_THREADING) + target_compile_definitions(${target} PRIVATE USE_THREADING) + endif() + target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/include) target_include_directories(${target} PRIVATE ${CMAKE_SOURCE_DIR}/src) target_compile_options(${target} PRIVATE ${WARNING_FLAGS}) diff --git a/src/main.c b/src/main.c index 895a5214e..ca8efed69 100644 --- a/src/main.c +++ b/src/main.c @@ -6,6 +6,10 @@ #include #include +#ifdef USE_THREADING +#include +#endif + #include "cneuron/cneuron.h" #include "prand32.h" @@ -26,8 +30,6 @@ float relu(float val, bool is_deravative) { return fmax(0.0f, val); } -#include - typedef struct { dataset *train_dataset; size_t batch_size; @@ -46,7 +48,9 @@ dataset *churn_dataset(churn *churn) { } void train(neural_network *nn, dataset *train_dataset, dataset *test_dataset, float learn_rate, int batch_amount, int log_amount, size_t batch_size) { +#ifdef USE_THREADING pthread_t thread; +#endif churn churner = (churn) {.train_dataset=train_dataset, .batch_size=batch_size}; clock_t start_time = clock(); dataset *batch_dataset = churn_dataset(&churner); @@ -60,10 +64,16 @@ void train(neural_network *nn, dataset *train_dataset, dataset *test_dataset, fl start_time = clock(); } +#ifdef USE_THREADING pthread_create(&thread, NULL, (void *(*)(void *))churn_dataset, &churner); mini_batch_gd(nn, learn_rate, batch_dataset); free_dataset(batch_dataset); pthread_join(thread, (void **)&batch_dataset); +#else + mini_batch_gd(nn, learn_rate, batch_dataset); + free_dataset(batch_dataset); + batch_dataset = churn_dataset(&churner); +#endif } } diff --git a/src/network.c b/src/network.c index 950a8fb58..8ff0df924 100644 --- a/src/network.c +++ b/src/network.c @@ -8,6 +8,10 @@ #include #include +#ifdef USE_THREADING +#include +#endif + #include "cneuron/cneuron.h" #include "prand32.h" @@ -293,19 +297,13 @@ void stochastic_gd(neural_network *nn, float learn_rate, const data *data) { } } -#include -#include -#include - -#define NUM_THREADS 1 // You can tweak this as needed - typedef struct { neural_network *nn; const dataset *data_batch; size_t start; size_t end; - float ***weights_gradients; - float ***bias_gradients; + float **weights_gradients; + float **bias_gradients; int thread_index; } ThreadArgs; @@ -313,8 +311,8 @@ void *thread_worker(void *arg) { ThreadArgs *args = (ThreadArgs *)arg; neural_network *nn = args->nn; - float **weights_gradients = args->weights_gradients[args->thread_index]; - float **bias_gradients = args->bias_gradients[args->thread_index]; + float **weights_gradients = args->weights_gradients; + float **bias_gradients = args->bias_gradients; for (size_t i = 0; i < nn->length; i++) { size_t weights_size = nn->layers[i]->length * ((i == 0) ? nn->inputs_length : nn->layers[i - 1]->length); @@ -322,7 +320,7 @@ void *thread_worker(void *arg) { bias_gradients[i] = calloc(nn->layers[i]->length, sizeof(float)); } - for (size_t i = args->start; i < args->end; i++) { + for (size_t i = 0; i < args->data_batch->length; i++) { data *data = args->data_batch->datas[i]; compute_network(nn, data->inputs); @@ -338,72 +336,39 @@ void *thread_worker(void *arg) { void mini_batch_gd(neural_network *nn, float learn_rate, const dataset *data_batch) { assert(nn && data_batch); - pthread_t threads[NUM_THREADS]; - ThreadArgs args[NUM_THREADS]; + ThreadArgs args; - // Allocate gradient accumulators per thread - float ***all_weights_gradients = malloc(NUM_THREADS * sizeof(float **)); - float ***all_bias_gradients = malloc(NUM_THREADS * sizeof(float **)); + float **weights_gradients = malloc(nn->length * sizeof(float *)); + float **bias_gradients = malloc(nn->length * sizeof(float *)); - for (int i = 0; i < NUM_THREADS; i++) { - all_weights_gradients[i] = malloc(nn->length * sizeof(float *)); - all_bias_gradients[i] = malloc(nn->length * sizeof(float *)); - } + args = (ThreadArgs){.nn = nn, .data_batch = data_batch, .weights_gradients = weights_gradients, .bias_gradients = bias_gradients}; - size_t chunk_size = (data_batch->length + NUM_THREADS - 1) / NUM_THREADS; +#ifdef USE_THREADING + pthread_t thread; + pthread_create(&thread, NULL, thread_worker, &args); + pthread_join(thread, NULL); +#else + thread_worker(&args); +#endif - // Launch threads - for (int t = 0; t < NUM_THREADS; t++) { - args[t] = (ThreadArgs){ - .nn = nn, - .data_batch = data_batch, - .start = t * chunk_size, - .end = (t + 1) * chunk_size > data_batch->length ? data_batch->length : (t + 1) * chunk_size, - .weights_gradients = all_weights_gradients, - .bias_gradients = all_bias_gradients, - .thread_index = t - }; - pthread_create(&threads[t], NULL, thread_worker, &args[t]); - } - - // Join threads - for (int t = 0; t < NUM_THREADS; t++) { - pthread_join(threads[t], NULL); - } - - // Merge gradients for (size_t i = 0; i < nn->length; i++) { size_t weights_size = nn->layers[i]->length * ((i == 0) ? nn->inputs_length : nn->layers[i - 1]->length); for (size_t j = 0; j < weights_size; j++) { - float sum = 0.0f; - for (int t = 0; t < NUM_THREADS; t++) { - sum += all_weights_gradients[t][i][j]; - } - nn->layers[i]->weights[j] -= (sum / data_batch->length) * learn_rate; + nn->layers[i]->weights[j] -= weights_gradients[i][j] / data_batch->length * learn_rate; } for (size_t j = 0; j < nn->layers[i]->length; j++) { - float sum = 0.0f; - for (int t = 0; t < NUM_THREADS; t++) { - sum += all_bias_gradients[t][i][j]; - } - nn->layers[i]->bias[j] -= (sum / data_batch->length) * learn_rate; + nn->layers[i]->bias[j] -= (bias_gradients[i][j] / data_batch->length) * learn_rate; } } - // Free memory - for (int t = 0; t < NUM_THREADS; t++) { - for (size_t i = 0; i < nn->length; i++) { - free(all_weights_gradients[t][i]); - free(all_bias_gradients[t][i]); - } - free(all_weights_gradients[t]); - free(all_bias_gradients[t]); + for (size_t i = 0; i < nn->length; i++) { + free(weights_gradients[i]); + free(bias_gradients[i]); } - - free(all_weights_gradients); - free(all_bias_gradients); + free(weights_gradients); + free(bias_gradients); } bool save_network(const char *filename, neural_network *nn) { From 764a5479fad5cac1296402202de304e719802577 Mon Sep 17 00:00:00 2001 From: LegendaryLHL Date: Fri, 13 Jun 2025 11:00:49 +0800 Subject: [PATCH 6/6] Fix leak and workflow --- .github/workflows/ci.yml | 2 +- src/main.c | 22 +++++++++++++--------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 67f1986c9..9ac1885f3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -27,7 +27,7 @@ jobs: sudo apt-get install -y cmake libgtest-dev clang-format libopenblas-dev - name: Configure CMake - run: cmake -S . -B build + run: cmake -S . -B build -DUSE_THREADING=OFF - name: Build run: cmake --build build diff --git a/src/main.c b/src/main.c index ca8efed69..233635d90 100644 --- a/src/main.c +++ b/src/main.c @@ -33,10 +33,10 @@ float relu(float val, bool is_deravative) { typedef struct { dataset *train_dataset; size_t batch_size; -} churn; +} generator_args; -dataset *churn_dataset(churn *churn) { - dataset *batch_dataset = get_random_dataset_sample(churn->train_dataset, churn->batch_size); +dataset *dataset_generator(generator_args *args) { + dataset *batch_dataset = get_random_dataset_sample(args->train_dataset, args->batch_size); for (size_t i = 0; i < batch_dataset->length; i++) { data *data = batch_dataset->datas[i]; rotate_data(data, IMAGE_SIZE, IMAGE_SIZE, prand32f_range(-5.0f, 5.0f)); @@ -51,9 +51,9 @@ void train(neural_network *nn, dataset *train_dataset, dataset *test_dataset, fl #ifdef USE_THREADING pthread_t thread; #endif - churn churner = (churn) {.train_dataset=train_dataset, .batch_size=batch_size}; + generator_args args = (generator_args){.train_dataset = train_dataset, .batch_size = batch_size}; clock_t start_time = clock(); - dataset *batch_dataset = churn_dataset(&churner); + dataset *batch_dataset = dataset_generator(&args); for (int i = 0; i < batch_amount; i++) { if (i % log_amount == 0 && i != 0) { float new_cost = cost(nn, test_dataset, 100); @@ -65,16 +65,20 @@ void train(neural_network *nn, dataset *train_dataset, dataset *test_dataset, fl } #ifdef USE_THREADING - pthread_create(&thread, NULL, (void *(*)(void *))churn_dataset, &churner); + pthread_create(&thread, NULL, (void *(*)(void *))dataset_generator, &args); mini_batch_gd(nn, learn_rate, batch_dataset); free_dataset(batch_dataset); - pthread_join(thread, (void **)&batch_dataset); + void *result = NULL; + pthread_join(thread, &result); + batch_dataset = (dataset *)result; #else mini_batch_gd(nn, learn_rate, batch_dataset); free_dataset(batch_dataset); - batch_dataset = churn_dataset(&churner); + batch_dataset = dataset_generator(&args); #endif } + // Last dataset not used + free_dataset(batch_dataset); } dataset *get_mnist(bool is_test) { @@ -138,7 +142,7 @@ int main(int argc, char **argv) { // Parameters float learn_rate = 1.5f; size_t batch_size = 30; - int learn_amount = 4800000; + int learn_amount = 48000000; int batch_amount = learn_amount / batch_size; int log_amount = 200; // Log once reached a number of batch