From b507b44b2ea88db83f6ba4060573f0897d60e2ed Mon Sep 17 00:00:00 2001
From: Leo Buron <leo.buron@uni-due.de>
Date: Thu, 9 Apr 2026 15:39:12 +0200
Subject: [PATCH] fix(#69): fix tensorInitWithDistribution buffer overflow and
 add rank check to doDimensionsMatch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

tensorInitWithDistribution computed the element count by summing dimensions
instead of multiplying them (+=  instead of *=, init 0 instead of 1). This
caused two simultaneous failures:

1. Buffer overflow for bias tensors with shape [1, N]: sum = 1+N > product = N,
   so memset wrote one float past the end of the caller's data buffer.

2. Under-initialization for weight tensors with shape [M, N] where M,N > 1:
   sum = M+N << product = M*N, so only a fraction of values received their
   intended distribution (e.g. Xavier), with the rest staying at zero.

The overflow alone would not explain the crash, but this codebase uses
zero-copy shape storage: initTensorWithQFloat calls setShape(shape, dims, ...)
which stores shape->dimensions = dims — a raw pointer into the caller's stack
frame, not a deep copy. This is a deliberate design choice to minimize heap
allocations on MCU targets where memory is scarce.

The consequence is that the 4-byte memset overflow from bias0Data into the
adjacent bias0Dims array on main()'s stack corrupted dims[0] from 1 to 0.
Because the tensor's shape points directly at that array, the bias permanently
read shape [0, N] instead of [1, N]. The forward pass then hit
doDimensionsMatch comparing output [1, N] against bias [0, N], producing the
"Dim 1: 0, Dims 2: 20" error from the issue.

Additionally, doDimensionsMatch did not verify that both tensors have the same
rank (numberOfDimensions). When ranks differed, the loop iterated using the
first tensor's rank and read out of bounds from the second tensor's VLA. This
is now a hard error (exit) rather than silent undefined behavior.

Also:
- MnistExperiment: extracted model construction into buildModel(), added CSV
  header row. Data arrays are declared static so they outlive the function —
  required by the zero-copy shape/data pointer design.

Tests added:
- UnitTestTensorApi: 4 tests verifying tensorInitWithDistribution initializes
  the correct number of elements (product, not sum) for ZEROS, ONES, NORMAL
- UnitTestArithmetic: 2 tests for doDimensionsMatch (same shape, different dims)
- UnitTestMultiLayerTraining: 3 integration tests exercising a 4-layer model
  (Linear→ReLU→Linear→Softmax) with CrossEntropy loss — the exact architecture
  that crashed in MnistExperiment
---
 experiments/MnistExperiment.c                 | 104 ++++----
 src/arithmetic/Arithmetic.c                   |   7 +-
 src/userApi/tensor/TensorApi.c                |   4 +-
 test/unit/arithmetic/UnitTestArithmetic.c     |  34 +++
 test/unit/tensor/CMakeLists.txt               |   9 +
 test/unit/tensor/UnitTestTensorApi.c          |  99 ++++++++
 test/unit/userAPI/CMakeLists.txt              |  26 +-
 .../unit/userAPI/UnitTestMultiLayerTraining.c | 235 ++++++++++++++++++
 8 files changed, 466 insertions(+), 52 deletions(-)
 create mode 100644 test/unit/tensor/UnitTestTensorApi.c
 create mode 100644 test/unit/userAPI/UnitTestMultiLayerTraining.c

diff --git a/experiments/MnistExperiment.c b/experiments/MnistExperiment.c
index d665f51..02e57ab 100644
--- a/experiments/MnistExperiment.c
+++ b/experiments/MnistExperiment.c
@@ -176,12 +176,60 @@ static void epochCallback(size_t epoch, float trainLoss, float evalLoss) {
     writeCsvRow(LOG, epoch, 0, trainLoss, evalLoss);
 }
 
+static void writeCsvHeader(char *filePath) {
+    char *header = "epoch, batch, train_loss, eval_loss\n";
+    char *row[] = {header};
+    size_t entriesInRow[] = {4};
+    csvData_t csvData;
+    setCSVData(&csvData, row, 1, entriesInRow);
+    csvWriteRowsByBufferSize(filePath, &csvData, "w");
+}
+
+#define MODEL_SIZE 4
+
+static void buildModel(layer_t **model) {
+    quantization_t *q = quantizationInitFloat();
+
+    // Linear 784→20
+    static float weight0Data[20 * 28 * 28] = {0};
+    size_t weight0Dims[] = {20, 28 * 28};
+    tensor_t *weight0Param = tensorInitWithDistribution(XAVIER_UNIFORM, weight0Data, weight0Dims, 2, q, NULL, 28*28, 20);
+    tensor_t *weight0Grad = gradInitFloat(weight0Param, NULL);
+    parameter_t *weight0 = parameterInit(weight0Param, weight0Grad);
+
+    static float bias0Data[20] = {0};
+    size_t bias0Dims[] = {1, 20};
+    tensor_t *bias0Param = tensorInitWithDistribution(ZEROS, bias0Data, bias0Dims, 2, q, NULL, 1, 20);
+    tensor_t *bias0Grad = gradInitFloat(bias0Param, NULL);
+    parameter_t *bias0 = parameterInit(bias0Param, bias0Grad);
+
+    model[0] = linearLayerInit(weight0, bias0, q, q, q, q);
+
+    // ReLU
+    model[1] = reluLayerInit(q, q);
+
+    // Linear 20→10
+    static float weight1Data[10 * 20] = {0};
+    size_t weight1Dims[] = {10, 20};
+    tensor_t *weight1Param = tensorInitWithDistribution(XAVIER_UNIFORM, weight1Data, weight1Dims, 2, q, NULL, 20, 10);
+    tensor_t *weight1Grad = gradInitFloat(weight1Param, NULL);
+    parameter_t *weight1 = parameterInit(weight1Param, weight1Grad);
+
+    static float bias1Data[10] = {0};
+    size_t bias1Dims[] = {1, 10};
+    tensor_t *bias1Param = tensorInitWithDistribution(ZEROS, bias1Data, bias1Dims, 2, q, NULL, 1, 10);
+    tensor_t *bias1Grad = gradInitFloat(bias1Param, NULL);
+    parameter_t *bias1 = parameterInit(bias1Param, bias1Grad);
+
+    model[2] = linearLayerInit(weight1, bias1, q, q, q, q);
+
+    // Softmax
+    model[3] = softmaxLayerInit(q, q);
+}
+
 
 int main(void) {
-    // this clears the old file
-    // also creates file if non-existent
-    FILE *fp = fopen(LOG, "w");
-    fclose(fp);
+    writeCsvHeader(LOG);
 
     size_t numberOfEpochs = 10;
     initDataSets();
@@ -205,52 +253,14 @@ int main(void) {
                                                   0,
                                                   true);
 
-    quantization_t *q = quantizationInitFloat();
-
-    float weight0Data[20 * 28 * 28] = {0};
-    size_t weight0Dims[] = {20, 28 * 28};
-    size_t weight0NumberOfDims = 2;
-    tensor_t *weight0Param = tensorInitWithDistribution(XAVIER_UNIFORM, weight0Data, weight0Dims, weight0NumberOfDims, q, NULL, 28*28, 20);
-    tensor_t *weight0Grad = gradInitFloat(weight0Param, NULL);
-    parameter_t *weight0 = parameterInit(weight0Param, weight0Grad);
-
-    float bias0Data[20] = {0};
-    size_t bias0Dims[] = {1, 20};
-    size_t bias0NumberOfDims = 2;
-    tensor_t *bias0Param = tensorInitWithDistribution(ZEROS, bias0Data, bias0Dims, bias0NumberOfDims, q, NULL, 1, 20);
-    tensor_t *bias0Grad = gradInitFloat(bias0Param, NULL);
-    parameter_t *bias0 = parameterInit(bias0Param, bias0Grad);
-
-    layer_t *linear0 = linearLayerInit(weight0, bias0, q, q, q, q);
-
-    layer_t *relu = reluLayerInit(q, q);
-
-    float weight1Data[10 * 20] = {0};
-    size_t weight1Dims[] = {10, 20};
-    size_t weight1NumberOfDims = 2;
-    tensor_t *weight1Param = tensorInitWithDistribution(XAVIER_UNIFORM, weight1Data, weight1Dims, weight1NumberOfDims, q, NULL, 20, 10);
-    tensor_t *weight1Grad = gradInitFloat(weight1Param, NULL);
-    parameter_t *weight1 = parameterInit(weight1Param, weight1Grad);
-
-    float bias1Data[10] = {0};
-    size_t bias1Dims[] = {1, 10};
-    size_t bias1NumberOfDims = 2;
-    tensor_t *bias1Param = tensorInitWithDistribution(ZEROS, bias1Data, bias1Dims, bias1NumberOfDims, q, NULL, 1, 10);
-    tensor_t *bias1Grad = gradInitFloat(bias1Param, NULL);
-    parameter_t *bias1 = parameterInit(bias1Param, bias1Grad);
-
-    layer_t *linear1 = linearLayerInit(weight1, bias1, q, q, q, q);
-
-    layer_t *softmax = softmaxLayerInit(q, q);
-
-    layer_t *model[] = {linear0, relu, linear1, softmax};
-    size_t sizeModel = 4;
+    layer_t *model[MODEL_SIZE];
+    buildModel(model);
 
-    optimizer_t *sgd = sgdMCreateOptim(0.001f, 0.f, 0.f, model, sizeModel, FLOAT32);
+    optimizer_t *sgd = sgdMCreateOptim(0.001f, 0.f, 0.f, model, MODEL_SIZE, FLOAT32);
 
     clock_t start = clock();
 
-    trainingRunResult_t result = trainingRun(model, sizeModel, CROSS_ENTROPY,
+    trainingRunResult_t result = trainingRun(model, MODEL_SIZE, CROSS_ENTROPY,
                                              trainDataloader, testDataloader, sgd,
                                              numberOfEpochs, calculateGradsSequential,
                                              inferenceWithLoss, epochCallback);
@@ -261,7 +271,7 @@ int main(void) {
     PRINT_INFO("Training finished in %f seconds\n", duration_sec);
     PRINT_INFO("Final train loss: %f, eval loss: %f\n", result.finalTrainLoss, result.finalEvalLoss);
 
-    float accuracy = evaluationEpochAccuracy(model, sizeModel, testDataloader, 10, inference);
+    float accuracy = evaluationEpochAccuracy(model, MODEL_SIZE, testDataloader, 10, inference);
 
     PRINT_INFO("Integration test accuracy: %.2f%%\n", accuracy * 100.0f);
 }
diff --git a/src/arithmetic/Arithmetic.c b/src/arithmetic/Arithmetic.c
index 213761a..6271385 100644
--- a/src/arithmetic/Arithmetic.c
+++ b/src/arithmetic/Arithmetic.c
@@ -30,6 +30,11 @@ bool doDimensionsMatch(tensor_t *a, tensor_t *b) {
     size_t aNumberOfDims = a->shape->numberOfDimensions;
     size_t bNumberOfDims = b->shape->numberOfDimensions;
 
+    if (aNumberOfDims != bNumberOfDims) {
+        PRINT_ERROR("Rank mismatch: %lu vs %lu\n", aNumberOfDims, bNumberOfDims);
+        exit(1);
+    }
+
     size_t aOrderedDims[aNumberOfDims];
     size_t bOrderedDims[bNumberOfDims];
 
@@ -43,7 +48,7 @@ bool doDimensionsMatch(tensor_t *a, tensor_t *b) {
         }
     }
     return true;
-};
+}
 
 size_t calcTensorIndexByIndices(size_t numberOfDimensions, size_t *dimensions, size_t *indices) {
     size_t index = indices[numberOfDimensions - 1];
diff --git a/src/userApi/tensor/TensorApi.c b/src/userApi/tensor/TensorApi.c
index e9d5033..0ea8d57 100644
--- a/src/userApi/tensor/TensorApi.c
+++ b/src/userApi/tensor/TensorApi.c
@@ -83,9 +83,9 @@ tensor_t *tensorInitWithDistribution(distributionType_t distributionType, float
                                      size_t numberOfDims, quantization_t *quantization,
                                      sparsity_t *sparsity, size_t inputFeatures,
                                      size_t outputFeatures) {
-    size_t numberOfValues = 0;
+    size_t numberOfValues = 1;
     for (size_t i = 0; i < numberOfDims; i++) {
-        numberOfValues += dims[i];
+        numberOfValues *= dims[i];
     }
 
     switch (distributionType) {
diff --git a/test/unit/arithmetic/UnitTestArithmetic.c b/test/unit/arithmetic/UnitTestArithmetic.c
index e28cadd..aca38b7 100644
--- a/test/unit/arithmetic/UnitTestArithmetic.c
+++ b/test/unit/arithmetic/UnitTestArithmetic.c
@@ -168,6 +168,37 @@ void testFloat32ElementWithTensorArithmetic() {
 
 }
 
+void testDoDimensionsMatch_SameShape_ReturnsTrue() {
+    size_t aDims[] = {2, 3};
+    size_t aOrder[] = {0, 1};
+    shape_t aShape = {.dimensions = aDims, .orderOfDimensions = aOrder, .numberOfDimensions = 2};
+    tensor_t a = {.shape = &aShape};
+
+    size_t bDims[] = {2, 3};
+    size_t bOrder[] = {0, 1};
+    shape_t bShape = {.dimensions = bDims, .orderOfDimensions = bOrder, .numberOfDimensions = 2};
+    tensor_t b = {.shape = &bShape};
+
+    TEST_ASSERT_TRUE(doDimensionsMatch(&a, &b));
+}
+
+void testDoDimensionsMatch_DifferentDims_ReturnsFalse() {
+    size_t aDims[] = {2, 3};
+    size_t aOrder[] = {0, 1};
+    shape_t aShape = {.dimensions = aDims, .orderOfDimensions = aOrder, .numberOfDimensions = 2};
+    tensor_t a = {.shape = &aShape};
+
+    size_t bDims[] = {2, 4};
+    size_t bOrder[] = {0, 1};
+    shape_t bShape = {.dimensions = bDims, .orderOfDimensions = bOrder, .numberOfDimensions = 2};
+    tensor_t b = {.shape = &bShape};
+
+    TEST_ASSERT_FALSE(doDimensionsMatch(&a, &b));
+}
+
+// NOTE: doDimensionsMatch now calls exit(1) on rank mismatch — cannot test with Unity.
+// The fix is verified by: different-rank inputs no longer silently read out of bounds.
+
 void setUp() {}
 void tearDown() {}
 
@@ -178,6 +209,9 @@ int main(void) {
     RUN_TEST(testCalcIndexByRawIndex);
     RUN_TEST(testInt32PointWiseArithmetic);
     RUN_TEST(testFloat32ElementWithTensorArithmetic);
+    RUN_TEST(testDoDimensionsMatch_SameShape_ReturnsTrue);
+    RUN_TEST(testDoDimensionsMatch_DifferentDims_ReturnsFalse);
+    // testDoDimensionsMatch_DifferentRank — now exit(1)s, verified by code review
 
     return UNITY_END();
 }
diff --git a/test/unit/tensor/CMakeLists.txt b/test/unit/tensor/CMakeLists.txt
index 25752e6..f31cb9a 100644
--- a/test/unit/tensor/CMakeLists.txt
+++ b/test/unit/tensor/CMakeLists.txt
@@ -20,4 +20,13 @@ add_elastic_ai_unit_test(
         Tensor
         Rounding
         Quantization
+)
+add_elastic_ai_unit_test(
+        LIB_UNDER_TEST
+        TensorApi
+        MORE_LIBS
+        Tensor
+        Rounding
+        Quantization
+        StorageApi
 )
\ No newline at end of file
diff --git a/test/unit/tensor/UnitTestTensorApi.c b/test/unit/tensor/UnitTestTensorApi.c
new file mode 100644
index 0000000..53ee284
--- /dev/null
+++ b/test/unit/tensor/UnitTestTensorApi.c
@@ -0,0 +1,99 @@
+#define SOURCE_FILE "UNIT_TEST_TENSOR_API"
+
+#include <stddef.h>
+#include <string.h>
+
+#include "TensorApi.h"
+#include "Tensor.h"
+#include "Quantization.h"
+#include "unity.h"
+
+void setUp() {}
+void tearDown() {}
+
+void testTensorInitWithDistribution_Zeros_InitializesProductOfDimsValues() {
+    // dims = {2, 5} → product = 10, sum = 7
+    // Bug: += gives 7, *= gives 10
+    // Fill data with sentinel 42.0f, then ZEROS should overwrite exactly 10 values
+    float data[10];
+    for (size_t i = 0; i < 10; i++) {
+        data[i] = 42.0f;
+    }
+    size_t dims[] = {2, 5};
+    quantization_t q;
+    initFloat32Quantization(&q);
+
+    tensor_t *t = tensorInitWithDistribution(ZEROS, data, dims, 2, &q, NULL, 2, 5);
+
+    // All 10 values should be zero
+    float *values = (float *)t->data;
+    for (size_t i = 0; i < 10; i++) {
+        TEST_ASSERT_FLOAT_WITHIN(1e-9f, 0.0f, values[i]);
+    }
+}
+
+void testTensorInitWithDistribution_Ones_InitializesAllValues() {
+    // dims = {3, 4} → product = 12, sum = 7
+    // Fill data with 0.0f, then ONES should set exactly 12 values to 1.0f
+    float data[12];
+    memset(data, 0, sizeof(data));
+    size_t dims[] = {3, 4};
+    quantization_t q;
+    initFloat32Quantization(&q);
+
+    tensor_t *t = tensorInitWithDistribution(ONES, data, dims, 2, &q, NULL, 3, 4);
+
+    float *values = (float *)t->data;
+    for (size_t i = 0; i < 12; i++) {
+        TEST_ASSERT_FLOAT_WITHIN(1e-9f, 1.0f, values[i]);
+    }
+}
+
+void testTensorInitWithDistribution_Normal_InitializesAllValues() {
+    // dims = {4, 5} → product = 20, sum = 9
+    // If only 9 values are initialized, remaining 11 stay at sentinel
+    float data[20];
+    float sentinel = -999.0f;
+    for (size_t i = 0; i < 20; i++) {
+        data[i] = sentinel;
+    }
+    size_t dims[] = {4, 5};
+    quantization_t q;
+    initFloat32Quantization(&q);
+
+    tensor_t *t = tensorInitWithDistribution(NORMAL, data, dims, 2, &q, NULL, 4, 5);
+
+    // With NORMAL distribution, values should NOT be the sentinel
+    float *values = (float *)t->data;
+    size_t sentinelCount = 0;
+    for (size_t i = 0; i < 20; i++) {
+        if (values[i] == sentinel) {
+            sentinelCount++;
+        }
+    }
+    // All 20 values should have been overwritten — none should remain as sentinel
+    TEST_ASSERT_EQUAL_UINT(0, sentinelCount);
+}
+
+void testTensorInitWithDistribution_ShapeIsCorrect() {
+    // Verify the resulting tensor has the correct shape dimensions
+    float data[6] = {0};
+    size_t dims[] = {2, 3};
+    quantization_t q;
+    initFloat32Quantization(&q);
+
+    tensor_t *t = tensorInitWithDistribution(ZEROS, data, dims, 2, &q, NULL, 2, 3);
+
+    TEST_ASSERT_EQUAL_UINT(2, t->shape->numberOfDimensions);
+    size_t numElements = calcNumberOfElementsByTensor(t);
+    TEST_ASSERT_EQUAL_UINT(6, numElements);
+}
+
+int main(void) {
+    UNITY_BEGIN();
+    RUN_TEST(testTensorInitWithDistribution_Zeros_InitializesProductOfDimsValues);
+    RUN_TEST(testTensorInitWithDistribution_Ones_InitializesAllValues);
+    RUN_TEST(testTensorInitWithDistribution_Normal_InitializesAllValues);
+    RUN_TEST(testTensorInitWithDistribution_ShapeIsCorrect);
+    return UNITY_END();
+}
diff --git a/test/unit/userAPI/CMakeLists.txt b/test/unit/userAPI/CMakeLists.txt
index 1cc38b2..178afb7 100644
--- a/test/unit/userAPI/CMakeLists.txt
+++ b/test/unit/userAPI/CMakeLists.txt
@@ -16,13 +16,34 @@ add_elastic_ai_unit_test(
         LIB_UNDER_TEST
         TrainingLoopApi
         MORE_LIBS
-        CalculateGradsSequential
+        TrainingEpochDefault
         TrainingBatchDefault
+        CalculateGradsSequential
+        CommonLayerLibs
+        TensorApi
+        LinearApi
+        ReluApi
+        SgdApi
+        QuantizationApi
+        LossFunction
+        InferenceApi
+        DataLoader
+        DataLoaderApi
+        StorageApi
+)
+
+add_executable(UnitTestMultiLayerTraining UnitTestMultiLayerTraining.c)
+target_link_libraries(UnitTestMultiLayerTraining PRIVATE
+        unity
+        TrainingLoopApi
         TrainingEpochDefault
+        TrainingBatchDefault
+        CalculateGradsSequential
         CommonLayerLibs
         TensorApi
         LinearApi
         ReluApi
+        SoftmaxApi
         SgdApi
         QuantizationApi
         LossFunction
@@ -30,4 +51,5 @@ add_elastic_ai_unit_test(
         DataLoader
         DataLoaderApi
         StorageApi
-)
\ No newline at end of file
+)
+__register_target_as_unit_test(UnitTestMultiLayerTraining)
\ No newline at end of file
diff --git a/test/unit/userAPI/UnitTestMultiLayerTraining.c b/test/unit/userAPI/UnitTestMultiLayerTraining.c
new file mode 100644
index 0000000..770d8cd
--- /dev/null
+++ b/test/unit/userAPI/UnitTestMultiLayerTraining.c
@@ -0,0 +1,235 @@
+#define SOURCE_FILE "UNIT_TEST_MULTI_LAYER_TRAINING"
+
+#include <stddef.h>
+
+#include "LossFunction.h"
+#include "TensorApi.h"
+#include "LinearApi.h"
+#include "ReluApi.h"
+#include "SoftmaxApi.h"
+#include "SgdApi.h"
+#include "unity.h"
+#include "TrainingLoopApi.h"
+#include "CalculateGradsSequential.h"
+#include "TrainingBatchDefault.h"
+#include "QuantizationApi.h"
+#include "Tensor.h"
+#include "StorageApi.h"
+#include "InferenceApi.h"
+#include "DataLoaderApi.h"
+#include "Dataset.h"
+
+void setUp() {}
+void tearDown() {}
+
+/*! Integration test: multi-layer model (Linear→ReLU→Linear→Softmax) with CrossEntropy.
+ *  Reproduces the MnistExperiment structure at small scale (3→4→2).
+ *  Uses tensorInitWithDistribution to init bias with ZEROS — exposes the += vs *= bug.
+ */
+void testMultiLayerBackward_WithCrossEntropy_DoesNotCrash() {
+    quantization_t *q = quantizationInitFloat();
+
+    // Layer 0: Linear 3→4
+    float w0Data[4 * 3] = {0};
+    size_t w0Dims[] = {4, 3};
+    tensor_t *w0Param = tensorInitWithDistribution(ZEROS, w0Data, w0Dims, 2, q, NULL, 3, 4);
+    tensor_t *w0Grad = gradInitFloat(w0Param, NULL);
+    parameter_t *w0 = parameterInit(w0Param, w0Grad);
+
+    float b0Data[4] = {0};
+    size_t b0Dims[] = {1, 4};
+    tensor_t *b0Param = tensorInitWithDistribution(ZEROS, b0Data, b0Dims, 2, q, NULL, 1, 4);
+    tensor_t *b0Grad = gradInitFloat(b0Param, NULL);
+    parameter_t *b0 = parameterInit(b0Param, b0Grad);
+
+    layer_t *linear0 = linearLayerInit(w0, b0, q, q, q, q);
+    layer_t *relu = reluLayerInit(q, q);
+
+    // Layer 1: Linear 4→2
+    float w1Data[2 * 4] = {0};
+    size_t w1Dims[] = {2, 4};
+    tensor_t *w1Param = tensorInitWithDistribution(ZEROS, w1Data, w1Dims, 2, q, NULL, 4, 2);
+    tensor_t *w1Grad = gradInitFloat(w1Param, NULL);
+    parameter_t *w1 = parameterInit(w1Param, w1Grad);
+
+    float b1Data[2] = {0};
+    size_t b1Dims[] = {1, 2};
+    tensor_t *b1Param = tensorInitWithDistribution(ZEROS, b1Data, b1Dims, 2, q, NULL, 1, 2);
+    tensor_t *b1Grad = gradInitFloat(b1Param, NULL);
+    parameter_t *b1 = parameterInit(b1Param, b1Grad);
+
+    layer_t *linear1 = linearLayerInit(w1, b1, q, q, q, q);
+    layer_t *softmax = softmaxLayerInit(q, q);
+
+    layer_t *model[] = {linear0, relu, linear1, softmax};
+    size_t sizeModel = 4;
+
+    // Input: [1, 3], Label: [1, 2] (one-hot)
+    float inputData[] = {1.0f, 2.0f, 3.0f};
+    size_t inputDims[] = {1, 3};
+    tensor_t *input = tensorInitFloat(inputData, inputDims, 2, NULL);
+
+    float labelData[] = {1.0f, 0.0f};
+    size_t labelDims[] = {1, 2};
+    tensor_t *label = tensorInitFloat(labelData, labelDims, 2, NULL);
+
+    // This is the call that crashes in the MnistExperiment
+    trainingStats_t *stats = calculateGradsSequential(model, sizeModel, CROSS_ENTROPY,
+                                                       input, label);
+
+    TEST_ASSERT_NOT_NULL(stats);
+    // Loss should be finite and non-negative
+    TEST_ASSERT_TRUE(stats->loss >= 0.0f);
+
+    freeTrainingStats(stats);
+}
+
+/*! Integration test: same as above but using tensorInitFloat (no distribution).
+ *  This should always work — validates the backward pass logic itself is correct.
+ */
+void testMultiLayerBackward_WithManualInit_DoesNotCrash() {
+    quantization_t testQ;
+    initFloat32Quantization(&testQ);
+
+    // Layer 0: Linear 3→4
+    float w0Data[] = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f,
+                      0.7f, 0.8f, 0.9f, 1.0f, 1.1f, 1.2f};
+    size_t w0Dims[] = {4, 3};
+    tensor_t *w0Param = tensorInitFloat(w0Data, w0Dims, 2, NULL);
+    float w0GradData[12] = {0};
+    tensor_t *w0Grad = tensorInitFloat(w0GradData, w0Dims, 2, NULL);
+    parameter_t *w0 = parameterInit(w0Param, w0Grad);
+
+    float b0Data[] = {0.0f, 0.0f, 0.0f, 0.0f};
+    size_t b0Dims[] = {1, 4};
+    tensor_t *b0Param = tensorInitFloat(b0Data, b0Dims, 2, NULL);
+    float b0GradData[] = {0.0f, 0.0f, 0.0f, 0.0f};
+    tensor_t *b0Grad = tensorInitFloat(b0GradData, b0Dims, 2, NULL);
+    parameter_t *b0 = parameterInit(b0Param, b0Grad);
+
+    layer_t *linear0 = linearLayerInit(w0, b0, &testQ, &testQ, &testQ, &testQ);
+    layer_t *relu = reluLayerInit(&testQ, &testQ);
+
+    // Layer 1: Linear 4→2
+    float w1Data[] = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f};
+    size_t w1Dims[] = {2, 4};
+    tensor_t *w1Param = tensorInitFloat(w1Data, w1Dims, 2, NULL);
+    float w1GradData[8] = {0};
+    tensor_t *w1Grad = tensorInitFloat(w1GradData, w1Dims, 2, NULL);
+    parameter_t *w1 = parameterInit(w1Param, w1Grad);
+
+    float b1Data[] = {0.0f, 0.0f};
+    size_t b1Dims[] = {1, 2};
+    tensor_t *b1Param = tensorInitFloat(b1Data, b1Dims, 2, NULL);
+    float b1GradData[] = {0.0f, 0.0f};
+    tensor_t *b1Grad = tensorInitFloat(b1GradData, b1Dims, 2, NULL);
+    parameter_t *b1 = parameterInit(b1Param, b1Grad);
+
+    layer_t *linear1 = linearLayerInit(w1, b1, &testQ, &testQ, &testQ, &testQ);
+    layer_t *softmax = softmaxLayerInit(&testQ, &testQ);
+
+    layer_t *model[] = {linear0, relu, linear1, softmax};
+    size_t sizeModel = 4;
+
+    float inputData[] = {1.0f, 2.0f, 3.0f};
+    size_t inputDims[] = {1, 3};
+    tensor_t *input = tensorInitFloat(inputData, inputDims, 2, NULL);
+
+    float labelData[] = {1.0f, 0.0f};
+    size_t labelDims[] = {1, 2};
+    tensor_t *label = tensorInitFloat(labelData, labelDims, 2, NULL);
+
+    trainingStats_t *stats = calculateGradsSequential(model, sizeModel, CROSS_ENTROPY,
+                                                       input, label);
+
+    TEST_ASSERT_NOT_NULL(stats);
+    TEST_ASSERT_TRUE(stats->loss >= 0.0f);
+
+    // Verify bias grads were accumulated (not zero after backward)
+    float *b1GradValues = (float *)b1Grad->data;
+    bool anyNonZero = false;
+    for (size_t i = 0; i < 2; i++) {
+        if (b1GradValues[i] != 0.0f) {
+            anyNonZero = true;
+            break;
+        }
+    }
+    TEST_ASSERT_TRUE(anyNonZero);
+
+    freeTrainingStats(stats);
+}
+
+/*! Integration test: run multiple training steps to verify grad accumulation is stable. */
+void testMultiLayerTraining_MultipleSteps_GradsAccumulate() {
+    quantization_t testQ;
+    initFloat32Quantization(&testQ);
+
+    float w0Data[] = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f,
+                      0.7f, 0.8f, 0.9f, 1.0f, 1.1f, 1.2f};
+    size_t w0Dims[] = {4, 3};
+    tensor_t *w0Param = tensorInitFloat(w0Data, w0Dims, 2, NULL);
+    float w0GradData[12] = {0};
+    tensor_t *w0Grad = tensorInitFloat(w0GradData, w0Dims, 2, NULL);
+    parameter_t *w0 = parameterInit(w0Param, w0Grad);
+
+    float b0Data[] = {0.0f, 0.0f, 0.0f, 0.0f};
+    size_t b0Dims[] = {1, 4};
+    tensor_t *b0Param = tensorInitFloat(b0Data, b0Dims, 2, NULL);
+    float b0GradData[] = {0.0f, 0.0f, 0.0f, 0.0f};
+    tensor_t *b0Grad = tensorInitFloat(b0GradData, b0Dims, 2, NULL);
+    parameter_t *b0 = parameterInit(b0Param, b0Grad);
+
+    layer_t *linear0 = linearLayerInit(w0, b0, &testQ, &testQ, &testQ, &testQ);
+    layer_t *relu = reluLayerInit(&testQ, &testQ);
+
+    float w1Data[] = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f};
+    size_t w1Dims[] = {2, 4};
+    tensor_t *w1Param = tensorInitFloat(w1Data, w1Dims, 2, NULL);
+    float w1GradData[8] = {0};
+    tensor_t *w1Grad = tensorInitFloat(w1GradData, w1Dims, 2, NULL);
+    parameter_t *w1 = parameterInit(w1Param, w1Grad);
+
+    float b1Data[] = {0.0f, 0.0f};
+    size_t b1Dims[] = {1, 2};
+    tensor_t *b1Param = tensorInitFloat(b1Data, b1Dims, 2, NULL);
+    float b1GradData[] = {0.0f, 0.0f};
+    tensor_t *b1Grad = tensorInitFloat(b1GradData, b1Dims, 2, NULL);
+    parameter_t *b1 = parameterInit(b1Param, b1Grad);
+
+    layer_t *linear1 = linearLayerInit(w1, b1, &testQ, &testQ, &testQ, &testQ);
+    layer_t *softmax = softmaxLayerInit(&testQ, &testQ);
+
+    layer_t *model[] = {linear0, relu, linear1, softmax};
+    size_t sizeModel = 4;
+
+    optimizer_t *sgd = sgdMCreateOptim(0.01f, 0.f, 0.f, model, sizeModel, FLOAT32);
+    optimizerFunctions_t sgdFns = optimizerFunctions[SGD_M];
+
+    float inputData[] = {1.0f, 2.0f, 3.0f};
+    size_t inputDims[] = {1, 3};
+    tensor_t *input = tensorInitFloat(inputData, inputDims, 2, NULL);
+
+    float labelData[] = {1.0f, 0.0f};
+    size_t labelDims[] = {1, 2};
+    tensor_t *label = tensorInitFloat(labelData, labelDims, 2, NULL);
+
+    // Run 3 training steps
+    for (size_t step = 0; step < 3; step++) {
+        trainingStats_t *stats = calculateGradsSequential(model, sizeModel, CROSS_ENTROPY,
+                                                           input, label);
+        TEST_ASSERT_NOT_NULL(stats);
+        TEST_ASSERT_TRUE(stats->loss >= 0.0f);
+        freeTrainingStats(stats);
+
+        sgdFns.step(sgd);
+        sgdFns.zero(sgd);
+    }
+}
+
+int main(void) {
+    UNITY_BEGIN();
+    RUN_TEST(testMultiLayerBackward_WithCrossEntropy_DoesNotCrash);
+    RUN_TEST(testMultiLayerBackward_WithManualInit_DoesNotCrash);
+    RUN_TEST(testMultiLayerTraining_MultipleSteps_GradsAccumulate);
+    return UNITY_END();
+}