From 5f17daf14427233ba449a1cdd54e6cf7824e1ddf Mon Sep 17 00:00:00 2001
From: sourcepirate <sathyanarrayanan@yandex.com>
Date: Sun, 31 May 2026 10:40:47 +0530
Subject: [PATCH 1/2] Updated tests and no autograd docs

---
 Agents.md                                     |   3 +-
 README.md                                     |   6 +
 docs/layers/base.md                           |  36 ++++
 neutro/layers/core/dropout.py                 |   3 +
 tests/engine/test_node.py                     |  46 +++++
 tests/layers/core/test_dropout.py             | 135 +++++++++++++-
 tests/layers/core/test_input_layer.py         |  52 ++++++
 tests/layers/core/test_merging_coverage.py    | 123 +++++++++++++
 tests/layers/core/test_reparameterization.py  |  36 ++++
 tests/layers/embedding/test_time_embedding.py |  22 +++
 tests/layers/normalization/test_batchnorm.py  |  14 ++
 tests/layers/pooling/test_global_pooling.py   |  18 ++
 tests/models/test_model_coverage.py           | 166 ++++++++++++++++++
 tests/models/test_model_coverage2.py          | 156 ++++++++++++++++
 tests/test_preprocessing.py                   |  44 +++++
 tests/test_preprocessing_sequence.py          | 106 +++++++++++
 tests/test_preprocessing_text.py              | 114 ++++++++++++
 tests/tokenizers/test_tiktoken_compat.py      |  43 +++++
 tests/utils/test_data_utils.py                |  42 +++++
 19 files changed, 1156 insertions(+), 9 deletions(-)
 create mode 100644 tests/engine/test_node.py
 create mode 100644 tests/layers/core/test_input_layer.py
 create mode 100644 tests/layers/core/test_merging_coverage.py
 create mode 100644 tests/models/test_model_coverage.py
 create mode 100644 tests/models/test_model_coverage2.py
 create mode 100644 tests/test_preprocessing_sequence.py
 create mode 100644 tests/test_preprocessing_text.py
 create mode 100644 tests/utils/test_data_utils.py

diff --git a/Agents.md b/Agents.md
index 90652e4..bf4dea6 100644
--- a/Agents.md
+++ b/Agents.md
@@ -8,7 +8,8 @@ You are an agent working on `neutro`, an "intentionally naive" and educational i
 2.  **Keras API Fidelity**: Maintain strict compatibility with Keras/TensorFlow APIs (`compile`, `fit`, `predict`, `evaluate`, `summary`, `Sequential`, `Model`).
 3.  **Educational Clarity**: Code should be readable and reflect the underlying mathematical algorithms (e.g., FlashAttention, MoE routing, RoPE). Use clear variable names and minimal but impactful comments.
 4.  **No Magic**: Avoid complex meta-programming or obscure libraries. If a layer needs a backward pass, implement it explicitly.
-5.  **Nested Training**: Ensure that nested layers (layers within blocks) are discovered and updated by the optimizer. Use `Layer.sublayers` to traverse the hierarchy.
+5.  **No Autograd**: `neutro` has no automatic differentiation engine. There is no equivalent of PyTorch's `autograd` or JAX's `grad`. Every layer MUST implement its own `backward(grad_output)` that manually computes gradients using the chain rule. This is the defining educational feature of the library — you *are* the autograd engine.
+6.  **Nested Training**: Ensure that nested layers (layers within blocks) are discovered and updated by the optimizer. Use `Layer.sublayers` to traverse the hierarchy.
 
 ## Implementation Details
 
diff --git a/README.md b/README.md
index e037a17..aec670f 100644
--- a/README.md
+++ b/README.md
@@ -17,6 +17,12 @@ Let's be honest: modern DL frameworks are black boxes. You pip install 4GB of bi
 - **A Toy, not a Tool**: This isn't meant for production. It's a playground for learning advanced algorithms (MHA, GQA, FlashAttention, LSTM) in their purest form.
 - **For the Wisdom-Rich**: If you remember when 64MB of RAM was a flex and "vectorization" meant loop unrolling, this is for you. It's a fun way to play with cutting-edge 2024 algorithms using 1990s-era clarity.
 
+## 🚫 No Autograd
+
+Unlike PyTorch or TensorFlow, `neutro` has **zero automatic differentiation**. You will not find an `autograd` engine here. Every gradient is computed by hand — each layer implements its own `backward` method using explicit matrix multiplications and the chain rule.
+
+This is not a bug, it's the feature. Writing `self.grads['W'] = inputs.T @ grad_output` is how you *learn* what backpropagation actually does.
+
 ---
 
 ## 🚀 What's Inside?
diff --git a/docs/layers/base.md b/docs/layers/base.md
index 6cbf5a2..c4415ef 100644
--- a/docs/layers/base.md
+++ b/docs/layers/base.md
@@ -247,6 +247,42 @@ y = layer(x)
 8. `Dense.forward` computes `np.dot(x, W) + b`, applies ReLU, caches `self.inputs` and `self.z`, returns the output.
 9. Later, `layer.backward(grad_output)` uses those cached values to compute weight gradients.
 
+## 🚫 No Autograd — You Write the Gradients
+
+This is the single most important thing to understand about `neutro`:
+
+**There is no automatic differentiation engine.**
+
+In PyTorch, you write:
+
+```python
+y = x @ W + b      # PyTorch traces this into a graph
+y.backward()       # PyTorch automatically computes gradients for W and b
+```
+
+In `neutro`, you write both `forward` AND `backward`:
+
+```python
+def forward(self, x):
+    self.inputs = x
+    return x @ self.params['W'] + self.params['b']
+
+def backward(self, grad_output):
+    self.grads['W'] = self.inputs.T @ grad_output
+    self.grads['b'] = np.sum(grad_output, axis=0)
+    return grad_output @ self.params['W'].T
+```
+
+Why? Because every matrix multiplication you write in `backward` — every `@`, every `np.sum`, every `reshape` — is an explicit application of the **chain rule**. You are not calling `loss.backward()`. You *are* the autograd engine.
+
+This means:
+- **If you add a new layer**, you must implement `backward` yourself — no framework will do it for you.
+- **If you change the forward pass**, you must update backward to match. Every new line in `forward` probably needs a corresponding line in `backward`.
+- **If backward gives wrong shapes**, you'll get a NumPy shape mismatch error — not a cryptic autograd graph error. You'll learn to think in shapes.
+- **Every value you cache on `self` in `forward`** (like `self.inputs` or `self.z`) is cached for one reason: `backward` needs it. There is no tape, no graph, no magic — just stored NumPy arrays and chain rule math.
+
+This is the defining educational feature of the library. You can't hand-wave through gradient descent here. You must understand where gradients come from.
+
 ## Try it yourself
 
 Here's how you'd create a custom `MyDense` layer from scratch:
diff --git a/neutro/layers/core/dropout.py b/neutro/layers/core/dropout.py
index 53a82ac..abbdf63 100644
--- a/neutro/layers/core/dropout.py
+++ b/neutro/layers/core/dropout.py
@@ -13,6 +13,9 @@ def forward(self, inputs, training=False):
         self.mask = np.random.binomial(1, 1 - self.rate, size=inputs.shape) / (1 - self.rate)
         return inputs * self.mask
 
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
     def backward(self, grad_output):
         if self.mask is None:
             return grad_output
diff --git a/tests/engine/test_node.py b/tests/engine/test_node.py
new file mode 100644
index 0000000..43c73b5
--- /dev/null
+++ b/tests/engine/test_node.py
@@ -0,0 +1,46 @@
+import numpy as np
+from neutro.engine.node import KerasTensor, Node
+
+
+class FakeLayer:
+    def __init__(self):
+        self.name = "fake_layer"
+
+
+def test_keras_tensor_repr():
+    t = KerasTensor(shape=(None, 32, 32, 3), name="input")
+    r = repr(t)
+    assert "KerasTensor" in r
+    assert "(None, 32, 32, 3)" in r
+    assert "input" in r
+
+
+def test_node_single_output():
+    layer = FakeLayer()
+    output = KerasTensor(shape=(None, 10), name="output")
+    node = Node(layer, input_tensors=[], output_tensors=output)
+
+    assert node.layer is layer
+    assert node.output_tensors is output
+    assert output.node is node
+    assert layer._inbound_nodes == [node]
+
+
+def test_node_list_output():
+    layer = FakeLayer()
+    out1 = KerasTensor(shape=(None, 5))
+    out2 = KerasTensor(shape=(None, 3))
+    node = Node(layer, input_tensors=[], output_tensors=[out1, out2])
+
+    assert node.output_tensors == [out1, out2]
+    assert out1.node is node
+    assert out2.node is node
+
+
+def test_node_repr():
+    layer = FakeLayer()
+    output = KerasTensor(shape=(None, 10))
+    node = Node(layer, input_tensors=[], output_tensors=output)
+    r = repr(node)
+    assert "Node" in r
+    assert "fake_layer" in r
diff --git a/tests/layers/core/test_dropout.py b/tests/layers/core/test_dropout.py
index 037f2d3..64418e8 100644
--- a/tests/layers/core/test_dropout.py
+++ b/tests/layers/core/test_dropout.py
@@ -1,17 +1,136 @@
 import numpy as np
+import pytest
 from neutro.layers.core.dropout import Dropout
+from neutro.models.base_model import Sequential
 
-def test_dropout():
+
+def test_dropout_inference():
     layer = Dropout(0.5)
     x = np.random.rand(10, 10)
-    
-    # Inference
+
     out_inf = layer.forward(x, training=False)
     assert np.all(out_inf == x)
-    
-    # Training
+
+
+def test_dropout_training():
+    layer = Dropout(0.5)
+    x = np.random.rand(10, 10)
+
     out_train = layer.forward(x, training=True)
     assert not np.all(out_train == x)
-    
-    grad = layer.backward(np.random.rand(10, 10))
-    assert grad.shape == (10, 10)
+
+
+def test_dropout_rate_zero():
+    layer = Dropout(0.0)
+    x = np.random.rand(10, 10)
+
+    out = layer.forward(x, training=True)
+    assert np.all(out == x)
+
+    grad = np.random.rand(10, 10)
+    dx = layer.backward(grad)
+    assert np.all(dx == grad)
+
+
+def test_dropout_1d_input():
+    layer = Dropout(0.5)
+    x = np.random.rand(20)
+
+    out = layer.forward(x, training=True)
+    assert out.shape == (20,)
+    assert not np.all(out == x)
+
+    grad = np.random.rand(20)
+    dx = layer.backward(grad)
+    assert dx.shape == (20,)
+
+
+def test_dropout_3d_input():
+    layer = Dropout(0.3)
+    x = np.random.rand(4, 16, 64)
+
+    out = layer.forward(x, training=True)
+    assert out.shape == (4, 16, 64)
+
+    grad = np.random.rand(4, 16, 64)
+    dx = layer.backward(grad)
+    assert dx.shape == (4, 16, 64)
+
+
+def test_dropout_statistics():
+    layer = Dropout(0.5)
+    x = np.ones((1000, 100))
+
+    out = layer.forward(x, training=True)
+    zero_fraction = np.mean(out == 0)
+    assert 0.45 < zero_fraction < 0.55
+
+
+def test_dropout_backward_inference():
+    layer = Dropout(0.5)
+    x = np.random.rand(10, 10)
+    grad = np.random.rand(10, 10)
+
+    layer.forward(x, training=False)
+    dx = layer.backward(grad)
+    assert np.all(dx == grad)
+
+
+def test_dropout_backward_values():
+    layer = Dropout(0.5)
+    x = np.ones((10, 10))
+    grad = np.ones((10, 10))
+
+    layer.forward(x, training=True)
+
+    dx = layer.backward(grad)
+    expected_dx = grad * layer.mask
+    np.testing.assert_allclose(dx, expected_dx)
+
+
+def test_dropout_backward_no_forward():
+    layer = Dropout(0.5)
+
+    grad = np.random.rand(10, 10)
+    dx = layer.backward(grad)
+    assert np.all(dx == grad)
+
+
+def test_dropout_compute_output_shape():
+    layer = Dropout(0.5)
+
+    shape = layer.compute_output_shape((None, 32))
+    assert shape == (None, 32)
+
+    shape = layer.compute_output_shape((16, 32))
+    assert shape == (16, 32)
+
+    shape = layer.compute_output_shape((None, 16, 64))
+    assert shape == (None, 16, 64)
+
+
+def test_dropout_in_sequential_model():
+    model = Sequential([
+        Dropout(0.5),
+        Dropout(0.3),
+        Dropout(0.0),
+    ])
+    x = np.random.rand(8, 32)
+
+    out = model.forward(x, training=True)
+    assert out.shape == (8, 32)
+
+    out_inf = model.forward(x, training=False)
+    assert np.all(out_inf == x)
+
+
+def test_dropout_mask_recreated_each_forward():
+    layer = Dropout(0.5)
+    x = np.ones((100, 100))
+
+    out1 = layer.forward(x, training=True)
+    mask1 = (out1 != 0).astype(float)
+    out2 = layer.forward(x, training=True)
+    mask2 = (out2 != 0).astype(float)
+
+    assert not np.all(mask1 == mask2)
diff --git a/tests/layers/core/test_input_layer.py b/tests/layers/core/test_input_layer.py
new file mode 100644
index 0000000..d3f382f
--- /dev/null
+++ b/tests/layers/core/test_input_layer.py
@@ -0,0 +1,52 @@
+import numpy as np
+import pytest
+from neutro.layers.core.input_layer import InputLayer, Input
+from neutro.engine.node import KerasTensor
+
+
+def test_input_layer_forward():
+    layer = InputLayer(input_shape=(4,))
+    out = layer.forward(np.array([1, 2, 3, 4]))
+    assert np.array_equal(out, np.array([1, 2, 3, 4]))
+
+
+def test_input_layer_backward():
+    layer = InputLayer(input_shape=(4,))
+    grad = layer.backward(np.array([0.1, 0.2, 0.3, 0.4]))
+    assert np.array_equal(grad, np.array([0.1, 0.2, 0.3, 0.4]))
+
+
+def test_input_layer_build_immediate():
+    layer = InputLayer(input_shape=(28, 28, 1))
+    assert layer.built
+    assert layer.input_shape == (28, 28, 1)
+
+
+def test_input_layer_build_explicit():
+    layer = InputLayer()
+    layer.build((None, 28, 28, 1))
+    assert layer.built
+    assert layer.input_shape == (None, 28, 28, 1)
+
+
+def test_input_no_shape_raises():
+    with pytest.raises(ValueError, match="Please provide a shape"):
+        Input(shape=None)
+
+
+def test_input_with_list_shape():
+    tensor = Input(shape=[28, 28, 1])
+    assert isinstance(tensor, KerasTensor)
+    assert tensor.shape == (None, 28, 28, 1)
+
+
+def test_input_with_tuple_shape():
+    tensor = Input(shape=(28, 28, 1))
+    assert isinstance(tensor, KerasTensor)
+    assert tensor.shape == (None, 28, 28, 1)
+
+
+def test_input_with_batch_shape():
+    tensor = Input(shape=(None, 28, 28, 1))
+    assert isinstance(tensor, KerasTensor)
+    assert tensor.shape == (None, 28, 28, 1)
diff --git a/tests/layers/core/test_merging_coverage.py b/tests/layers/core/test_merging_coverage.py
new file mode 100644
index 0000000..e28fd47
--- /dev/null
+++ b/tests/layers/core/test_merging_coverage.py
@@ -0,0 +1,123 @@
+import numpy as np
+import pytest
+from neutro.layers.core.merging import Add, Concatenate, Multiply, Average, Maximum, Minimum
+
+
+class TestMultiply:
+    def test_forward(self):
+        layer = Multiply()
+        a = np.array([[1, 2], [3, 4]])
+        b = np.array([[5, 6], [7, 8]])
+        out = layer.forward([a, b])
+        expected = a * b
+        np.testing.assert_array_equal(out, expected)
+
+    def test_backward(self):
+        layer = Multiply()
+        a = np.array([[1.0, 2.0], [3.0, 4.0]])
+        b = np.array([[5.0, 6.0], [7.0, 8.0]])
+        layer.forward([a, b])
+        grad = np.array([[1.0, 1.0], [1.0, 1.0]])
+        grads = layer.backward(grad)
+        assert len(grads) == 2
+        np.testing.assert_array_equal(grads[0], b)
+        np.testing.assert_array_equal(grads[1], a)
+
+    def test_compute_output_shape(self):
+        layer = Multiply()
+        shape = layer.compute_output_shape([(None, 32), (None, 32)])
+        assert shape == (None, 32)
+
+    def test_compute_output_shape_single(self):
+        layer = Multiply()
+        shape = layer.compute_output_shape((16, 32))
+        assert shape == (16, 32)
+
+
+class TestAverage:
+    def test_forward_and_backward(self):
+        layer = Average()
+        a = np.array([[1.0, 3.0], [5.0, 7.0]])
+        b = np.array([[2.0, 4.0], [6.0, 8.0]])
+        out = layer.forward([a, b])
+        expected = (a + b) / 2
+        np.testing.assert_array_equal(out, expected)
+
+        grad = np.array([[1.0, 1.0], [1.0, 1.0]])
+        grads = layer.backward(grad)
+        assert len(grads) == 2
+        np.testing.assert_array_equal(grads[0], grad / 2)
+        np.testing.assert_array_equal(grads[1], grad / 2)
+
+    def test_compute_output_shape(self):
+        layer = Average()
+        assert layer.compute_output_shape([(None, 32), (None, 32)]) == (None, 32)
+        assert layer.compute_output_shape((16, 32)) == (16, 32)
+
+
+class TestMaximum:
+    def test_forward_maximum(self):
+        layer = Maximum()
+        a = np.array([[1.0, 5.0], [3.0, 2.0]])
+        b = np.array([[4.0, 2.0], [1.0, 6.0]])
+        out = layer.forward([a, b])
+        expected = np.maximum(a, b)
+        np.testing.assert_array_equal(out, expected)
+
+    def test_backward_maximum(self):
+        layer = Maximum()
+        a = np.array([[1.0, 5.0], [3.0, 2.0]])
+        b = np.array([[4.0, 2.0], [1.0, 6.0]])
+        layer.forward([a, b])
+        grad = np.array([[1.0, 1.0], [1.0, 1.0]])
+        grads = layer.backward(grad)
+        assert len(grads) == 2
+        expected_grad_a = np.array([[0.0, 1.0], [1.0, 0.0]])
+        expected_grad_b = np.array([[1.0, 0.0], [0.0, 1.0]])
+        np.testing.assert_array_equal(grads[0], expected_grad_a)
+        np.testing.assert_array_equal(grads[1], expected_grad_b)
+
+    def test_compute_output_shape(self):
+        layer = Maximum()
+        assert layer.compute_output_shape([(None, 32), (None, 32)]) == (None, 32)
+        assert layer.compute_output_shape((16, 32)) == (16, 32)
+
+
+class TestMinimum:
+    def test_forward_minimum(self):
+        layer = Minimum()
+        a = np.array([[1.0, 5.0], [3.0, 2.0]])
+        b = np.array([[4.0, 2.0], [1.0, 6.0]])
+        out = layer.forward([a, b])
+        expected = np.minimum(a, b)
+        np.testing.assert_array_equal(out, expected)
+
+    def test_backward_minimum(self):
+        layer = Minimum()
+        a = np.array([[1.0, 5.0], [3.0, 2.0]])
+        b = np.array([[4.0, 2.0], [1.0, 6.0]])
+        layer.forward([a, b])
+        grad = np.array([[1.0, 1.0], [1.0, 1.0]])
+        grads = layer.backward(grad)
+        assert len(grads) == 2
+        expected_grad_a = np.array([[1.0, 0.0], [0.0, 1.0]])
+        expected_grad_b = np.array([[0.0, 1.0], [1.0, 0.0]])
+        np.testing.assert_array_equal(grads[0], expected_grad_a)
+        np.testing.assert_array_equal(grads[1], expected_grad_b)
+
+    def test_compute_output_shape(self):
+        layer = Minimum()
+        assert layer.compute_output_shape([(None, 32), (None, 32)]) == (None, 32)
+        assert layer.compute_output_shape((16, 32)) == (16, 32)
+
+
+class TestAddComputeOutputShape:
+    def test_compute_output_shape_non_list(self):
+        layer = Add()
+        assert layer.compute_output_shape((16, 32)) == (16, 32)
+
+
+class TestConcatenateComputeOutputShape:
+    def test_compute_output_shape_non_list(self):
+        layer = Concatenate()
+        assert layer.compute_output_shape((16, 32)) == (16, 32)
diff --git a/tests/layers/core/test_reparameterization.py b/tests/layers/core/test_reparameterization.py
index 2245d4f..5e18d37 100644
--- a/tests/layers/core/test_reparameterization.py
+++ b/tests/layers/core/test_reparameterization.py
@@ -23,3 +23,39 @@ def test_reparameterization():
     assert len(grads) == 2
     assert grads[0].shape == (10, 5) # grad_mean
     assert grads[1].shape == (10, 5) # grad_log_var
+
+def test_reparameterization_compute_output_shape():
+    layer = Reparameterization()
+    shape = layer.compute_output_shape([(10, 5), (10, 5)])
+    assert shape == (10, 5)
+
+def test_reparameterization_compute_output_shape_single():
+    layer = Reparameterization()
+    shape = layer.compute_output_shape((10, 5))
+    assert shape == (10, 5)
+
+def test_reparameterization_backward_shapes():
+    layer = Reparameterization()
+    mean = np.random.randn(4, 8)
+    log_var = np.random.randn(4, 8)
+    layer.forward([mean, log_var], training=True)
+
+    grad_output = np.random.randn(4, 8)
+    grads = layer.backward(grad_output)
+
+    assert len(grads) == 2
+    assert grads[0].shape == (4, 8)
+    assert grads[1].shape == (4, 8)
+
+def test_reparameterization_backward_values():
+    layer = Reparameterization()
+    mean = np.zeros((3, 2))
+    log_var = np.ones((3, 2))  # var = exp(log_var) = e
+
+    layer.forward([mean, log_var], training=True)
+    grad_output = np.ones((3, 2))
+
+    grads = layer.backward(grad_output)
+    assert np.allclose(grads[0], np.ones((3, 2)))
+    expected_log_var = np.ones((3, 2)) * np.exp(0.5) * 0.5 * layer.epsilon
+    assert np.allclose(grads[1], expected_log_var)
diff --git a/tests/layers/embedding/test_time_embedding.py b/tests/layers/embedding/test_time_embedding.py
index da3bd5e..c40ed4d 100644
--- a/tests/layers/embedding/test_time_embedding.py
+++ b/tests/layers/embedding/test_time_embedding.py
@@ -22,3 +22,25 @@ def test_time_embedding_backward_shape():
 
     assert grad.shape == t.shape
     assert np.all(grad == 0)
+
+def test_time_embedding_build():
+    layer = TimeEmbedding(dim=128)
+    layer.build((4,))
+    assert layer.built
+
+def test_time_embedding_compute_output_shape():
+    layer = TimeEmbedding(dim=256)
+    shape = layer.compute_output_shape((4,))
+    assert shape == (4, 256)
+
+def test_time_embedding_2d_input():
+    layer = TimeEmbedding(dim=64)
+    t = np.array([[0], [10], [50], [100]])
+    out = layer.forward(t)
+    assert out.shape == (4, 64)
+
+def test_time_embedding_odd_dim():
+    layer = TimeEmbedding(dim=129)
+    t = np.array([0, 10, 50])
+    out = layer.forward(t)
+    assert out.shape == (3, 129)
diff --git a/tests/layers/normalization/test_batchnorm.py b/tests/layers/normalization/test_batchnorm.py
index 04264ea..7a0ff0f 100644
--- a/tests/layers/normalization/test_batchnorm.py
+++ b/tests/layers/normalization/test_batchnorm.py
@@ -29,3 +29,17 @@ def test_batch_norm_backward():
     assert grad_input.shape == (batch, c)
     assert layer.grads['gamma'].shape == (c,)
     assert layer.grads['beta'].shape == (c,)
+
+def test_batch_norm_inference():
+    batch, c = 4, 3
+    layer = BatchNormalization(momentum=0.5)
+    layer.build((batch, c))
+
+    inputs = np.random.randn(batch, c) * 10 + 5
+    out_train = layer.forward(inputs, training=True)
+    assert out_train.shape == (batch, c)
+
+    out_infer = layer.forward(inputs, training=False)
+    assert out_infer.shape == (batch, c)
+    assert np.allclose(layer.running_mean, 0.5 * 0 + 0.5 * np.mean(inputs, axis=0), atol=1e-6)
+    assert np.allclose(layer.running_var, 0.5 * 1 + 0.5 * np.var(inputs, axis=0), atol=1e-6)
diff --git a/tests/layers/pooling/test_global_pooling.py b/tests/layers/pooling/test_global_pooling.py
index 5c66984..48ec46b 100644
--- a/tests/layers/pooling/test_global_pooling.py
+++ b/tests/layers/pooling/test_global_pooling.py
@@ -53,3 +53,21 @@ def test_global_pooling_channels_first():
     assert max_out.shape == (2, 3)
     assert max_out[0, 0] == np.max(inputs[0, 0, :, :])
     assert max_layer.backward(np.random.randn(2, 3)).shape == inputs.shape
+
+def test_global_avg_pooling_invalid_data_format():
+    with pytest.raises(ValueError, match="data_format must be"):
+        GlobalAveragePooling2D(data_format='invalid')
+
+def test_global_max_pooling_invalid_data_format():
+    with pytest.raises(ValueError, match="data_format must be"):
+        GlobalMaxPooling2D(data_format='invalid')
+
+def test_global_avg_pooling_compute_output_shape():
+    layer = GlobalAveragePooling2D(data_format='channels_last')
+    shape = layer.compute_output_shape((2, 8, 8, 3))
+    assert shape == (2, 3)
+
+def test_global_max_pooling_compute_output_shape():
+    layer = GlobalMaxPooling2D(data_format='channels_last')
+    shape = layer.compute_output_shape((2, 8, 8, 3))
+    assert shape == (2, 3)
diff --git a/tests/models/test_model_coverage.py b/tests/models/test_model_coverage.py
new file mode 100644
index 0000000..f63e4bc
--- /dev/null
+++ b/tests/models/test_model_coverage.py
@@ -0,0 +1,166 @@
+import numpy as np
+import inspect
+from neutro.models import Model, Sequential
+from neutro.layers import Dense, Input, ReLU, Dropout
+from neutro.layers.base import Layer
+from neutro.layers.transformer.transformer_block import TransformerBlock
+from neutro.layers.attention.kv_cache import KVCache
+from neutro.optimizers import SGD
+
+
+class SubclassedModel(Model):
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self.dense1 = Dense(8)
+        self.relu = ReLU()
+        self.dense2 = Dense(4)
+
+    def forward(self, inputs, training=False):
+        x = self.dense1(inputs)
+        x = self.relu(x)
+        return self.dense2(x)
+
+    def build(self, input_shape):
+        self.dense1.build(input_shape)
+        shape = self.dense1.compute_output_shape(input_shape)
+        self.relu.build(shape)
+        shape = self.relu.compute_output_shape(shape)
+        self.dense2.build(shape)
+        self.built = True
+
+
+# 1. _init_graph with single output (not a list) — lines 48, 55
+def test_init_graph_single_output():
+    inputs = Input(shape=(10,))
+    x = Dense(5, activation='relu')(inputs)
+    outputs = Dense(3)(x)
+    model = Model(inputs=inputs, outputs=outputs)
+    assert len(model._nodes_ordered) > 0
+    assert len(model.layers) > 0
+
+
+# 2. _get_all_layers without arguments — lines 72-73
+def test_get_all_layers_no_args():
+    model = Sequential([Dense(10), Dense(5)])
+    all_layers = model._get_all_layers()
+    assert len(all_layers) == 2
+
+
+# 3. evaluate with metrics — lines 531-536
+def test_evaluate_with_metrics():
+    model = Sequential([Dense(8, input_shape=(4,)), Dense(2)])
+    model.compile(optimizer=SGD(0.01), loss='mse', metrics=['accuracy'])
+    x = np.random.rand(10, 4)
+    y = np.random.rand(10, 2)
+    model.fit(x, y, epochs=1, batch_size=4, verbose=0)
+    results = model.evaluate(x, y)
+    assert 'loss' in results
+    assert 'accuracy' in results
+
+
+# 4. fit with validation_data — validation loss/metrics path
+def test_fit_with_validation_data():
+    model = Sequential([Dense(8, input_shape=(4,)), Dense(2)])
+    model.compile(optimizer=SGD(0.01), loss='mse', metrics=['accuracy'])
+    x = np.random.rand(10, 4)
+    y = np.random.rand(10, 2)
+    val_data = (np.random.rand(5, 4), np.random.rand(5, 2))
+    history = model.fit(x, y, epochs=2, batch_size=5, verbose=0, validation_data=val_data)
+    assert 'val_loss' in history.history
+    assert 'val_accuracy' in history.history
+
+
+# 5. backward functional path with single input/output — lines 426, 440, 465-470
+def test_backward_functional_single_io():
+    inputs = Input(shape=(4,))
+    x = Dense(8, activation='relu')(inputs)
+    outputs = Dense(2)(x)
+    model = Model(inputs=inputs, outputs=outputs)
+    model.compile(optimizer=SGD(0.01), loss='mse')
+    x_data = np.random.rand(10, 4)
+    y_data = np.random.rand(10, 2)
+    model.fit(x_data, y_data, epochs=1, batch_size=4, verbose=0)
+    for layer in model.layers:
+        if layer.grads:
+            for k, v in layer.grads.items():
+                assert not np.allclose(v, 0)
+
+
+# 6. build for subclassed model — lines 507-510
+def test_build_subclassed_model():
+    model = SubclassedModel()
+    assert not model.built
+    model.build((None, 6))
+    assert model.built
+    x = np.random.rand(5, 6)
+    y = model.forward(x)
+    assert y.shape == (5, 4)
+
+
+# 7. summary on functional model — "Connected to" column
+def test_summary_functional_model(capsys):
+    inputs = Input(shape=(10,))
+    x = Dense(5, activation='relu')(inputs)
+    outputs = Dense(3)(x)
+    model = Model(inputs=inputs, outputs=outputs)
+    model.summary()
+    captured = capsys.readouterr()
+    assert 'Connected to' in captured.out
+    assert 'Total params' in captured.out
+
+
+# 8. clear_layer_grads — static method clears recursively
+def test_clear_layer_grads():
+    model = Sequential([Dense(5, input_shape=(3,)), Dense(2)])
+    model.build((None, 3))
+    for layer in model.layers:
+        for k in layer.params:
+            layer.grads[k] = np.random.randn(*layer.params[k].shape)
+    Model._clear_layer_grads(model)
+    for layer in model.layers:
+        assert layer.grads == {}
+
+
+# 9a. _accumulate_layer_grads — new key path (line 157)
+def test_accumulate_layer_grads_new_key():
+    layer = Dense(5, input_shape=(3,))
+    layer.build((None, 3))
+    layer.grads['W'] = np.ones(layer.params['W'].shape)
+    layer.grads['b'] = np.ones(layer.params['b'].shape)
+    accumulator = {}
+    Model._accumulate_layer_grads(layer, accumulator)
+    l_id = id(layer)
+    assert l_id in accumulator
+    assert np.all(accumulator[l_id]['W'] == 1.0)
+    assert np.all(accumulator[l_id]['b'] == 1.0)
+
+
+# 9b. _accumulate_layer_grads — existing key path (line 154-155)
+def test_accumulate_layer_grads_existing_key():
+    layer = Dense(5, input_shape=(3,))
+    layer.build((None, 3))
+
+    layer.grads['W'] = np.ones(layer.params['W'].shape) * 2
+    layer.grads['b'] = np.ones(layer.params['b'].shape) * 2
+
+    accumulator = {}
+    l_id = id(layer)
+    accumulator[l_id] = {
+        'W': np.ones(layer.params['W'].shape),
+        'b': np.ones(layer.params['b'].shape)
+    }
+
+    Model._accumulate_layer_grads(layer, accumulator)
+
+    assert np.all(accumulator[l_id]['W'] == 3.0)
+    assert np.all(accumulator[l_id]['b'] == 3.0)
+
+
+# 10. forward sequential with kv_cache — lines 370-377
+def test_forward_sequential_with_kv_cache():
+    block = TransformerBlock(embed_dim=8, num_heads=2, ff_dim=16, use_flash=True, causal=True, pre_norm=True)
+    model = Sequential([block])
+    x = np.random.rand(2, 4, 8)
+    cache = KVCache()
+    output = model.forward(x, training=False, kv_cache=cache)
+    assert output.shape == (2, 4, 8)
diff --git a/tests/models/test_model_coverage2.py b/tests/models/test_model_coverage2.py
new file mode 100644
index 0000000..8ebc76f
--- /dev/null
+++ b/tests/models/test_model_coverage2.py
@@ -0,0 +1,156 @@
+import numpy as np
+from neutro.models import Model, Sequential
+from neutro.layers import Dense, Input
+from neutro.layers.base import Layer
+
+
+class ContainerLayer(Layer):
+    def __init__(self, units=5):
+        super().__init__()
+        self.dense = Dense(units)
+
+    def forward(self, x, training=False):
+        return self.dense(x, training=training)
+
+    def backward(self, grad_output):
+        return self.dense.backward(grad_output)
+
+
+class DoubleRefLayer(Layer):
+    def __init__(self, units=5):
+        super().__init__()
+        self.inner = Dense(units)
+        self.inner_copy = self.inner
+
+    def forward(self, x, training=False):
+        return self.inner(x, training=training)
+
+    def backward(self, grad_output):
+        return self.inner.backward(grad_output)
+
+
+class SubclassNoBuild(Model):
+    def __init__(self):
+        super().__init__()
+        self.dense = Dense(10)
+
+    def forward(self, x, training=False):
+        return self.dense(x, training=training)
+
+
+class BrokenLayer(Layer):
+    def compute_output_shape(self, input_shape):
+        raise ValueError("broken")
+
+
+def test_clear_layer_grads_with_sublayers():
+    model = Sequential([ContainerLayer(4), Dense(3)])
+    model.build((None, 4))
+    x = np.random.rand(2, 4)
+    out = model.forward(x, training=True)
+    grad = np.random.rand(2, 3)
+    model.backward(grad)
+    container = model.layers[0]
+    assert len(container.dense.grads) > 0
+    Model._clear_layer_grads(model)
+    assert len(container.dense.grads) == 0
+
+
+def test_accumulate_layer_grads_visited_check():
+    layer = DoubleRefLayer(5)
+    layer.build((None, 4))
+    layer.inner.build((None, 4))
+    layer.grads['W'] = np.ones((4, 5))
+    layer.inner.grads['W'] = np.ones((4, 5)) * 2
+    accumulator = {}
+    Model._accumulate_layer_grads(layer, accumulator)
+    double_ref_id = id(layer)
+    inner_id = id(layer.inner)
+    assert double_ref_id in accumulator
+    assert inner_id in accumulator
+    assert np.all(accumulator[double_ref_id]['W'] == 1.0)
+    assert np.all(accumulator[inner_id]['W'] == 2.0)
+
+
+def test_restore_layer_state_with_sublayers():
+    container = ContainerLayer(5)
+    container.build((None, 4))
+    container.dense.build((None, 4))
+    container.dense.custom_attr = "original"
+    state = Model._capture_layer_state(container)
+    container.dense.custom_attr = "modified"
+    assert container.dense.custom_attr == "modified"
+    Model._restore_layer_state(container, state)
+    assert container.dense.custom_attr == "original"
+
+
+def test_functional_compute_output_shape():
+    inputs = Input(shape=(10,))
+    x = Dense(5)(inputs)
+    outputs = Dense(3)(x)
+    model = Model(inputs=inputs, outputs=outputs)
+    shape = model.compute_output_shape((None, 10))
+    assert shape == (None, 3)
+
+
+def test_functional_build():
+    inputs = Input(shape=(10,))
+    x = Dense(5)(inputs)
+    outputs = Dense(3)(x)
+    model = Model(inputs=inputs, outputs=outputs)
+    model.build((None, 10))
+    assert model.built is True
+
+
+def test_backward_functional_single_output():
+    inputs = Input(shape=(10,))
+    x = Dense(5, activation='relu')(inputs)
+    outputs = Dense(3)(x)
+    model = Model(inputs=inputs, outputs=outputs)
+    x_data = np.random.rand(4, 10)
+    y = model.forward(x_data, training=True)
+    grad = np.random.rand(4, 3)
+    grad_inputs = model.backward(grad)
+    assert grad_inputs.shape == (4, 10)
+
+
+def test_subclassed_model_build_no_override():
+    model = SubclassNoBuild()
+    model.build((None, 5))
+    assert model.built is True
+    assert model.input_shape == (None, 5)
+
+
+def test_sequential_forward_without_kv_cache():
+    model = Sequential([Dense(5, input_shape=(10,)), Dense(3)])
+    x = np.random.rand(4, 10)
+    out = model.forward(x, training=False, kv_cache=None)
+    assert out.shape == (4, 3)
+
+
+def test_sequential_add_with_input_shape():
+    model = Sequential()
+    dense = Dense(5, input_shape=(10,))
+    model.add(dense)
+    assert len(model.layers) == 1
+    assert model.layers[0].built
+
+
+def test_summary_unbuilt_layer(capsys):
+    model = Sequential()
+    layer = Dense(5)
+    model.layers.append(layer)
+    model.summary()
+    captured = capsys.readouterr()
+    assert "unbuilt" in captured.out
+
+
+def test_summary_exception_built(capsys):
+    model = Sequential()
+    layer = BrokenLayer()
+    layer.built = True
+    layer.input_shape = (None, 10)
+    model.layers.append(layer)
+    model.summary()
+    captured = capsys.readouterr()
+    assert "multiple" in captured.out
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
index 948771e..cc7287b 100644
--- a/tests/test_preprocessing.py
+++ b/tests/test_preprocessing.py
@@ -58,3 +58,47 @@ def test_image_data_generator_channels_first():
     assert batch_x.shape == (2, 3, 8, 8)
     assert batch_y.shape == (2,)
     np.testing.assert_allclose(batch_x, x[:2] / 255.0, atol=1e-5)
+
+def test_image_data_generator_invalid_data_format():
+    with pytest.raises(ValueError, match="data_format must be"):
+        ImageDataGenerator(data_format='invalid')
+
+def test_image_data_generator_vertical_flip():
+    img = np.zeros((32, 32, 3))
+    img[0, :, 0] = 1.0
+
+    datagen = ImageDataGenerator(vertical_flip=True)
+    np.random.seed(42)
+
+    flipped = False
+    for _ in range(10):
+        transformed = datagen.apply_transform(img)
+        if np.all(transformed[-1, :, 0] == 1.0):
+            flipped = True
+            break
+    assert flipped
+
+def test_image_data_generator_width_height_shift():
+    img = np.zeros((32, 32, 3))
+    img[16, 16, :] = 1.0
+
+    datagen = ImageDataGenerator(width_shift_range=0.5, height_shift_range=0.5)
+    np.random.seed(0)
+    transformed = datagen.apply_transform(img)
+
+    assert transformed.shape == (32, 32, 3)
+
+def test_image_data_generator_channels_first_vertical_flip():
+    img = np.zeros((3, 32, 32))
+    img[:, 0, :] = 1.0
+
+    datagen = ImageDataGenerator(vertical_flip=True, data_format='channels_first')
+    np.random.seed(42)
+
+    flipped = False
+    for _ in range(10):
+        transformed = datagen.apply_transform(img)
+        if np.all(transformed[:, -1, :] == 1.0):
+            flipped = True
+            break
+    assert flipped
diff --git a/tests/test_preprocessing_sequence.py b/tests/test_preprocessing_sequence.py
new file mode 100644
index 0000000..9561690
--- /dev/null
+++ b/tests/test_preprocessing_sequence.py
@@ -0,0 +1,106 @@
+import numpy as np
+import pytest
+from neutro.preprocessing.sequence import pad_sequences
+
+
+class TestPadSequencesBasic:
+    def test_padding_pre_default(self):
+        sequences = [[1, 2], [3, 4, 5]]
+        result = pad_sequences(sequences, maxlen=3)
+        expected = np.array([[0, 1, 2], [3, 4, 5]], dtype="int32")
+        np.testing.assert_array_equal(result, expected)
+
+    def test_padding_post(self):
+        sequences = [[1, 2], [3, 4, 5]]
+        result = pad_sequences(sequences, maxlen=3, padding="post")
+        expected = np.array([[1, 2, 0], [3, 4, 5]], dtype="int32")
+        np.testing.assert_array_equal(result, expected)
+
+
+class TestPadSequencesTruncating:
+    def test_truncating_pre(self):
+        sequences = [[1, 2, 3, 4, 5]]
+        result = pad_sequences(sequences, maxlen=3, truncating="pre")
+        expected = np.array([[3, 4, 5]], dtype="int32")
+        np.testing.assert_array_equal(result, expected)
+
+    def test_truncating_post(self):
+        sequences = [[1, 2, 3, 4, 5]]
+        result = pad_sequences(sequences, maxlen=3, truncating="post")
+        expected = np.array([[1, 2, 3]], dtype="int32")
+        np.testing.assert_array_equal(result, expected)
+
+
+class TestPadSequencesMaxlen:
+    def test_custom_maxlen_shorter_than_longest(self):
+        sequences = [[1, 2, 3, 4, 5], [1, 2]]
+        result = pad_sequences(sequences, maxlen=3)
+        expected = np.array([[3, 4, 5], [0, 1, 2]], dtype="int32")
+        np.testing.assert_array_equal(result, expected)
+
+    def test_maxlen_none_auto_detect(self):
+        sequences = [[1, 2], [3, 4, 5, 6], [7]]
+        result = pad_sequences(sequences, maxlen=None)
+        expected = np.array(
+            [[0, 0, 1, 2], [3, 4, 5, 6], [0, 0, 0, 7]], dtype="int32"
+        )
+        np.testing.assert_array_equal(result, expected)
+
+
+class TestPadSequencesDtypeAndValue:
+    def test_custom_dtype(self):
+        sequences = [[1, 2], [3, 4, 5]]
+        result = pad_sequences(sequences, maxlen=3, dtype="float32")
+        assert result.dtype == np.float32
+        expected = np.array([[0, 1, 2], [3, 4, 5]], dtype="float32")
+        np.testing.assert_array_equal(result, expected)
+
+    def test_custom_padding_value(self):
+        sequences = [[1, 2], [3, 4, 5]]
+        result = pad_sequences(sequences, maxlen=3, value=99)
+        expected = np.array([[99, 1, 2], [3, 4, 5]], dtype="int32")
+        np.testing.assert_array_equal(result, expected)
+
+
+class TestPadSequencesEdgeCases:
+    def test_empty_sequence_in_list(self):
+        sequences = [[1, 2, 3], [], [4, 5]]
+        result = pad_sequences(sequences, maxlen=3)
+        expected = np.array([[1, 2, 3], [0, 0, 0], [0, 4, 5]], dtype="int32")
+        np.testing.assert_array_equal(result, expected)
+
+
+class TestPadSequencesErrors:
+    def test_invalid_truncating_type(self):
+        sequences = [[1, 2, 3]]
+        with pytest.raises(ValueError, match='Truncating type "middle" not understood'):
+            pad_sequences(sequences, maxlen=2, truncating="middle")
+
+    def test_invalid_padding_type(self):
+        sequences = [[1, 2, 3]]
+        with pytest.raises(ValueError, match='Padding type "middle" not understood'):
+            pad_sequences(sequences, maxlen=2, padding="middle")
+
+
+class TestPadSequencesMixed:
+    def test_padding_pre_with_truncating_post(self):
+        sequences = [[1, 2, 3, 4, 5], [1, 2]]
+        result = pad_sequences(sequences, maxlen=3, truncating="post")
+        expected = np.array([[1, 2, 3], [0, 1, 2]], dtype="int32")
+        np.testing.assert_array_equal(result, expected)
+
+    def test_padding_post_with_truncating_pre(self):
+        sequences = [[1, 2, 3, 4, 5], [1, 2]]
+        result = pad_sequences(
+            sequences, maxlen=3, padding="post", truncating="pre"
+        )
+        expected = np.array([[3, 4, 5], [1, 2, 0]], dtype="int32")
+        np.testing.assert_array_equal(result, expected)
+
+    def test_padding_post_with_truncating_post(self):
+        sequences = [[1, 2, 3, 4, 5], [1, 2]]
+        result = pad_sequences(
+            sequences, maxlen=3, padding="post", truncating="post"
+        )
+        expected = np.array([[1, 2, 3], [1, 2, 0]], dtype="int32")
+        np.testing.assert_array_equal(result, expected)
diff --git a/tests/test_preprocessing_text.py b/tests/test_preprocessing_text.py
new file mode 100644
index 0000000..ca29d50
--- /dev/null
+++ b/tests/test_preprocessing_text.py
@@ -0,0 +1,114 @@
+import pytest
+from neutro.preprocessing.text import Tokenizer
+
+
+class TestTokenizerFitOnTexts:
+    def test_fit_on_texts_basic_lowercase(self):
+        t = Tokenizer()
+        t.fit_on_texts(["Hello World", "Hello Keras"])
+        assert t.word_index == {"hello": 1, "world": 2, "keras": 3}
+        assert t.index_word[1] == "hello"
+        assert t.word_counts == {"hello": 2, "world": 1, "keras": 1}
+
+    def test_fit_on_texts_with_oov_token(self):
+        t = Tokenizer(oov_token="<OOV>")
+        t.fit_on_texts(["cat dog", "cat fish"])
+        assert t.word_index["<OOV>"] == 1
+        assert t.index_word[1] == "<OOV>"
+        assert t.word_index["cat"] == 2
+        assert t.word_index["dog"] == 3
+
+    def test_fit_on_texts_num_words_limit(self):
+        t = Tokenizer(num_words=2, oov_token="<OOV>")
+        t.fit_on_texts(["apple banana cherry", "apple banana date"])
+        # oov_token gets index 1, "apple" gets 2, "banana" gets 3
+        # num_words=2 means only index < 2 is kept? No — sorted_words is trimmed to 2 words,
+        # but oov_token is added outside that. So we get: oov=1, apple=2, banana=3,
+        # but sorted_words only had 2 entries. With num_words=2, sorted_words[:2] = [apple, banana]
+        # So word_index has: <OOV>:1, apple:2, banana:3
+        assert "<OOV>" in t.word_index
+        assert "apple" in t.word_index
+        assert "banana" in t.word_index
+        assert "cherry" not in t.word_index
+        assert "date" not in t.word_index
+
+    def test_fit_on_texts_empty_list(self):
+        t = Tokenizer()
+        t.fit_on_texts([])
+        assert t.word_index == {}
+        assert t.index_word == {}
+
+
+class TestTokenizerTextsToSequences:
+    def test_texts_to_sequences_basic(self):
+        t = Tokenizer()
+        t.fit_on_texts(["the cat sat", "the dog ran"])
+        seqs = t.texts_to_sequences(["the cat sat"])
+        assert seqs == [[1, 2, 3]]
+
+    def test_texts_to_sequences_with_oov(self):
+        t = Tokenizer(oov_token="<OOV>")
+        t.fit_on_texts(["cat dog fish"])
+        seqs = t.texts_to_sequences(["cat bird dog"])
+        # cat=2, dog=3, bird is unknown -> oov=1
+        assert seqs == [[2, 1, 3]]
+
+    def test_texts_to_sequences_num_words_filters_to_oov(self):
+        t = Tokenizer(num_words=2, oov_token="<OOV>")
+        t.fit_on_texts(["apple banana cherry", "apple banana date"])
+        seqs = t.texts_to_sequences(["apple banana cherry"])
+        # <OOV>=1, apple=2, banana=3
+        # num_words=2, so indices >= 2 are filtered to OOV
+        # apple (2) >= 2 -> OOV, banana (3) >= 2 -> OOV, cherry unknown -> OOV
+        assert seqs == [[1, 1, 1]]
+
+    def test_texts_to_sequences_empty_words(self):
+        t = Tokenizer()
+        t.fit_on_texts(["hello world"])
+        seqs = t.texts_to_sequences(["  hello   world  "])
+        assert seqs == [[1, 2]]
+
+
+class TestTokenizerSequencesToTexts:
+    def test_sequences_to_texts_basic(self):
+        t = Tokenizer()
+        t.fit_on_texts(["hello world"])
+        texts = t.sequences_to_texts([[1, 2]])
+        assert texts == ["hello world"]
+
+    def test_sequences_to_texts_unknown_index(self):
+        t = Tokenizer()
+        t.fit_on_texts(["hello world"])
+        texts = t.sequences_to_texts([[1, 999]])
+        assert texts == ["hello ?"]
+
+
+class TestTokenizerGetConfig:
+    def test_get_config_returns_all_keys(self):
+        t = Tokenizer(num_words=10, oov_token="<OOV>", lower=False)
+        t.fit_on_texts(["hello world"])
+        config = t.get_config()
+        assert config["num_words"] == 10
+        assert config["oov_token"] == "<OOV>"
+        assert config["lower"] is False
+        assert config["split"] == " "
+        assert "filters" in config
+        assert "word_index" in config
+        assert "index_word" in config
+
+
+class TestTokenizerNoLowercase:
+    def test_fit_on_texts_without_lowercase(self):
+        t = Tokenizer(lower=False)
+        t.fit_on_texts(["Hello World"])
+        assert "Hello" in t.word_index
+        assert "hello" not in t.word_index
+        assert "World" in t.word_index
+
+    def test_texts_to_sequences_without_lowercase(self):
+        t = Tokenizer(lower=False)
+        t.fit_on_texts(["Hello World"])
+        seqs = t.texts_to_sequences(["Hello World"])
+        assert seqs == [[1, 2]]
+        seqs_mismatch = t.texts_to_sequences(["hello world"])
+        assert seqs_mismatch == [[]]
diff --git a/tests/tokenizers/test_tiktoken_compat.py b/tests/tokenizers/test_tiktoken_compat.py
index 6f94136..b4c2208 100644
--- a/tests/tokenizers/test_tiktoken_compat.py
+++ b/tests/tokenizers/test_tiktoken_compat.py
@@ -51,3 +51,46 @@ def test_tiktoken_compatible_tokenizer():
     encoded_special = tokenizer.encode(text_with_special, allowed_special="all")
     assert 1000 in encoded_special
     assert tokenizer.decode(encoded_special) == text_with_special
+
+
+from unittest.mock import patch, MagicMock
+
+
+@patch('urllib.request.urlopen')
+@patch('os.path.exists')
+@patch('os.makedirs')
+@patch('tempfile.gettempdir')
+@patch('builtins.open', new_callable=MagicMock)
+def test_load_tiktoken_bpe_url(mock_file_open, mock_gettempdir, mock_makedirs, mock_exists, mock_urlopen):
+    import base64
+    
+    mock_gettempdir.return_value = "/tmp"
+    mock_exists.return_value = False
+    
+    content = base64.b64encode(b"hello") + b" 258\n" + base64.b64encode(b"world") + b" 259\n"
+    
+    mock_response = MagicMock()
+    mock_response.read.return_value = content
+    mock_response.__enter__.return_value = mock_response
+    mock_urlopen.return_value = mock_response
+    
+    mock_file = MagicMock()
+    mock_file.__enter__.return_value.read.return_value = content
+    mock_file_open.return_value = mock_file
+    
+    from neutro.tokenizers.tiktoken_compat import load_tiktoken_bpe
+    ranks = load_tiktoken_bpe("https://example.com/test.tiktoken")
+    assert len(ranks) == 2
+    assert ranks[b"hello"] == 258
+    assert ranks[b"world"] == 259
+
+
+@patch('neutro.tokenizers.tiktoken_compat.load_tiktoken_bpe')
+def test_get_gpt2_tokenizer(mock_load_bpe):
+    mock_load_bpe.return_value = {}
+    
+    from neutro.tokenizers.tiktoken_compat import get_gpt2_tokenizer
+    tokenizer = get_gpt2_tokenizer()
+    assert tokenizer is not None
+    assert tokenizer.special_tokens["<|endoftext|>"] == 50256
+    mock_load_bpe.assert_called_once()
diff --git a/tests/utils/test_data_utils.py b/tests/utils/test_data_utils.py
new file mode 100644
index 0000000..1e7ed54
--- /dev/null
+++ b/tests/utils/test_data_utils.py
@@ -0,0 +1,42 @@
+import numpy as np
+import pytest
+import json
+from unittest.mock import patch, MagicMock
+from neutro.utils.data_utils import load_imdb, get_imdb_word_index
+
+
+@patch('neutro.utils.data_utils.download_file')
+@patch('numpy.load')
+@patch('os.path.expanduser')
+def test_load_imdb(mock_expanduser, mock_load, mock_download):
+    mock_expanduser.return_value = "/tmp"
+    mock_data = MagicMock()
+    mock_data.__enter__.return_value = {
+        'x_train': np.zeros((100,)),
+        'y_train': np.zeros(100),
+        'x_test': np.zeros((20,)),
+        'y_test': np.zeros(20)
+    }
+    mock_load.return_value = mock_data
+    
+    (x_train, y_train), (x_test, y_test) = load_imdb()
+    assert x_train.shape == (100,)
+    assert y_train.shape == (100,)
+    assert x_test.shape == (20,)
+    mock_download.assert_called_once()
+
+
+@patch('neutro.utils.data_utils.download_file')
+@patch('builtins.open', new_callable=MagicMock)
+@patch('json.load')
+@patch('os.path.expanduser')
+def test_get_imdb_word_index(mock_expanduser, mock_json_load, mock_open, mock_download):
+    mock_expanduser.return_value = "/tmp"
+    mock_json_load.return_value = {"the": 1, "and": 2, "a": 3}
+    mock_file = MagicMock()
+    mock_file.__enter__.return_value = mock_file
+    mock_open.return_value = mock_file
+    
+    word_index = get_imdb_word_index()
+    assert word_index == {"the": 1, "and": 2, "a": 3}
+    mock_download.assert_called_once()

From d71d3d6bfefe17109cda4bd487df6c7584a1747c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 31 May 2026 05:20:34 +0000
Subject: [PATCH 2/2] Fix dropout inference mask regression coverage and docs

---
 docs/layers/core/core_utility_layers.md | 9 +++++++++
 neutro/layers/core/dropout.py           | 1 +
 tests/layers/core/test_dropout.py       | 1 +
 3 files changed, 11 insertions(+)

diff --git a/docs/layers/core/core_utility_layers.md b/docs/layers/core/core_utility_layers.md
index 44065f3..16c43f3 100644
--- a/docs/layers/core/core_utility_layers.md
+++ b/docs/layers/core/core_utility_layers.md
@@ -60,6 +60,15 @@ def backward(self, grad_output):
 
 🔍 **Line `if self.mask is None`**: If we never called forward (or called it with `training=False`), there's no mask. In that case, the gradient passes through unchanged — just like the forward pass.
 
+#### `compute_output_shape`
+
+```python
+def compute_output_shape(self, input_shape):
+    return input_shape
+```
+
+Dropout does not change tensor rank or dimensions; it only masks values during training. So the output shape is always identical to the input shape.
+
 ---
 
 ## Flatten — `neutro/layers/core/flatten.py`
diff --git a/neutro/layers/core/dropout.py b/neutro/layers/core/dropout.py
index abbdf63..975ae29 100644
--- a/neutro/layers/core/dropout.py
+++ b/neutro/layers/core/dropout.py
@@ -9,6 +9,7 @@ def __init__(self, rate, **kwargs):
 
     def forward(self, inputs, training=False):
         if not training or self.rate == 0:
+            self.mask = None
             return inputs
         self.mask = np.random.binomial(1, 1 - self.rate, size=inputs.shape) / (1 - self.rate)
         return inputs * self.mask
diff --git a/tests/layers/core/test_dropout.py b/tests/layers/core/test_dropout.py
index 64418e8..c1ef38d 100644
--- a/tests/layers/core/test_dropout.py
+++ b/tests/layers/core/test_dropout.py
@@ -71,6 +71,7 @@ def test_dropout_backward_inference():
     x = np.random.rand(10, 10)
     grad = np.random.rand(10, 10)
 
+    layer.forward(x, training=True)
     layer.forward(x, training=False)
     dx = layer.backward(grad)
     assert np.all(dx == grad)