Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 15 additions & 12 deletions neutro/activations/softmax.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,19 @@ def __call__(self, x):
self.last_output = exps / np.sum(exps, axis=-1, keepdims=True)
return self.last_output
def gradient(self, x):
return self.last_output * (1 - self.last_output)
# Softmax does not have a valid element-wise gradient because its
# Jacobian is a full matrix (diag(s) - s*s^T), not a diagonal one.
# s*(1-s) is only the diagonal and produces incorrect gradients.
# Always use gradient_fast(x, grad_output) instead.
raise NotImplementedError(
"Softmax.gradient() is not implemented because the softmax Jacobian "
"is not diagonal. Use gradient_fast(x, grad_output) for correct "
"chain-rule gradients: dL/dx = s * (g - dot(s, g))."
)
def gradient_fast(self, x, grad_output):
orig_shape = grad_output.shape
grad_flat = grad_output.reshape(-1, orig_shape[-1])
out_flat = self.last_output.reshape(-1, orig_shape[-1])

n_samples, units = grad_flat.shape
res = np.zeros_like(grad_flat)
for i in range(n_samples):
s = out_flat[i].reshape(-1, 1)
jacobian = np.diagflat(s) - np.dot(s, s.T)
res[i] = np.dot(grad_flat[i], jacobian)
return res.reshape(orig_shape)
# Vectorized softmax backward: dL/dx = s * (g - dot(s, g))
# Derived from the full Jacobian J = diag(s) - s*s^T:
# dL/dx_k = s_k * (g_k - sum_i(s_i * g_i))
s = self.last_output
dot = np.sum(s * grad_output, axis=-1, keepdims=True)
return s * (grad_output - dot)
22 changes: 13 additions & 9 deletions neutro/layers/normalization/rmsnorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,23 @@ def forward(self, x, training=False):
return self.x_norm * self.params['weight']

def backward(self, grad_output):
# Naive but educational implementation of RMSNorm backward
# dW
self.grads['weight'] = np.sum(grad_output * self.x_norm, axis=(0, 1))

# dW: sum over all axes except the last (feature) axis so the result
# has the same shape as params['weight'] regardless of input rank
# (works for 2-D (batch, dim), 3-D (batch, seq, dim), etc.)
feature_axes = tuple(range(len(grad_output.shape) - 1))
self.grads['weight'] = np.sum(grad_output * self.x_norm, axis=feature_axes)

# dX
N = self.dim
grad_x_norm = grad_output * self.params['weight']

# Backward through: x / sqrt(mean(x^2) + eps)
# Detailed derivation for the old folks:
# dx = (grad_x_norm / rms) - (x * sum(grad_x_norm * x) / (N * rms^3))

# Derivation:
# rms = sqrt(mean(x^2) + eps), d_rms/dx_j = x_j / (N * rms)
# d_xnorm_i/dx_j = delta_ij/rms - x_i*x_j / (N * rms^3)
# dL/dx_j = grad_x_norm_j/rms - x_j * sum(grad_x_norm*x) / (N * rms^3)

sum_grad_x = np.sum(grad_x_norm * self.x, axis=-1, keepdims=True)
dx = (grad_x_norm / self.rms) - (self.x * sum_grad_x / (N * self.rms**3))

return dx
8 changes: 7 additions & 1 deletion neutro/layers/recurrent/simple_rnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,13 @@ def backward(self, grad_output):

for t in range(timesteps - 1, -1, -1):
dh = (grad_output[:, t, :] if self.return_sequences else (grad_output if t == timesteps - 1 else 0)) + dh_next
dz = dh * (1 - self.h_states[:, t+1, :]**2)
# Apply the derivative of the hidden-state activation.
# tanh: d(tanh(z))/dz = 1 - tanh(z)^2 = 1 - h_t^2
# linear: d(z)/dz = 1, so dz = dh
if self.activation_name == 'tanh':
dz = dh * (1 - self.h_states[:, t+1, :]**2)
else:
dz = dh
d_Wx += np.dot(self.inputs[:, t, :].T, dz)
d_Wh += np.dot(self.h_states[:, t, :].T, dz)
d_b += np.sum(dz, axis=0)
Expand Down
34 changes: 28 additions & 6 deletions tests/activations/test_softmax.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,42 @@ def test_softmax_forward():
assert np.allclose(np.sum(out), 1.0)
assert out[0, 2] > out[0, 1] > out[0, 0]

def test_softmax_gradient():
def test_softmax_gradient_raises():
# Softmax.gradient() is intentionally not implemented because s*(1-s) is
# only the diagonal of the Jacobian and gives incorrect chain-rule results.
softmax = Softmax()
x = np.array([[1, 2, 3]], dtype=float)
softmax(x)
grad = softmax.gradient(x)
assert grad.shape == x.shape
with pytest.raises(NotImplementedError):
softmax.gradient(x)

def test_softmax_gradient_fast():
# gradient_fast implements the correct vectorised backward:
# dL/dx = s * (g - dot(s, g))
# which is equivalent to the full Jacobian J = diag(s) - s*s^T applied to g.
softmax = Softmax()
x = np.array([[1, 2]], dtype=float)
out = softmax(x)
grad_output = np.array([[1, 0]], dtype=float)
x = np.array([[1.0, 2.0, 3.0]], dtype=float)
s = softmax(x) # populates last_output

grad_output = np.array([[1.0, 0.0, 0.0]], dtype=float)

# Expected: s * (g - dot(s, g)) for each sample
dot = np.sum(s * grad_output, axis=-1, keepdims=True)
expected = s * (grad_output - dot)

grad = softmax.gradient_fast(x, grad_output)
assert grad.shape == x.shape
assert np.allclose(grad, expected)

def test_softmax_gradient_fast_sum_zero():
# A key property: the gradient of softmax sums to zero along the last axis
# (because softmax output sums to 1, the Jacobian rows sum to 0).
softmax = Softmax()
x = np.array([[0.5, 1.5, -0.5, 2.0]], dtype=float)
softmax(x)
grad_output = np.random.default_rng(42).standard_normal(x.shape)
grad = softmax.gradient_fast(x, grad_output)
assert np.allclose(grad.sum(axis=-1), 0.0, atol=1e-12)

def test_get_activation():
assert isinstance(get('relu'), ReLU)
Expand Down
Loading