diff --git a/mlkem/src/poly.c b/mlkem/src/poly.c
index 4a7316718..725dcb509 100644
--- a/mlkem/src/poly.c
+++ b/mlkem/src/poly.c
@@ -29,33 +29,61 @@
 #include "verify.h"
 
 /**
- * Montgomery multiplication modulo MLKEM_Q.
+ * Montgomery multiplication modulo MLKEM_Q with precomputed twist.
  *
  * @reference{`fqmul()` in the reference implementation @[REF].}
  *
- * @param a First factor. Can be any int16_t.
- * @param b Second factor. Must be signed canonical
- *          (abs value < (MLKEM_Q+1)/2).
+ * @param a         First factor. Can be any int16_t.
+ * @param b         Second factor. Must be signed canonical
+ *                  (abs value < (MLKEM_Q+1)/2).
+ * @param b_twisted Precomputed `b * MLKEM_Q^{-1} mod 2^16` (signed canonical).
  *
  * @return 16-bit integer congruent to a*b*R^{-1} mod MLKEM_Q, and
  *         smaller than MLKEM_Q in absolute value.
  */
-static MLK_INLINE int16_t mlk_fqmul(int16_t a, int16_t b)
+static MLK_INLINE int16_t mlk_fqmul(int16_t a, int16_t b, int16_t b_twisted)
 __contract__(
   requires(b > -MLKEM_Q_HALF && b < MLKEM_Q_HALF)
   ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
 )
 {
-  int16_t res;
+  uint16_t lo0;
+  int16_t res, hi, lo, correction;
   mlk_assert_abs_bound(&b, 1, MLKEM_Q_HALF);
 
-  res = mlk_montgomery_reduce((int32_t)a * (int32_t)b);
+  /* High 16 bits of the signed product a * b. */
+  hi = (int16_t)(((int32_t)a * b) >> 16);
+  /* Low 16 bits of a * b_twisted (== a * b * MLKEM_Q^{-1} mod 2^16).
+   * This is just a 16x16->16 bit low multiplication, but we express
+   * it as a 16x16->32 widening multiplication with an explicit truncation
+   * to avoid unsigned overflow errors from CBMC. The compiler will be smart
+   * enough to realize to optimize this away. */
+  lo0 = (uint16_t)(((uint32_t)mlk_cast_int16_to_uint16(a) *
+                    mlk_cast_int16_to_uint16(b_twisted)) &
+                   UINT16_MAX);
+  /* Lift low product to signed (!) 16-bit integer */
+  lo = mlk_cast_uint16_to_int16(lo0);
+
+  /* 16x16->16 bit high multiplication
+   *
+   * PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
+   * implementation-defined for negative left argument. Here,
+   * we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
+   *
+   * Bounds: |correction| <= ceil(2^15 * MLKEM_Q / 2^16) = (MLKEM_Q + 1)/2
+   *
+   * Safety: The bounds argument above demonstrates that the truncation is safe.
+   */
+  correction = (int16_t)(((int32_t)lo * MLKEM_Q) >> 16);
+
   /* Bounds:
-   * |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
+   * |res| <= |hi| + |correction|
+   *       <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
    *       <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
    *       <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
    *        < MLKEM_Q
    */
+  res = (int16_t)(hi - correction);
 
   mlk_assert_abs_bound(&res, 1, MLKEM_Q);
   return res;
@@ -115,13 +143,16 @@ __contract__(
 {
   unsigned i;
   const int16_t f = 1353; /* check-magic: 1353 == signed_mod(2^32, MLKEM_Q) */
+  /* check-magic:
+       20553 == signed_mod(1353 * pow(MLKEM_Q, -1, 2^16), 2^16) */
+  const int16_t f_twisted = 20553;
   for (i = 0; i < MLKEM_N; i++)
   __loop__(
     invariant(i <= MLKEM_N)
     invariant(array_abs_bound(r->coeffs, 0, i, MLKEM_Q))
     decreases(MLKEM_N - i))
   {
-    r->coeffs[i] = mlk_fqmul(r->coeffs[i], f);
+    r->coeffs[i] = mlk_fqmul(r->coeffs[i], f, f_twisted);
   }
 
   mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
@@ -281,11 +312,13 @@ __contract__(
     invariant(array_abs_bound(x->coeffs, 0, 2 * i, MLKEM_Q))
     decreases(MLKEM_N / 4 - i))
   {
-    x->coeffs[2 * i + 0] = mlk_fqmul(a->coeffs[4 * i + 1], mlk_zetas[64 + i]);
+    const int16_t z = mlk_zetas[64 + i][0];
+    const int16_t z_t = mlk_zetas[64 + i][1];
+    x->coeffs[2 * i + 0] = mlk_fqmul(a->coeffs[4 * i + 1], z, z_t);
     /* The values in zeta table are <= MLKEM_Q in absolute value,
      * so the negation in int16_t is safe. */
     x->coeffs[2 * i + 1] =
-        mlk_fqmul(a->coeffs[4 * i + 3], (int16_t)(-mlk_zetas[64 + i]));
+        mlk_fqmul(a->coeffs[4 * i + 3], (int16_t)(-z), (int16_t)(-z_t));
   }
 
   /*
@@ -312,37 +345,133 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
   mlk_poly_mulcache_compute_c(x, a);
 }
 
+/* Cooley-Tukey butterfly */
+static MLK_INLINE void mlk_ct_butterfly(int16_t r[MLKEM_N], unsigned i,
+                                        unsigned j, int16_t z, int16_t zt)
+{
+  const int16_t t = mlk_fqmul(r[j], z, zt);
+  r[j] = (int16_t)(r[i] - t);
+  r[i] = (int16_t)(r[i] + t);
+}
+
+/* Gentleman-Sande butterfly without reduction.
+ *
+ * The twiddles `z`, `zt` are implicitly negated: we compute `b - a` instead
+ * of `a - b`, which is equivalent to multiplying by `-z`.
+ */
+static MLK_INLINE void mlk_gs_butterfly(int16_t r[MLKEM_N], unsigned i,
+                                        unsigned j, int16_t z, int16_t zt)
+{
+  const int16_t a = r[i];
+  const int16_t b = r[j];
+  r[i] = (int16_t)(a + b);
+  r[j] = mlk_fqmul((int16_t)(b - a), z, zt);
+}
+
 /*
- * Computes a block CT butterflies with a fixed twiddle factor,
- * using Montgomery multiplication.
- * Parameters:
- * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
- * - root: Twiddle factor to use for the butterfly. This must be in
- *         Montgomery form and signed canonical.
- * - start: Offset to the beginning of the butterfly block
- * - len: Index difference between coefficients subject to a butterfly
- * - bound: Ghost variable describing coefficient bound: Prior to `start`,
- *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
- *          they must be bound by `bound`.
- * When this function returns, output coefficients in the index range
- * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
- * Example:
- * - start=8, len=4
- *   This would compute the following four butterflies
- *          8     --    12
- *             9    --     13
- *                10   --     14
- *                   11   --     15
- * - start=4, len=2
- *   This would compute the following two butterflies
- *          4 -- 6
- *             5 -- 7
+ * Two merged forward-NTT layers, applied to one outer block.
+ */
+static MLK_INLINE void mlk_ntt_2_layers_block(
+    int16_t r[MLKEM_N], unsigned start, unsigned len, int16_t z0, int16_t z0t,
+    int16_t z1, int16_t z1t, int16_t z2, int16_t z2t, const int16_t bound)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(0 < bound && bound < INT16_MAX - 2 * MLKEM_Q)
+  requires(1 <= len && len <= MLKEM_N / 4)
+  requires(start <= MLKEM_N - 4 * len)
+  requires(z0 > -MLKEM_Q_HALF && z0 < MLKEM_Q_HALF)
+  requires(z1 > -MLKEM_Q_HALF && z1 < MLKEM_Q_HALF)
+  requires(z2 > -MLKEM_Q_HALF && z2 < MLKEM_Q_HALF)
+  requires(array_abs_bound(r, start, MLKEM_N, bound))
+  requires(array_abs_bound(r, 0, start, bound + 2 * MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, start + 4 * len, MLKEM_N, bound))
+  ensures(array_abs_bound(r, 0, start + 4 * len, bound + 2 * MLKEM_Q))
+)
+{
+  unsigned j = 0;
+  /* `bound` is a ghost variable referenced only in the CBMC contract. */
+  ((void)bound);
+  for (j = 0; j < len; j++)
+  __loop__(
+    assigns(j, memory_slice(r, sizeof(int16_t) * MLKEM_N))
+    invariant(j <= len)
+    /* Static bounds */
+    invariant(array_abs_bound(r, 0, start, bound + 2 * MLKEM_Q))
+    invariant(array_abs_bound(r, start + 4 * len, MLKEM_N, bound))
+    /* Dynamic bounds */
+    invariant(array_abs_bound(r, start + 0 * len,     start + 0 * len + j, bound + 2 * MLKEM_Q))
+    invariant(array_abs_bound(r, start + 1 * len,     start + 1 * len + j, bound + 2 * MLKEM_Q))
+    invariant(array_abs_bound(r, start + 2 * len,     start + 2 * len + j, bound + 2 * MLKEM_Q))
+    invariant(array_abs_bound(r, start + 3 * len,     start + 3 * len + j, bound + 2 * MLKEM_Q))
+    invariant(array_abs_bound(r, start + 0 * len + j, start + 1 * len,     bound))
+    invariant(array_abs_bound(r, start + 1 * len + j, start + 2 * len,     bound))
+    invariant(array_abs_bound(r, start + 2 * len + j, start + 3 * len,     bound))
+    invariant(array_abs_bound(r, start + 3 * len + j, start + 4 * len,     bound))
+    decreases(len - j))
+  {
+    const unsigned i0 = start + j;
+    const unsigned i1 = i0 + 1 * len;
+    const unsigned i2 = i0 + 2 * len;
+    const unsigned i3 = i0 + 3 * len;
+
+    mlk_ct_butterfly(r, i0, i2, z0, z0t);
+    mlk_ct_butterfly(r, i1, i3, z0, z0t);
+    mlk_ct_butterfly(r, i0, i1, z1, z1t);
+    mlk_ct_butterfly(r, i2, i3, z2, z2t);
+  }
+}
+
+/*
+ * Two merged forward-NTT layers.
+ */
+static MLK_INLINE void mlk_ntt_2_layers(int16_t r[MLKEM_N],
+                                        const unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(layer == 1 || layer == 3 || layer == 5)
+  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 2) * MLKEM_Q)))
+{
+  const unsigned len_outer = (unsigned)MLKEM_N >> layer;
+  const unsigned len = len_outer >> 1;
+  unsigned start, k;
+  k = 1u << (layer - 1);
+  for (start = 0; start < MLKEM_N; start += 2 * len_outer)
+  __loop__(
+    invariant(start <= MLKEM_N)
+    invariant((1u << (layer - 1)) <= k && k <= (1u << layer))
+    invariant(2 * len_outer * k == start + MLKEM_N)
+    invariant(array_abs_bound(r, 0, start, (layer + 2) * MLKEM_Q))
+    invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q))
+    decreases(MLKEM_N - start))
+  {
+    /* Negation of the zetas is embedded in
+     * mlk_ntt_2_layers_blocks -> mlk_gs_butterfly */
+    const int16_t z0 = mlk_zetas[k][0];
+    const int16_t z1 = mlk_zetas[2 * k][0];
+    const int16_t z2 = mlk_zetas[2 * k + 1][0];
+
+    const int16_t z0t = mlk_zetas[k][1];
+    const int16_t z1t = mlk_zetas[2 * k][1];
+    const int16_t z2t = mlk_zetas[2 * k + 1][1];
+
+    k++;
+    mlk_ntt_2_layers_block(r, start, len, z0, z0t, z1, z1t, z2, z2t,
+                           (int16_t)(layer * MLKEM_Q));
+  }
+}
+
+/*
+ * Single-layer forward NTT, used for the leftover layer 7 (len=2) after the
+ * three merged 2-layer calls.
  */
 
 /* Reference: Embedded in `ntt()` in the reference implementation @[REF]. */
 static void mlk_ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
-                                    unsigned start, unsigned len,
-                                    unsigned bound)
+                                    int16_t zeta_t, unsigned start,
+                                    unsigned len, unsigned bound)
 __contract__(
   requires(start < MLKEM_N)
   requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
@@ -372,20 +501,13 @@ __contract__(
     decreases(start + len - j))
   {
     int16_t t;
-    t = mlk_fqmul(r[j + len], zeta);
+    t = mlk_fqmul(r[j + len], zeta, zeta_t);
     /* The precondition implies that the arithmetic does not overflow. */
     r[j + len] = (int16_t)(r[j] - t);
     r[j] = (int16_t)(r[j] + t);
   }
 }
 
-/*
- * Compute one layer of forward NTT
- * Parameters:
- * - r: Pointer to base of polynomial
- * - layer: Variable indicating which layer is being applied.
- */
-
 /* Reference: Embedded in `ntt()` in the reference implementation @[REF]. */
 static void mlk_ntt_layer(int16_t r[MLKEM_N], unsigned layer)
 __contract__(
@@ -407,8 +529,10 @@ __contract__(
     invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q))
     decreases(MLKEM_N - start))
   {
-    int16_t zeta = mlk_zetas[k++];
-    mlk_ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
+    const int16_t zeta = mlk_zetas[k][0];
+    const int16_t zeta_t = mlk_zetas[k][1];
+    k++;
+    mlk_ntt_butterfly_block(r, zeta, zeta_t, start, len, layer * MLKEM_Q);
   }
 }
 
@@ -432,21 +556,17 @@ __contract__(
   ensures(array_abs_bound(p->coeffs, 0, MLKEM_N, MLK_NTT_BOUND))
 )
 {
-  unsigned layer;
   int16_t *r;
 
   mlk_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
-
   r = p->coeffs;
 
-  for (layer = 1; layer <= 7; layer++)
-  __loop__(
-    invariant(1 <= layer && layer <= 8)
-    invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
-    decreases(8 - layer))
-  {
-    mlk_ntt_layer(r, layer);
-  }
+  mlk_ntt_2_layers(r, 1);
+  mlk_ntt_2_layers(r, 3);
+  mlk_ntt_2_layers(r, 5);
+  /* Layer 7 is left as a single layer because the 2-layer merge only covers
+   * an even number of layers (ML-KEM has 7). */
+  mlk_ntt_layer(r, 7);
 
   /* Check the stronger bound */
   mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
@@ -470,7 +590,96 @@ void mlk_poly_ntt(mlk_poly *p)
 }
 
 
-/* Compute one layer of inverse NTT */
+/*
+ * Two merged inverse-NTT layers, applied to one outer block.
+ *
+ * Bound discipline (int16_t): on entry every coefficient is <MLKEM_Q. After
+ * the first pair of GS butterflies the additive outputs are <2*MLKEM_Q; the
+ * second pair leaves the multiplicative outputs <MLKEM_Q via fqmul, and we
+ * Barrett-reduce the additive outputs explicitly so all four coefficients are
+ * again <MLKEM_Q.
+ */
+static MLK_INLINE void mlk_invntt_2_layers_block(int16_t r[MLKEM_N],
+                                                 unsigned start, unsigned len,
+                                                 int16_t z0, int16_t z0t,
+                                                 int16_t z1, int16_t z1t,
+                                                 int16_t z2, int16_t z2t)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(1 <= len && len <= MLKEM_N / 4)
+  requires(start <= MLKEM_N - 4 * len)
+  requires(z0 > -MLKEM_Q_HALF && z0 < MLKEM_Q_HALF)
+  requires(z1 > -MLKEM_Q_HALF && z1 < MLKEM_Q_HALF)
+  requires(z2 > -MLKEM_Q_HALF && z2 < MLKEM_Q_HALF)
+  requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+)
+{
+  unsigned j = 0;
+  for (j = 0; j < len; j++)
+  __loop__(
+    assigns(j, memory_slice(r, sizeof(int16_t) * MLKEM_N))
+    invariant(j <= len)
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+    decreases(len - j))
+  {
+    const unsigned i0 = start + j;
+    const unsigned i1 = i0 + 1 * len;
+    const unsigned i2 = i0 + 2 * len;
+    const unsigned i3 = i0 + 3 * len;
+    /* Bounds: MLKEM_Q */
+    mlk_gs_butterfly(r, i0, i1, z1, z1t);
+    mlk_gs_butterfly(r, i2, i3, z2, z2t);
+    /* Bounds: 2 * MLKEM_Q */
+    mlk_gs_butterfly(r, i0, i2, z0, z0t);
+    mlk_gs_butterfly(r, i1, i3, z0, z0t);
+    /* Barrett-reduce the additive outputs back to <MLKEM_Q. */
+    r[i0] = mlk_barrett_reduce(r[i0]);
+    r[i1] = mlk_barrett_reduce(r[i1]);
+  }
+}
+
+/*
+ * Two merged inverse-NTT layers.
+ */
+static MLK_INLINE void mlk_invntt_2_layers(int16_t r[MLKEM_N],
+                                           const unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(layer == 2 || layer == 4 || layer == 6)
+  requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
+{
+  const unsigned len = (unsigned)MLKEM_N >> layer;
+  const unsigned len_outer = len << 1;
+  unsigned start, k;
+  k = (1u << (layer - 1)) - 1u;
+  for (start = 0; start < MLKEM_N; start += 2 * len_outer)
+  __loop__(
+    invariant(start <= MLKEM_N)
+    invariant(k < (1u << (layer - 1)))
+    invariant(2 * len_outer * k + 2 * len_outer == 2 * MLKEM_N - start)
+    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
+    decreases(MLKEM_N - start))
+  {
+    /* Zetas are passed un-negated; mlk_gs_butterfly absorbs the negation. */
+    const int16_t z0 = mlk_zetas[k][0];
+    const int16_t z1 = mlk_zetas[2 * k + 1][0];
+    const int16_t z2 = mlk_zetas[2 * k][0];
+
+    const int16_t z0t = mlk_zetas[k][1];
+    const int16_t z1t = mlk_zetas[2 * k + 1][1];
+    const int16_t z2t = mlk_zetas[2 * k][1];
+
+    k--;
+    mlk_invntt_2_layers_block(r, start, len, z0, z0t, z1, z1t, z2, z2t);
+  }
+}
+
+/* Single-layer inverse NTT, used for the leftover layer 7 (len=1) before
+ * the three merged 2-layer calls. */
 
 /* Reference: Embedded into `invntt()` in the reference implementation @[REF] */
 static void mlk_invntt_layer(int16_t *r, unsigned layer)
@@ -493,7 +702,9 @@ __contract__(
     decreases(MLKEM_N - start))
   {
     unsigned j;
-    int16_t zeta = mlk_zetas[k--];
+    const int16_t zeta = mlk_zetas[k][0];
+    const int16_t zeta_t = mlk_zetas[k][1];
+    k--;
     for (j = start; j < start + len; j++)
     __loop__(
       invariant(start <= j && j <= start + len)
@@ -505,7 +716,7 @@ __contract__(
       /* The preconditions imply that the arithmetic does not overflow. */
       r[j] = mlk_barrett_reduce((int16_t)(t + r[j + len]));
       r[j + len] = (int16_t)(r[j + len] - t);
-      r[j + len] = mlk_fqmul(r[j + len], zeta);
+      r[j + len] = mlk_fqmul(r[j + len], zeta, zeta_t);
     }
   }
 }
@@ -522,8 +733,11 @@ __contract__(
   ensures(array_abs_bound(p->coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND))
 )
 {
-  unsigned j, layer;
+  unsigned j;
   const int16_t f = 1441; /* check-magic: 1441 == pow(2,32 - 7,MLKEM_Q) */
+  /* check-magic:
+       -10079 == signed_mod(1441 * pow(MLKEM_Q, -1, 2^16), 2^16) */
+  const int16_t f_twisted = -10079;
   int16_t *r = p->coeffs;
 
   /*
@@ -537,18 +751,15 @@ __contract__(
     invariant(array_abs_bound(r, 0, j, MLKEM_Q))
     decreases(MLKEM_N - j))
   {
-    r[j] = mlk_fqmul(r[j], f);
+    r[j] = mlk_fqmul(r[j], f, f_twisted);
   }
 
-  /* Run the invNTT layers */
-  for (layer = 7; layer > 0; layer--)
-  __loop__(
-    invariant(0 <= layer && layer < 8)
-    invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
-    decreases(layer))
-  {
-    mlk_invntt_layer(r, layer);
-  }
+  /* Layer 7 is left as a single layer because the 2-layer merge only covers
+   * an even number of layers (ML-KEM has 7). */
+  mlk_invntt_layer(r, 7);
+  mlk_invntt_2_layers(r, 6);
+  mlk_invntt_2_layers(r, 4);
+  mlk_invntt_2_layers(r, 2);
 
   mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
 }
diff --git a/mlkem/src/zetas.inc b/mlkem/src/zetas.inc
index 00316daf6..f16fafe3a 100644
--- a/mlkem/src/zetas.inc
+++ b/mlkem/src/zetas.inc
@@ -12,19 +12,39 @@
 
 /*
  * Table of zeta values used in the reference NTT and inverse NTT.
- * See autogen for details.
+ * Each row is (zeta_mont, zeta_twisted); see autogen for details.
  */
-static MLK_ALIGN const int16_t mlk_zetas[128] = {
-    -1044, -758,  -359,  -1517, 1493,  1422,  287,   202,  -171,  622,   1577,
-    182,   962,   -1202, -1474, 1468,  573,   -1325, 264,  383,   -829,  1458,
-    -1602, -130,  -681,  1017,  732,   608,   -1542, 411,  -205,  -1571, 1223,
-    652,   -552,  1015,  -1293, 1491,  -282,  -1544, 516,  -8,    -320,  -666,
-    -1618, -1162, 126,   1469,  -853,  -90,   -271,  830,  107,   -1421, -247,
-    -951,  -398,  961,   -1508, -725,  448,   -1065, 677,  -1275, -1103, 430,
-    555,   843,   -1251, 871,   1550,  105,   422,   587,  177,   -235,  -291,
-    -460,  1574,  1653,  -246,  778,   1159,  -147,  -777, 1483,  -602,  1119,
-    -1590, 644,   -872,  349,   418,   329,   -156,  -75,  817,   1097,  603,
-    610,   1322,  -1285, -1465, 384,   -1215, -136,  1218, -1335, -874,  220,
-    -1187, -1659, -1185, -1530, -1278, 794,   -1510, -854, -870,  478,   -108,
-    -308,  996,   991,   958,   -1460, 1522,  1628,
+static MLK_ALIGN const int16_t mlk_zetas[128][2] = {
+    {-1044, -20},    {-758, 31498},   {-359, 14745},   {-1517, 787},
+    {1493, 13525},   {1422, -12402},  {287, 28191},    {202, -16694},
+    {-171, -20907},  {622, 27758},    {1577, -3799},   {182, -15690},
+    {962, 10690},    {-1202, 1358},   {-1474, -11202}, {1468, 31164},
+    {573, -5827},    {-1325, 17363},  {264, -26360},   {383, -29057},
+    {-829, 5571},    {1458, -1102},   {-1602, 21438},  {-130, -26242},
+    {-681, -28073},  {1017, 24313},   {732, -10532},   {608, 8800},
+    {-1542, 18426},  {411, 8859},     {-205, 26675},   {-1571, -16163},
+    {1223, -5689},   {652, -6516},    {-552, 1496},    {1015, 30967},
+    {-1293, -23565}, {1491, 20179},   {-282, 20710},   {-1544, 25080},
+    {516, -12796},   {-8, 26616},     {-320, 16064},   {-666, -12442},
+    {-1618, 9134},   {-1162, -650},   {126, -25986},   {1469, 27837},
+    {-853, 19883},   {-90, -28250},   {-271, -15887},  {830, -8898},
+    {107, -28309},   {-1421, 9075},   {-247, -30199},  {-951, 18249},
+    {-398, 13426},   {961, 14017},    {-1508, -29156}, {-725, -12757},
+    {448, 16832},    {-1065, 4311},   {677, -24155},   {-1275, -17915},
+    {-1103, -335},   {430, 11182},    {555, -11477},   {843, 13387},
+    {-1251, -32227}, {871, -14233},   {1550, 20494},   {105, -21655},
+    {422, -27738},   {587, 13131},    {177, 945},      {-235, -4587},
+    {-291, -14883},  {-460, 23092},   {1574, 6182},    {1653, 5493},
+    {-246, 32010},   {778, -32502},   {1159, 10631},   {-147, 30317},
+    {-777, 29175},   {1483, -18741},  {-602, -28762},  {1119, 12639},
+    {-1590, -18486}, {644, 20100},    {-872, 17560},   {349, 18525},
+    {418, -14430},   {329, 19529},    {-156, -5276},   {-75, -12619},
+    {817, -31183},   {1097, 20297},   {603, 25435},    {610, 2146},
+    {1322, -7382},   {-1285, 15355},  {-1465, 24391},  {384, -32384},
+    {-1215, -20927}, {-136, -6280},   {1218, 10946},   {-1335, -14903},
+    {-874, 24214},   {220, -11044},   {-1187, 16989},  {-1659, 14469},
+    {-1185, 10335},  {-1530, -21498}, {-1278, -7934},  {794, -20198},
+    {-1510, -22502}, {-854, 23210},   {-870, 10906},   {478, -17442},
+    {-108, 31636},   {-308, -23860},  {996, 28644},    {991, -20257},
+    {958, 23998},    {-1460, 7756},   {1522, -17422},  {1628, 23132},
 };
diff --git a/proofs/cbmc/fqmul/fqmul_harness.c b/proofs/cbmc/fqmul/fqmul_harness.c
index 570e508af..ff7e04478 100644
--- a/proofs/cbmc/fqmul/fqmul_harness.c
+++ b/proofs/cbmc/fqmul/fqmul_harness.c
@@ -6,11 +6,11 @@
 #include "common.h"
 
 
-int16_t mlk_fqmul(int16_t a, int16_t b);
+int16_t mlk_fqmul(int16_t a, int16_t b, int16_t b_twisted);
 
 void harness(void)
 {
-  int16_t a, b, r;
+  int16_t a, b, b_twisted, r;
 
-  r = mlk_fqmul(a, b);
+  r = mlk_fqmul(a, b, b_twisted);
 }
diff --git a/proofs/cbmc/invntt_2_layers/Makefile b/proofs/cbmc/invntt_2_layers/Makefile
new file mode 100644
index 000000000..cb3063704
--- /dev/null
+++ b/proofs/cbmc/invntt_2_layers/Makefile
@@ -0,0 +1,34 @@
+# Copyright (c) The mlkem-native project authors
+# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+
+include ../Makefile_params.common
+
+HARNESS_ENTRY = harness
+HARNESS_FILE = invntt_2_layers_harness
+
+# This should be a unique identifier for this proof, and will appear on the
+# Litani dashboard. It can be human-readable and contain spaces if you wish.
+PROOF_UID = mlk_invntt_2_layers
+
+DEFINES +=
+INCLUDES +=
+
+REMOVE_FUNCTION_BODY +=
+UNWINDSET +=
+
+PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
+PROJECT_SOURCES += $(SRCDIR)/mlkem/src/poly.c
+
+CHECK_FUNCTION_CONTRACTS=mlk_invntt_2_layers
+USE_FUNCTION_CONTRACTS=mlk_invntt_2_layers_block
+APPLY_LOOP_CONTRACTS=on
+USE_DYNAMIC_FRAMES=1
+
+EXTERNAL_SAT_SOLVER=
+CBMCFLAGS=--smt2
+
+FUNCTION_NAME = mlk_invntt_2_layers
+
+CBMC_OBJECT_BITS = 8
+
+include ../Makefile.common
diff --git a/proofs/cbmc/invntt_2_layers/invntt_2_layers_harness.c b/proofs/cbmc/invntt_2_layers/invntt_2_layers_harness.c
new file mode 100644
index 000000000..014bb8957
--- /dev/null
+++ b/proofs/cbmc/invntt_2_layers/invntt_2_layers_harness.c
@@ -0,0 +1,15 @@
+// Copyright (c) The mlkem-native project authors
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT-0
+
+#include "poly.h"
+
+
+void mlk_invntt_2_layers(int16_t *r, unsigned layer);
+
+void harness(void)
+{
+  int16_t *r;
+  unsigned layer;
+  mlk_invntt_2_layers(r, layer);
+}
diff --git a/proofs/cbmc/invntt_2_layers_block/Makefile b/proofs/cbmc/invntt_2_layers_block/Makefile
new file mode 100644
index 000000000..884687706
--- /dev/null
+++ b/proofs/cbmc/invntt_2_layers_block/Makefile
@@ -0,0 +1,33 @@
+# Copyright (c) The mlkem-native project authors
+# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+
+include ../Makefile_params.common
+
+HARNESS_ENTRY = harness
+HARNESS_FILE = invntt_2_layers_block_harness
+
+PROOF_UID = mlk_invntt_2_layers_block
+
+DEFINES +=
+INCLUDES +=
+
+REMOVE_FUNCTION_BODY +=
+UNWINDSET +=
+
+PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
+PROJECT_SOURCES += $(SRCDIR)/mlkem/src/poly.c
+
+CHECK_FUNCTION_CONTRACTS=mlk_invntt_2_layers_block
+USE_FUNCTION_CONTRACTS = mlk_fqmul
+USE_FUNCTION_CONTRACTS += mlk_barrett_reduce
+APPLY_LOOP_CONTRACTS=on
+USE_DYNAMIC_FRAMES=1
+
+EXTERNAL_SAT_SOLVER=
+CBMCFLAGS=--smt2
+
+FUNCTION_NAME = mlk_invntt_2_layers_block
+
+CBMC_OBJECT_BITS = 9
+
+include ../Makefile.common
diff --git a/proofs/cbmc/invntt_2_layers_block/invntt_2_layers_block_harness.c b/proofs/cbmc/invntt_2_layers_block/invntt_2_layers_block_harness.c
new file mode 100644
index 000000000..d4968af7b
--- /dev/null
+++ b/proofs/cbmc/invntt_2_layers_block/invntt_2_layers_block_harness.c
@@ -0,0 +1,20 @@
+// Copyright (c) The mlkem-native project authors
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT-0
+
+#include <stdint.h>
+#include "common.h"
+
+
+void mlk_invntt_2_layers_block(int16_t *r, unsigned start, unsigned len,
+                               int16_t z0, int16_t z0_twst, int16_t z1,
+                               int16_t z1_twst, int16_t z2, int16_t z2_twst);
+
+void harness(void)
+{
+  int16_t *r;
+  unsigned start, len;
+  int16_t z0, z0_twst, z1, z1_twst, z2, z2_twst;
+  mlk_invntt_2_layers_block(r, start, len, z0, z0_twst, z1, z1_twst, z2,
+                            z2_twst);
+}
diff --git a/proofs/cbmc/ntt_2_layers/Makefile b/proofs/cbmc/ntt_2_layers/Makefile
new file mode 100644
index 000000000..32360d139
--- /dev/null
+++ b/proofs/cbmc/ntt_2_layers/Makefile
@@ -0,0 +1,42 @@
+# Copyright (c) The mlkem-native project authors
+# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+
+include ../Makefile_params.common
+
+HARNESS_ENTRY = harness
+HARNESS_FILE = ntt_2_layers_harness
+
+# This should be a unique identifier for this proof, and will appear on the
+# Litani dashboard. It can be human-readable and contain spaces if you wish.
+PROOF_UID = mlk_ntt_2_layers
+
+DEFINES +=
+INCLUDES +=
+
+REMOVE_FUNCTION_BODY +=
+UNWINDSET +=
+
+PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
+PROJECT_SOURCES += $(SRCDIR)/mlkem/src/poly.c
+
+CHECK_FUNCTION_CONTRACTS=mlk_ntt_2_layers
+USE_FUNCTION_CONTRACTS=mlk_ntt_2_layers_block
+APPLY_LOOP_CONTRACTS=on
+USE_DYNAMIC_FRAMES=1
+
+# Disable any setting of EXTERNAL_SAT_SOLVER, and choose SMT backend instead
+EXTERNAL_SAT_SOLVER=
+CBMCFLAGS=--bitwuzla
+
+FUNCTION_NAME = mlk_ntt_2_layers
+
+# If this proof is found to consume huge amounts of RAM, you can set the
+# EXPENSIVE variable. With new enough versions of the proof tools, this will
+# restrict the number of EXPENSIVE CBMC jobs running at once. See the
+# documentation in Makefile.common under the "Job Pools" heading for details.
+# EXPENSIVE = true
+
+# This function is large enough to need...
+CBMC_OBJECT_BITS = 8
+
+include ../Makefile.common
diff --git a/proofs/cbmc/ntt_2_layers/ntt_2_layers_harness.c b/proofs/cbmc/ntt_2_layers/ntt_2_layers_harness.c
new file mode 100644
index 000000000..46a2a2f95
--- /dev/null
+++ b/proofs/cbmc/ntt_2_layers/ntt_2_layers_harness.c
@@ -0,0 +1,15 @@
+// Copyright (c) The mlkem-native project authors
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT-0
+
+#include "poly.h"
+
+
+void mlk_ntt_2_layers(int16_t *r, unsigned layer);
+
+void harness(void)
+{
+  int16_t *r;
+  unsigned layer;
+  mlk_ntt_2_layers(r, layer);
+}
diff --git a/proofs/cbmc/ntt_2_layers_block/Makefile b/proofs/cbmc/ntt_2_layers_block/Makefile
new file mode 100644
index 000000000..b3b6f5e4a
--- /dev/null
+++ b/proofs/cbmc/ntt_2_layers_block/Makefile
@@ -0,0 +1,32 @@
+# Copyright (c) The mlkem-native project authors
+# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+
+include ../Makefile_params.common
+
+HARNESS_ENTRY = harness
+HARNESS_FILE = ntt_2_layers_block_harness
+
+PROOF_UID = mlk_ntt_2_layers_block
+
+DEFINES +=
+INCLUDES +=
+
+REMOVE_FUNCTION_BODY +=
+UNWINDSET +=
+
+PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
+PROJECT_SOURCES += $(SRCDIR)/mlkem/src/poly.c
+
+CHECK_FUNCTION_CONTRACTS=mlk_ntt_2_layers_block
+USE_FUNCTION_CONTRACTS=mlk_fqmul
+APPLY_LOOP_CONTRACTS=on
+USE_DYNAMIC_FRAMES=1
+
+EXTERNAL_SAT_SOLVER=
+CBMCFLAGS=--bitwuzla
+
+FUNCTION_NAME = mlk_ntt_2_layers_block
+
+CBMC_OBJECT_BITS = 9
+
+include ../Makefile.common
diff --git a/proofs/cbmc/ntt_2_layers_block/ntt_2_layers_block_harness.c b/proofs/cbmc/ntt_2_layers_block/ntt_2_layers_block_harness.c
new file mode 100644
index 000000000..d823f72b3
--- /dev/null
+++ b/proofs/cbmc/ntt_2_layers_block/ntt_2_layers_block_harness.c
@@ -0,0 +1,21 @@
+// Copyright (c) The mlkem-native project authors
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT-0
+
+#include <stdint.h>
+#include "common.h"
+
+
+void mlk_ntt_2_layers_block(int16_t *r, unsigned start, unsigned len,
+                            int16_t z0, int16_t z0_twst, int16_t z1,
+                            int16_t z1_twst, int16_t z2, int16_t z2_twst,
+                            const int16_t bound);
+
+void harness(void)
+{
+  int16_t *r;
+  unsigned start, len;
+  int16_t z0, z0_twst, z1, z1_twst, z2, z2_twst, bound;
+  mlk_ntt_2_layers_block(r, start, len, z0, z0_twst, z1, z1_twst, z2, z2_twst,
+                         bound);
+}
diff --git a/proofs/cbmc/ntt_butterfly_block/ntt_butterfly_block_harness.c b/proofs/cbmc/ntt_butterfly_block/ntt_butterfly_block_harness.c
index 4434ffe1a..aa03d39fd 100644
--- a/proofs/cbmc/ntt_butterfly_block/ntt_butterfly_block_harness.c
+++ b/proofs/cbmc/ntt_butterfly_block/ntt_butterfly_block_harness.c
@@ -6,12 +6,12 @@
 #include "common.h"
 
 
-void mlk_ntt_butterfly_block(int16_t *r, int16_t root, unsigned start,
-                             unsigned len, unsigned bound);
+void mlk_ntt_butterfly_block(int16_t *r, int16_t root, int16_t root_twisted,
+                             unsigned start, unsigned len, unsigned bound);
 
 void harness(void)
 {
-  int16_t *r, root;
+  int16_t *r, root, root_twisted;
   unsigned start, stride, bound;
-  mlk_ntt_butterfly_block(r, root, start, stride, bound);
+  mlk_ntt_butterfly_block(r, root, root_twisted, start, stride, bound);
 }
diff --git a/proofs/cbmc/poly_invntt_tomont_c/Makefile b/proofs/cbmc/poly_invntt_tomont_c/Makefile
index ae66bf139..180e94a4c 100644
--- a/proofs/cbmc/poly_invntt_tomont_c/Makefile
+++ b/proofs/cbmc/poly_invntt_tomont_c/Makefile
@@ -22,6 +22,7 @@ PROJECT_SOURCES += $(SRCDIR)/mlkem/src/poly.c
 CHECK_FUNCTION_CONTRACTS=mlk_poly_invntt_tomont_c
 USE_FUNCTION_CONTRACTS = mlk_fqmul
 USE_FUNCTION_CONTRACTS += mlk_invntt_layer
+USE_FUNCTION_CONTRACTS += mlk_invntt_2_layers
 APPLY_LOOP_CONTRACTS=on
 USE_DYNAMIC_FRAMES=1
 
diff --git a/proofs/cbmc/poly_ntt_c/Makefile b/proofs/cbmc/poly_ntt_c/Makefile
index 11ed7098b..5704a2565 100644
--- a/proofs/cbmc/poly_ntt_c/Makefile
+++ b/proofs/cbmc/poly_ntt_c/Makefile
@@ -21,6 +21,7 @@ PROJECT_SOURCES += $(SRCDIR)/mlkem/src/poly.c
 
 CHECK_FUNCTION_CONTRACTS=mlk_poly_ntt_c
 USE_FUNCTION_CONTRACTS=mlk_ntt_layer
+USE_FUNCTION_CONTRACTS+=mlk_ntt_2_layers
 APPLY_LOOP_CONTRACTS=on
 USE_DYNAMIC_FRAMES=1
 
diff --git a/scripts/autogen b/scripts/autogen
index ba7801fb5..9cbc1bccf 100755
--- a/scripts/autogen
+++ b/scripts/autogen
@@ -714,17 +714,25 @@ def signed_reduce(a):
 
 def gen_c_zetas():
     """Generate source and header file for zeta values used in
-    the reference NTT and invNTT"""
+    the reference NTT and invNTT.
+
+    Each yielded entry is a pair (zeta_mont, zeta_twisted) where:
+      - zeta_mont    = signed canonical representative of zeta * 2^16 mod q
+      - zeta_twisted = signed canonical 16-bit representative of
+                       zeta_mont * q^{-1} mod 2^16
+    The twisted form lets the C-reference fqmul skip the extra QINV
+    multiply that mlk_montgomery_reduce would otherwise perform."""
 
     # The zeta values are the powers of the chosen root of unity (17),
-    # converted to Montgomery form.
+    # converted to Montgomery form (and to its twisted-Montgomery counterpart).
 
-    zeta = []
+    zeta_pairs = []
     for i in range(128):
-        zeta.append(signed_reduce(pow(root_of_unity, i, modulus) * montgomery_factor))
+        raw = pow(root_of_unity, i, modulus)
+        zeta_pairs.append(prepare_root_for_montmul(raw))
 
     # The source code stores the zeta table in bit reversed form
-    yield from (zeta[bitreverse(i, 7)] for i in range(128))
+    yield from (zeta_pairs[bitreverse(i, 7)] for i in range(128))
 
 
 def gen_c_zeta_file():
@@ -733,10 +741,11 @@ def gen_c_zeta_file():
         yield ""
         yield "/*"
         yield " * Table of zeta values used in the reference NTT and inverse NTT."
-        yield " * See autogen for details."
+        yield " * Each row is (zeta_mont, zeta_twisted); see autogen for details."
         yield " */"
-        yield "static MLK_ALIGN const int16_t mlk_zetas[128] = {"
-        yield from map(lambda t: str(t) + ",", gen_c_zetas())
+        yield "static MLK_ALIGN const int16_t mlk_zetas[128][2] = {"
+        for z_mont, z_twisted in gen_c_zetas():
+            yield f"{{ {z_mont}, {z_twisted} }},"
         yield "};"
         yield ""
 
diff --git a/scripts/check-magic b/scripts/check-magic
index 42b50815d..29105374a 100755
--- a/scripts/check-magic
+++ b/scripts/check-magic
@@ -37,6 +37,37 @@ FAIL = f"{RED}✗{NORMAL}"
 REMEMBERED = f"{BLUE}⊢{NORMAL}"
 
 
+def join_block_comments(lines):
+    """Collapse multi-line C block comments onto their opening line.
+
+    Each continuation line is replaced with an empty placeholder so the
+    overall line count - and therefore line numbers in diagnostics - is
+    unchanged. Only handles block comments that start and end with the
+    standard /* ... */ delimiters; line comments and strings are left
+    alone, which is sufficient for this script's input.
+    """
+    out = list(lines)
+    i = 0
+    n = len(out)
+    while i < n:
+        line = out[i]
+        if "/*" in line and "*/" not in line:
+            j = i + 1
+            merged = line
+            while j < n and "*/" not in out[j]:
+                merged += " " + out[j].strip()
+                out[j] = ""
+                j += 1
+            if j < n:
+                merged += " " + out[j].strip()
+                out[j] = ""
+                out[i] = merged
+                i = j + 1
+                continue
+        i += 1
+    return out
+
+
 def check_magic_numbers():
     mlkem_q = 3329
     exceptions = [
@@ -129,6 +160,11 @@ def check_magic_numbers():
         if autogen_marker in content:
             continue
         content = content.split("\n")
+        # Pre-pass: collapse multi-line block comments onto their opening
+        # line so wrapped check-magic annotations are recognized. Consumed
+        # continuation lines are replaced with empty placeholders to preserve
+        # line numbers in diagnostics.
+        content = join_block_comments(content)
         # Use negative lookbefore and lookahead to exclude numbers
         # that occur as part of identifiers (e.g. layer12345 or 199901L)
         pattern = r"(?<![0-9a-zA-Z/_-])([-]?\d{4,})(?![0-9a-zA-Z_-])"