From 30f89a6c3dd5194c6dc9dea8db3f0fe7ce0e7be6 Mon Sep 17 00:00:00 2001
From: Greg von Nessi <greg.vonnessi@entrolution.ai>
Date: Thu, 26 Feb 2026 00:42:49 +0000
Subject: [PATCH 1/5] Deduplicate reverse sweep in gradient_with_buf and
 sparse_jacobian_par

Both methods inlined the same reverse sweep loop that reverse_sweep_core
already implements. Replace with calls to the shared function, which also
picks up the zero-adjoint skip optimization in gradient_with_buf.
---
 src/bytecode_tape.rs | 88 +-------------------------------------------
 1 file changed, 2 insertions(+), 86 deletions(-)
diff --git a/src/bytecode_tape.rs b/src/bytecode_tape.rs
index 6dbbb65..27dbbdd 100644
--- a/src/bytecode_tape.rs
+++ b/src/bytecode_tape.rs
@@ -898,50 +898,7 @@ impl<F: Float> BytecodeTape<F> {
         adjoint_buf.resize(n, F::zero());
         adjoint_buf[self.output_index as usize] = F::one();
 
-        for i in (0..self.opcodes.len()).rev() {
-            let adj = adjoint_buf[i];
-            if adj == F::zero() {
-                continue;
-            }
-
-            match self.opcodes[i] {
-                OpCode::Input | OpCode::Const => continue,
-                OpCode::Custom => {
-                    adjoint_buf[i] = F::zero();
-                    let [a_idx, cb_idx] = self.arg_indices[i];
-                    let a = self.values[a_idx as usize];
-                    let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied();
-                    let b = b_idx_opt
-                        .map(|bi| self.values[bi as usize])
-                        .unwrap_or(F::zero());
-                    let r = self.values[i];
-                    let (da, db) = self.custom_ops[cb_idx as usize].partials(a, b, r);
-                    adjoint_buf[a_idx as usize] = adjoint_buf[a_idx as usize] + da * adj;
-                    if let Some(bi) = b_idx_opt {
-                        adjoint_buf[bi as usize] = adjoint_buf[bi as usize] + db * adj;
-                    }
-                }
-                op => {
-                    adjoint_buf[i] = F::zero();
-                    let [a_idx, b_idx] = self.arg_indices[i];
-                    let a = self.values[a_idx as usize];
-                    let b = if b_idx != UNUSED && op != OpCode::Powi {
-                        self.values[b_idx as usize]
-                    } else if op == OpCode::Powi {
-                        F::from(b_idx).unwrap_or_else(|| F::zero())
-                    } else {
-                        F::zero()
-                    };
-                    let r = self.values[i];
-                    let (da, db) = opcode::reverse_partials(op, a, b, r);
-
-                    adjoint_buf[a_idx as usize] = adjoint_buf[a_idx as usize] + da * adj;
-                    if b_idx != UNUSED && op != OpCode::Powi {
-                        adjoint_buf[b_idx as usize] = adjoint_buf[b_idx as usize] + db * adj;
-                    }
-                }
-            }
-        }
+        self.reverse_sweep_core(adjoint_buf, &self.values, None);
         adjoint_buf[..self.num_inputs as usize].to_vec()
     }
 
@@ -2410,48 +2367,7 @@ impl<F: Float> BytecodeTape<F> {
                         }
                     }
 
-                    for idx in (0..self.opcodes.len()).rev() {
-                        let adj = adjoints[idx];
-                        if adj == F::zero() {
-                            continue;
-                        }
-                        match self.opcodes[idx] {
-                            OpCode::Input | OpCode::Const => continue,
-                            OpCode::Custom => {
-                                adjoints[idx] = F::zero();
-                                let [a_idx, cb_idx] = self.arg_indices[idx];
-                                let a = values_buf[a_idx as usize];
-                                let b_idx_opt = self.custom_second_args.get(&(idx as u32)).copied();
-                                let b = b_idx_opt
-                                    .map(|bi| values_buf[bi as usize])
-                                    .unwrap_or(F::zero());
-                                let r = values_buf[idx];
-                                let (da, db) = self.custom_ops[cb_idx as usize].partials(a, b, r);
-                                adjoints[a_idx as usize] = adjoints[a_idx as usize] + da * adj;
-                                if let Some(bi) = b_idx_opt {
-                                    adjoints[bi as usize] = adjoints[bi as usize] + db * adj;
-                                }
-                            }
-                            op => {
-                                adjoints[idx] = F::zero();
-                                let [a_idx, b_idx] = self.arg_indices[idx];
-                                let a = values_buf[a_idx as usize];
-                                let b = if b_idx != UNUSED && op != OpCode::Powi {
-                                    values_buf[b_idx as usize]
-                                } else if op == OpCode::Powi {
-                                    F::from(b_idx).unwrap_or_else(|| F::zero())
-                                } else {
-                                    F::zero()
-                                };
-                                let r = values_buf[idx];
-                                let (da, db) = opcode::reverse_partials(op, a, b, r);
-                                adjoints[a_idx as usize] = adjoints[a_idx as usize] + da * adj;
-                                if b_idx != UNUSED && op != OpCode::Powi {
-                                    adjoints[b_idx as usize] = adjoints[b_idx as usize] + db * adj;
-                                }
-                            }
-                        }
-                    }
+                    self.reverse_sweep_core(&mut adjoints, &values_buf, None);
                     adjoints[..ni].to_vec()
                 })
                 .collect();

From e5cf808c18e9967781a8ad64f8220a488504279a Mon Sep 17 00:00:00 2001
From: Greg von Nessi <greg.vonnessi@entrolution.ai>
Date: Thu, 26 Feb 2026 00:43:27 +0000
Subject: [PATCH 2/5] Move bytecode_tape.rs to bytecode_tape/mod.rs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prepare for submodule decomposition. Pure rename via git mv,
zero content changes — preserves full blame history.
---
 src/{bytecode_tape.rs => bytecode_tape/mod.rs} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename src/{bytecode_tape.rs => bytecode_tape/mod.rs} (100%)

diff --git a/src/bytecode_tape.rs b/src/bytecode_tape/mod.rs
similarity index 100%
rename from src/bytecode_tape.rs
rename to src/bytecode_tape/mod.rs

From c5153696404f33d1ec9f9e328e57bcbc15a04c6b Mon Sep 17 00:00:00 2001
From: Greg von Nessi <greg.vonnessi@entrolution.ai>
Date: Thu, 26 Feb 2026 00:56:56 +0000
Subject: [PATCH 3/5] Decompose BytecodeTape into directory module with 10
 submodules

Split the 2,689-line monolith into logical submodules for navigability:
mod.rs (struct + construction + accessors), forward.rs, reverse.rs,
tangent.rs, jacobian.rs, sparse.rs, optimize.rs, taylor.rs, parallel.rs,
serde_support.rs, thread_local.rs.

Struct fields changed to pub(crate) for cross-module impl blocks.
Seven internal methods upgraded to pub(super) for cross-module calls.
Zero public API or behavioral changes.
---
 src/bytecode_tape/forward.rs       |  184 +++
 src/bytecode_tape/jacobian.rs      |  183 +++
 src/bytecode_tape/mod.rs           | 2174 +---------------------------
 src/bytecode_tape/optimize.rs      |  271 ++++
 src/bytecode_tape/parallel.rs      |  263 ++++
 src/bytecode_tape/reverse.rs       |  170 +++
 src/bytecode_tape/serde_support.rs |   61 +
 src/bytecode_tape/sparse.rs        |  435 ++++++
 src/bytecode_tape/tangent.rs       |  380 +++++
 src/bytecode_tape/taylor.rs        |  118 ++
 src/bytecode_tape/thread_local.rs  |   87 ++
 11 files changed, 2177 insertions(+), 2149 deletions(-)
 create mode 100644 src/bytecode_tape/forward.rs
 create mode 100644 src/bytecode_tape/jacobian.rs
 create mode 100644 src/bytecode_tape/optimize.rs
 create mode 100644 src/bytecode_tape/parallel.rs
 create mode 100644 src/bytecode_tape/reverse.rs
 create mode 100644 src/bytecode_tape/serde_support.rs
 create mode 100644 src/bytecode_tape/sparse.rs
 create mode 100644 src/bytecode_tape/tangent.rs
 create mode 100644 src/bytecode_tape/taylor.rs
 create mode 100644 src/bytecode_tape/thread_local.rs

diff --git a/src/bytecode_tape/forward.rs b/src/bytecode_tape/forward.rs
new file mode 100644
index 0000000..e17ebe4
--- /dev/null
+++ b/src/bytecode_tape/forward.rs
@@ -0,0 +1,184 @@
+use crate::float::Float;
+use crate::opcode::{self, OpCode, UNUSED};
+
+impl<F: Float> super::BytecodeTape<F> {
+    /// Re-evaluate the tape at new inputs (forward sweep).
+    ///
+    /// Overwrites `values` in-place — no allocation.
+    pub fn forward(&mut self, inputs: &[F]) {
+        assert_eq!(
+            inputs.len(),
+            self.num_inputs as usize,
+            "wrong number of inputs"
+        );
+
+        // Overwrite input values.
+        for (i, &v) in inputs.iter().enumerate() {
+            self.values[i] = v;
+        }
+
+        // Re-evaluate all non-Input, non-Const ops.
+        for i in 0..self.opcodes.len() {
+            match self.opcodes[i] {
+                OpCode::Input | OpCode::Const => continue,
+                OpCode::Custom => {
+                    let [a_idx, cb_idx] = self.arg_indices[i];
+                    let a = self.values[a_idx as usize];
+                    let b = self
+                        .custom_second_args
+                        .get(&(i as u32))
+                        .map(|&bi| self.values[bi as usize])
+                        .unwrap_or(F::zero());
+                    self.values[i] = self.custom_ops[cb_idx as usize].eval(a, b);
+                }
+                op => {
+                    let [a_idx, b_idx] = self.arg_indices[i];
+                    let a = self.values[a_idx as usize];
+                    let b = if b_idx != UNUSED && op != OpCode::Powi {
+                        self.values[b_idx as usize]
+                    } else if op == OpCode::Powi {
+                        F::from(b_idx).unwrap_or_else(|| F::zero())
+                    } else {
+                        F::zero()
+                    };
+                    self.values[i] = opcode::eval_forward(op, a, b);
+                }
+            }
+        }
+    }
+
+    /// Forward sweep with nonsmooth branch tracking.
+    ///
+    /// Calls [`forward`](Self::forward) to evaluate the tape, then scans for
+    /// nonsmooth operations and records which branch was taken at each one.
+    ///
+    /// Tracked operations:
+    /// - `Abs`, `Min`, `Max` — kinks with nontrivial subdifferentials
+    /// - `Signum`, `Floor`, `Ceil`, `Round`, `Trunc` — step-function
+    ///   discontinuities (zero derivative on both sides, tracked for proximity
+    ///   detection only)
+    ///
+    /// Returns [`crate::NonsmoothInfo`] containing all kink entries in tape order.
+    pub fn forward_nonsmooth(&mut self, inputs: &[F]) -> crate::nonsmooth::NonsmoothInfo<F> {
+        self.forward(inputs);
+
+        let mut kinks = Vec::new();
+        for i in 0..self.opcodes.len() {
+            let op = self.opcodes[i];
+            if !opcode::is_nonsmooth(op) {
+                continue;
+            }
+
+            let [a_idx, b_idx] = self.arg_indices[i];
+            let a = self.values[a_idx as usize];
+
+            match op {
+                OpCode::Abs => {
+                    kinks.push(crate::nonsmooth::KinkEntry {
+                        tape_index: i as u32,
+                        opcode: op,
+                        switching_value: a,
+                        branch: if a >= F::zero() { 1 } else { -1 },
+                    });
+                }
+                OpCode::Max => {
+                    let b = self.values[b_idx as usize];
+                    kinks.push(crate::nonsmooth::KinkEntry {
+                        tape_index: i as u32,
+                        opcode: op,
+                        switching_value: a - b,
+                        branch: if a >= b { 1 } else { -1 },
+                    });
+                }
+                OpCode::Min => {
+                    let b = self.values[b_idx as usize];
+                    kinks.push(crate::nonsmooth::KinkEntry {
+                        tape_index: i as u32,
+                        opcode: op,
+                        switching_value: a - b,
+                        branch: if a <= b { 1 } else { -1 },
+                    });
+                }
+                OpCode::Signum => {
+                    // Kink at x = 0 (same as Abs).
+                    kinks.push(crate::nonsmooth::KinkEntry {
+                        tape_index: i as u32,
+                        opcode: op,
+                        switching_value: a,
+                        branch: if a >= F::zero() { 1 } else { -1 },
+                    });
+                }
+                OpCode::Floor | OpCode::Ceil | OpCode::Round | OpCode::Trunc => {
+                    // Kink at integer values. switching_value = distance to
+                    // nearest integer: zero exactly at kink points, works
+                    // symmetrically for both approach directions.
+                    kinks.push(crate::nonsmooth::KinkEntry {
+                        tape_index: i as u32,
+                        opcode: op,
+                        switching_value: a - a.round(),
+                        branch: if a - a.floor() < F::from(0.5).unwrap() {
+                            1
+                        } else {
+                            -1
+                        },
+                    });
+                }
+                _ => unreachable!(),
+            }
+        }
+
+        crate::nonsmooth::NonsmoothInfo { kinks }
+    }
+
+    /// Forward evaluation into an external buffer.
+    ///
+    /// Reads opcodes, constants, and argument indices from `self`, but writes
+    /// computed values into `values_buf` instead of `self.values`. This allows
+    /// parallel evaluation of the same tape at different inputs without cloning.
+    pub fn forward_into(&self, inputs: &[F], values_buf: &mut Vec<F>) {
+        assert_eq!(
+            inputs.len(),
+            self.num_inputs as usize,
+            "wrong number of inputs"
+        );
+
+        let n = self.num_variables as usize;
+        values_buf.clear();
+        values_buf.resize(n, F::zero());
+
+        // Copy constant values from the tape, then overwrite inputs.
+        values_buf.copy_from_slice(&self.values[..n]);
+        for (i, &v) in inputs.iter().enumerate() {
+            values_buf[i] = v;
+        }
+
+        // Re-evaluate all non-Input, non-Const ops.
+        for i in 0..self.opcodes.len() {
+            match self.opcodes[i] {
+                OpCode::Input | OpCode::Const => continue,
+                OpCode::Custom => {
+                    let [a_idx, cb_idx] = self.arg_indices[i];
+                    let a = values_buf[a_idx as usize];
+                    let b = self
+                        .custom_second_args
+                        .get(&(i as u32))
+                        .map(|&bi| values_buf[bi as usize])
+                        .unwrap_or(F::zero());
+                    values_buf[i] = self.custom_ops[cb_idx as usize].eval(a, b);
+                }
+                op => {
+                    let [a_idx, b_idx] = self.arg_indices[i];
+                    let a = values_buf[a_idx as usize];
+                    let b = if b_idx != UNUSED && op != OpCode::Powi {
+                        values_buf[b_idx as usize]
+                    } else if op == OpCode::Powi {
+                        F::from(b_idx).unwrap_or_else(|| F::zero())
+                    } else {
+                        F::zero()
+                    };
+                    values_buf[i] = opcode::eval_forward(op, a, b);
+                }
+            }
+        }
+    }
+}
diff --git a/src/bytecode_tape/jacobian.rs b/src/bytecode_tape/jacobian.rs
new file mode 100644
index 0000000..f698902
--- /dev/null
+++ b/src/bytecode_tape/jacobian.rs
@@ -0,0 +1,183 @@
+use std::collections::HashMap;
+
+use crate::dual::Dual;
+use crate::float::Float;
+use crate::opcode;
+
+impl<F: Float> super::BytecodeTape<F> {
+    /// Compute the full Jacobian of a multi-output tape via reverse mode.
+    ///
+    /// Performs `m` reverse sweeps (one per output). Returns `J[i][j] = ∂f_i/∂x_j`.
+    pub fn jacobian(&mut self, inputs: &[F]) -> Vec<Vec<F>> {
+        self.forward(inputs);
+
+        let out_indices = self.all_output_indices();
+
+        let ni = self.num_inputs as usize;
+        let mut jac = Vec::with_capacity(out_indices.len());
+
+        for &out_idx in out_indices {
+            let adjoints = self.reverse(out_idx);
+            jac.push(adjoints[..ni].to_vec());
+        }
+
+        jac
+    }
+
+    /// Vector-Jacobian product for a multi-output tape.
+    ///
+    /// Computes `wᵀ · J` where `J` is the Jacobian. More efficient than
+    /// computing the full Jacobian when only the weighted combination is needed.
+    pub fn vjp_multi(&mut self, inputs: &[F], weights: &[F]) -> Vec<F> {
+        self.forward(inputs);
+        self.reverse_seeded(weights)
+    }
+
+    /// Compute a Jacobian with forced branch choices at specified tape indices.
+    ///
+    /// For each `(tape_index, sign)` in `forced_signs`, the reverse sweep uses
+    /// [`forced_reverse_partials`](opcode::forced_reverse_partials) instead of the
+    /// standard partials at that index.
+    ///
+    /// This is the building block for Clarke subdifferential enumeration.
+    pub fn jacobian_limiting(&mut self, inputs: &[F], forced_signs: &[(u32, i8)]) -> Vec<Vec<F>> {
+        self.forward(inputs);
+
+        let sign_map: HashMap<u32, i8> = forced_signs.iter().copied().collect();
+        let out_indices = self.all_output_indices();
+
+        let ni = self.num_inputs as usize;
+        let mut jac = Vec::with_capacity(out_indices.len());
+
+        for &out_idx in out_indices {
+            let adjoints = self.reverse_with_forced_signs(out_idx, &sign_map);
+            jac.push(adjoints[..ni].to_vec());
+        }
+
+        jac
+    }
+
+    /// Compute the Clarke generalized Jacobian via limiting Jacobian enumeration.
+    ///
+    /// 1. Runs `forward_nonsmooth` to detect all kink operations and their branches.
+    /// 2. Identifies "active" kinks (|switching_value| < `tol`).
+    /// 3. Enumerates all 2^k sign combinations for the k active kinks.
+    /// 4. For each combination, computes a limiting Jacobian via forced reverse sweeps.
+    ///
+    /// Returns the nonsmooth info and a vector of limiting Jacobians.
+    ///
+    /// # Errors
+    ///
+    /// Returns [`crate::ClarkeError::TooManyKinks`] if the number of active kinks exceeds
+    /// the limit (default 20, overridden by `max_active_kinks`).
+    pub fn clarke_jacobian(
+        &mut self,
+        inputs: &[F],
+        tol: F,
+        max_active_kinks: Option<usize>,
+    ) -> Result<(crate::nonsmooth::NonsmoothInfo<F>, Vec<Vec<Vec<F>>>), crate::nonsmooth::ClarkeError>
+    {
+        #![allow(clippy::type_complexity)]
+        let info = self.forward_nonsmooth(inputs);
+        let active: Vec<&crate::nonsmooth::KinkEntry<F>> = info
+            .active_kinks(tol)
+            .into_iter()
+            .filter(|k| opcode::has_nontrivial_subdifferential(k.opcode))
+            .collect();
+        let k = active.len();
+        let limit = max_active_kinks.unwrap_or(20);
+
+        if k > limit {
+            return Err(crate::nonsmooth::ClarkeError::TooManyKinks { count: k, limit });
+        }
+
+        let active_indices: Vec<u32> = active.iter().map(|e| e.tape_index).collect();
+
+        // Build sign_map from all (non-active) kinks using their natural branches,
+        // then override active kinks per combination.
+        let base_signs: HashMap<u32, i8> = info
+            .kinks
+            .iter()
+            .map(|e| (e.tape_index, e.branch))
+            .collect();
+
+        let out_indices = self.all_output_indices();
+        let ni = self.num_inputs as usize;
+
+        let num_combos = 1usize << k;
+        let mut jacobians = Vec::with_capacity(num_combos);
+
+        for combo in 0..num_combos {
+            let mut sign_map = base_signs.clone();
+            for (bit, &idx) in active_indices.iter().enumerate() {
+                let sign: i8 = if (combo >> bit) & 1 == 0 { 1 } else { -1 };
+                sign_map.insert(idx, sign);
+            }
+
+            let mut jac = Vec::with_capacity(out_indices.len());
+            for &out_idx in out_indices {
+                let adjoints = self.reverse_with_forced_signs(out_idx, &sign_map);
+                jac.push(adjoints[..ni].to_vec());
+            }
+            jacobians.push(jac);
+        }
+
+        Ok((info, jacobians))
+    }
+
+    /// Dense Jacobian via forward mode (one forward-tangent pass per input).
+    ///
+    /// More efficient than reverse mode when `num_inputs < num_outputs`.
+    pub fn jacobian_forward(&self, x: &[F]) -> Vec<Vec<F>> {
+        let n = self.num_inputs as usize;
+
+        let out_indices = self.all_output_indices();
+        let m = out_indices.len();
+
+        let mut jac = vec![vec![F::zero(); n]; m];
+        let mut dual_input_buf: Vec<Dual<F>> = Vec::with_capacity(n);
+        let mut dual_vals_buf: Vec<Dual<F>> = Vec::new();
+
+        // Indexing by `col` is clearer than enumerate here: col seeds the tangent direction
+        #[allow(clippy::needless_range_loop)]
+        for col in 0..n {
+            dual_input_buf.clear();
+            dual_input_buf.extend(
+                (0..n).map(|i| Dual::new(x[i], if i == col { F::one() } else { F::zero() })),
+            );
+
+            self.forward_tangent(&dual_input_buf, &mut dual_vals_buf);
+
+            for (row_idx, &out_idx) in out_indices.iter().enumerate() {
+                jac[row_idx][col] = dual_vals_buf[out_idx as usize].eps;
+            }
+        }
+
+        jac
+    }
+
+    /// Dense Jacobian via cross-country (vertex) elimination.
+    ///
+    /// Builds a linearized DAG from the tape, then eliminates intermediate
+    /// vertices in Markowitz order. For functions where `m ≈ n` and the
+    /// graph has moderate connectivity, this can require fewer operations
+    /// than either pure forward mode (`n` passes) or reverse mode (`m` passes).
+    pub fn jacobian_cross_country(&mut self, inputs: &[F]) -> Vec<Vec<F>> {
+        self.forward(inputs);
+
+        let out_indices = self.all_output_indices();
+
+        let mut graph = crate::cross_country::LinearizedGraph::from_tape(
+            &self.opcodes,
+            &self.arg_indices,
+            &self.values,
+            self.num_inputs as usize,
+            out_indices,
+            &self.custom_ops,
+            &self.custom_second_args,
+        );
+
+        graph.eliminate_all();
+        graph.extract_jacobian()
+    }
+}
diff --git a/src/bytecode_tape/mod.rs b/src/bytecode_tape/mod.rs
index 27dbbdd..7b4666c 100644
--- a/src/bytecode_tape/mod.rs
+++ b/src/bytecode_tape/mod.rs
@@ -11,20 +11,30 @@
 //! branches (`if x > 0 { ... } else { ... }`), re-evaluating at inputs that
 //! take a different branch produces incorrect results.
 
-use std::cell::Cell;
 use std::collections::HashMap;
 use std::sync::Arc;
 
 use crate::dual::Dual;
-use crate::dual_vec::DualVec;
 use crate::float::Float;
 use crate::opcode::{self, OpCode, UNUSED};
 
-use crate::float::IsAllZero;
-use num_traits::Float as NumFloat;
+// Submodules — each adds impl blocks to BytecodeTape<F>
+mod forward;
+mod jacobian;
+mod optimize;
+mod reverse;
+mod sparse;
+mod tangent;
 
 #[cfg(feature = "taylor")]
-use crate::taylor::Taylor;
+mod taylor;
+#[cfg(feature = "parallel")]
+mod parallel;
+#[cfg(feature = "serde")]
+mod serde_support;
+
+mod thread_local;
+pub use self::thread_local::{with_active_btape, BtapeGuard, BtapeThreadLocal};
 
 /// Sentinel index for constant entries (not tracked).
 pub const CONSTANT: u32 = u32::MAX;
@@ -93,19 +103,19 @@ pub struct CustomOpHandle(pub(crate) u16);
 /// Created via [`crate::api::record`]. After recording, call [`forward`](Self::forward)
 /// to re-evaluate and [`reverse`](Self::reverse) to compute adjoints.
 pub struct BytecodeTape<F: Float> {
-    opcodes: Vec<OpCode>,
-    arg_indices: Vec<[u32; 2]>,
-    values: Vec<F>,
-    num_inputs: u32,
-    num_variables: u32,
-    output_index: u32,
+    pub(crate) opcodes: Vec<OpCode>,
+    pub(crate) arg_indices: Vec<[u32; 2]>,
+    pub(crate) values: Vec<F>,
+    pub(crate) num_inputs: u32,
+    pub(crate) num_variables: u32,
+    pub(crate) output_index: u32,
     /// Indices of multiple output variables (empty = single-output mode).
-    output_indices: Vec<u32>,
+    pub(crate) output_indices: Vec<u32>,
     /// Registered custom operations (callback table).
-    custom_ops: Vec<Arc<dyn CustomOp<F>>>,
+    pub(crate) custom_ops: Vec<Arc<dyn CustomOp<F>>>,
     /// Second operand index for binary custom ops (sparse side table).
     /// Maps tape index → second operand index.
-    custom_second_args: HashMap<u32, u32>,
+    pub(crate) custom_second_args: HashMap<u32, u32>,
 }
 
 impl<F: Float> BytecodeTape<F> {
@@ -462,2144 +472,10 @@ impl<F: Float> BytecodeTape<F> {
     pub fn has_custom_ops(&self) -> bool {
         !self.custom_ops.is_empty()
     }
-
-    /// Reverse sweep with weighted seeds for multiple outputs.
-    ///
-    /// Computes `∑_i weights[i] * ∂output_i/∂x` — a vector-Jacobian product.
-    ///
-    /// Returns the gradient with respect to all inputs (length [`num_inputs`](Self::num_inputs)).
-    pub fn reverse_seeded(&self, seeds: &[F]) -> Vec<F> {
-        let out_indices = self.all_output_indices();
-
-        assert_eq!(
-            seeds.len(),
-            out_indices.len(),
-            "seeds length must match number of outputs"
-        );
-
-        let ni = self.num_inputs as usize;
-        let adjoints = self.reverse_seeded_full(seeds, out_indices);
-        adjoints[..ni].to_vec()
-    }
-
-    /// Compute the full Jacobian of a multi-output tape via reverse mode.
-    ///
-    /// Performs `m` reverse sweeps (one per output). Returns `J[i][j] = ∂f_i/∂x_j`.
-    pub fn jacobian(&mut self, inputs: &[F]) -> Vec<Vec<F>> {
-        self.forward(inputs);
-
-        let out_indices = self.all_output_indices();
-
-        let ni = self.num_inputs as usize;
-        let mut jac = Vec::with_capacity(out_indices.len());
-
-        for &out_idx in out_indices {
-            let adjoints = self.reverse(out_idx);
-            jac.push(adjoints[..ni].to_vec());
-        }
-
-        jac
-    }
-
-    /// Vector-Jacobian product for a multi-output tape.
-    ///
-    /// Computes `wᵀ · J` where `J` is the Jacobian. More efficient than
-    /// computing the full Jacobian when only the weighted combination is needed.
-    pub fn vjp_multi(&mut self, inputs: &[F], weights: &[F]) -> Vec<F> {
-        self.forward(inputs);
-        self.reverse_seeded(weights)
-    }
-
-    /// Compute a Jacobian with forced branch choices at specified tape indices.
-    ///
-    /// For each `(tape_index, sign)` in `forced_signs`, the reverse sweep uses
-    /// [`forced_reverse_partials`](opcode::forced_reverse_partials) instead of the
-    /// standard partials at that index.
-    ///
-    /// This is the building block for Clarke subdifferential enumeration.
-    pub fn jacobian_limiting(&mut self, inputs: &[F], forced_signs: &[(u32, i8)]) -> Vec<Vec<F>> {
-        self.forward(inputs);
-
-        let sign_map: HashMap<u32, i8> = forced_signs.iter().copied().collect();
-        let out_indices = self.all_output_indices();
-
-        let ni = self.num_inputs as usize;
-        let mut jac = Vec::with_capacity(out_indices.len());
-
-        for &out_idx in out_indices {
-            let adjoints = self.reverse_with_forced_signs(out_idx, &sign_map);
-            jac.push(adjoints[..ni].to_vec());
-        }
-
-        jac
-    }
-
-    /// Compute the Clarke generalized Jacobian via limiting Jacobian enumeration.
-    ///
-    /// 1. Runs `forward_nonsmooth` to detect all kink operations and their branches.
-    /// 2. Identifies "active" kinks (|switching_value| < `tol`).
-    /// 3. Enumerates all 2^k sign combinations for the k active kinks.
-    /// 4. For each combination, computes a limiting Jacobian via forced reverse sweeps.
-    ///
-    /// Returns the nonsmooth info and a vector of limiting Jacobians.
-    ///
-    /// # Errors
-    ///
-    /// Returns [`crate::ClarkeError::TooManyKinks`] if the number of active kinks exceeds
-    /// the limit (default 20, overridden by `max_active_kinks`).
-    pub fn clarke_jacobian(
-        &mut self,
-        inputs: &[F],
-        tol: F,
-        max_active_kinks: Option<usize>,
-    ) -> Result<(crate::nonsmooth::NonsmoothInfo<F>, Vec<Vec<Vec<F>>>), crate::nonsmooth::ClarkeError>
-    {
-        #![allow(clippy::type_complexity)]
-        let info = self.forward_nonsmooth(inputs);
-        let active: Vec<&crate::nonsmooth::KinkEntry<F>> = info
-            .active_kinks(tol)
-            .into_iter()
-            .filter(|k| opcode::has_nontrivial_subdifferential(k.opcode))
-            .collect();
-        let k = active.len();
-        let limit = max_active_kinks.unwrap_or(20);
-
-        if k > limit {
-            return Err(crate::nonsmooth::ClarkeError::TooManyKinks { count: k, limit });
-        }
-
-        let active_indices: Vec<u32> = active.iter().map(|e| e.tape_index).collect();
-
-        // Build sign_map from all (non-active) kinks using their natural branches,
-        // then override active kinks per combination.
-        let base_signs: HashMap<u32, i8> = info
-            .kinks
-            .iter()
-            .map(|e| (e.tape_index, e.branch))
-            .collect();
-
-        let out_indices = self.all_output_indices();
-        let ni = self.num_inputs as usize;
-
-        let num_combos = 1usize << k;
-        let mut jacobians = Vec::with_capacity(num_combos);
-
-        for combo in 0..num_combos {
-            let mut sign_map = base_signs.clone();
-            for (bit, &idx) in active_indices.iter().enumerate() {
-                let sign: i8 = if (combo >> bit) & 1 == 0 { 1 } else { -1 };
-                sign_map.insert(idx, sign);
-            }
-
-            let mut jac = Vec::with_capacity(out_indices.len());
-            for &out_idx in out_indices {
-                let adjoints = self.reverse_with_forced_signs(out_idx, &sign_map);
-                jac.push(adjoints[..ni].to_vec());
-            }
-            jacobians.push(jac);
-        }
-
-        Ok((info, jacobians))
-    }
-
-    /// Re-evaluate the tape at new inputs (forward sweep).
-    ///
-    /// Overwrites `values` in-place — no allocation.
-    pub fn forward(&mut self, inputs: &[F]) {
-        assert_eq!(
-            inputs.len(),
-            self.num_inputs as usize,
-            "wrong number of inputs"
-        );
-
-        // Overwrite input values.
-        for (i, &v) in inputs.iter().enumerate() {
-            self.values[i] = v;
-        }
-
-        // Re-evaluate all non-Input, non-Const ops.
-        for i in 0..self.opcodes.len() {
-            match self.opcodes[i] {
-                OpCode::Input | OpCode::Const => continue,
-                OpCode::Custom => {
-                    let [a_idx, cb_idx] = self.arg_indices[i];
-                    let a = self.values[a_idx as usize];
-                    let b = self
-                        .custom_second_args
-                        .get(&(i as u32))
-                        .map(|&bi| self.values[bi as usize])
-                        .unwrap_or(F::zero());
-                    self.values[i] = self.custom_ops[cb_idx as usize].eval(a, b);
-                }
-                op => {
-                    let [a_idx, b_idx] = self.arg_indices[i];
-                    let a = self.values[a_idx as usize];
-                    let b = if b_idx != UNUSED && op != OpCode::Powi {
-                        self.values[b_idx as usize]
-                    } else if op == OpCode::Powi {
-                        F::from(b_idx).unwrap_or_else(|| F::zero())
-                    } else {
-                        F::zero()
-                    };
-                    self.values[i] = opcode::eval_forward(op, a, b);
-                }
-            }
-        }
-    }
-
-    /// Forward sweep with nonsmooth branch tracking.
-    ///
-    /// Calls [`forward`](Self::forward) to evaluate the tape, then scans for
-    /// nonsmooth operations and records which branch was taken at each one.
-    ///
-    /// Tracked operations:
-    /// - `Abs`, `Min`, `Max` — kinks with nontrivial subdifferentials
-    /// - `Signum`, `Floor`, `Ceil`, `Round`, `Trunc` — step-function
-    ///   discontinuities (zero derivative on both sides, tracked for proximity
-    ///   detection only)
-    ///
-    /// Returns [`crate::NonsmoothInfo`] containing all kink entries in tape order.
-    pub fn forward_nonsmooth(&mut self, inputs: &[F]) -> crate::nonsmooth::NonsmoothInfo<F> {
-        self.forward(inputs);
-
-        let mut kinks = Vec::new();
-        for i in 0..self.opcodes.len() {
-            let op = self.opcodes[i];
-            if !opcode::is_nonsmooth(op) {
-                continue;
-            }
-
-            let [a_idx, b_idx] = self.arg_indices[i];
-            let a = self.values[a_idx as usize];
-
-            match op {
-                OpCode::Abs => {
-                    kinks.push(crate::nonsmooth::KinkEntry {
-                        tape_index: i as u32,
-                        opcode: op,
-                        switching_value: a,
-                        branch: if a >= F::zero() { 1 } else { -1 },
-                    });
-                }
-                OpCode::Max => {
-                    let b = self.values[b_idx as usize];
-                    kinks.push(crate::nonsmooth::KinkEntry {
-                        tape_index: i as u32,
-                        opcode: op,
-                        switching_value: a - b,
-                        branch: if a >= b { 1 } else { -1 },
-                    });
-                }
-                OpCode::Min => {
-                    let b = self.values[b_idx as usize];
-                    kinks.push(crate::nonsmooth::KinkEntry {
-                        tape_index: i as u32,
-                        opcode: op,
-                        switching_value: a - b,
-                        branch: if a <= b { 1 } else { -1 },
-                    });
-                }
-                OpCode::Signum => {
-                    // Kink at x = 0 (same as Abs).
-                    kinks.push(crate::nonsmooth::KinkEntry {
-                        tape_index: i as u32,
-                        opcode: op,
-                        switching_value: a,
-                        branch: if a >= F::zero() { 1 } else { -1 },
-                    });
-                }
-                OpCode::Floor | OpCode::Ceil | OpCode::Round | OpCode::Trunc => {
-                    // Kink at integer values. switching_value = distance to
-                    // nearest integer: zero exactly at kink points, works
-                    // symmetrically for both approach directions.
-                    kinks.push(crate::nonsmooth::KinkEntry {
-                        tape_index: i as u32,
-                        opcode: op,
-                        switching_value: a - a.round(),
-                        branch: if a - a.floor() < F::from(0.5).unwrap() {
-                            1
-                        } else {
-                            -1
-                        },
-                    });
-                }
-                _ => unreachable!(),
-            }
-        }
-
-        crate::nonsmooth::NonsmoothInfo { kinks }
-    }
-
-    /// Core reverse sweep loop shared by all scalar reverse sweep variants.
-    ///
-    /// Expects `adjoints` to be pre-seeded by the caller (length = `num_variables`).
-    /// Reads primal values from `values` (either `self.values` or an external buffer).
-    /// When `forced_signs` is `Some`, uses forced partials at matching tape indices.
-    fn reverse_sweep_core(
-        &self,
-        adjoints: &mut [F],
-        values: &[F],
-        forced_signs: Option<&HashMap<u32, i8>>,
-    ) {
-        for i in (0..self.opcodes.len()).rev() {
-            let adj = adjoints[i];
-            if adj == F::zero() {
-                continue;
-            }
-
-            match self.opcodes[i] {
-                OpCode::Input | OpCode::Const => continue,
-                OpCode::Custom => {
-                    adjoints[i] = F::zero();
-                    let [a_idx, cb_idx] = self.arg_indices[i];
-                    let a = values[a_idx as usize];
-                    let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied();
-                    let b = b_idx_opt.map(|bi| values[bi as usize]).unwrap_or(F::zero());
-                    let r = values[i];
-                    let (da, db) = self.custom_ops[cb_idx as usize].partials(a, b, r);
-                    adjoints[a_idx as usize] = adjoints[a_idx as usize] + da * adj;
-                    if let Some(bi) = b_idx_opt {
-                        adjoints[bi as usize] = adjoints[bi as usize] + db * adj;
-                    }
-                }
-                op => {
-                    adjoints[i] = F::zero();
-                    let [a_idx, b_idx] = self.arg_indices[i];
-                    let a = values[a_idx as usize];
-                    let b = if b_idx != UNUSED && op != OpCode::Powi {
-                        values[b_idx as usize]
-                    } else if op == OpCode::Powi {
-                        F::from(b_idx).unwrap_or_else(|| F::zero())
-                    } else {
-                        F::zero()
-                    };
-                    let r = values[i];
-
-                    let (da, db) = match forced_signs.and_then(|fs| fs.get(&(i as u32))) {
-                        Some(&sign) => opcode::forced_reverse_partials(op, a, b, r, sign),
-                        None => opcode::reverse_partials(op, a, b, r),
-                    };
-
-                    adjoints[a_idx as usize] = adjoints[a_idx as usize] + da * adj;
-                    if b_idx != UNUSED && op != OpCode::Powi {
-                        adjoints[b_idx as usize] = adjoints[b_idx as usize] + db * adj;
-                    }
-                }
-            }
-        }
-    }
-
-    /// Reverse sweep: compute adjoints seeded at the output.
-    ///
-    /// Returns the full adjoint vector (length = `num_variables`).
-    pub fn reverse(&self, seed_index: u32) -> Vec<F> {
-        let n = self.num_variables as usize;
-        let mut adjoints = vec![F::zero(); n];
-        adjoints[seed_index as usize] = F::one();
-        self.reverse_sweep_core(&mut adjoints, &self.values, None);
-        adjoints
-    }
-
-    /// Reverse sweep with forced branch choices at specified tape indices.
-    fn reverse_with_forced_signs(
-        &self,
-        seed_index: u32,
-        forced_signs: &HashMap<u32, i8>,
-    ) -> Vec<F> {
-        let n = self.num_variables as usize;
-        let mut adjoints = vec![F::zero(); n];
-        adjoints[seed_index as usize] = F::one();
-        self.reverse_sweep_core(&mut adjoints, &self.values, Some(forced_signs));
-        adjoints
-    }
-
-    /// Forward evaluation into an external buffer.
-    ///
-    /// Reads opcodes, constants, and argument indices from `self`, but writes
-    /// computed values into `values_buf` instead of `self.values`. This allows
-    /// parallel evaluation of the same tape at different inputs without cloning.
-    pub fn forward_into(&self, inputs: &[F], values_buf: &mut Vec<F>) {
-        assert_eq!(
-            inputs.len(),
-            self.num_inputs as usize,
-            "wrong number of inputs"
-        );
-
-        let n = self.num_variables as usize;
-        values_buf.clear();
-        values_buf.resize(n, F::zero());
-
-        // Copy constant values from the tape, then overwrite inputs.
-        values_buf.copy_from_slice(&self.values[..n]);
-        for (i, &v) in inputs.iter().enumerate() {
-            values_buf[i] = v;
-        }
-
-        // Re-evaluate all non-Input, non-Const ops.
-        for i in 0..self.opcodes.len() {
-            match self.opcodes[i] {
-                OpCode::Input | OpCode::Const => continue,
-                OpCode::Custom => {
-                    let [a_idx, cb_idx] = self.arg_indices[i];
-                    let a = values_buf[a_idx as usize];
-                    let b = self
-                        .custom_second_args
-                        .get(&(i as u32))
-                        .map(|&bi| values_buf[bi as usize])
-                        .unwrap_or(F::zero());
-                    values_buf[i] = self.custom_ops[cb_idx as usize].eval(a, b);
-                }
-                op => {
-                    let [a_idx, b_idx] = self.arg_indices[i];
-                    let a = values_buf[a_idx as usize];
-                    let b = if b_idx != UNUSED && op != OpCode::Powi {
-                        values_buf[b_idx as usize]
-                    } else if op == OpCode::Powi {
-                        F::from(b_idx).unwrap_or_else(|| F::zero())
-                    } else {
-                        F::zero()
-                    };
-                    values_buf[i] = opcode::eval_forward(op, a, b);
-                }
-            }
-        }
-    }
-
-    /// Reverse sweep reading from an external values buffer.
-    ///
-    /// Like [`reverse`](Self::reverse) but reads primal values from `values`
-    /// instead of `self.values`. Pair with [`forward_into`](Self::forward_into)
-    /// for parallel evaluation.
-    pub fn reverse_from(&self, values: &[F], seed_index: u32) -> Vec<F> {
-        let n = self.num_variables as usize;
-        assert_eq!(values.len(), n, "values buffer has wrong length");
-        let mut adjoints = vec![F::zero(); n];
-        adjoints[seed_index as usize] = F::one();
-        self.reverse_sweep_core(&mut adjoints, values, None);
-        adjoints
-    }
-
-    /// Forward + reverse: compute the gradient at new inputs.
-    ///
-    /// Returns only the input adjoints (indices `0..num_inputs`).
-    pub fn gradient(&mut self, inputs: &[F]) -> Vec<F> {
-        self.forward(inputs);
-        let adjoints = self.reverse(self.output_index);
-        adjoints[..self.num_inputs as usize].to_vec()
-    }
-
-    /// Like [`gradient`](Self::gradient) but reuses a caller-provided buffer
-    /// for the adjoint vector, avoiding allocation on repeated calls.
-    pub fn gradient_with_buf(&mut self, inputs: &[F], adjoint_buf: &mut Vec<F>) -> Vec<F> {
-        self.forward(inputs);
-
-        let n = self.num_variables as usize;
-        adjoint_buf.clear();
-        adjoint_buf.resize(n, F::zero());
-        adjoint_buf[self.output_index as usize] = F::one();
-
-        self.reverse_sweep_core(adjoint_buf, &self.values, None);
-        adjoint_buf[..self.num_inputs as usize].to_vec()
-    }
-
-    // ── Forward-over-reverse (second-order) ──
-
-    /// Forward sweep with tangent-carrying numbers. Reads opcodes and constants
-    /// from `self`, writing results into `buf`. Does not mutate the tape.
-    ///
-    /// Generic over `T: NumFloat` so it works with both `Dual<F>` and
-    /// `DualVec<F, N>`.
-    pub fn forward_tangent<T: NumFloat>(&self, inputs: &[T], buf: &mut Vec<T>) {
-        self.forward_tangent_inner(inputs, buf, |i, a_t, b_t| {
-            // First-order chain rule: evaluate on primals, convert partials to T.
-            let [a_idx, cb_idx] = self.arg_indices[i];
-            let a_primal = self.values[a_idx as usize];
-            let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied();
-            let b_primal = b_idx_opt
-                .map(|bi| self.values[bi as usize])
-                .unwrap_or(F::zero());
-            let result = self.custom_ops[cb_idx as usize].eval(a_primal, b_primal);
-            let (da, db) = self.custom_ops[cb_idx as usize].partials(a_primal, b_primal, result);
-            let result_t = T::from(result).unwrap();
-            let da_t = T::from(da).unwrap();
-            let db_t = T::from(db).unwrap();
-            let a_re_t = T::from(a_primal).unwrap();
-            let b_re_t = T::from(b_primal).unwrap();
-            result_t + da_t * (a_t - a_re_t) + db_t * (b_t - b_re_t)
-        });
-    }
-
-    /// Forward sweep specialized for `Dual<F>`, calling [`CustomOp::eval_dual`]
-    /// so that custom ops propagate tangent information for second-order derivatives.
-    fn forward_tangent_dual(&self, inputs: &[Dual<F>], buf: &mut Vec<Dual<F>>) {
-        self.forward_tangent_inner(inputs, buf, |i, a_t, b_t| {
-            let [_a_idx, cb_idx] = self.arg_indices[i];
-            self.custom_ops[cb_idx as usize].eval_dual(a_t, b_t)
-        });
-    }
-
-    /// Common forward-tangent loop. The `handle_custom` closure receives
-    /// `(tape_index, a_value, b_value)` for custom op slots and returns
-    /// the result to store.
-    fn forward_tangent_inner<T: NumFloat>(
-        &self,
-        inputs: &[T],
-        buf: &mut Vec<T>,
-        handle_custom: impl Fn(usize, T, T) -> T,
-    ) {
-        assert_eq!(
-            inputs.len(),
-            self.num_inputs as usize,
-            "wrong number of inputs"
-        );
-
-        let n = self.num_variables as usize;
-        buf.clear();
-        buf.resize(n, T::zero());
-
-        let mut input_idx = 0usize;
-        for i in 0..self.opcodes.len() {
-            match self.opcodes[i] {
-                OpCode::Input => {
-                    buf[i] = inputs[input_idx];
-                    input_idx += 1;
-                }
-                OpCode::Const => {
-                    buf[i] = T::from(self.values[i]).unwrap();
-                }
-                OpCode::Custom => {
-                    let [a_idx, _cb_idx] = self.arg_indices[i];
-                    let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied();
-                    let a_t = buf[a_idx as usize];
-                    let b_t = b_idx_opt.map(|bi| buf[bi as usize]).unwrap_or(T::zero());
-                    buf[i] = handle_custom(i, a_t, b_t);
-                }
-                op => {
-                    let [a_idx, b_idx] = self.arg_indices[i];
-                    let a = buf[a_idx as usize];
-                    let b = if b_idx != UNUSED && op != OpCode::Powi {
-                        buf[b_idx as usize]
-                    } else if op == OpCode::Powi {
-                        T::from(b_idx).unwrap_or_else(|| T::zero())
-                    } else {
-                        T::zero()
-                    };
-                    buf[i] = opcode::eval_forward(op, a, b);
-                }
-            }
-        }
-    }
-
-    /// Reverse sweep with tangent-carrying adjoints. Uses values from
-    /// [`forward_tangent`](Self::forward_tangent). Uses [`IsAllZero`] to
-    /// safely skip zero adjoints without dropping tangent contributions.
-    fn reverse_tangent<T: NumFloat + IsAllZero>(&self, tangent_vals: &[T], buf: &mut Vec<T>) {
-        self.reverse_tangent_inner(tangent_vals, buf, |i| {
-            // First-order: convert primal-float partials to T.
-            let [a_idx, cb_idx] = self.arg_indices[i];
-            let a_primal = self.values[a_idx as usize];
-            let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied();
-            let b_primal = b_idx_opt
-                .map(|bi| self.values[bi as usize])
-                .unwrap_or(F::zero());
-            let r_primal = self.values[i];
-            let (da, db) = self.custom_ops[cb_idx as usize].partials(a_primal, b_primal, r_primal);
-            (T::from(da).unwrap(), T::from(db).unwrap())
-        });
-    }
-
-    /// Reverse sweep specialized for `Dual<F>`, calling [`CustomOp::partials_dual`]
-    /// so that custom op partials carry tangent information for second-order derivatives.
-    fn reverse_tangent_dual(&self, tangent_vals: &[Dual<F>], buf: &mut Vec<Dual<F>>) {
-        self.reverse_tangent_inner(tangent_vals, buf, |i| {
-            let [a_idx, cb_idx] = self.arg_indices[i];
-            let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied();
-            let a_dual = tangent_vals[a_idx as usize];
-            let b_dual = b_idx_opt
-                .map(|bi| tangent_vals[bi as usize])
-                .unwrap_or(Dual::constant(F::zero()));
-            let r_dual = tangent_vals[i];
-            self.custom_ops[cb_idx as usize].partials_dual(a_dual, b_dual, r_dual)
-        });
-    }
-
-    /// Common reverse-tangent loop. The `custom_partials` closure receives
-    /// `tape_index` for custom op slots and returns `(da, db)` as T-valued partials.
-    fn reverse_tangent_inner<T: NumFloat + IsAllZero>(
-        &self,
-        tangent_vals: &[T],
-        buf: &mut Vec<T>,
-        custom_partials: impl Fn(usize) -> (T, T),
-    ) {
-        let n = self.num_variables as usize;
-        buf.clear();
-        buf.resize(n, T::zero());
-        buf[self.output_index as usize] = T::one();
-
-        for i in (0..self.opcodes.len()).rev() {
-            match self.opcodes[i] {
-                OpCode::Input | OpCode::Const => continue,
-                OpCode::Custom => {
-                    let adj = buf[i];
-                    if adj.is_all_zero() {
-                        continue;
-                    }
-                    buf[i] = T::zero();
-
-                    let [a_idx, _cb_idx] = self.arg_indices[i];
-                    let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied();
-                    let (da_t, db_t) = custom_partials(i);
-                    buf[a_idx as usize] = buf[a_idx as usize] + da_t * adj;
-                    if let Some(bi) = b_idx_opt {
-                        buf[bi as usize] = buf[bi as usize] + db_t * adj;
-                    }
-                }
-                op => {
-                    let adj = buf[i];
-                    if adj.is_all_zero() {
-                        continue;
-                    }
-                    buf[i] = T::zero();
-
-                    let [a_idx, b_idx] = self.arg_indices[i];
-                    let a = tangent_vals[a_idx as usize];
-                    let b = if b_idx != UNUSED && op != OpCode::Powi {
-                        tangent_vals[b_idx as usize]
-                    } else if op == OpCode::Powi {
-                        T::from(b_idx).unwrap_or_else(|| T::zero())
-                    } else {
-                        T::zero()
-                    };
-                    let r = tangent_vals[i];
-                    let (da, db) = opcode::reverse_partials(op, a, b, r);
-
-                    buf[a_idx as usize] = buf[a_idx as usize] + da * adj;
-                    if b_idx != UNUSED && op != OpCode::Powi {
-                        buf[b_idx as usize] = buf[b_idx as usize] + db * adj;
-                    }
-                }
-            }
-        }
-    }
-
-    /// Hessian-vector product via forward-over-reverse.
-    ///
-    /// Returns `(gradient, H·v)` where both are `Vec<F>` of length
-    /// [`num_inputs`](Self::num_inputs). The tape is not mutated.
-    pub fn hvp(&self, x: &[F], v: &[F]) -> (Vec<F>, Vec<F>) {
-        let mut dual_vals = Vec::new();
-        let mut adjoint_buf = Vec::new();
-        self.hvp_with_buf(x, v, &mut dual_vals, &mut adjoint_buf)
-    }
-
-    /// Like [`hvp`](Self::hvp) but reuses caller-provided buffers to avoid
-    /// allocation on repeated calls (e.g. inside [`hessian`](Self::hessian)).
-    pub fn hvp_with_buf(
-        &self,
-        x: &[F],
-        v: &[F],
-        dual_vals_buf: &mut Vec<Dual<F>>,
-        adjoint_buf: &mut Vec<Dual<F>>,
-    ) -> (Vec<F>, Vec<F>) {
-        let n = self.num_inputs as usize;
-        assert_eq!(x.len(), n, "wrong number of inputs");
-        assert_eq!(v.len(), n, "wrong number of directions");
-
-        let dual_inputs: Vec<Dual<F>> = x
-            .iter()
-            .zip(v.iter())
-            .map(|(&xi, &vi)| Dual::new(xi, vi))
-            .collect();
-
-        self.forward_tangent_dual(&dual_inputs, dual_vals_buf);
-        self.reverse_tangent_dual(dual_vals_buf, adjoint_buf);
-
-        let gradient: Vec<F> = (0..n).map(|i| adjoint_buf[i].re).collect();
-        let hvp: Vec<F> = (0..n).map(|i| adjoint_buf[i].eps).collect();
-        (gradient, hvp)
-    }
-
-    /// Like [`hvp_with_buf`](Self::hvp_with_buf) but also reuses a caller-provided
-    /// input buffer, eliminating all allocations on repeated calls.
-    fn hvp_with_all_bufs(
-        &self,
-        x: &[F],
-        v: &[F],
-        dual_input_buf: &mut Vec<Dual<F>>,
-        dual_vals_buf: &mut Vec<Dual<F>>,
-        adjoint_buf: &mut Vec<Dual<F>>,
-    ) {
-        // Reuse the input buffer instead of allocating
-        dual_input_buf.clear();
-        dual_input_buf.extend(x.iter().zip(v.iter()).map(|(&xi, &vi)| Dual::new(xi, vi)));
-
-        self.forward_tangent_dual(dual_input_buf, dual_vals_buf);
-        self.reverse_tangent_dual(dual_vals_buf, adjoint_buf);
-    }
-
-    /// Full Hessian matrix via `n` Hessian-vector products.
-    ///
-    /// Returns `(value, gradient, hessian)` where `hessian[i][j] = ∂²f/∂x_i∂x_j`.
-    /// The tape is not mutated.
-    pub fn hessian(&self, x: &[F]) -> (F, Vec<F>, Vec<Vec<F>>) {
-        let n = self.num_inputs as usize;
-        assert_eq!(x.len(), n, "wrong number of inputs");
-
-        let mut dual_input_buf: Vec<Dual<F>> = Vec::with_capacity(n);
-        let mut dual_vals_buf = Vec::new();
-        let mut adjoint_buf = Vec::new();
-        let mut hessian = vec![vec![F::zero(); n]; n];
-        let mut gradient = vec![F::zero(); n];
-        let mut value = F::zero();
-
-        for j in 0..n {
-            // Reuse input buffer
-            dual_input_buf.clear();
-            dual_input_buf
-                .extend((0..n).map(|i| Dual::new(x[i], if i == j { F::one() } else { F::zero() })));
-
-            self.forward_tangent_dual(&dual_input_buf, &mut dual_vals_buf);
-            self.reverse_tangent_dual(&dual_vals_buf, &mut adjoint_buf);
-
-            if j == 0 {
-                value = dual_vals_buf[self.output_index as usize].re;
-                for i in 0..n {
-                    gradient[i] = adjoint_buf[i].re;
-                }
-            }
-
-            for (row, adj) in hessian.iter_mut().zip(adjoint_buf.iter()) {
-                row[j] = adj.eps;
-            }
-        }
-
-        (value, gradient, hessian)
-    }
-
-    /// Full Hessian matrix via batched forward-over-reverse.
-    ///
-    /// Processes `ceil(n/N)` batches instead of `n` individual HVPs,
-    /// computing N Hessian columns simultaneously.
-    pub fn hessian_vec<const N: usize>(&self, x: &[F]) -> (F, Vec<F>, Vec<Vec<F>>) {
-        let n = self.num_inputs as usize;
-        assert_eq!(x.len(), n, "wrong number of inputs");
-
-        let mut dual_input_buf: Vec<DualVec<F, N>> = Vec::with_capacity(n);
-        let mut dual_vals_buf: Vec<DualVec<F, N>> = Vec::new();
-        let mut adjoint_buf: Vec<DualVec<F, N>> = Vec::new();
-        let mut hessian = vec![vec![F::zero(); n]; n];
-        let mut gradient = vec![F::zero(); n];
-        let mut value = F::zero();
-
-        let num_batches = n.div_ceil(N);
-        for batch in 0..num_batches {
-            let base = batch * N;
-
-            // Reuse input buffer
-            dual_input_buf.clear();
-            dual_input_buf.extend((0..n).map(|i| {
-                let eps = std::array::from_fn(|lane| {
-                    let col = base + lane;
-                    if col < n && i == col {
-                        F::one()
-                    } else {
-                        F::zero()
-                    }
-                });
-                DualVec::new(x[i], eps)
-            }));
-
-            self.forward_tangent(&dual_input_buf, &mut dual_vals_buf);
-            self.reverse_tangent(&dual_vals_buf, &mut adjoint_buf);
-
-            if batch == 0 {
-                value = dual_vals_buf[self.output_index as usize].re;
-                for i in 0..n {
-                    gradient[i] = adjoint_buf[i].re;
-                }
-            }
-
-            for lane in 0..N {
-                let col = base + lane;
-                if col >= n {
-                    break;
-                }
-                for i in 0..n {
-                    hessian[i][col] = adjoint_buf[i].eps[lane];
-                }
-            }
-        }
-
-        (value, gradient, hessian)
-    }
-
-    /// Detect the structural sparsity pattern of the Hessian.
-    ///
-    /// Walks the tape forward propagating input-dependency bitsets.
-    /// At nonlinear operations, marks cross-pairs as potential Hessian interactions.
-    pub fn detect_sparsity(&self) -> crate::sparse::SparsityPattern {
-        crate::sparse::detect_sparsity_impl(
-            &self.opcodes,
-            &self.arg_indices,
-            self.num_inputs as usize,
-            self.num_variables as usize,
-        )
-    }
-
-    /// Detect the structural sparsity pattern of the Jacobian.
-    ///
-    /// Walks the tape forward propagating input-dependency bitsets (first-order).
-    /// For each output, determines which inputs it depends on.
-    pub fn detect_jacobian_sparsity(&self) -> crate::sparse::JacobianSparsityPattern {
-        let out_indices = self.all_output_indices();
-        crate::sparse::detect_jacobian_sparsity_impl(
-            &self.opcodes,
-            &self.arg_indices,
-            self.num_inputs as usize,
-            self.num_variables as usize,
-            out_indices,
-        )
-    }
-
-    /// Compute a sparse Hessian using structural sparsity detection and graph coloring.
-    ///
-    /// Returns `(value, gradient, pattern, hessian_values)` where
-    /// `hessian_values[k]` corresponds to `(pattern.rows[k], pattern.cols[k])`.
-    ///
-    /// For problems with sparse Hessians, this requires only `chromatic_number`
-    /// HVP calls instead of `n`, which can be dramatically fewer for banded
-    /// or sparse interaction structures.
-    pub fn sparse_hessian(&self, x: &[F]) -> (F, Vec<F>, crate::sparse::SparsityPattern, Vec<F>) {
-        let n = self.num_inputs as usize;
-        assert_eq!(x.len(), n, "wrong number of inputs");
-
-        let pattern = self.detect_sparsity();
-        let (colors, num_colors) = crate::sparse::greedy_coloring(&pattern);
-
-        let mut hessian_values = vec![F::zero(); pattern.nnz()];
-        let mut gradient = vec![F::zero(); n];
-        let mut value = F::zero();
-
-        let mut dual_input_buf: Vec<Dual<F>> = Vec::with_capacity(n);
-        let mut dual_vals_buf = Vec::new();
-        let mut adjoint_buf = Vec::new();
-        let mut v = vec![F::zero(); n];
-
-        for color in 0..num_colors {
-            // Form direction vector: v[i] = 1 if colors[i] == color, else 0
-            for i in 0..n {
-                v[i] = if colors[i] == color {
-                    F::one()
-                } else {
-                    F::zero()
-                };
-            }
-
-            self.hvp_with_all_bufs(
-                x,
-                &v,
-                &mut dual_input_buf,
-                &mut dual_vals_buf,
-                &mut adjoint_buf,
-            );
-
-            if color == 0 {
-                value = dual_vals_buf[self.output_index as usize].re;
-                for i in 0..n {
-                    gradient[i] = adjoint_buf[i].re;
-                }
-            }
-
-            // Extract Hessian entries for this color.
-            for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() {
-                if colors[col as usize] == color {
-                    hessian_values[k] = adjoint_buf[row as usize].eps;
-                }
-            }
-        }
-
-        (value, gradient, pattern, hessian_values)
-    }
-
-    /// Batched sparse Hessian: packs N colors per sweep using DualVec.
-    ///
-    /// Reduces the number of forward+reverse sweeps from `num_colors` to
-    /// `ceil(num_colors / N)`. Each sweep processes N colors simultaneously.
-    ///
-    /// Returns `(value, gradient, pattern, hessian_values)`.
-    pub fn sparse_hessian_vec<const N: usize>(
-        &self,
-        x: &[F],
-    ) -> (F, Vec<F>, crate::sparse::SparsityPattern, Vec<F>) {
-        let n = self.num_inputs as usize;
-        assert_eq!(x.len(), n, "wrong number of inputs");
-
-        let pattern = self.detect_sparsity();
-        let (colors, num_colors) = crate::sparse::greedy_coloring(&pattern);
-
-        let mut hessian_values = vec![F::zero(); pattern.nnz()];
-        let mut gradient = vec![F::zero(); n];
-        let mut value = F::zero();
-
-        let mut dual_input_buf: Vec<DualVec<F, N>> = Vec::with_capacity(n);
-        let mut dual_vals_buf: Vec<DualVec<F, N>> = Vec::new();
-        let mut adjoint_buf: Vec<DualVec<F, N>> = Vec::new();
-
-        let num_batches = (num_colors as usize).div_ceil(N);
-        for batch in 0..num_batches {
-            let base_color = (batch * N) as u32;
-
-            // Build DualVec inputs: lane k has v[i]=1 if colors[i] == base_color+k
-            dual_input_buf.clear();
-            dual_input_buf.extend((0..n).map(|i| {
-                let eps = std::array::from_fn(|lane| {
-                    let target_color = base_color + lane as u32;
-                    if target_color < num_colors && colors[i] == target_color {
-                        F::one()
-                    } else {
-                        F::zero()
-                    }
-                });
-                DualVec::new(x[i], eps)
-            }));
-
-            self.forward_tangent(&dual_input_buf, &mut dual_vals_buf);
-            self.reverse_tangent(&dual_vals_buf, &mut adjoint_buf);
-
-            if batch == 0 {
-                value = dual_vals_buf[self.output_index as usize].re;
-                for i in 0..n {
-                    gradient[i] = adjoint_buf[i].re;
-                }
-            }
-
-            // Extract Hessian entries: for entry (row, col) with colors[col] == base_color+lane,
-            // read adjoint_buf[row].eps[lane]
-            for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() {
-                let col_color = colors[col as usize];
-                if col_color >= base_color && col_color < base_color + N as u32 {
-                    let lane = (col_color - base_color) as usize;
-                    hessian_values[k] = adjoint_buf[row as usize].eps[lane];
-                }
-            }
-        }
-
-        (value, gradient, pattern, hessian_values)
-    }
-
-    // ── Batch evaluation ──
-
-    /// Evaluate the gradient at multiple input points.
-    ///
-    /// Returns one gradient vector per input point.
-    pub fn gradient_batch(&mut self, inputs: &[&[F]]) -> Vec<Vec<F>> {
-        inputs.iter().map(|x| self.gradient(x)).collect()
-    }
-
-    // ── Higher-order derivatives ──
-
-    /// Third-order directional derivative: `∑_{jk} (∂³f/∂x_i∂x_j∂x_k) v1_j v2_k`.
-    ///
-    /// Given directions `v1` and `v2`, computes:
-    /// - `gradient`: `∇f(x)`
-    /// - `hvp`: `H(x) · v1` (Hessian-vector product)
-    /// - `third`: `(∂/∂v2)(H · v1)` (third-order tensor contracted with v1 and v2)
-    ///
-    /// Uses `Dual<Dual<F>>` (nested dual numbers): inner tangent for `v1`,
-    /// outer tangent for `v2`.
-    pub fn third_order_hvvp(&self, x: &[F], v1: &[F], v2: &[F]) -> (Vec<F>, Vec<F>, Vec<F>) {
-        let n = self.num_inputs as usize;
-        assert_eq!(x.len(), n, "wrong number of inputs");
-        assert_eq!(v1.len(), n, "wrong v1 length");
-        assert_eq!(v2.len(), n, "wrong v2 length");
-
-        // Build Dual<Dual<F>> inputs:
-        //   inner.re = x[i], inner.eps = v1[i]  (for HVP direction)
-        //   outer.re = inner, outer.eps = v2[i]  (for third-order direction)
-        //
-        // outer = Dual { re: Dual(x[i], v1[i]), eps: Dual(v2[i], 0) }
-        let dd_inputs: Vec<Dual<Dual<F>>> = (0..n)
-            .map(|i| Dual {
-                re: Dual::new(x[i], v1[i]),
-                eps: Dual::new(v2[i], F::zero()),
-            })
-            .collect();
-
-        let mut dd_vals: Vec<Dual<Dual<F>>> = Vec::new();
-        let mut dd_adj: Vec<Dual<Dual<F>>> = Vec::new();
-
-        self.forward_tangent(&dd_inputs, &mut dd_vals);
-        self.reverse_tangent(&dd_vals, &mut dd_adj);
-
-        let gradient: Vec<F> = (0..n).map(|i| dd_adj[i].re.re).collect();
-        let hvp: Vec<F> = (0..n).map(|i| dd_adj[i].re.eps).collect();
-        let third: Vec<F> = (0..n).map(|i| dd_adj[i].eps.eps).collect();
-
-        (gradient, hvp, third)
-    }
-
-    /// Forward-reverse Taylor pass for gradient + higher-order directional adjoints.
-    ///
-    /// Builds Taylor inputs `x_i(t) = x_i + v_i * t` (with zero higher coefficients),
-    /// runs `forward_tangent`, then `reverse_tangent` to get Taylor-valued adjoints.
-    ///
-    /// Returns `(output, adjoints)` where:
-    /// - `output` is the Taylor expansion of `f` along direction `v`
-    /// - `adjoints[i].coeff(0)` = `∂f/∂x_i` (gradient)
-    /// - `adjoints[i].coeff(1)` = `Σ_j (∂²f/∂x_i∂x_j) v_j` (HVP)
-    /// - `adjoints[i].derivative(k)` = k-th order directional adjoint
-    ///
-    /// For K=2, the HVP component is equivalent to [`hvp`](Self::hvp).
-    /// For K≥3, yields additional higher-order information in the same pass.
-    ///
-    /// Like [`hvp`](Self::hvp), takes `&self` and does not call `forward(x)`
-    /// before the Taylor pass. Custom ops will use primal values from recording time.
-    #[cfg(feature = "taylor")]
-    pub fn taylor_grad<const K: usize>(
-        &self,
-        x: &[F],
-        v: &[F],
-    ) -> (Taylor<F, K>, Vec<Taylor<F, K>>) {
-        let mut fwd_buf = Vec::new();
-        let mut adj_buf = Vec::new();
-        self.taylor_grad_with_buf(x, v, &mut fwd_buf, &mut adj_buf)
-    }
-
-    /// Like [`taylor_grad`](Self::taylor_grad) but reuses caller-provided buffers
-    /// to avoid allocation on repeated calls.
-    #[cfg(feature = "taylor")]
-    pub fn taylor_grad_with_buf<const K: usize>(
-        &self,
-        x: &[F],
-        v: &[F],
-        fwd_buf: &mut Vec<Taylor<F, K>>,
-        adj_buf: &mut Vec<Taylor<F, K>>,
-    ) -> (Taylor<F, K>, Vec<Taylor<F, K>>) {
-        let n = self.num_inputs as usize;
-        assert_eq!(x.len(), n, "wrong number of inputs");
-        assert_eq!(v.len(), n, "wrong number of directions");
-
-        // Build Taylor inputs: x_i(t) = x_i + v_i * t
-        let taylor_inputs: Vec<Taylor<F, K>> = x
-            .iter()
-            .zip(v.iter())
-            .map(|(&xi, &vi)| {
-                let mut coeffs = [F::zero(); K];
-                coeffs[0] = xi;
-                if K > 1 {
-                    coeffs[1] = vi;
-                }
-                Taylor::new(coeffs)
-            })
-            .collect();
-
-        self.forward_tangent(&taylor_inputs, fwd_buf);
-        let output = fwd_buf[self.output_index as usize];
-        self.reverse_tangent(fwd_buf, adj_buf);
-
-        (output, adj_buf[..n].to_vec())
-    }
-
-    // ── ODE Taylor integration ──
-
-    /// Compute the Taylor expansion of the ODE solution `y(t)` to order K.
-    ///
-    /// Given a tape representing the right-hand side `f: R^n → R^n` of the ODE
-    /// `y' = f(y)`, and an initial condition `y(0) = y0`, computes the Taylor
-    /// coefficients `y_0, y_1, ..., y_{K-1}` such that
-    /// `y(t) ≈ y_0 + y_1·t + y_2·t² + ... + y_{K-1}·t^{K-1}`.
-    ///
-    /// The tape must have `num_outputs == num_inputs` (autonomous ODE: f maps R^n → R^n).
-    ///
-    /// Returns one `Taylor<F, K>` per state variable. Use [`Taylor::eval_at`] to
-    /// evaluate at a step size `h`, or inspect coefficients for error estimation.
-    #[cfg(feature = "taylor")]
-    pub fn ode_taylor_step<const K: usize>(&self, y0: &[F]) -> Vec<Taylor<F, K>> {
-        let mut buf = Vec::new();
-        self.ode_taylor_step_with_buf(y0, &mut buf)
-    }
-
-    /// Like [`ode_taylor_step`](Self::ode_taylor_step) but reuses a caller-provided
-    /// buffer to avoid allocation on repeated calls.
-    #[cfg(feature = "taylor")]
-    pub fn ode_taylor_step_with_buf<const K: usize>(
-        &self,
-        y0: &[F],
-        buf: &mut Vec<Taylor<F, K>>,
-    ) -> Vec<Taylor<F, K>> {
-        let n = self.num_inputs as usize;
-        assert_eq!(y0.len(), n, "y0 length must match num_inputs");
-        assert_eq!(
-            self.num_outputs(),
-            n,
-            "ODE tape must have num_outputs == num_inputs (f: R^n -> R^n)"
-        );
-
-        let out_indices = self.all_output_indices();
-
-        let mut y_coeffs = vec![[F::zero(); K]; n];
-        for i in 0..n {
-            y_coeffs[i][0] = y0[i];
-        }
-
-        for k in 0..K - 1 {
-            let inputs: Vec<Taylor<F, K>> = (0..n).map(|i| Taylor::new(y_coeffs[i])).collect();
-
-            self.forward_tangent(&inputs, buf);
-
-            let divisor = F::from(k + 1).unwrap();
-            for i in 0..n {
-                y_coeffs[i][k + 1] = buf[out_indices[i] as usize].coeff(k) / divisor;
-            }
-        }
-
-        (0..n).map(|i| Taylor::new(y_coeffs[i])).collect()
-    }
-
-    // ── Tape optimizations ──
-
-    /// Eliminate dead (unreachable) entries from the tape.
-    ///
-    /// Walks backward from all outputs, marks reachable entries, then compacts
-    /// the tape in-place with an index remap. Inputs are never removed.
-    /// Core DCE: reachability walk from `seeds`, compact tape, return index remap.
-    ///
-    /// Shared by [`dead_code_elimination`](Self::dead_code_elimination) and
-    /// [`dead_code_elimination_for_outputs`](Self::dead_code_elimination_for_outputs).
-    /// Callers handle output index updates after compaction.
-    fn dce_compact(&mut self, seeds: &[u32]) -> Vec<u32> {
-        let n = self.opcodes.len();
-        let mut reachable = vec![false; n];
-
-        // Mark all inputs as reachable.
-        for flag in reachable.iter_mut().take(self.num_inputs as usize) {
-            *flag = true;
-        }
-
-        let mut stack: Vec<u32> = seeds.to_vec();
-
-        while let Some(idx) = stack.pop() {
-            let i = idx as usize;
-            if reachable[i] {
-                continue;
-            }
-            reachable[i] = true;
-            let [a, b] = self.arg_indices[i];
-            if a != UNUSED {
-                stack.push(a);
-            }
-            if b != UNUSED && self.opcodes[i] != OpCode::Powi {
-                stack.push(b);
-            }
-        }
-
-        // Build remap: old index -> new index.
-        let mut remap = vec![0u32; n];
-        let mut new_idx = 0u32;
-        for i in 0..n {
-            if reachable[i] {
-                remap[i] = new_idx;
-                new_idx += 1;
-            }
-        }
-        let new_len = new_idx as usize;
-
-        // Compact in-place.
-        let mut write = 0;
-        for (read, &is_reachable) in reachable.iter().enumerate().take(n) {
-            if is_reachable {
-                self.opcodes[write] = self.opcodes[read];
-                self.values[write] = self.values[read];
-                let [a, b] = self.arg_indices[read];
-                let ra = if a != UNUSED {
-                    remap[a as usize]
-                } else {
-                    UNUSED
-                };
-                let rb = if b != UNUSED && self.opcodes[read] != OpCode::Powi {
-                    remap[b as usize]
-                } else {
-                    b
-                };
-                self.arg_indices[write] = [ra, rb];
-                write += 1;
-            }
-        }
-
-        self.opcodes.truncate(new_len);
-        self.arg_indices.truncate(new_len);
-        self.values.truncate(new_len);
-        self.num_variables = new_len as u32;
-
-        remap
-    }
-
-    pub fn dead_code_elimination(&mut self) {
-        let mut seeds = vec![self.output_index];
-        seeds.extend_from_slice(&self.output_indices);
-        let remap = self.dce_compact(&seeds);
-        self.output_index = remap[self.output_index as usize];
-        for oi in &mut self.output_indices {
-            *oi = remap[*oi as usize];
-        }
-    }
-
-    /// Eliminate dead code, keeping only the specified outputs alive.
-    ///
-    /// Like [`dead_code_elimination`](Self::dead_code_elimination) but seeds
-    /// reachability only from `active_outputs`. After compaction,
-    /// `output_indices` contains only the active outputs (remapped), and
-    /// `output_index` is set to the first active output.
-    ///
-    /// **Note**: Like `dead_code_elimination`, this does not remap
-    /// `custom_second_args` (pre-existing limitation).
-    ///
-    /// # Panics
-    /// Panics if `active_outputs` is empty.
-    pub fn dead_code_elimination_for_outputs(&mut self, active_outputs: &[u32]) {
-        assert!(
-            !active_outputs.is_empty(),
-            "active_outputs must not be empty"
-        );
-        let remap = self.dce_compact(active_outputs);
-        self.output_indices = active_outputs
-            .iter()
-            .map(|&oi| remap[oi as usize])
-            .collect();
-        self.output_index = self.output_indices[0];
-    }
-
-    /// Common subexpression elimination.
-    ///
-    /// Deduplicates identical `(OpCode, arg0, arg1)` triples, normalising
-    /// argument order for commutative ops. Finishes with a DCE pass to
-    /// remove the now-dead duplicates.
-    pub fn cse(&mut self) {
-        use std::collections::HashMap;
-
-        let n = self.opcodes.len();
-        // Maps canonical (op, arg0, arg1) -> first index that computed it.
-        let mut seen: HashMap<(OpCode, u32, u32), u32> = HashMap::new();
-        // remap[i] = canonical index for entry i (identity by default).
-        let mut remap: Vec<u32> = (0..n as u32).collect();
-
-        let is_commutative = |op: OpCode| -> bool {
-            matches!(
-                op,
-                OpCode::Add | OpCode::Mul | OpCode::Max | OpCode::Min | OpCode::Hypot
-            )
-        };
-
-        for i in 0..n {
-            let op = self.opcodes[i];
-            match op {
-                OpCode::Input | OpCode::Const => continue,
-                _ => {}
-            }
-
-            let [mut a, mut b] = self.arg_indices[i];
-            // Apply remap to args (except Powi exponent in b).
-            a = remap[a as usize];
-            if b != UNUSED && op != OpCode::Powi {
-                b = remap[b as usize];
-            }
-            // Update arg_indices with remapped values.
-            self.arg_indices[i] = [a, b];
-
-            // Build the canonical key.
-            let key = if b == UNUSED {
-                // Unary: hash (op, arg0) only; use UNUSED as placeholder.
-                (op, a, UNUSED)
-            } else if is_commutative(op) {
-                let (lo, hi) = if a <= b { (a, b) } else { (b, a) };
-                (op, lo, hi)
-            } else {
-                (op, a, b)
-            };
-
-            if let Some(&canonical) = seen.get(&key) {
-                remap[i] = canonical;
-            } else {
-                seen.insert(key, i as u32);
-            }
-        }
-
-        // Apply remap to all arg_indices (for entries that reference CSE'd nodes).
-        for i in 0..n {
-            let op = self.opcodes[i];
-            if matches!(op, OpCode::Input | OpCode::Const) {
-                continue;
-            }
-            let [a, b] = self.arg_indices[i];
-            let ra = remap[a as usize];
-            let rb = if b != UNUSED && op != OpCode::Powi {
-                remap[b as usize]
-            } else {
-                b
-            };
-            self.arg_indices[i] = [ra, rb];
-        }
-
-        // Update output indices.
-        self.output_index = remap[self.output_index as usize];
-        for oi in &mut self.output_indices {
-            *oi = remap[*oi as usize];
-        }
-
-        // DCE removes the now-unreachable duplicate entries.
-        self.dead_code_elimination();
-    }
-
-    // ── Sparse Jacobian ──
-
-    /// Compute a sparse Jacobian using structural sparsity detection and graph coloring.
-    ///
-    /// Auto-selects forward-mode (column compression) or reverse-mode (row compression)
-    /// based on which requires fewer sweeps.
-    ///
-    /// Returns `(output_values, pattern, jacobian_values)`.
-    pub fn sparse_jacobian(
-        &mut self,
-        x: &[F],
-    ) -> (Vec<F>, crate::sparse::JacobianSparsityPattern, Vec<F>) {
-        self.forward(x);
-        let pattern = self.detect_jacobian_sparsity();
-        let (col_colors, num_col_colors) = crate::sparse::column_coloring(&pattern);
-        let (row_colors, num_row_colors) = crate::sparse::row_coloring(&pattern);
-
-        if num_col_colors <= num_row_colors {
-            let jac_values =
-                self.sparse_jacobian_forward_impl(x, &pattern, &col_colors, num_col_colors);
-            let outputs = self.output_values();
-            (outputs, pattern, jac_values)
-        } else {
-            let jac_values =
-                self.sparse_jacobian_reverse_impl(x, &pattern, &row_colors, num_row_colors);
-            let outputs = self.output_values();
-            (outputs, pattern, jac_values)
-        }
-    }
-
-    /// Sparse Jacobian via forward-mode (column compression).
-    pub fn sparse_jacobian_forward(
-        &mut self,
-        x: &[F],
-    ) -> (Vec<F>, crate::sparse::JacobianSparsityPattern, Vec<F>) {
-        self.forward(x);
-        let pattern = self.detect_jacobian_sparsity();
-        let (colors, num_colors) = crate::sparse::column_coloring(&pattern);
-        let jac_values = self.sparse_jacobian_forward_impl(x, &pattern, &colors, num_colors);
-        let outputs = self.output_values();
-        (outputs, pattern, jac_values)
-    }
-
-    /// Sparse Jacobian via reverse-mode (row compression).
-    pub fn sparse_jacobian_reverse(
-        &mut self,
-        x: &[F],
-    ) -> (Vec<F>, crate::sparse::JacobianSparsityPattern, Vec<F>) {
-        self.forward(x);
-        let pattern = self.detect_jacobian_sparsity();
-        let (colors, num_colors) = crate::sparse::row_coloring(&pattern);
-        let jac_values = self.sparse_jacobian_reverse_impl(x, &pattern, &colors, num_colors);
-        let outputs = self.output_values();
-        (outputs, pattern, jac_values)
-    }
-
-    /// Sparse Jacobian with a precomputed sparsity pattern and coloring.
-    ///
-    /// Skips re-detection of sparsity on repeated calls. Use `column_coloring` colors
-    /// for forward mode or `row_coloring` colors for reverse mode. The `forward` flag
-    /// selects the mode.
-    pub fn sparse_jacobian_with_pattern(
-        &mut self,
-        x: &[F],
-        pattern: &crate::sparse::JacobianSparsityPattern,
-        colors: &[u32],
-        num_colors: u32,
-        forward_mode: bool,
-    ) -> (Vec<F>, Vec<F>) {
-        self.forward(x);
-        let jac_values = if forward_mode {
-            self.sparse_jacobian_forward_impl(x, pattern, colors, num_colors)
-        } else {
-            self.sparse_jacobian_reverse_impl(x, pattern, colors, num_colors)
-        };
-        let outputs = self.output_values();
-        (outputs, jac_values)
-    }
-
-    /// Forward-mode sparse Jacobian implementation (column compression).
-    ///
-    /// Each color group seeds a forward pass with tangent 1.0 in the columns
-    /// sharing that color. The resulting tangent at each output gives J[row][col].
-    fn sparse_jacobian_forward_impl(
-        &self,
-        x: &[F],
-        pattern: &crate::sparse::JacobianSparsityPattern,
-        colors: &[u32],
-        num_colors: u32,
-    ) -> Vec<F> {
-        let n = self.num_inputs as usize;
-        let mut jac_values = vec![F::zero(); pattern.nnz()];
-
-        let out_indices = self.all_output_indices();
-
-        let mut dual_input_buf: Vec<Dual<F>> = Vec::with_capacity(n);
-        let mut dual_vals_buf: Vec<Dual<F>> = Vec::new();
-
-        for color in 0..num_colors {
-            // Build Dual inputs: tangent = 1 for inputs with this color
-            dual_input_buf.clear();
-            dual_input_buf.extend((0..n).map(|i| {
-                Dual::new(
-                    x[i],
-                    if colors[i] == color {
-                        F::one()
-                    } else {
-                        F::zero()
-                    },
-                )
-            }));
-
-            self.forward_tangent(&dual_input_buf, &mut dual_vals_buf);
-
-            // Extract Jacobian entries: for entry (row, col) with colors[col] == color,
-            // the tangent at output_indices[row] gives J[row][col]
-            for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() {
-                if colors[col as usize] == color {
-                    jac_values[k] = dual_vals_buf[out_indices[row as usize] as usize].eps;
-                }
-            }
-        }
-
-        jac_values
-    }
-
-    /// Reverse-mode sparse Jacobian implementation (row compression).
-    ///
-    /// Each color group seeds a reverse pass with adjoint 1.0 at the outputs
-    /// sharing that color.
-    fn sparse_jacobian_reverse_impl(
-        &self,
-        _x: &[F],
-        pattern: &crate::sparse::JacobianSparsityPattern,
-        colors: &[u32],
-        num_colors: u32,
-    ) -> Vec<F> {
-        let m = self.num_outputs();
-        let mut jac_values = vec![F::zero(); pattern.nnz()];
-
-        let out_indices = self.all_output_indices();
-
-        for color in 0..num_colors {
-            // Build seeds: weight = 1 for outputs with this color
-            let seeds: Vec<F> = (0..m)
-                .map(|i| {
-                    if colors[i] == color {
-                        F::one()
-                    } else {
-                        F::zero()
-                    }
-                })
-                .collect();
-
-            let adjoints = self.reverse_seeded_full(&seeds, out_indices);
-
-            // Extract Jacobian entries: for entry (row, col) with colors[row] == color,
-            // adjoint[col] gives J[row][col]
-            for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() {
-                if colors[row as usize] == color {
-                    jac_values[k] = adjoints[col as usize];
-                }
-            }
-        }
-
-        jac_values
-    }
-
-    /// Reverse sweep with weighted seeds, returning full adjoint vector.
-    fn reverse_seeded_full(&self, seeds: &[F], out_indices: &[u32]) -> Vec<F> {
-        let n = self.num_variables as usize;
-        let mut adjoints = vec![F::zero(); n];
-
-        for (&out_idx, &weight) in out_indices.iter().zip(seeds.iter()) {
-            if weight == F::zero() {
-                continue;
-            }
-            adjoints[out_idx as usize] = adjoints[out_idx as usize] + weight;
-        }
-
-        self.reverse_sweep_core(&mut adjoints, &self.values, None);
-        adjoints
-    }
-
-    /// Batched sparse Jacobian: packs N colors per forward sweep using DualVec.
-    ///
-    /// Reduces the number of forward sweeps from `num_colors` to
-    /// `ceil(num_colors / N)`.
-    pub fn sparse_jacobian_vec<const N: usize>(
-        &mut self,
-        x: &[F],
-    ) -> (Vec<F>, crate::sparse::JacobianSparsityPattern, Vec<F>) {
-        self.forward(x);
-        let pattern = self.detect_jacobian_sparsity();
-        let (colors, num_colors) = crate::sparse::column_coloring(&pattern);
-
-        let n = self.num_inputs as usize;
-        let mut jac_values = vec![F::zero(); pattern.nnz()];
-
-        let out_indices = self.all_output_indices();
-
-        let mut dual_input_buf: Vec<DualVec<F, N>> = Vec::with_capacity(n);
-        let mut dual_vals_buf: Vec<DualVec<F, N>> = Vec::new();
-
-        let num_batches = (num_colors as usize).div_ceil(N);
-        for batch in 0..num_batches {
-            let base_color = (batch * N) as u32;
-
-            dual_input_buf.clear();
-            dual_input_buf.extend((0..n).map(|i| {
-                let eps = std::array::from_fn(|lane| {
-                    let target_color = base_color + lane as u32;
-                    if target_color < num_colors && colors[i] == target_color {
-                        F::one()
-                    } else {
-                        F::zero()
-                    }
-                });
-                DualVec::new(x[i], eps)
-            }));
-
-            self.forward_tangent(&dual_input_buf, &mut dual_vals_buf);
-
-            for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() {
-                let col_color = colors[col as usize];
-                if col_color >= base_color && col_color < base_color + N as u32 {
-                    let lane = (col_color - base_color) as usize;
-                    jac_values[k] = dual_vals_buf[out_indices[row as usize] as usize].eps[lane];
-                }
-            }
-        }
-
-        let outputs = self.output_values();
-        (outputs, pattern, jac_values)
-    }
-
-    /// Dense Jacobian via forward mode (one forward-tangent pass per input).
-    ///
-    /// More efficient than reverse mode when `num_inputs < num_outputs`.
-    pub fn jacobian_forward(&self, x: &[F]) -> Vec<Vec<F>> {
-        let n = self.num_inputs as usize;
-
-        let out_indices = self.all_output_indices();
-        let m = out_indices.len();
-
-        let mut jac = vec![vec![F::zero(); n]; m];
-        let mut dual_input_buf: Vec<Dual<F>> = Vec::with_capacity(n);
-        let mut dual_vals_buf: Vec<Dual<F>> = Vec::new();
-
-        // Indexing by `col` is clearer than enumerate here: col seeds the tangent direction
-        #[allow(clippy::needless_range_loop)]
-        for col in 0..n {
-            dual_input_buf.clear();
-            dual_input_buf.extend(
-                (0..n).map(|i| Dual::new(x[i], if i == col { F::one() } else { F::zero() })),
-            );
-
-            self.forward_tangent(&dual_input_buf, &mut dual_vals_buf);
-
-            for (row_idx, &out_idx) in out_indices.iter().enumerate() {
-                jac[row_idx][col] = dual_vals_buf[out_idx as usize].eps;
-            }
-        }
-
-        jac
-    }
-
-    /// Dense Jacobian via cross-country (vertex) elimination.
-    ///
-    /// Builds a linearized DAG from the tape, then eliminates intermediate
-    /// vertices in Markowitz order. For functions where `m ≈ n` and the
-    /// graph has moderate connectivity, this can require fewer operations
-    /// than either pure forward mode (`n` passes) or reverse mode (`m` passes).
-    pub fn jacobian_cross_country(&mut self, inputs: &[F]) -> Vec<Vec<F>> {
-        self.forward(inputs);
-
-        let out_indices = self.all_output_indices();
-
-        let mut graph = crate::cross_country::LinearizedGraph::from_tape(
-            &self.opcodes,
-            &self.arg_indices,
-            &self.values,
-            self.num_inputs as usize,
-            out_indices,
-            &self.custom_ops,
-            &self.custom_second_args,
-        );
-
-        graph.eliminate_all();
-        graph.extract_jacobian()
-    }
-
-    /// Sparse Hessian with a precomputed sparsity pattern and coloring.
-    ///
-    /// Skips re-detection on repeated calls (e.g. in solver loops).
-    pub fn sparse_hessian_with_pattern(
-        &self,
-        x: &[F],
-        pattern: &crate::sparse::SparsityPattern,
-        colors: &[u32],
-        num_colors: u32,
-    ) -> (F, Vec<F>, Vec<F>) {
-        let n = self.num_inputs as usize;
-        assert_eq!(x.len(), n, "wrong number of inputs");
-
-        let mut hessian_values = vec![F::zero(); pattern.nnz()];
-        let mut gradient = vec![F::zero(); n];
-        let mut value = F::zero();
-
-        let mut dual_input_buf: Vec<Dual<F>> = Vec::with_capacity(n);
-        let mut dual_vals_buf = Vec::new();
-        let mut adjoint_buf = Vec::new();
-        let mut v = vec![F::zero(); n];
-
-        for color in 0..num_colors {
-            for i in 0..n {
-                v[i] = if colors[i] == color {
-                    F::one()
-                } else {
-                    F::zero()
-                };
-            }
-
-            self.hvp_with_all_bufs(
-                x,
-                &v,
-                &mut dual_input_buf,
-                &mut dual_vals_buf,
-                &mut adjoint_buf,
-            );
-
-            if color == 0 {
-                value = dual_vals_buf[self.output_index as usize].re;
-                for i in 0..n {
-                    gradient[i] = adjoint_buf[i].re;
-                }
-            }
-
-            for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() {
-                if colors[col as usize] == color {
-                    hessian_values[k] = adjoint_buf[row as usize].eps;
-                }
-            }
-        }
-
-        (value, gradient, hessian_values)
-    }
-
-    /// Run all tape optimizations: CSE followed by DCE.
-    ///
-    /// In debug builds, validates internal consistency after optimization.
-    pub fn optimize(&mut self) {
-        self.cse();
-        self.dead_code_elimination();
-
-        // Validate internal consistency in debug builds.
-        #[cfg(debug_assertions)]
-        {
-            let n = self.opcodes.len();
-            // All arg_indices must point to valid entries.
-            for i in 0..n {
-                let [a, b] = self.arg_indices[i];
-                match self.opcodes[i] {
-                    OpCode::Input | OpCode::Const => {
-                        assert_eq!(a, UNUSED, "Input/Const should have UNUSED args");
-                        assert_eq!(b, UNUSED, "Input/Const should have UNUSED args");
-                    }
-                    OpCode::Powi => {
-                        assert!(
-                            (a as usize) < n,
-                            "Powi arg0 {} out of bounds (tape len {})",
-                            a,
-                            n
-                        );
-                    }
-                    _ => {
-                        assert!(
-                            (a as usize) < i,
-                            "arg0 {} not before op {} (tape len {})",
-                            a,
-                            i,
-                            n
-                        );
-                        if b != UNUSED {
-                            assert!(
-                                (b as usize) < i,
-                                "arg1 {} not before op {} (tape len {})",
-                                b,
-                                i,
-                                n
-                            );
-                        }
-                    }
-                }
-            }
-            // output_index must be valid.
-            assert!(
-                (self.output_index as usize) < n,
-                "output_index {} out of bounds (tape len {})",
-                self.output_index,
-                n
-            );
-            for &oi in &self.output_indices {
-                assert!(
-                    (oi as usize) < n,
-                    "output_indices entry {} out of bounds (tape len {})",
-                    oi,
-                    n
-                );
-            }
-            // num_inputs must be preserved.
-            let input_count = self
-                .opcodes
-                .iter()
-                .filter(|&&op| op == OpCode::Input)
-                .count();
-            assert_eq!(
-                input_count, self.num_inputs as usize,
-                "num_inputs mismatch after optimization"
-            );
-        }
-    }
-}
+}
 
 impl<F: Float> Default for BytecodeTape<F> {
     fn default() -> Self {
         Self::new()
     }
 }
-
-// ══════════════════════════════════════════════
-//  Parallel methods (rayon)
-// ══════════════════════════════════════════════
-
-#[cfg(feature = "parallel")]
-impl<F: Float> BytecodeTape<F> {
-    /// Parallel gradient: forward + reverse using external buffers.
-    ///
-    /// Takes `&self` instead of `&mut self`, enabling shared access across threads.
-    pub fn gradient_par(&self, inputs: &[F]) -> Vec<F> {
-        let mut values_buf = Vec::new();
-        self.forward_into(inputs, &mut values_buf);
-        let adjoints = self.reverse_from(&values_buf, self.output_index);
-        adjoints[..self.num_inputs as usize].to_vec()
-    }
-
-    /// Parallel Jacobian: one reverse sweep per output, parallelized.
-    ///
-    /// Returns `J[i][j] = ∂f_i/∂x_j`.
-    pub fn jacobian_par(&self, inputs: &[F]) -> Vec<Vec<F>> {
-        use rayon::prelude::*;
-
-        let mut values_buf = Vec::new();
-        self.forward_into(inputs, &mut values_buf);
-
-        let out_indices = self.all_output_indices();
-
-        let ni = self.num_inputs as usize;
-        out_indices
-            .par_iter()
-            .map(|&out_idx| {
-                let adjoints = self.reverse_from(&values_buf, out_idx);
-                adjoints[..ni].to_vec()
-            })
-            .collect()
-    }
-
-    /// Parallel Hessian: one HVP per column, parallelized over columns.
-    ///
-    /// Returns `(value, gradient, hessian)`.
-    pub fn hessian_par(&self, x: &[F]) -> (F, Vec<F>, Vec<Vec<F>>) {
-        use rayon::prelude::*;
-
-        let n = self.num_inputs as usize;
-        assert_eq!(x.len(), n, "wrong number of inputs");
-
-        // Compute gradient and value from column 0 (serial).
-        let dual_input_buf: Vec<Dual<F>> = (0..n)
-            .map(|i| Dual::new(x[i], if i == 0 { F::one() } else { F::zero() }))
-            .collect();
-        let mut dual_vals_buf = Vec::new();
-        let mut adjoint_buf = Vec::new();
-        self.forward_tangent_dual(&dual_input_buf, &mut dual_vals_buf);
-        self.reverse_tangent_dual(&dual_vals_buf, &mut adjoint_buf);
-
-        let value = dual_vals_buf[self.output_index as usize].re;
-        let gradient: Vec<F> = (0..n).map(|i| adjoint_buf[i].re).collect();
-        let col0: Vec<F> = (0..n).map(|i| adjoint_buf[i].eps).collect();
-
-        // Parallelize remaining columns.
-        let other_cols: Vec<Vec<F>> = (1..n)
-            .into_par_iter()
-            .map(|j| {
-                let inputs: Vec<Dual<F>> = (0..n)
-                    .map(|i| Dual::new(x[i], if i == j { F::one() } else { F::zero() }))
-                    .collect();
-                let mut dv = Vec::new();
-                let mut ab = Vec::new();
-                self.forward_tangent_dual(&inputs, &mut dv);
-                self.reverse_tangent_dual(&dv, &mut ab);
-                (0..n).map(|i| ab[i].eps).collect()
-            })
-            .collect();
-
-        let mut hessian = vec![vec![F::zero(); n]; n];
-        for i in 0..n {
-            hessian[i][0] = col0[i];
-        }
-        for (j_minus_1, col) in other_cols.iter().enumerate() {
-            let j = j_minus_1 + 1;
-            for i in 0..n {
-                hessian[i][j] = col[i];
-            }
-        }
-
-        (value, gradient, hessian)
-    }
-
-    /// Parallel sparse Hessian: parallelized over colors.
-    ///
-    /// Returns `(value, gradient, pattern, hessian_values)`.
-    pub fn sparse_hessian_par(
-        &self,
-        x: &[F],
-    ) -> (F, Vec<F>, crate::sparse::SparsityPattern, Vec<F>) {
-        use rayon::prelude::*;
-
-        let n = self.num_inputs as usize;
-        assert_eq!(x.len(), n, "wrong number of inputs");
-
-        let pattern = self.detect_sparsity();
-        let (colors, num_colors) = crate::sparse::greedy_coloring(&pattern);
-
-        // Compute value/gradient from color 0 (serial).
-        let mut v0 = vec![F::zero(); n];
-        for i in 0..n {
-            v0[i] = if colors[i] == 0 { F::one() } else { F::zero() };
-        }
-        let di: Vec<Dual<F>> = (0..n).map(|i| Dual::new(x[i], v0[i])).collect();
-        let mut dv = Vec::new();
-        let mut ab = Vec::new();
-        self.forward_tangent_dual(&di, &mut dv);
-        self.reverse_tangent_dual(&dv, &mut ab);
-        let value = dv[self.output_index as usize].re;
-        let gradient: Vec<F> = (0..n).map(|i| ab[i].re).collect();
-
-        // Collect all color results in parallel.
-        let color_results: Vec<Vec<Dual<F>>> = (0..num_colors)
-            .into_par_iter()
-            .map(|color| {
-                let mut v = vec![F::zero(); n];
-                for i in 0..n {
-                    v[i] = if colors[i] == color {
-                        F::one()
-                    } else {
-                        F::zero()
-                    };
-                }
-                let inputs: Vec<Dual<F>> = (0..n).map(|i| Dual::new(x[i], v[i])).collect();
-                let mut dv_local = Vec::new();
-                let mut ab_local = Vec::new();
-                self.forward_tangent_dual(&inputs, &mut dv_local);
-                self.reverse_tangent_dual(&dv_local, &mut ab_local);
-                ab_local
-            })
-            .collect();
-
-        let mut hessian_values = vec![F::zero(); pattern.nnz()];
-        for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() {
-            let color = colors[col as usize] as usize;
-            hessian_values[k] = color_results[color][row as usize].eps;
-        }
-
-        (value, gradient, pattern, hessian_values)
-    }
-
-    /// Parallel sparse Jacobian: parallelized over colors.
-    ///
-    /// Auto-selects forward (column compression) or reverse (row compression)
-    /// based on `num_outputs` vs `num_inputs`.
-    pub fn sparse_jacobian_par(
-        &self,
-        x: &[F],
-    ) -> (Vec<F>, crate::sparse::JacobianSparsityPattern, Vec<F>) {
-        use rayon::prelude::*;
-
-        let n = self.num_inputs as usize;
-        assert_eq!(x.len(), n, "wrong number of inputs");
-
-        let mut values_buf = Vec::new();
-        self.forward_into(x, &mut values_buf);
-
-        let out_indices = self.all_output_indices();
-        let m = out_indices.len();
-        let outputs: Vec<F> = out_indices
-            .iter()
-            .map(|&oi| values_buf[oi as usize])
-            .collect();
-
-        let jac_pattern = self.detect_jacobian_sparsity();
-        let ni = self.num_inputs as usize;
-
-        if m <= n {
-            // Row compression (reverse mode)
-            let (row_colors, num_colors) = crate::sparse::row_coloring(&jac_pattern);
-
-            let color_results: Vec<Vec<F>> = (0..num_colors)
-                .into_par_iter()
-                .map(|color| {
-                    let n_vars = self.num_variables as usize;
-                    let mut adjoints = vec![F::zero(); n_vars];
-                    for (i, &oi) in out_indices.iter().enumerate() {
-                        if row_colors[i] == color {
-                            adjoints[oi as usize] = F::one();
-                        }
-                    }
-
-                    self.reverse_sweep_core(&mut adjoints, &values_buf, None);
-                    adjoints[..ni].to_vec()
-                })
-                .collect();
-
-            let mut jac_values = vec![F::zero(); jac_pattern.nnz()];
-            for (k, (&row, &col)) in jac_pattern
-                .rows
-                .iter()
-                .zip(jac_pattern.cols.iter())
-                .enumerate()
-            {
-                let color = row_colors[row as usize] as usize;
-                jac_values[k] = color_results[color][col as usize];
-            }
-
-            (outputs, jac_pattern, jac_values)
-        } else {
-            // Column compression (forward mode) — parallelize forward tangent sweeps
-            let (col_colors, num_colors) = crate::sparse::column_coloring(&jac_pattern);
-
-            let color_results: Vec<Vec<F>> = (0..num_colors)
-                .into_par_iter()
-                .map(|color| {
-                    let dir: Vec<F> = (0..n)
-                        .map(|i| {
-                            if col_colors[i] == color {
-                                F::one()
-                            } else {
-                                F::zero()
-                            }
-                        })
-                        .collect();
-                    // Forward tangent with F (not Dual) direction
-                    // We need to do forward_into first, then forward tangent
-                    // But forward_tangent uses self.values, so we need a workaround.
-                    // Use Dual<F> with tangent = direction.
-                    let inputs: Vec<Dual<F>> = (0..n).map(|i| Dual::new(x[i], dir[i])).collect();
-                    let mut dv = Vec::new();
-                    self.forward_tangent(&inputs, &mut dv);
-                    out_indices.iter().map(|&oi| dv[oi as usize].eps).collect()
-                })
-                .collect();
-
-            let mut jac_values = vec![F::zero(); jac_pattern.nnz()];
-            for (k, (&row, &col)) in jac_pattern
-                .rows
-                .iter()
-                .zip(jac_pattern.cols.iter())
-                .enumerate()
-            {
-                let color = col_colors[col as usize] as usize;
-                jac_values[k] = color_results[color][row as usize];
-            }
-
-            (outputs, jac_pattern, jac_values)
-        }
-    }
-
-    /// Evaluate the gradient at multiple input points in parallel.
-    ///
-    /// Uses `forward_into` + `reverse_from` with per-thread buffers.
-    pub fn gradient_batch_par(&self, inputs: &[&[F]]) -> Vec<Vec<F>> {
-        use rayon::prelude::*;
-
-        let ni = self.num_inputs as usize;
-        let out_idx = self.output_index;
-
-        inputs
-            .par_iter()
-            .map(|x| {
-                let mut values_buf = Vec::new();
-                self.forward_into(x, &mut values_buf);
-                let adjoints = self.reverse_from(&values_buf, out_idx);
-                adjoints[..ni].to_vec()
-            })
-            .collect()
-    }
-
-    /// Compute Hessian at multiple input points in parallel.
-    ///
-    /// Returns `(value, gradient, hessian)` for each input point.
-    pub fn hessian_batch_par(&self, inputs: &[&[F]]) -> Vec<(F, Vec<F>, Vec<Vec<F>>)> {
-        use rayon::prelude::*;
-
-        inputs.par_iter().map(|x| self.hessian_par(x)).collect()
-    }
-}
-
-// ══════════════════════════════════════════════
-//  Thread-local active bytecode tape
-// ══════════════════════════════════════════════
-//  Serde support
-// ══════════════════════════════════════════════
-
-#[cfg(feature = "serde")]
-mod serde_support {
-    use super::*;
-    use serde::ser::SerializeStruct;
-    use serde::{Deserialize, Deserializer, Serialize, Serializer};
-
-    impl<F: Float + Serialize> Serialize for BytecodeTape<F> {
-        fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
-            if !self.custom_ops.is_empty() {
-                return Err(serde::ser::Error::custom(
-                    "cannot serialize a BytecodeTape containing custom ops; \
-                     custom ops must be re-registered after deserialization",
-                ));
-            }
-            let mut s = serializer.serialize_struct("BytecodeTape", 8)?;
-            s.serialize_field("opcodes", &self.opcodes)?;
-            s.serialize_field("arg_indices", &self.arg_indices)?;
-            s.serialize_field("values", &self.values)?;
-            s.serialize_field("num_inputs", &self.num_inputs)?;
-            s.serialize_field("num_variables", &self.num_variables)?;
-            s.serialize_field("output_index", &self.output_index)?;
-            s.serialize_field("output_indices", &self.output_indices)?;
-            s.serialize_field("custom_second_args", &self.custom_second_args)?;
-            s.end()
-        }
-    }
-
-    impl<'de, F: Float + Deserialize<'de>> Deserialize<'de> for BytecodeTape<F> {
-        fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
-            #[derive(Deserialize)]
-            struct TapeData<F> {
-                opcodes: Vec<OpCode>,
-                arg_indices: Vec<[u32; 2]>,
-                values: Vec<F>,
-                num_inputs: u32,
-                num_variables: u32,
-                output_index: u32,
-                #[serde(default)]
-                output_indices: Vec<u32>,
-                #[serde(default)]
-                custom_second_args: HashMap<u32, u32>,
-            }
-
-            let data = TapeData::<F>::deserialize(deserializer)?;
-            Ok(BytecodeTape {
-                opcodes: data.opcodes,
-                arg_indices: data.arg_indices,
-                values: data.values,
-                num_inputs: data.num_inputs,
-                num_variables: data.num_variables,
-                output_index: data.output_index,
-                output_indices: data.output_indices,
-                custom_ops: Vec::new(),
-                custom_second_args: data.custom_second_args,
-            })
-        }
-    }
-}
-
-// ══════════════════════════════════════════════
-
-thread_local! {
-    static BTAPE_F32: Cell<*mut BytecodeTape<f32>> = const { Cell::new(std::ptr::null_mut()) };
-    static BTAPE_F64: Cell<*mut BytecodeTape<f64>> = const { Cell::new(std::ptr::null_mut()) };
-    static BTAPE_DUAL_F32: Cell<*mut BytecodeTape<Dual<f32>>> = const { Cell::new(std::ptr::null_mut()) };
-    static BTAPE_DUAL_F64: Cell<*mut BytecodeTape<Dual<f64>>> = const { Cell::new(std::ptr::null_mut()) };
-}
-
-/// Trait to select the correct thread-local for a given float type.
-///
-/// Implemented for `f32`, `f64`, `Dual<f32>`, and `Dual<f64>`, enabling
-/// `BReverse<F>` to be used with these base types.
-pub trait BtapeThreadLocal: Float {
-    fn btape_cell() -> &'static std::thread::LocalKey<Cell<*mut BytecodeTape<Self>>>;
-}
-
-impl BtapeThreadLocal for f32 {
-    fn btape_cell() -> &'static std::thread::LocalKey<Cell<*mut BytecodeTape<Self>>> {
-        &BTAPE_F32
-    }
-}
-
-impl BtapeThreadLocal for f64 {
-    fn btape_cell() -> &'static std::thread::LocalKey<Cell<*mut BytecodeTape<Self>>> {
-        &BTAPE_F64
-    }
-}
-
-impl BtapeThreadLocal for Dual<f32> {
-    fn btape_cell() -> &'static std::thread::LocalKey<Cell<*mut BytecodeTape<Self>>> {
-        &BTAPE_DUAL_F32
-    }
-}
-
-impl BtapeThreadLocal for Dual<f64> {
-    fn btape_cell() -> &'static std::thread::LocalKey<Cell<*mut BytecodeTape<Self>>> {
-        &BTAPE_DUAL_F64
-    }
-}
-
-/// Access the active bytecode tape for the current thread.
-/// Panics if no tape is active.
-#[inline]
-pub fn with_active_btape<F: BtapeThreadLocal, R>(f: impl FnOnce(&mut BytecodeTape<F>) -> R) -> R {
-    F::btape_cell().with(|cell| {
-        let ptr = cell.get();
-        assert!(
-            !ptr.is_null(),
-            "No active bytecode tape. Use echidna::record() to record a function."
-        );
-        // SAFETY: BtapeGuard guarantees validity for the duration of the
-        // recording scope, single-threaded via thread-local.
-        let tape = unsafe { &mut *ptr };
-        f(tape)
-    })
-}
-
-/// RAII guard that sets a bytecode tape as the thread-local active tape.
-pub struct BtapeGuard<F: BtapeThreadLocal> {
-    prev: *mut BytecodeTape<F>,
-}
-
-impl<F: BtapeThreadLocal> BtapeGuard<F> {
-    /// Activate `tape` as the thread-local bytecode tape.
-    pub fn new(tape: &mut BytecodeTape<F>) -> Self {
-        let prev = F::btape_cell().with(|cell| {
-            let prev = cell.get();
-            cell.set(tape as *mut BytecodeTape<F>);
-            prev
-        });
-        BtapeGuard { prev }
-    }
-}
-
-impl<F: BtapeThreadLocal> Drop for BtapeGuard<F> {
-    fn drop(&mut self) {
-        F::btape_cell().with(|cell| {
-            cell.set(self.prev);
-        });
-    }
-}
diff --git a/src/bytecode_tape/optimize.rs b/src/bytecode_tape/optimize.rs
new file mode 100644
index 0000000..9e66e3e
--- /dev/null
+++ b/src/bytecode_tape/optimize.rs
@@ -0,0 +1,271 @@
+use std::collections::HashMap;
+
+use crate::float::Float;
+use crate::opcode::{OpCode, UNUSED};
+
+impl<F: Float> super::BytecodeTape<F> {
+    /// Eliminate dead (unreachable) entries from the tape.
+    ///
+    /// Walks backward from all outputs, marks reachable entries, then compacts
+    /// the tape in-place with an index remap. Inputs are never removed.
+    /// Core DCE: reachability walk from `seeds`, compact tape, return index remap.
+    ///
+    /// Shared by [`dead_code_elimination`](Self::dead_code_elimination) and
+    /// [`dead_code_elimination_for_outputs`](Self::dead_code_elimination_for_outputs).
+    /// Callers handle output index updates after compaction.
+    fn dce_compact(&mut self, seeds: &[u32]) -> Vec<u32> {
+        let n = self.opcodes.len();
+        let mut reachable = vec![false; n];
+
+        // Mark all inputs as reachable.
+        for flag in reachable.iter_mut().take(self.num_inputs as usize) {
+            *flag = true;
+        }
+
+        let mut stack: Vec<u32> = seeds.to_vec();
+
+        while let Some(idx) = stack.pop() {
+            let i = idx as usize;
+            if reachable[i] {
+                continue;
+            }
+            reachable[i] = true;
+            let [a, b] = self.arg_indices[i];
+            if a != UNUSED {
+                stack.push(a);
+            }
+            if b != UNUSED && self.opcodes[i] != OpCode::Powi {
+                stack.push(b);
+            }
+        }
+
+        // Build remap: old index -> new index.
+        let mut remap = vec![0u32; n];
+        let mut new_idx = 0u32;
+        for i in 0..n {
+            if reachable[i] {
+                remap[i] = new_idx;
+                new_idx += 1;
+            }
+        }
+        let new_len = new_idx as usize;
+
+        // Compact in-place.
+        let mut write = 0;
+        for (read, &is_reachable) in reachable.iter().enumerate().take(n) {
+            if is_reachable {
+                self.opcodes[write] = self.opcodes[read];
+                self.values[write] = self.values[read];
+                let [a, b] = self.arg_indices[read];
+                let ra = if a != UNUSED {
+                    remap[a as usize]
+                } else {
+                    UNUSED
+                };
+                let rb = if b != UNUSED && self.opcodes[read] != OpCode::Powi {
+                    remap[b as usize]
+                } else {
+                    b
+                };
+                self.arg_indices[write] = [ra, rb];
+                write += 1;
+            }
+        }
+
+        self.opcodes.truncate(new_len);
+        self.arg_indices.truncate(new_len);
+        self.values.truncate(new_len);
+        self.num_variables = new_len as u32;
+
+        remap
+    }
+
+    pub fn dead_code_elimination(&mut self) {
+        let mut seeds = vec![self.output_index];
+        seeds.extend_from_slice(&self.output_indices);
+        let remap = self.dce_compact(&seeds);
+        self.output_index = remap[self.output_index as usize];
+        for oi in &mut self.output_indices {
+            *oi = remap[*oi as usize];
+        }
+    }
+
+    /// Eliminate dead code, keeping only the specified outputs alive.
+    ///
+    /// Like [`dead_code_elimination`](Self::dead_code_elimination) but seeds
+    /// reachability only from `active_outputs`. After compaction,
+    /// `output_indices` contains only the active outputs (remapped), and
+    /// `output_index` is set to the first active output.
+    ///
+    /// **Note**: Like `dead_code_elimination`, this does not remap
+    /// `custom_second_args` (pre-existing limitation).
+    ///
+    /// # Panics
+    /// Panics if `active_outputs` is empty.
+    pub fn dead_code_elimination_for_outputs(&mut self, active_outputs: &[u32]) {
+        assert!(
+            !active_outputs.is_empty(),
+            "active_outputs must not be empty"
+        );
+        let remap = self.dce_compact(active_outputs);
+        self.output_indices = active_outputs
+            .iter()
+            .map(|&oi| remap[oi as usize])
+            .collect();
+        self.output_index = self.output_indices[0];
+    }
+
+    /// Common subexpression elimination.
+    ///
+    /// Deduplicates identical `(OpCode, arg0, arg1)` triples, normalising
+    /// argument order for commutative ops. Finishes with a DCE pass to
+    /// remove the now-dead duplicates.
+    pub fn cse(&mut self) {
+        let n = self.opcodes.len();
+        // Maps canonical (op, arg0, arg1) -> first index that computed it.
+        let mut seen: HashMap<(OpCode, u32, u32), u32> = HashMap::new();
+        // remap[i] = canonical index for entry i (identity by default).
+        let mut remap: Vec<u32> = (0..n as u32).collect();
+
+        let is_commutative = |op: OpCode| -> bool {
+            matches!(
+                op,
+                OpCode::Add | OpCode::Mul | OpCode::Max | OpCode::Min | OpCode::Hypot
+            )
+        };
+
+        for i in 0..n {
+            let op = self.opcodes[i];
+            match op {
+                OpCode::Input | OpCode::Const => continue,
+                _ => {}
+            }
+
+            let [mut a, mut b] = self.arg_indices[i];
+            // Apply remap to args (except Powi exponent in b).
+            a = remap[a as usize];
+            if b != UNUSED && op != OpCode::Powi {
+                b = remap[b as usize];
+            }
+            // Update arg_indices with remapped values.
+            self.arg_indices[i] = [a, b];
+
+            // Build the canonical key.
+            let key = if b == UNUSED {
+                // Unary: hash (op, arg0) only; use UNUSED as placeholder.
+                (op, a, UNUSED)
+            } else if is_commutative(op) {
+                let (lo, hi) = if a <= b { (a, b) } else { (b, a) };
+                (op, lo, hi)
+            } else {
+                (op, a, b)
+            };
+
+            if let Some(&canonical) = seen.get(&key) {
+                remap[i] = canonical;
+            } else {
+                seen.insert(key, i as u32);
+            }
+        }
+
+        // Apply remap to all arg_indices (for entries that reference CSE'd nodes).
+        for i in 0..n {
+            let op = self.opcodes[i];
+            if matches!(op, OpCode::Input | OpCode::Const) {
+                continue;
+            }
+            let [a, b] = self.arg_indices[i];
+            let ra = remap[a as usize];
+            let rb = if b != UNUSED && op != OpCode::Powi {
+                remap[b as usize]
+            } else {
+                b
+            };
+            self.arg_indices[i] = [ra, rb];
+        }
+
+        // Update output indices.
+        self.output_index = remap[self.output_index as usize];
+        for oi in &mut self.output_indices {
+            *oi = remap[*oi as usize];
+        }
+
+        // DCE removes the now-unreachable duplicate entries.
+        self.dead_code_elimination();
+    }
+
+    /// Run all tape optimizations: CSE followed by DCE.
+    ///
+    /// In debug builds, validates internal consistency after optimization.
+    pub fn optimize(&mut self) {
+        self.cse();
+        self.dead_code_elimination();
+
+        // Validate internal consistency in debug builds.
+        #[cfg(debug_assertions)]
+        {
+            let n = self.opcodes.len();
+            // All arg_indices must point to valid entries.
+            for i in 0..n {
+                let [a, b] = self.arg_indices[i];
+                match self.opcodes[i] {
+                    OpCode::Input | OpCode::Const => {
+                        assert_eq!(a, UNUSED, "Input/Const should have UNUSED args");
+                        assert_eq!(b, UNUSED, "Input/Const should have UNUSED args");
+                    }
+                    OpCode::Powi => {
+                        assert!(
+                            (a as usize) < n,
+                            "Powi arg0 {} out of bounds (tape len {})",
+                            a,
+                            n
+                        );
+                    }
+                    _ => {
+                        assert!(
+                            (a as usize) < i,
+                            "arg0 {} not before op {} (tape len {})",
+                            a,
+                            i,
+                            n
+                        );
+                        if b != UNUSED {
+                            assert!(
+                                (b as usize) < i,
+                                "arg1 {} not before op {} (tape len {})",
+                                b,
+                                i,
+                                n
+                            );
+                        }
+                    }
+                }
+            }
+            // output_index must be valid.
+            assert!(
+                (self.output_index as usize) < n,
+                "output_index {} out of bounds (tape len {})",
+                self.output_index,
+                n
+            );
+            for &oi in &self.output_indices {
+                assert!(
+                    (oi as usize) < n,
+                    "output_indices entry {} out of bounds (tape len {})",
+                    oi,
+                    n
+                );
+            }
+            // num_inputs must be preserved.
+            let input_count = self
+                .opcodes
+                .iter()
+                .filter(|&&op| op == OpCode::Input)
+                .count();
+            assert_eq!(
+                input_count, self.num_inputs as usize,
+                "num_inputs mismatch after optimization"
+            );
+        }
+    }
+}
diff --git a/src/bytecode_tape/parallel.rs b/src/bytecode_tape/parallel.rs
new file mode 100644
index 0000000..bf30336
--- /dev/null
+++ b/src/bytecode_tape/parallel.rs
@@ -0,0 +1,263 @@
+use rayon::prelude::*;
+
+use crate::dual::Dual;
+use crate::float::Float;
+
+impl<F: Float> super::BytecodeTape<F> {
+    /// Parallel gradient: forward + reverse using external buffers.
+    ///
+    /// Takes `&self` instead of `&mut self`, enabling shared access across threads.
+    pub fn gradient_par(&self, inputs: &[F]) -> Vec<F> {
+        let mut values_buf = Vec::new();
+        self.forward_into(inputs, &mut values_buf);
+        let adjoints = self.reverse_from(&values_buf, self.output_index);
+        adjoints[..self.num_inputs as usize].to_vec()
+    }
+
+    /// Parallel Jacobian: one reverse sweep per output, parallelized.
+    ///
+    /// Returns `J[i][j] = ∂f_i/∂x_j`.
+    pub fn jacobian_par(&self, inputs: &[F]) -> Vec<Vec<F>> {
+        let mut values_buf = Vec::new();
+        self.forward_into(inputs, &mut values_buf);
+
+        let out_indices = self.all_output_indices();
+
+        let ni = self.num_inputs as usize;
+        out_indices
+            .par_iter()
+            .map(|&out_idx| {
+                let adjoints = self.reverse_from(&values_buf, out_idx);
+                adjoints[..ni].to_vec()
+            })
+            .collect()
+    }
+
+    /// Parallel Hessian: one HVP per column, parallelized over columns.
+    ///
+    /// Returns `(value, gradient, hessian)`.
+    pub fn hessian_par(&self, x: &[F]) -> (F, Vec<F>, Vec<Vec<F>>) {
+        let n = self.num_inputs as usize;
+        assert_eq!(x.len(), n, "wrong number of inputs");
+
+        // Compute gradient and value from column 0 (serial).
+        let dual_input_buf: Vec<Dual<F>> = (0..n)
+            .map(|i| Dual::new(x[i], if i == 0 { F::one() } else { F::zero() }))
+            .collect();
+        let mut dual_vals_buf = Vec::new();
+        let mut adjoint_buf = Vec::new();
+        self.forward_tangent_dual(&dual_input_buf, &mut dual_vals_buf);
+        self.reverse_tangent_dual(&dual_vals_buf, &mut adjoint_buf);
+
+        let value = dual_vals_buf[self.output_index as usize].re;
+        let gradient: Vec<F> = (0..n).map(|i| adjoint_buf[i].re).collect();
+        let col0: Vec<F> = (0..n).map(|i| adjoint_buf[i].eps).collect();
+
+        // Parallelize remaining columns.
+        let other_cols: Vec<Vec<F>> = (1..n)
+            .into_par_iter()
+            .map(|j| {
+                let inputs: Vec<Dual<F>> = (0..n)
+                    .map(|i| Dual::new(x[i], if i == j { F::one() } else { F::zero() }))
+                    .collect();
+                let mut dv = Vec::new();
+                let mut ab = Vec::new();
+                self.forward_tangent_dual(&inputs, &mut dv);
+                self.reverse_tangent_dual(&dv, &mut ab);
+                (0..n).map(|i| ab[i].eps).collect()
+            })
+            .collect();
+
+        let mut hessian = vec![vec![F::zero(); n]; n];
+        for i in 0..n {
+            hessian[i][0] = col0[i];
+        }
+        for (j_minus_1, col) in other_cols.iter().enumerate() {
+            let j = j_minus_1 + 1;
+            for i in 0..n {
+                hessian[i][j] = col[i];
+            }
+        }
+
+        (value, gradient, hessian)
+    }
+
+    /// Parallel sparse Hessian: parallelized over colors.
+    ///
+    /// Returns `(value, gradient, pattern, hessian_values)`.
+    pub fn sparse_hessian_par(
+        &self,
+        x: &[F],
+    ) -> (F, Vec<F>, crate::sparse::SparsityPattern, Vec<F>) {
+        let n = self.num_inputs as usize;
+        assert_eq!(x.len(), n, "wrong number of inputs");
+
+        let pattern = self.detect_sparsity();
+        let (colors, num_colors) = crate::sparse::greedy_coloring(&pattern);
+
+        // Compute value/gradient from color 0 (serial).
+        let mut v0 = vec![F::zero(); n];
+        for i in 0..n {
+            v0[i] = if colors[i] == 0 { F::one() } else { F::zero() };
+        }
+        let di: Vec<Dual<F>> = (0..n).map(|i| Dual::new(x[i], v0[i])).collect();
+        let mut dv = Vec::new();
+        let mut ab = Vec::new();
+        self.forward_tangent_dual(&di, &mut dv);
+        self.reverse_tangent_dual(&dv, &mut ab);
+        let value = dv[self.output_index as usize].re;
+        let gradient: Vec<F> = (0..n).map(|i| ab[i].re).collect();
+
+        // Collect all color results in parallel.
+        let color_results: Vec<Vec<Dual<F>>> = (0..num_colors)
+            .into_par_iter()
+            .map(|color| {
+                let mut v = vec![F::zero(); n];
+                for i in 0..n {
+                    v[i] = if colors[i] == color {
+                        F::one()
+                    } else {
+                        F::zero()
+                    };
+                }
+                let inputs: Vec<Dual<F>> = (0..n).map(|i| Dual::new(x[i], v[i])).collect();
+                let mut dv_local = Vec::new();
+                let mut ab_local = Vec::new();
+                self.forward_tangent_dual(&inputs, &mut dv_local);
+                self.reverse_tangent_dual(&dv_local, &mut ab_local);
+                ab_local
+            })
+            .collect();
+
+        let mut hessian_values = vec![F::zero(); pattern.nnz()];
+        for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() {
+            let color = colors[col as usize] as usize;
+            hessian_values[k] = color_results[color][row as usize].eps;
+        }
+
+        (value, gradient, pattern, hessian_values)
+    }
+
+    /// Parallel sparse Jacobian: parallelized over colors.
+    ///
+    /// Auto-selects forward (column compression) or reverse (row compression)
+    /// based on `num_outputs` vs `num_inputs`.
+    pub fn sparse_jacobian_par(
+        &self,
+        x: &[F],
+    ) -> (Vec<F>, crate::sparse::JacobianSparsityPattern, Vec<F>) {
+        let n = self.num_inputs as usize;
+        assert_eq!(x.len(), n, "wrong number of inputs");
+
+        let mut values_buf = Vec::new();
+        self.forward_into(x, &mut values_buf);
+
+        let out_indices = self.all_output_indices();
+        let m = out_indices.len();
+        let outputs: Vec<F> = out_indices
+            .iter()
+            .map(|&oi| values_buf[oi as usize])
+            .collect();
+
+        let jac_pattern = self.detect_jacobian_sparsity();
+        let ni = self.num_inputs as usize;
+
+        if m <= n {
+            // Row compression (reverse mode)
+            let (row_colors, num_colors) = crate::sparse::row_coloring(&jac_pattern);
+
+            let color_results: Vec<Vec<F>> = (0..num_colors)
+                .into_par_iter()
+                .map(|color| {
+                    let n_vars = self.num_variables as usize;
+                    let mut adjoints = vec![F::zero(); n_vars];
+                    for (i, &oi) in out_indices.iter().enumerate() {
+                        if row_colors[i] == color {
+                            adjoints[oi as usize] = F::one();
+                        }
+                    }
+
+                    self.reverse_sweep_core(&mut adjoints, &values_buf, None);
+                    adjoints[..ni].to_vec()
+                })
+                .collect();
+
+            let mut jac_values = vec![F::zero(); jac_pattern.nnz()];
+            for (k, (&row, &col)) in jac_pattern
+                .rows
+                .iter()
+                .zip(jac_pattern.cols.iter())
+                .enumerate()
+            {
+                let color = row_colors[row as usize] as usize;
+                jac_values[k] = color_results[color][col as usize];
+            }
+
+            (outputs, jac_pattern, jac_values)
+        } else {
+            // Column compression (forward mode) — parallelize forward tangent sweeps
+            let (col_colors, num_colors) = crate::sparse::column_coloring(&jac_pattern);
+
+            let color_results: Vec<Vec<F>> = (0..num_colors)
+                .into_par_iter()
+                .map(|color| {
+                    let dir: Vec<F> = (0..n)
+                        .map(|i| {
+                            if col_colors[i] == color {
+                                F::one()
+                            } else {
+                                F::zero()
+                            }
+                        })
+                        .collect();
+                    // Forward tangent with F (not Dual) direction
+                    // We need to do forward_into first, then forward tangent
+                    // But forward_tangent uses self.values, so we need a workaround.
+                    // Use Dual<F> with tangent = direction.
+                    let inputs: Vec<Dual<F>> = (0..n).map(|i| Dual::new(x[i], dir[i])).collect();
+                    let mut dv = Vec::new();
+                    self.forward_tangent(&inputs, &mut dv);
+                    out_indices.iter().map(|&oi| dv[oi as usize].eps).collect()
+                })
+                .collect();
+
+            let mut jac_values = vec![F::zero(); jac_pattern.nnz()];
+            for (k, (&row, &col)) in jac_pattern
+                .rows
+                .iter()
+                .zip(jac_pattern.cols.iter())
+                .enumerate()
+            {
+                let color = col_colors[col as usize] as usize;
+                jac_values[k] = color_results[color][row as usize];
+            }
+
+            (outputs, jac_pattern, jac_values)
+        }
+    }
+
+    /// Evaluate the gradient at multiple input points in parallel.
+    ///
+    /// Uses `forward_into` + `reverse_from` with per-thread buffers.
+    pub fn gradient_batch_par(&self, inputs: &[&[F]]) -> Vec<Vec<F>> {
+        let ni = self.num_inputs as usize;
+        let out_idx = self.output_index;
+
+        inputs
+            .par_iter()
+            .map(|x| {
+                let mut values_buf = Vec::new();
+                self.forward_into(x, &mut values_buf);
+                let adjoints = self.reverse_from(&values_buf, out_idx);
+                adjoints[..ni].to_vec()
+            })
+            .collect()
+    }
+
+    /// Compute Hessian at multiple input points in parallel.
+    ///
+    /// Returns `(value, gradient, hessian)` for each input point.
+    pub fn hessian_batch_par(&self, inputs: &[&[F]]) -> Vec<(F, Vec<F>, Vec<Vec<F>>)> {
+        inputs.par_iter().map(|x| self.hessian_par(x)).collect()
+    }
+}
diff --git a/src/bytecode_tape/reverse.rs b/src/bytecode_tape/reverse.rs
new file mode 100644
index 0000000..a1be6af
--- /dev/null
+++ b/src/bytecode_tape/reverse.rs
@@ -0,0 +1,170 @@
+use std::collections::HashMap;
+
+use crate::float::Float;
+use crate::opcode::{self, OpCode, UNUSED};
+
+impl<F: Float> super::BytecodeTape<F> {
+    /// Reverse sweep with weighted seeds for multiple outputs.
+    ///
+    /// Computes `∑_i weights[i] * ∂output_i/∂x` — a vector-Jacobian product.
+    ///
+    /// Returns the gradient with respect to all inputs (length [`num_inputs`](Self::num_inputs)).
+    pub fn reverse_seeded(&self, seeds: &[F]) -> Vec<F> {
+        let out_indices = self.all_output_indices();
+
+        assert_eq!(
+            seeds.len(),
+            out_indices.len(),
+            "seeds length must match number of outputs"
+        );
+
+        let ni = self.num_inputs as usize;
+        let adjoints = self.reverse_seeded_full(seeds, out_indices);
+        adjoints[..ni].to_vec()
+    }
+
+    /// Core reverse sweep loop shared by all scalar reverse sweep variants.
+    ///
+    /// Expects `adjoints` to be pre-seeded by the caller (length = `num_variables`).
+    /// Reads primal values from `values` (either `self.values` or an external buffer).
+    /// When `forced_signs` is `Some`, uses forced partials at matching tape indices.
+    pub(super) fn reverse_sweep_core(
+        &self,
+        adjoints: &mut [F],
+        values: &[F],
+        forced_signs: Option<&HashMap<u32, i8>>,
+    ) {
+        for i in (0..self.opcodes.len()).rev() {
+            let adj = adjoints[i];
+            if adj == F::zero() {
+                continue;
+            }
+
+            match self.opcodes[i] {
+                OpCode::Input | OpCode::Const => continue,
+                OpCode::Custom => {
+                    adjoints[i] = F::zero();
+                    let [a_idx, cb_idx] = self.arg_indices[i];
+                    let a = values[a_idx as usize];
+                    let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied();
+                    let b = b_idx_opt.map(|bi| values[bi as usize]).unwrap_or(F::zero());
+                    let r = values[i];
+                    let (da, db) = self.custom_ops[cb_idx as usize].partials(a, b, r);
+                    adjoints[a_idx as usize] = adjoints[a_idx as usize] + da * adj;
+                    if let Some(bi) = b_idx_opt {
+                        adjoints[bi as usize] = adjoints[bi as usize] + db * adj;
+                    }
+                }
+                op => {
+                    adjoints[i] = F::zero();
+                    let [a_idx, b_idx] = self.arg_indices[i];
+                    let a = values[a_idx as usize];
+                    let b = if b_idx != UNUSED && op != OpCode::Powi {
+                        values[b_idx as usize]
+                    } else if op == OpCode::Powi {
+                        F::from(b_idx).unwrap_or_else(|| F::zero())
+                    } else {
+                        F::zero()
+                    };
+                    let r = values[i];
+
+                    let (da, db) = match forced_signs.and_then(|fs| fs.get(&(i as u32))) {
+                        Some(&sign) => opcode::forced_reverse_partials(op, a, b, r, sign),
+                        None => opcode::reverse_partials(op, a, b, r),
+                    };
+
+                    adjoints[a_idx as usize] = adjoints[a_idx as usize] + da * adj;
+                    if b_idx != UNUSED && op != OpCode::Powi {
+                        adjoints[b_idx as usize] = adjoints[b_idx as usize] + db * adj;
+                    }
+                }
+            }
+        }
+    }
+
+    /// Reverse sweep: compute adjoints seeded at the output.
+    ///
+    /// Returns the full adjoint vector (length = `num_variables`).
+    pub fn reverse(&self, seed_index: u32) -> Vec<F> {
+        let n = self.num_variables as usize;
+        let mut adjoints = vec![F::zero(); n];
+        adjoints[seed_index as usize] = F::one();
+        self.reverse_sweep_core(&mut adjoints, &self.values, None);
+        adjoints
+    }
+
+    /// Reverse sweep with forced branch choices at specified tape indices.
+    pub(super) fn reverse_with_forced_signs(
+        &self,
+        seed_index: u32,
+        forced_signs: &HashMap<u32, i8>,
+    ) -> Vec<F> {
+        let n = self.num_variables as usize;
+        let mut adjoints = vec![F::zero(); n];
+        adjoints[seed_index as usize] = F::one();
+        self.reverse_sweep_core(&mut adjoints, &self.values, Some(forced_signs));
+        adjoints
+    }
+
+    /// Reverse sweep reading from an external values buffer.
+    ///
+    /// Like [`reverse`](Self::reverse) but reads primal values from `values`
+    /// instead of `self.values`. Pair with [`forward_into`](Self::forward_into)
+    /// for parallel evaluation.
+    pub fn reverse_from(&self, values: &[F], seed_index: u32) -> Vec<F> {
+        let n = self.num_variables as usize;
+        assert_eq!(values.len(), n, "values buffer has wrong length");
+        let mut adjoints = vec![F::zero(); n];
+        adjoints[seed_index as usize] = F::one();
+        self.reverse_sweep_core(&mut adjoints, values, None);
+        adjoints
+    }
+
+    /// Forward + reverse: compute the gradient at new inputs.
+    ///
+    /// Returns only the input adjoints (indices `0..num_inputs`).
+    pub fn gradient(&mut self, inputs: &[F]) -> Vec<F> {
+        self.forward(inputs);
+        let adjoints = self.reverse(self.output_index);
+        adjoints[..self.num_inputs as usize].to_vec()
+    }
+
+    /// Like [`gradient`](Self::gradient) but reuses a caller-provided buffer
+    /// for the adjoint vector, avoiding allocation on repeated calls.
+    pub fn gradient_with_buf(&mut self, inputs: &[F], adjoint_buf: &mut Vec<F>) -> Vec<F> {
+        self.forward(inputs);
+
+        let n = self.num_variables as usize;
+        adjoint_buf.clear();
+        adjoint_buf.resize(n, F::zero());
+        adjoint_buf[self.output_index as usize] = F::one();
+
+        self.reverse_sweep_core(adjoint_buf, &self.values, None);
+        adjoint_buf[..self.num_inputs as usize].to_vec()
+    }
+
+    /// Reverse sweep with weighted seeds, returning full adjoint vector.
+    pub(super) fn reverse_seeded_full(&self, seeds: &[F], out_indices: &[u32]) -> Vec<F> {
+        let n = self.num_variables as usize;
+        let mut adjoints = vec![F::zero(); n];
+
+        for (&out_idx, &weight) in out_indices.iter().zip(seeds.iter()) {
+            if weight == F::zero() {
+                continue;
+            }
+            adjoints[out_idx as usize] = adjoints[out_idx as usize] + weight;
+        }
+
+        self.reverse_sweep_core(&mut adjoints, &self.values, None);
+        adjoints
+    }
+
+    // ── Batch evaluation ──
+
+    /// Evaluate the gradient at multiple input points.
+    ///
+    /// Returns one gradient vector per input point.
+    pub fn gradient_batch(&mut self, inputs: &[&[F]]) -> Vec<Vec<F>> {
+        inputs.iter().map(|x| self.gradient(x)).collect()
+    }
+}
diff --git a/src/bytecode_tape/serde_support.rs b/src/bytecode_tape/serde_support.rs
new file mode 100644
index 0000000..b99288e
--- /dev/null
+++ b/src/bytecode_tape/serde_support.rs
@@ -0,0 +1,61 @@
+use std::collections::HashMap;
+
+use serde::ser::SerializeStruct;
+use serde::{Deserialize, Deserializer, Serialize, Serializer};
+
+use crate::float::Float;
+use crate::opcode::OpCode;
+
+use super::BytecodeTape;
+
+impl<F: Float + Serialize> Serialize for BytecodeTape<F> {
+    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
+        if !self.custom_ops.is_empty() {
+            return Err(serde::ser::Error::custom(
+                "cannot serialize a BytecodeTape containing custom ops; \
+                 custom ops must be re-registered after deserialization",
+            ));
+        }
+        let mut s = serializer.serialize_struct("BytecodeTape", 8)?;
+        s.serialize_field("opcodes", &self.opcodes)?;
+        s.serialize_field("arg_indices", &self.arg_indices)?;
+        s.serialize_field("values", &self.values)?;
+        s.serialize_field("num_inputs", &self.num_inputs)?;
+        s.serialize_field("num_variables", &self.num_variables)?;
+        s.serialize_field("output_index", &self.output_index)?;
+        s.serialize_field("output_indices", &self.output_indices)?;
+        s.serialize_field("custom_second_args", &self.custom_second_args)?;
+        s.end()
+    }
+}
+
+impl<'de, F: Float + Deserialize<'de>> Deserialize<'de> for BytecodeTape<F> {
+    fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
+        #[derive(Deserialize)]
+        struct TapeData<F> {
+            opcodes: Vec<OpCode>,
+            arg_indices: Vec<[u32; 2]>,
+            values: Vec<F>,
+            num_inputs: u32,
+            num_variables: u32,
+            output_index: u32,
+            #[serde(default)]
+            output_indices: Vec<u32>,
+            #[serde(default)]
+            custom_second_args: HashMap<u32, u32>,
+        }
+
+        let data = TapeData::<F>::deserialize(deserializer)?;
+        Ok(BytecodeTape {
+            opcodes: data.opcodes,
+            arg_indices: data.arg_indices,
+            values: data.values,
+            num_inputs: data.num_inputs,
+            num_variables: data.num_variables,
+            output_index: data.output_index,
+            output_indices: data.output_indices,
+            custom_ops: Vec::new(),
+            custom_second_args: data.custom_second_args,
+        })
+    }
+}
diff --git a/src/bytecode_tape/sparse.rs b/src/bytecode_tape/sparse.rs
new file mode 100644
index 0000000..b0998c3
--- /dev/null
+++ b/src/bytecode_tape/sparse.rs
@@ -0,0 +1,435 @@
+use crate::dual::Dual;
+use crate::dual_vec::DualVec;
+use crate::float::Float;
+
+impl<F: Float> super::BytecodeTape<F> {
+    /// Detect the structural sparsity pattern of the Hessian.
+    ///
+    /// Walks the tape forward propagating input-dependency bitsets.
+    /// At nonlinear operations, marks cross-pairs as potential Hessian interactions.
+    pub fn detect_sparsity(&self) -> crate::sparse::SparsityPattern {
+        crate::sparse::detect_sparsity_impl(
+            &self.opcodes,
+            &self.arg_indices,
+            self.num_inputs as usize,
+            self.num_variables as usize,
+        )
+    }
+
+    /// Detect the structural sparsity pattern of the Jacobian.
+    ///
+    /// Walks the tape forward propagating input-dependency bitsets (first-order).
+    /// For each output, determines which inputs it depends on.
+    pub fn detect_jacobian_sparsity(&self) -> crate::sparse::JacobianSparsityPattern {
+        let out_indices = self.all_output_indices();
+        crate::sparse::detect_jacobian_sparsity_impl(
+            &self.opcodes,
+            &self.arg_indices,
+            self.num_inputs as usize,
+            self.num_variables as usize,
+            out_indices,
+        )
+    }
+
+    /// Compute a sparse Hessian using structural sparsity detection and graph coloring.
+    ///
+    /// Returns `(value, gradient, pattern, hessian_values)` where
+    /// `hessian_values[k]` corresponds to `(pattern.rows[k], pattern.cols[k])`.
+    ///
+    /// For problems with sparse Hessians, this requires only `chromatic_number`
+    /// HVP calls instead of `n`, which can be dramatically fewer for banded
+    /// or sparse interaction structures.
+    pub fn sparse_hessian(&self, x: &[F]) -> (F, Vec<F>, crate::sparse::SparsityPattern, Vec<F>) {
+        let n = self.num_inputs as usize;
+        assert_eq!(x.len(), n, "wrong number of inputs");
+
+        let pattern = self.detect_sparsity();
+        let (colors, num_colors) = crate::sparse::greedy_coloring(&pattern);
+
+        let mut hessian_values = vec![F::zero(); pattern.nnz()];
+        let mut gradient = vec![F::zero(); n];
+        let mut value = F::zero();
+
+        let mut dual_input_buf: Vec<Dual<F>> = Vec::with_capacity(n);
+        let mut dual_vals_buf = Vec::new();
+        let mut adjoint_buf = Vec::new();
+        let mut v = vec![F::zero(); n];
+
+        for color in 0..num_colors {
+            // Form direction vector: v[i] = 1 if colors[i] == color, else 0
+            for i in 0..n {
+                v[i] = if colors[i] == color {
+                    F::one()
+                } else {
+                    F::zero()
+                };
+            }
+
+            self.hvp_with_all_bufs(
+                x,
+                &v,
+                &mut dual_input_buf,
+                &mut dual_vals_buf,
+                &mut adjoint_buf,
+            );
+
+            if color == 0 {
+                value = dual_vals_buf[self.output_index as usize].re;
+                for i in 0..n {
+                    gradient[i] = adjoint_buf[i].re;
+                }
+            }
+
+            // Extract Hessian entries for this color.
+            for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() {
+                if colors[col as usize] == color {
+                    hessian_values[k] = adjoint_buf[row as usize].eps;
+                }
+            }
+        }
+
+        (value, gradient, pattern, hessian_values)
+    }
+
+    /// Batched sparse Hessian: packs N colors per sweep using DualVec.
+    ///
+    /// Reduces the number of forward+reverse sweeps from `num_colors` to
+    /// `ceil(num_colors / N)`. Each sweep processes N colors simultaneously.
+    ///
+    /// Returns `(value, gradient, pattern, hessian_values)`.
+    pub fn sparse_hessian_vec<const N: usize>(
+        &self,
+        x: &[F],
+    ) -> (F, Vec<F>, crate::sparse::SparsityPattern, Vec<F>) {
+        let n = self.num_inputs as usize;
+        assert_eq!(x.len(), n, "wrong number of inputs");
+
+        let pattern = self.detect_sparsity();
+        let (colors, num_colors) = crate::sparse::greedy_coloring(&pattern);
+
+        let mut hessian_values = vec![F::zero(); pattern.nnz()];
+        let mut gradient = vec![F::zero(); n];
+        let mut value = F::zero();
+
+        let mut dual_input_buf: Vec<DualVec<F, N>> = Vec::with_capacity(n);
+        let mut dual_vals_buf: Vec<DualVec<F, N>> = Vec::new();
+        let mut adjoint_buf: Vec<DualVec<F, N>> = Vec::new();
+
+        let num_batches = (num_colors as usize).div_ceil(N);
+        for batch in 0..num_batches {
+            let base_color = (batch * N) as u32;
+
+            // Build DualVec inputs: lane k has v[i]=1 if colors[i] == base_color+k
+            dual_input_buf.clear();
+            dual_input_buf.extend((0..n).map(|i| {
+                let eps = std::array::from_fn(|lane| {
+                    let target_color = base_color + lane as u32;
+                    if target_color < num_colors && colors[i] == target_color {
+                        F::one()
+                    } else {
+                        F::zero()
+                    }
+                });
+                DualVec::new(x[i], eps)
+            }));
+
+            self.forward_tangent(&dual_input_buf, &mut dual_vals_buf);
+            self.reverse_tangent(&dual_vals_buf, &mut adjoint_buf);
+
+            if batch == 0 {
+                value = dual_vals_buf[self.output_index as usize].re;
+                for i in 0..n {
+                    gradient[i] = adjoint_buf[i].re;
+                }
+            }
+
+            // Extract Hessian entries: for entry (row, col) with colors[col] == base_color+lane,
+            // read adjoint_buf[row].eps[lane]
+            for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() {
+                let col_color = colors[col as usize];
+                if col_color >= base_color && col_color < base_color + N as u32 {
+                    let lane = (col_color - base_color) as usize;
+                    hessian_values[k] = adjoint_buf[row as usize].eps[lane];
+                }
+            }
+        }
+
+        (value, gradient, pattern, hessian_values)
+    }
+
+    // ── Sparse Jacobian ──
+
+    /// Compute a sparse Jacobian using structural sparsity detection and graph coloring.
+    ///
+    /// Auto-selects forward-mode (column compression) or reverse-mode (row compression)
+    /// based on which requires fewer sweeps.
+    ///
+    /// Returns `(output_values, pattern, jacobian_values)`.
+    pub fn sparse_jacobian(
+        &mut self,
+        x: &[F],
+    ) -> (Vec<F>, crate::sparse::JacobianSparsityPattern, Vec<F>) {
+        self.forward(x);
+        let pattern = self.detect_jacobian_sparsity();
+        let (col_colors, num_col_colors) = crate::sparse::column_coloring(&pattern);
+        let (row_colors, num_row_colors) = crate::sparse::row_coloring(&pattern);
+
+        if num_col_colors <= num_row_colors {
+            let jac_values =
+                self.sparse_jacobian_forward_impl(x, &pattern, &col_colors, num_col_colors);
+            let outputs = self.output_values();
+            (outputs, pattern, jac_values)
+        } else {
+            let jac_values =
+                self.sparse_jacobian_reverse_impl(x, &pattern, &row_colors, num_row_colors);
+            let outputs = self.output_values();
+            (outputs, pattern, jac_values)
+        }
+    }
+
+    /// Sparse Jacobian via forward-mode (column compression).
+    pub fn sparse_jacobian_forward(
+        &mut self,
+        x: &[F],
+    ) -> (Vec<F>, crate::sparse::JacobianSparsityPattern, Vec<F>) {
+        self.forward(x);
+        let pattern = self.detect_jacobian_sparsity();
+        let (colors, num_colors) = crate::sparse::column_coloring(&pattern);
+        let jac_values = self.sparse_jacobian_forward_impl(x, &pattern, &colors, num_colors);
+        let outputs = self.output_values();
+        (outputs, pattern, jac_values)
+    }
+
+    /// Sparse Jacobian via reverse-mode (row compression).
+    pub fn sparse_jacobian_reverse(
+        &mut self,
+        x: &[F],
+    ) -> (Vec<F>, crate::sparse::JacobianSparsityPattern, Vec<F>) {
+        self.forward(x);
+        let pattern = self.detect_jacobian_sparsity();
+        let (colors, num_colors) = crate::sparse::row_coloring(&pattern);
+        let jac_values = self.sparse_jacobian_reverse_impl(x, &pattern, &colors, num_colors);
+        let outputs = self.output_values();
+        (outputs, pattern, jac_values)
+    }
+
+    /// Sparse Jacobian with a precomputed sparsity pattern and coloring.
+    ///
+    /// Skips re-detection of sparsity on repeated calls. Use `column_coloring` colors
+    /// for forward mode or `row_coloring` colors for reverse mode. The `forward` flag
+    /// selects the mode.
+    pub fn sparse_jacobian_with_pattern(
+        &mut self,
+        x: &[F],
+        pattern: &crate::sparse::JacobianSparsityPattern,
+        colors: &[u32],
+        num_colors: u32,
+        forward_mode: bool,
+    ) -> (Vec<F>, Vec<F>) {
+        self.forward(x);
+        let jac_values = if forward_mode {
+            self.sparse_jacobian_forward_impl(x, pattern, colors, num_colors)
+        } else {
+            self.sparse_jacobian_reverse_impl(x, pattern, colors, num_colors)
+        };
+        let outputs = self.output_values();
+        (outputs, jac_values)
+    }
+
+    /// Forward-mode sparse Jacobian implementation (column compression).
+    ///
+    /// Each color group seeds a forward pass with tangent 1.0 in the columns
+    /// sharing that color. The resulting tangent at each output gives J[row][col].
+    fn sparse_jacobian_forward_impl(
+        &self,
+        x: &[F],
+        pattern: &crate::sparse::JacobianSparsityPattern,
+        colors: &[u32],
+        num_colors: u32,
+    ) -> Vec<F> {
+        let n = self.num_inputs as usize;
+        let mut jac_values = vec![F::zero(); pattern.nnz()];
+
+        let out_indices = self.all_output_indices();
+
+        let mut dual_input_buf: Vec<Dual<F>> = Vec::with_capacity(n);
+        let mut dual_vals_buf: Vec<Dual<F>> = Vec::new();
+
+        for color in 0..num_colors {
+            // Build Dual inputs: tangent = 1 for inputs with this color
+            dual_input_buf.clear();
+            dual_input_buf.extend((0..n).map(|i| {
+                Dual::new(
+                    x[i],
+                    if colors[i] == color {
+                        F::one()
+                    } else {
+                        F::zero()
+                    },
+                )
+            }));
+
+            self.forward_tangent(&dual_input_buf, &mut dual_vals_buf);
+
+            // Extract Jacobian entries: for entry (row, col) with colors[col] == color,
+            // the tangent at output_indices[row] gives J[row][col]
+            for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() {
+                if colors[col as usize] == color {
+                    jac_values[k] = dual_vals_buf[out_indices[row as usize] as usize].eps;
+                }
+            }
+        }
+
+        jac_values
+    }
+
+    /// Reverse-mode sparse Jacobian implementation (row compression).
+    ///
+    /// Each color group seeds a reverse pass with adjoint 1.0 at the outputs
+    /// sharing that color.
+    fn sparse_jacobian_reverse_impl(
+        &self,
+        _x: &[F],
+        pattern: &crate::sparse::JacobianSparsityPattern,
+        colors: &[u32],
+        num_colors: u32,
+    ) -> Vec<F> {
+        let m = self.num_outputs();
+        let mut jac_values = vec![F::zero(); pattern.nnz()];
+
+        let out_indices = self.all_output_indices();
+
+        for color in 0..num_colors {
+            // Build seeds: weight = 1 for outputs with this color
+            let seeds: Vec<F> = (0..m)
+                .map(|i| {
+                    if colors[i] == color {
+                        F::one()
+                    } else {
+                        F::zero()
+                    }
+                })
+                .collect();
+
+            let adjoints = self.reverse_seeded_full(&seeds, out_indices);
+
+            // Extract Jacobian entries: for entry (row, col) with colors[row] == color,
+            // adjoint[col] gives J[row][col]
+            for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() {
+                if colors[row as usize] == color {
+                    jac_values[k] = adjoints[col as usize];
+                }
+            }
+        }
+
+        jac_values
+    }
+
+    /// Batched sparse Jacobian: packs N colors per forward sweep using DualVec.
+    ///
+    /// Reduces the number of forward sweeps from `num_colors` to
+    /// `ceil(num_colors / N)`.
+    pub fn sparse_jacobian_vec<const N: usize>(
+        &mut self,
+        x: &[F],
+    ) -> (Vec<F>, crate::sparse::JacobianSparsityPattern, Vec<F>) {
+        self.forward(x);
+        let pattern = self.detect_jacobian_sparsity();
+        let (colors, num_colors) = crate::sparse::column_coloring(&pattern);
+
+        let n = self.num_inputs as usize;
+        let mut jac_values = vec![F::zero(); pattern.nnz()];
+
+        let out_indices = self.all_output_indices();
+
+        let mut dual_input_buf: Vec<DualVec<F, N>> = Vec::with_capacity(n);
+        let mut dual_vals_buf: Vec<DualVec<F, N>> = Vec::new();
+
+        let num_batches = (num_colors as usize).div_ceil(N);
+        for batch in 0..num_batches {
+            let base_color = (batch * N) as u32;
+
+            dual_input_buf.clear();
+            dual_input_buf.extend((0..n).map(|i| {
+                let eps = std::array::from_fn(|lane| {
+                    let target_color = base_color + lane as u32;
+                    if target_color < num_colors && colors[i] == target_color {
+                        F::one()
+                    } else {
+                        F::zero()
+                    }
+                });
+                DualVec::new(x[i], eps)
+            }));
+
+            self.forward_tangent(&dual_input_buf, &mut dual_vals_buf);
+
+            for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() {
+                let col_color = colors[col as usize];
+                if col_color >= base_color && col_color < base_color + N as u32 {
+                    let lane = (col_color - base_color) as usize;
+                    jac_values[k] = dual_vals_buf[out_indices[row as usize] as usize].eps[lane];
+                }
+            }
+        }
+
+        let outputs = self.output_values();
+        (outputs, pattern, jac_values)
+    }
+
+    /// Sparse Hessian with a precomputed sparsity pattern and coloring.
+    ///
+    /// Skips re-detection on repeated calls (e.g. in solver loops).
+    pub fn sparse_hessian_with_pattern(
+        &self,
+        x: &[F],
+        pattern: &crate::sparse::SparsityPattern,
+        colors: &[u32],
+        num_colors: u32,
+    ) -> (F, Vec<F>, Vec<F>) {
+        let n = self.num_inputs as usize;
+        assert_eq!(x.len(), n, "wrong number of inputs");
+
+        let mut hessian_values = vec![F::zero(); pattern.nnz()];
+        let mut gradient = vec![F::zero(); n];
+        let mut value = F::zero();
+
+        let mut dual_input_buf: Vec<Dual<F>> = Vec::with_capacity(n);
+        let mut dual_vals_buf = Vec::new();
+        let mut adjoint_buf = Vec::new();
+        let mut v = vec![F::zero(); n];
+
+        for color in 0..num_colors {
+            for i in 0..n {
+                v[i] = if colors[i] == color {
+                    F::one()
+                } else {
+                    F::zero()
+                };
+            }
+
+            self.hvp_with_all_bufs(
+                x,
+                &v,
+                &mut dual_input_buf,
+                &mut dual_vals_buf,
+                &mut adjoint_buf,
+            );
+
+            if color == 0 {
+                value = dual_vals_buf[self.output_index as usize].re;
+                for i in 0..n {
+                    gradient[i] = adjoint_buf[i].re;
+                }
+            }
+
+            for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() {
+                if colors[col as usize] == color {
+                    hessian_values[k] = adjoint_buf[row as usize].eps;
+                }
+            }
+        }
+
+        (value, gradient, hessian_values)
+    }
+}
diff --git a/src/bytecode_tape/tangent.rs b/src/bytecode_tape/tangent.rs
new file mode 100644
index 0000000..a15433e
--- /dev/null
+++ b/src/bytecode_tape/tangent.rs
@@ -0,0 +1,380 @@
+use crate::dual::Dual;
+use crate::dual_vec::DualVec;
+use crate::float::{Float, IsAllZero};
+use crate::opcode::{self, OpCode, UNUSED};
+use num_traits::Float as NumFloat;
+
+impl<F: Float> super::BytecodeTape<F> {
+    // ── Forward-over-reverse (second-order) ──
+
+    /// Forward sweep with tangent-carrying numbers. Reads opcodes and constants
+    /// from `self`, writing results into `buf`. Does not mutate the tape.
+    ///
+    /// Generic over `T: NumFloat` so it works with both `Dual<F>` and
+    /// `DualVec<F, N>`.
+    pub fn forward_tangent<T: NumFloat>(&self, inputs: &[T], buf: &mut Vec<T>) {
+        self.forward_tangent_inner(inputs, buf, |i, a_t, b_t| {
+            // First-order chain rule: evaluate on primals, convert partials to T.
+            let [a_idx, cb_idx] = self.arg_indices[i];
+            let a_primal = self.values[a_idx as usize];
+            let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied();
+            let b_primal = b_idx_opt
+                .map(|bi| self.values[bi as usize])
+                .unwrap_or(F::zero());
+            let result = self.custom_ops[cb_idx as usize].eval(a_primal, b_primal);
+            let (da, db) = self.custom_ops[cb_idx as usize].partials(a_primal, b_primal, result);
+            let result_t = T::from(result).unwrap();
+            let da_t = T::from(da).unwrap();
+            let db_t = T::from(db).unwrap();
+            let a_re_t = T::from(a_primal).unwrap();
+            let b_re_t = T::from(b_primal).unwrap();
+            result_t + da_t * (a_t - a_re_t) + db_t * (b_t - b_re_t)
+        });
+    }
+
+    /// Forward sweep specialized for `Dual<F>`, calling [`CustomOp::eval_dual`]
+    /// so that custom ops propagate tangent information for second-order derivatives.
+    pub(super) fn forward_tangent_dual(&self, inputs: &[Dual<F>], buf: &mut Vec<Dual<F>>) {
+        self.forward_tangent_inner(inputs, buf, |i, a_t, b_t| {
+            let [_a_idx, cb_idx] = self.arg_indices[i];
+            self.custom_ops[cb_idx as usize].eval_dual(a_t, b_t)
+        });
+    }
+
+    /// Common forward-tangent loop. The `handle_custom` closure receives
+    /// `(tape_index, a_value, b_value)` for custom op slots and returns
+    /// the result to store.
+    fn forward_tangent_inner<T: NumFloat>(
+        &self,
+        inputs: &[T],
+        buf: &mut Vec<T>,
+        handle_custom: impl Fn(usize, T, T) -> T,
+    ) {
+        assert_eq!(
+            inputs.len(),
+            self.num_inputs as usize,
+            "wrong number of inputs"
+        );
+
+        let n = self.num_variables as usize;
+        buf.clear();
+        buf.resize(n, T::zero());
+
+        let mut input_idx = 0usize;
+        for i in 0..self.opcodes.len() {
+            match self.opcodes[i] {
+                OpCode::Input => {
+                    buf[i] = inputs[input_idx];
+                    input_idx += 1;
+                }
+                OpCode::Const => {
+                    buf[i] = T::from(self.values[i]).unwrap();
+                }
+                OpCode::Custom => {
+                    let [a_idx, _cb_idx] = self.arg_indices[i];
+                    let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied();
+                    let a_t = buf[a_idx as usize];
+                    let b_t = b_idx_opt.map(|bi| buf[bi as usize]).unwrap_or(T::zero());
+                    buf[i] = handle_custom(i, a_t, b_t);
+                }
+                op => {
+                    let [a_idx, b_idx] = self.arg_indices[i];
+                    let a = buf[a_idx as usize];
+                    let b = if b_idx != UNUSED && op != OpCode::Powi {
+                        buf[b_idx as usize]
+                    } else if op == OpCode::Powi {
+                        T::from(b_idx).unwrap_or_else(|| T::zero())
+                    } else {
+                        T::zero()
+                    };
+                    buf[i] = opcode::eval_forward(op, a, b);
+                }
+            }
+        }
+    }
+
+    /// Reverse sweep with tangent-carrying adjoints. Uses values from
+    /// [`forward_tangent`](Self::forward_tangent). Uses [`IsAllZero`] to
+    /// safely skip zero adjoints without dropping tangent contributions.
+    pub(super) fn reverse_tangent<T: NumFloat + IsAllZero>(&self, tangent_vals: &[T], buf: &mut Vec<T>) {
+        self.reverse_tangent_inner(tangent_vals, buf, |i| {
+            // First-order: convert primal-float partials to T.
+            let [a_idx, cb_idx] = self.arg_indices[i];
+            let a_primal = self.values[a_idx as usize];
+            let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied();
+            let b_primal = b_idx_opt
+                .map(|bi| self.values[bi as usize])
+                .unwrap_or(F::zero());
+            let r_primal = self.values[i];
+            let (da, db) = self.custom_ops[cb_idx as usize].partials(a_primal, b_primal, r_primal);
+            (T::from(da).unwrap(), T::from(db).unwrap())
+        });
+    }
+
+    /// Reverse sweep specialized for `Dual<F>`, calling [`CustomOp::partials_dual`]
+    /// so that custom op partials carry tangent information for second-order derivatives.
+    pub(super) fn reverse_tangent_dual(&self, tangent_vals: &[Dual<F>], buf: &mut Vec<Dual<F>>) {
+        self.reverse_tangent_inner(tangent_vals, buf, |i| {
+            let [a_idx, cb_idx] = self.arg_indices[i];
+            let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied();
+            let a_dual = tangent_vals[a_idx as usize];
+            let b_dual = b_idx_opt
+                .map(|bi| tangent_vals[bi as usize])
+                .unwrap_or(Dual::constant(F::zero()));
+            let r_dual = tangent_vals[i];
+            self.custom_ops[cb_idx as usize].partials_dual(a_dual, b_dual, r_dual)
+        });
+    }
+
+    /// Common reverse-tangent loop. The `custom_partials` closure receives
+    /// `tape_index` for custom op slots and returns `(da, db)` as T-valued partials.
+    fn reverse_tangent_inner<T: NumFloat + IsAllZero>(
+        &self,
+        tangent_vals: &[T],
+        buf: &mut Vec<T>,
+        custom_partials: impl Fn(usize) -> (T, T),
+    ) {
+        let n = self.num_variables as usize;
+        buf.clear();
+        buf.resize(n, T::zero());
+        buf[self.output_index as usize] = T::one();
+
+        for i in (0..self.opcodes.len()).rev() {
+            match self.opcodes[i] {
+                OpCode::Input | OpCode::Const => continue,
+                OpCode::Custom => {
+                    let adj = buf[i];
+                    if adj.is_all_zero() {
+                        continue;
+                    }
+                    buf[i] = T::zero();
+
+                    let [a_idx, _cb_idx] = self.arg_indices[i];
+                    let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied();
+                    let (da_t, db_t) = custom_partials(i);
+                    buf[a_idx as usize] = buf[a_idx as usize] + da_t * adj;
+                    if let Some(bi) = b_idx_opt {
+                        buf[bi as usize] = buf[bi as usize] + db_t * adj;
+                    }
+                }
+                op => {
+                    let adj = buf[i];
+                    if adj.is_all_zero() {
+                        continue;
+                    }
+                    buf[i] = T::zero();
+
+                    let [a_idx, b_idx] = self.arg_indices[i];
+                    let a = tangent_vals[a_idx as usize];
+                    let b = if b_idx != UNUSED && op != OpCode::Powi {
+                        tangent_vals[b_idx as usize]
+                    } else if op == OpCode::Powi {
+                        T::from(b_idx).unwrap_or_else(|| T::zero())
+                    } else {
+                        T::zero()
+                    };
+                    let r = tangent_vals[i];
+                    let (da, db) = opcode::reverse_partials(op, a, b, r);
+
+                    buf[a_idx as usize] = buf[a_idx as usize] + da * adj;
+                    if b_idx != UNUSED && op != OpCode::Powi {
+                        buf[b_idx as usize] = buf[b_idx as usize] + db * adj;
+                    }
+                }
+            }
+        }
+    }
+
+    /// Hessian-vector product via forward-over-reverse.
+    ///
+    /// Returns `(gradient, H·v)` where both are `Vec<F>` of length
+    /// [`num_inputs`](Self::num_inputs). The tape is not mutated.
+    pub fn hvp(&self, x: &[F], v: &[F]) -> (Vec<F>, Vec<F>) {
+        let mut dual_vals = Vec::new();
+        let mut adjoint_buf = Vec::new();
+        self.hvp_with_buf(x, v, &mut dual_vals, &mut adjoint_buf)
+    }
+
+    /// Like [`hvp`](Self::hvp) but reuses caller-provided buffers to avoid
+    /// allocation on repeated calls (e.g. inside [`hessian`](Self::hessian)).
+    pub fn hvp_with_buf(
+        &self,
+        x: &[F],
+        v: &[F],
+        dual_vals_buf: &mut Vec<Dual<F>>,
+        adjoint_buf: &mut Vec<Dual<F>>,
+    ) -> (Vec<F>, Vec<F>) {
+        let n = self.num_inputs as usize;
+        assert_eq!(x.len(), n, "wrong number of inputs");
+        assert_eq!(v.len(), n, "wrong number of directions");
+
+        let dual_inputs: Vec<Dual<F>> = x
+            .iter()
+            .zip(v.iter())
+            .map(|(&xi, &vi)| Dual::new(xi, vi))
+            .collect();
+
+        self.forward_tangent_dual(&dual_inputs, dual_vals_buf);
+        self.reverse_tangent_dual(dual_vals_buf, adjoint_buf);
+
+        let gradient: Vec<F> = (0..n).map(|i| adjoint_buf[i].re).collect();
+        let hvp: Vec<F> = (0..n).map(|i| adjoint_buf[i].eps).collect();
+        (gradient, hvp)
+    }
+
+    /// Like [`hvp_with_buf`](Self::hvp_with_buf) but also reuses a caller-provided
+    /// input buffer, eliminating all allocations on repeated calls.
+    pub(super) fn hvp_with_all_bufs(
+        &self,
+        x: &[F],
+        v: &[F],
+        dual_input_buf: &mut Vec<Dual<F>>,
+        dual_vals_buf: &mut Vec<Dual<F>>,
+        adjoint_buf: &mut Vec<Dual<F>>,
+    ) {
+        // Reuse the input buffer instead of allocating
+        dual_input_buf.clear();
+        dual_input_buf.extend(x.iter().zip(v.iter()).map(|(&xi, &vi)| Dual::new(xi, vi)));
+
+        self.forward_tangent_dual(dual_input_buf, dual_vals_buf);
+        self.reverse_tangent_dual(dual_vals_buf, adjoint_buf);
+    }
+
+    /// Full Hessian matrix via `n` Hessian-vector products.
+    ///
+    /// Returns `(value, gradient, hessian)` where `hessian[i][j] = ∂²f/∂x_i∂x_j`.
+    /// The tape is not mutated.
+    pub fn hessian(&self, x: &[F]) -> (F, Vec<F>, Vec<Vec<F>>) {
+        let n = self.num_inputs as usize;
+        assert_eq!(x.len(), n, "wrong number of inputs");
+
+        let mut dual_input_buf: Vec<Dual<F>> = Vec::with_capacity(n);
+        let mut dual_vals_buf = Vec::new();
+        let mut adjoint_buf = Vec::new();
+        let mut hessian = vec![vec![F::zero(); n]; n];
+        let mut gradient = vec![F::zero(); n];
+        let mut value = F::zero();
+
+        for j in 0..n {
+            // Reuse input buffer
+            dual_input_buf.clear();
+            dual_input_buf
+                .extend((0..n).map(|i| Dual::new(x[i], if i == j { F::one() } else { F::zero() })));
+
+            self.forward_tangent_dual(&dual_input_buf, &mut dual_vals_buf);
+            self.reverse_tangent_dual(&dual_vals_buf, &mut adjoint_buf);
+
+            if j == 0 {
+                value = dual_vals_buf[self.output_index as usize].re;
+                for i in 0..n {
+                    gradient[i] = adjoint_buf[i].re;
+                }
+            }
+
+            for (row, adj) in hessian.iter_mut().zip(adjoint_buf.iter()) {
+                row[j] = adj.eps;
+            }
+        }
+
+        (value, gradient, hessian)
+    }
+
+    /// Full Hessian matrix via batched forward-over-reverse.
+    ///
+    /// Processes `ceil(n/N)` batches instead of `n` individual HVPs,
+    /// computing N Hessian columns simultaneously.
+    pub fn hessian_vec<const N: usize>(&self, x: &[F]) -> (F, Vec<F>, Vec<Vec<F>>) {
+        let n = self.num_inputs as usize;
+        assert_eq!(x.len(), n, "wrong number of inputs");
+
+        let mut dual_input_buf: Vec<DualVec<F, N>> = Vec::with_capacity(n);
+        let mut dual_vals_buf: Vec<DualVec<F, N>> = Vec::new();
+        let mut adjoint_buf: Vec<DualVec<F, N>> = Vec::new();
+        let mut hessian = vec![vec![F::zero(); n]; n];
+        let mut gradient = vec![F::zero(); n];
+        let mut value = F::zero();
+
+        let num_batches = n.div_ceil(N);
+        for batch in 0..num_batches {
+            let base = batch * N;
+
+            // Reuse input buffer
+            dual_input_buf.clear();
+            dual_input_buf.extend((0..n).map(|i| {
+                let eps = std::array::from_fn(|lane| {
+                    let col = base + lane;
+                    if col < n && i == col {
+                        F::one()
+                    } else {
+                        F::zero()
+                    }
+                });
+                DualVec::new(x[i], eps)
+            }));
+
+            self.forward_tangent(&dual_input_buf, &mut dual_vals_buf);
+            self.reverse_tangent(&dual_vals_buf, &mut adjoint_buf);
+
+            if batch == 0 {
+                value = dual_vals_buf[self.output_index as usize].re;
+                for i in 0..n {
+                    gradient[i] = adjoint_buf[i].re;
+                }
+            }
+
+            for lane in 0..N {
+                let col = base + lane;
+                if col >= n {
+                    break;
+                }
+                for i in 0..n {
+                    hessian[i][col] = adjoint_buf[i].eps[lane];
+                }
+            }
+        }
+
+        (value, gradient, hessian)
+    }
+
+    // ── Higher-order derivatives ──
+
+    /// Third-order directional derivative: `∑_{jk} (∂³f/∂x_i∂x_j∂x_k) v1_j v2_k`.
+    ///
+    /// Given directions `v1` and `v2`, computes:
+    /// - `gradient`: `∇f(x)`
+    /// - `hvp`: `H(x) · v1` (Hessian-vector product)
+    /// - `third`: `(∂/∂v2)(H · v1)` (third-order tensor contracted with v1 and v2)
+    ///
+    /// Uses `Dual<Dual<F>>` (nested dual numbers): inner tangent for `v1`,
+    /// outer tangent for `v2`.
+    pub fn third_order_hvvp(&self, x: &[F], v1: &[F], v2: &[F]) -> (Vec<F>, Vec<F>, Vec<F>) {
+        let n = self.num_inputs as usize;
+        assert_eq!(x.len(), n, "wrong number of inputs");
+        assert_eq!(v1.len(), n, "wrong v1 length");
+        assert_eq!(v2.len(), n, "wrong v2 length");
+
+        // Build Dual<Dual<F>> inputs:
+        //   inner.re = x[i], inner.eps = v1[i]  (for HVP direction)
+        //   outer.re = inner, outer.eps = v2[i]  (for third-order direction)
+        //
+        // outer = Dual { re: Dual(x[i], v1[i]), eps: Dual(v2[i], 0) }
+        let dd_inputs: Vec<Dual<Dual<F>>> = (0..n)
+            .map(|i| Dual {
+                re: Dual::new(x[i], v1[i]),
+                eps: Dual::new(v2[i], F::zero()),
+            })
+            .collect();
+
+        let mut dd_vals: Vec<Dual<Dual<F>>> = Vec::new();
+        let mut dd_adj: Vec<Dual<Dual<F>>> = Vec::new();
+
+        self.forward_tangent(&dd_inputs, &mut dd_vals);
+        self.reverse_tangent(&dd_vals, &mut dd_adj);
+
+        let gradient: Vec<F> = (0..n).map(|i| dd_adj[i].re.re).collect();
+        let hvp: Vec<F> = (0..n).map(|i| dd_adj[i].re.eps).collect();
+        let third: Vec<F> = (0..n).map(|i| dd_adj[i].eps.eps).collect();
+
+        (gradient, hvp, third)
+    }
+}
diff --git a/src/bytecode_tape/taylor.rs b/src/bytecode_tape/taylor.rs
new file mode 100644
index 0000000..7047ceb
--- /dev/null
+++ b/src/bytecode_tape/taylor.rs
@@ -0,0 +1,118 @@
+use crate::float::Float;
+use crate::taylor::Taylor;
+
+impl<F: Float> super::BytecodeTape<F> {
+    /// Forward-reverse Taylor pass for gradient + higher-order directional adjoints.
+    ///
+    /// Builds Taylor inputs `x_i(t) = x_i + v_i * t` (with zero higher coefficients),
+    /// runs `forward_tangent`, then `reverse_tangent` to get Taylor-valued adjoints.
+    ///
+    /// Returns `(output, adjoints)` where:
+    /// - `output` is the Taylor expansion of `f` along direction `v`
+    /// - `adjoints[i].coeff(0)` = `∂f/∂x_i` (gradient)
+    /// - `adjoints[i].coeff(1)` = `Σ_j (∂²f/∂x_i∂x_j) v_j` (HVP)
+    /// - `adjoints[i].derivative(k)` = k-th order directional adjoint
+    ///
+    /// For K=2, the HVP component is equivalent to [`hvp`](Self::hvp).
+    /// For K≥3, yields additional higher-order information in the same pass.
+    ///
+    /// Like [`hvp`](Self::hvp), takes `&self` and does not call `forward(x)`
+    /// before the Taylor pass. Custom ops will use primal values from recording time.
+    pub fn taylor_grad<const K: usize>(
+        &self,
+        x: &[F],
+        v: &[F],
+    ) -> (Taylor<F, K>, Vec<Taylor<F, K>>) {
+        let mut fwd_buf = Vec::new();
+        let mut adj_buf = Vec::new();
+        self.taylor_grad_with_buf(x, v, &mut fwd_buf, &mut adj_buf)
+    }
+
+    /// Like [`taylor_grad`](Self::taylor_grad) but reuses caller-provided buffers
+    /// to avoid allocation on repeated calls.
+    pub fn taylor_grad_with_buf<const K: usize>(
+        &self,
+        x: &[F],
+        v: &[F],
+        fwd_buf: &mut Vec<Taylor<F, K>>,
+        adj_buf: &mut Vec<Taylor<F, K>>,
+    ) -> (Taylor<F, K>, Vec<Taylor<F, K>>) {
+        let n = self.num_inputs as usize;
+        assert_eq!(x.len(), n, "wrong number of inputs");
+        assert_eq!(v.len(), n, "wrong number of directions");
+
+        // Build Taylor inputs: x_i(t) = x_i + v_i * t
+        let taylor_inputs: Vec<Taylor<F, K>> = x
+            .iter()
+            .zip(v.iter())
+            .map(|(&xi, &vi)| {
+                let mut coeffs = [F::zero(); K];
+                coeffs[0] = xi;
+                if K > 1 {
+                    coeffs[1] = vi;
+                }
+                Taylor::new(coeffs)
+            })
+            .collect();
+
+        self.forward_tangent(&taylor_inputs, fwd_buf);
+        let output = fwd_buf[self.output_index as usize];
+        self.reverse_tangent(fwd_buf, adj_buf);
+
+        (output, adj_buf[..n].to_vec())
+    }
+
+    // ── ODE Taylor integration ──
+
+    /// Compute the Taylor expansion of the ODE solution `y(t)` to order K.
+    ///
+    /// Given a tape representing the right-hand side `f: R^n → R^n` of the ODE
+    /// `y' = f(y)`, and an initial condition `y(0) = y0`, computes the Taylor
+    /// coefficients `y_0, y_1, ..., y_{K-1}` such that
+    /// `y(t) ≈ y_0 + y_1·t + y_2·t² + ... + y_{K-1}·t^{K-1}`.
+    ///
+    /// The tape must have `num_outputs == num_inputs` (autonomous ODE: f maps R^n → R^n).
+    ///
+    /// Returns one `Taylor<F, K>` per state variable. Use [`Taylor::eval_at`] to
+    /// evaluate at a step size `h`, or inspect coefficients for error estimation.
+    pub fn ode_taylor_step<const K: usize>(&self, y0: &[F]) -> Vec<Taylor<F, K>> {
+        let mut buf = Vec::new();
+        self.ode_taylor_step_with_buf(y0, &mut buf)
+    }
+
+    /// Like [`ode_taylor_step`](Self::ode_taylor_step) but reuses a caller-provided
+    /// buffer to avoid allocation on repeated calls.
+    pub fn ode_taylor_step_with_buf<const K: usize>(
+        &self,
+        y0: &[F],
+        buf: &mut Vec<Taylor<F, K>>,
+    ) -> Vec<Taylor<F, K>> {
+        let n = self.num_inputs as usize;
+        assert_eq!(y0.len(), n, "y0 length must match num_inputs");
+        assert_eq!(
+            self.num_outputs(),
+            n,
+            "ODE tape must have num_outputs == num_inputs (f: R^n -> R^n)"
+        );
+
+        let out_indices = self.all_output_indices();
+
+        let mut y_coeffs = vec![[F::zero(); K]; n];
+        for i in 0..n {
+            y_coeffs[i][0] = y0[i];
+        }
+
+        for k in 0..K - 1 {
+            let inputs: Vec<Taylor<F, K>> = (0..n).map(|i| Taylor::new(y_coeffs[i])).collect();
+
+            self.forward_tangent(&inputs, buf);
+
+            let divisor = F::from(k + 1).unwrap();
+            for i in 0..n {
+                y_coeffs[i][k + 1] = buf[out_indices[i] as usize].coeff(k) / divisor;
+            }
+        }
+
+        (0..n).map(|i| Taylor::new(y_coeffs[i])).collect()
+    }
+}
diff --git a/src/bytecode_tape/thread_local.rs b/src/bytecode_tape/thread_local.rs
new file mode 100644
index 0000000..32c0b65
--- /dev/null
+++ b/src/bytecode_tape/thread_local.rs
@@ -0,0 +1,87 @@
+use std::cell::Cell;
+
+use crate::dual::Dual;
+use crate::float::Float;
+
+use super::BytecodeTape;
+
+thread_local! {
+    static BTAPE_F32: Cell<*mut BytecodeTape<f32>> = const { Cell::new(std::ptr::null_mut()) };
+    static BTAPE_F64: Cell<*mut BytecodeTape<f64>> = const { Cell::new(std::ptr::null_mut()) };
+    static BTAPE_DUAL_F32: Cell<*mut BytecodeTape<Dual<f32>>> = const { Cell::new(std::ptr::null_mut()) };
+    static BTAPE_DUAL_F64: Cell<*mut BytecodeTape<Dual<f64>>> = const { Cell::new(std::ptr::null_mut()) };
+}
+
+/// Trait to select the correct thread-local for a given float type.
+///
+/// Implemented for `f32`, `f64`, `Dual<f32>`, and `Dual<f64>`, enabling
+/// `BReverse<F>` to be used with these base types.
+pub trait BtapeThreadLocal: Float {
+    fn btape_cell() -> &'static std::thread::LocalKey<Cell<*mut BytecodeTape<Self>>>;
+}
+
+impl BtapeThreadLocal for f32 {
+    fn btape_cell() -> &'static std::thread::LocalKey<Cell<*mut BytecodeTape<Self>>> {
+        &BTAPE_F32
+    }
+}
+
+impl BtapeThreadLocal for f64 {
+    fn btape_cell() -> &'static std::thread::LocalKey<Cell<*mut BytecodeTape<Self>>> {
+        &BTAPE_F64
+    }
+}
+
+impl BtapeThreadLocal for Dual<f32> {
+    fn btape_cell() -> &'static std::thread::LocalKey<Cell<*mut BytecodeTape<Self>>> {
+        &BTAPE_DUAL_F32
+    }
+}
+
+impl BtapeThreadLocal for Dual<f64> {
+    fn btape_cell() -> &'static std::thread::LocalKey<Cell<*mut BytecodeTape<Self>>> {
+        &BTAPE_DUAL_F64
+    }
+}
+
+/// Access the active bytecode tape for the current thread.
+/// Panics if no tape is active.
+#[inline]
+pub fn with_active_btape<F: BtapeThreadLocal, R>(f: impl FnOnce(&mut BytecodeTape<F>) -> R) -> R {
+    F::btape_cell().with(|cell| {
+        let ptr = cell.get();
+        assert!(
+            !ptr.is_null(),
+            "No active bytecode tape. Use echidna::record() to record a function."
+        );
+        // SAFETY: BtapeGuard guarantees validity for the duration of the
+        // recording scope, single-threaded via thread-local.
+        let tape = unsafe { &mut *ptr };
+        f(tape)
+    })
+}
+
+/// RAII guard that sets a bytecode tape as the thread-local active tape.
+pub struct BtapeGuard<F: BtapeThreadLocal> {
+    prev: *mut BytecodeTape<F>,
+}
+
+impl<F: BtapeThreadLocal> BtapeGuard<F> {
+    /// Activate `tape` as the thread-local bytecode tape.
+    pub fn new(tape: &mut BytecodeTape<F>) -> Self {
+        let prev = F::btape_cell().with(|cell| {
+            let prev = cell.get();
+            cell.set(tape as *mut BytecodeTape<F>);
+            prev
+        });
+        BtapeGuard { prev }
+    }
+}
+
+impl<F: BtapeThreadLocal> Drop for BtapeGuard<F> {
+    fn drop(&mut self) {
+        F::btape_cell().with(|cell| {
+            cell.set(self.prev);
+        });
+    }
+}

From af656de76f48cb4d4656135d5eacf431f580e1bf Mon Sep 17 00:00:00 2001
From: Greg von Nessi <greg.vonnessi@entrolution.ai>
Date: Thu, 26 Feb 2026 01:45:56 +0000
Subject: [PATCH 4/5] Bump to v0.4.0 and update docs

BytecodeTape decomposition (10 submodules), reverse sweep
deduplication, CUDA macro extraction, nalgebra 0.34, and
accumulated doc/test improvements since v0.3.0.
---
 CHANGELOG.md                       |   16 +-
 CONTRIBUTING.md                    |   50 +-
 Cargo.toml                         |    6 +-
 README.md                          |   20 +-
 SECURITY.md                        |    6 +-
 docs/adr-deferred-work.md          |   27 +-
 docs/algorithms.md                 |   11 +-
 docs/roadmap.md                    |    6 +-
 echidna-optim/Cargo.toml           |    4 +-
 echidna-optim/tests/integration.rs |    4 +-
 echidna-optim/tests/solvers.rs     |  651 ++++++++++++++++
 src/gpu/cuda_backend.rs            | 1147 ++++++++++------------------
 src/gpu/mod.rs                     |    2 +-
 src/opcode.rs                      |    2 +-
 14 files changed, 1185 insertions(+), 767 deletions(-)
 create mode 100644 echidna-optim/tests/solvers.rs

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b1e0909..054d493 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.4.0] - 2026-02-26
+
+### Changed
+
+#### Internal Architecture
+- **BytecodeTape decomposition**: split 2,689-line monolithic `bytecode_tape.rs` into a directory module with 10 focused submodules (`forward.rs`, `reverse.rs`, `tangent.rs`, `jacobian.rs`, `sparse.rs`, `optimize.rs`, `taylor.rs`, `parallel.rs`, `serde_support.rs`, `thread_local.rs`). Zero public API changes; benchmarks confirm no performance impact.
+- Deduplicated reverse sweep in `gradient_with_buf()` and `sparse_jacobian_par()` — both now call shared `reverse_sweep_core()` instead of inlining the loop. `gradient_with_buf` gains the zero-adjoint skip optimization it was previously missing.
+- Bumped `nalgebra` dependency from 0.33 to 0.34
+
+### Fixed
+- Corrected opcode variant count in documentation (44 variants, not 38/43)
+- Fixed CONTRIBUTING.md MSRV reference (1.93, not 1.80)
+
 ## [0.3.0] - 2026-02-25
 
 ### Added
@@ -162,7 +175,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Forward-vs-reverse cross-validation on Rosenbrock, Beale, Ackley, Booth, and more
 - Criterion benchmarks for forward overhead and reverse gradient
 
-[Unreleased]: https://github.com/Entrolution/echidna/compare/v0.3.0...HEAD
+[Unreleased]: https://github.com/Entrolution/echidna/compare/v0.4.0...HEAD
+[0.4.0]: https://github.com/Entrolution/echidna/compare/v0.3.0...v0.4.0
 [0.3.0]: https://github.com/Entrolution/echidna/compare/v0.2.0...v0.3.0
 [0.2.0]: https://github.com/Entrolution/echidna/compare/v0.1.0...v0.2.0
 [0.1.0]: https://github.com/Entrolution/echidna/releases/tag/v0.1.0
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f9ad291..0620da4 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -21,7 +21,7 @@ This project is governed by the [Contributor Covenant Code of Conduct](CODE_OF_C
 
 ### Prerequisites
 
-- Rust 1.80 or later (install via [rustup](https://rustup.rs/))
+- Rust 1.93 or later (install via [rustup](https://rustup.rs/))
 - Cargo (included with Rust)
 
 ### Building
@@ -37,9 +37,19 @@ cargo build --release
 ### Testing
 
 ```bash
-# Run all tests
+# Run core tests (no feature flags)
 cargo test
 
+# Run with all CPU features
+cargo test --features "bytecode,taylor,laurent,stde,diffop,serde,faer,nalgebra,ndarray"
+
+# Run GPU tests (requires hardware)
+cargo test --features "bytecode,gpu-wgpu"
+cargo test --features "bytecode,gpu-cuda"
+
+# Run echidna-optim tests
+cargo test -p echidna-optim
+
 # Run tests with output
 cargo test -- --nocapture
 
@@ -65,12 +75,29 @@ cargo doc --no-deps
 ### Benchmarks
 
 ```bash
-# Run all benchmarks
+# Run core benchmarks (forward + reverse)
 cargo bench
 
-# Run a specific benchmark
-cargo bench --bench forward
-cargo bench --bench reverse
+# Bytecode tape
+cargo bench --features bytecode --bench bytecode
+
+# Taylor mode
+cargo bench --features stde --bench taylor
+
+# STDE estimators
+cargo bench --features stde --bench stde
+
+# Differential operators
+cargo bench --features diffop --bench diffop
+
+# Cross-country, sparse, nonsmooth
+cargo bench --features bytecode --bench advanced
+
+# GPU
+cargo bench --features gpu-wgpu --bench gpu
+
+# Comparison vs other libraries
+cargo bench --features bytecode --bench comparison
 ```
 
 ### Security Audits
@@ -136,7 +163,7 @@ src/
 ├── api.rs                 # Public API: grad, jvp, vjp, jacobian, hessian, ...
 ├── breverse.rs            # BReverse<F> bytecode-tape reverse variable [bytecode]
 ├── bytecode_tape.rs       # BytecodeTape SoA representation [bytecode]
-├── opcode.rs              # Opcode definitions and dispatch [bytecode]
+├── opcode.rs              # Opcode definitions and dispatch (44 opcodes) [bytecode]
 ├── sparse.rs              # Sparsity detection and graph coloring [bytecode]
 ├── cross_country.rs       # Markowitz vertex elimination [bytecode]
 ├── nonsmooth.rs           # Branch tracking, Clarke Jacobian [bytecode]
@@ -146,7 +173,14 @@ src/
 ├── taylor_ops.rs          # Shared Taylor propagation rules [taylor]
 ├── laurent.rs             # Laurent<F, K> singularity analysis [laurent]
 ├── stde.rs                # Stochastic derivative estimators [stde]
-├── gpu/                   # GPU acceleration [gpu-wgpu, gpu-cuda]
+├── diffop.rs              # Arbitrary differential operators via jets [diffop]
+├── gpu/
+│   ├── mod.rs             # GpuBackend trait, GpuTapeData, GpuError
+│   ├── wgpu_backend.rs    # WgpuContext (Metal/Vulkan/DX12, f32) [gpu-wgpu]
+│   ├── cuda_backend.rs    # CudaContext (NVIDIA, f32+f64) [gpu-cuda]
+│   ├── stde_gpu.rs        # GPU-accelerated STDE functions [stde]
+│   ├── shaders/           # 5 WGSL compute shaders [gpu-wgpu]
+│   └── kernels/           # CUDA kernels (tape_eval.cu, taylor_eval.cu) [gpu-cuda]
 ├── faer_support.rs        # faer integration [faer]
 ├── nalgebra_support.rs    # nalgebra integration [nalgebra]
 ├── ndarray_support.rs     # ndarray integration [ndarray]
diff --git a/Cargo.toml b/Cargo.toml
index f6dd079..03fa10a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ members = [".", "echidna-optim"]
 
 [package]
 name = "echidna"
-version = "0.3.0"
+version = "0.4.0"
 edition = "2021"
 rust-version = "1.93"
 description = "A high-performance automatic differentiation library for Rust"
@@ -26,7 +26,7 @@ rayon = { version = "1", optional = true }
 serde = { version = "1", features = ["derive"], optional = true }
 ndarray = { version = "0.17", optional = true }
 faer = { version = "0.24", optional = true }
-nalgebra = { version = "0.33", optional = true }
+nalgebra = { version = "0.34", optional = true }
 wgpu = { version = "28", optional = true }
 bytemuck = { version = "1", features = ["derive"], optional = true }
 pollster = { version = "0.4", optional = true }
@@ -37,7 +37,7 @@ criterion = { version = "0.8", features = ["html_reports"] }
 approx = "0.5"
 serde_json = "1"
 ciborium = "0.2"
-nalgebra = "0.33"
+nalgebra = "0.34"
 num-dual = "0.11"
 ad_trait = "0.2.2"
 tempfile = "3"
diff --git a/README.md b/README.md
index f0f28ac..4e95577 100644
--- a/README.md
+++ b/README.md
@@ -46,7 +46,7 @@ Add to `Cargo.toml`:
 
 ```toml
 [dependencies]
-echidna = "0.3"
+echidna = "0.4"
 ```
 
 ### Gradient via reverse mode
@@ -157,7 +157,7 @@ assert!((d2_dxdy - 2.0).abs() < 1e-6);
 Enable features in `Cargo.toml`:
 
 ```toml
-echidna = { version = "0.3", features = ["bytecode", "taylor"] }
+echidna = { version = "0.4", features = ["bytecode", "taylor"] }
 ```
 
 ## API
@@ -200,12 +200,18 @@ echidna = { version = "0.3", features = ["bytecode", "taylor"] }
 | `tape.ode_taylor_step::<K>(y0)` | ODE Taylor integration step |
 | `stde::laplacian(tape, x, dirs)` | Stochastic Laplacian estimate |
 | `stde::hessian_diagonal(tape, x)` | Stochastic Hessian diagonal |
+| `stde::diagonal_kth_order(tape, x, k)` | Exact k-th order diagonal (dynamic, arena-based) |
 | `stde::diagonal_kth_order_const::<K>(tape, x)` | Exact k-th order diagonal (const-generic, stack-allocated) |
+| `stde::diagonal_kth_order_stochastic(tape, x, k, indices)` | Stochastic k-th order diagonal via subsampling |
 | `stde::dense_stde_2nd(tape, x, cholesky, z)` | Dense STDE for positive-definite 2nd-order operators |
+| `stde::stde_sparse(tape, x, dist, indices)` | Sparse STDE for arbitrary differential operators |
+| `stde::parabolic_diffusion(tape, x, sigma)` | Parabolic PDE σ-transform diffusion operator |
 | `stde::directional_derivatives(tape, x, dirs)` | Batched 1st and 2nd order directional derivatives |
 | `stde::laplacian_with_control(tape, x, dirs, ctrl)` | Laplacian with control variate variance reduction |
 | `stde::laplacian_hutchpp(tape, x, dirs_s, dirs_g)` | Hutch++ Laplacian estimator (lower variance) |
 | `stde::divergence(tape, x, dirs)` | Stochastic divergence (trace of Jacobian) estimator |
+| `stde::estimate(estimator, tape, x, dirs)` | Run an `Estimator` with Welford statistics |
+| `stde::estimate_weighted(estimator, tape, x, dirs, weights)` | Importance-weighted estimator (West's algorithm) |
 
 ### Differential Operators (requires `diffop`)
 
@@ -215,6 +221,10 @@ echidna = { version = "0.3", features = ["bytecode", "taylor"] }
 | `diffop::hessian(tape, x)` | Full Hessian via jet extraction |
 | `JetPlan::plan(n, indices)` | Plan once for a set of multi-indices |
 | `diffop::eval_dyn(plan, tape, x)` | Evaluate a plan at a new point (reuses precomputed slot assignments) |
+| `DiffOp::new(n, terms)` | Differential operator from `(coefficient, MultiIndex)` pairs |
+| `DiffOp::laplacian(n)` / `biharmonic(n)` / `diagonal(n, k)` | Common operator constructors |
+| `diffop.eval(tape, x)` | Evaluate operator at a point via `JetPlan` |
+| `diffop.sparse_distribution()` | Build `SparseSamplingDistribution` for stochastic estimation |
 
 ### GPU (requires `gpu-wgpu` or `gpu-cuda`)
 
@@ -229,6 +239,8 @@ echidna = { version = "0.3", features = ["bytecode", "taylor"] }
 | `stde_gpu::laplacian_gpu(ctx, bufs, x, dirs)` | GPU-accelerated Laplacian estimator |
 | `stde_gpu::hessian_diagonal_gpu(ctx, bufs, x)` | GPU-accelerated exact Hessian diagonal |
 | `stde_gpu::laplacian_with_control_gpu(ctx, bufs, x, dirs, ctrl)` | GPU Laplacian with control variate |
+| `stde_gpu::laplacian_gpu_cuda(ctx, bufs, x, dirs)` | CUDA-accelerated Laplacian estimator |
+| `stde_gpu::hessian_diagonal_gpu_cuda(ctx, bufs, x)` | CUDA-accelerated exact Hessian diagonal |
 
 CUDA additionally supports `_f64` variants of all methods.
 
@@ -267,7 +279,7 @@ cargo bench                                         # Forward + reverse
 cargo bench --features bytecode --bench bytecode    # Bytecode tape
 cargo bench --features stde --bench taylor          # Taylor mode
 cargo bench --features gpu-wgpu --bench gpu         # GPU
-cargo bench --features "bytecode,nalgebra" --bench comparison  # vs other libraries
+cargo bench --features bytecode --bench comparison             # vs other libraries
 ```
 
 ### Gradient benchmark (Rosenbrock, lower is better)
@@ -296,7 +308,7 @@ The [`echidna-optim`](echidna-optim/) crate provides optimization solvers and im
 
 ```toml
 [dependencies]
-echidna-optim = "0.3"
+echidna-optim = "0.4"
 ```
 
 Optional features: `parallel` (enables rayon parallelism via `echidna/parallel`), `sparse-implicit` (sparse implicit differentiation via faer).
diff --git a/SECURITY.md b/SECURITY.md
index 18b67f5..63c0c2f 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -4,8 +4,8 @@
 
 | Version | Supported |
 |---------|-----------|
-| 0.3.x   | Yes       |
-| < 0.3   | No        |
+| 0.4.x   | Yes       |
+| < 0.4   | No        |
 
 Only the latest minor release receives security updates.
 
@@ -30,7 +30,7 @@ echidna uses `unsafe` in the following scoped contexts:
 
 | Location | Purpose |
 |----------|---------|
-| `tape.rs`, `bytecode_tape.rs`, `taylor_dyn.rs` | Thread-local mutable pointer dereference for tape/arena access. Each is RAII-guarded: the pointer is valid for the lifetime of the enclosing scope guard. |
+| `tape.rs`, `bytecode_tape/thread_local.rs`, `taylor_dyn.rs` | Thread-local mutable pointer dereference for tape/arena access. Each is RAII-guarded: the pointer is valid for the lifetime of the enclosing scope guard. |
 | `checkpoint.rs` | Byte-level transmutation (`&[F]` ↔ `&[u8]`) for disk-backed checkpoint serialisation. Relies on `F: Float` being `f32`/`f64` (IEEE 754, no padding). |
 | `gpu/cuda_backend.rs` | FFI kernel launches via `cudarc`. Each call passes validated buffer sizes and grid dimensions to the CUDA driver. |
 | `traits/simba_impls.rs` | `extract_unchecked` / `replace_unchecked` for simba's `SimdValue` trait. Scalar types have only one lane, so the index is always 0. |
diff --git a/docs/adr-deferred-work.md b/docs/adr-deferred-work.md
index abc8a6c..419d1f4 100644
--- a/docs/adr-deferred-work.md
+++ b/docs/adr-deferred-work.md
@@ -2,8 +2,9 @@
 
 **Date**: 2026-02-25
 **Status**: Accepted
+**Last updated**: 2026-02-25 (added STDE deferred items)
 
-All core roadmap items (R1–R13) and Phases 1–5 are complete. This ADR captures every evaluated-but-not-implemented item with explicit reasoning, so future planning doesn't re-investigate the same paths.
+All core roadmap items (R1–R13), Phases 1–5, and the STDE deferred items plan are complete. This ADR captures every evaluated-but-not-implemented item with explicit reasoning, so future planning doesn't re-investigate the same paths.
 
 ---
 
@@ -12,12 +13,26 @@ All core roadmap items (R1–R13) and Phases 1–5 are complete. This ADR captur
 | Item | Blocker | Revisit When |
 |------|---------|--------------|
 | RUSTSEC-2024-0436 (paste/simba) | Upstream simba must release with paste alternative | simba publishes a new release |
-| nalgebra 0.33 → 0.34 | nalgebra 0.34 requires Rust 1.87+; MSRV is 1.80 | MSRV is raised to 1.87+ |
+
+---
+
+## Deferred (valuable, not yet needed)
+
+| Item | Reasoning | Revisit When |
+|------|-----------|--------------|
+| nalgebra 0.33 → 0.34 | MSRV blocker resolved (now 1.93, nalgebra 0.34 requires 1.87+). May have API changes in `NalgebraVec` and sparse matrix types. | When convenient; test thoroughly for API changes. |
+| Indefinite dense STDE | Eigendecomposition-based approach for indefinite coefficient matrices C (6 parameters, sign-splitting into C⁺ - C⁻) adds significant API complexity. The positive-definite Cholesky case (`dense_stde_2nd`) covers most PDE use cases (Fokker-Planck, Black-Scholes, HJB). Users with indefinite C can manually split into C⁺ - C⁻, compute Cholesky factors for each, and call `dense_stde_2nd` twice. | A concrete user need for indefinite C arises |
+| General-K GPU Taylor kernels | GPU Taylor kernel is hardcoded to K=3 (second-order). Hardcoding allows complete loop unrolling — critical for GPU performance. General K would need dynamic loops or a family of K-specialized kernels. | Need for GPU-accelerated 3rd+ order derivatives |
+| Chunked GPU Taylor dispatch | Working buffer is `3 * num_variables * batch_size * 4` bytes. WebGPU's 128 MB limit caps `num_variables * batch_size ≤ ~10M`. For larger problems, the dispatch function should chunk the batch and accumulate results. | Users hit the buffer limit in practice |
+| CUDA `laplacian_with_control_gpu` | `laplacian_with_control_gpu` is currently wgpu-only. CUDA equivalent (`laplacian_with_control_gpu_cuda`) is straightforward to add — same CPU-side Welford aggregation, just dispatches through `CudaContext`. | CUDA users need variance-reduced Laplacian |
+| `taylor_forward_2nd_batch` in `GpuBackend` trait | Currently an inherent method on each backend, not part of the `GpuBackend` trait. Adding to the trait would enable generic code over backends but requires an associated type for Taylor results. | Multiple backends need to be used generically for Taylor |
 
 ---
 
 ## Rejected (evaluated, explicit reasoning)
 
+### Core AD
+
 | Item | Reasoning | What exists instead |
 |------|-----------|---------------------|
 | Constant deduplication (5.2) | `FloatBits` orphan rule blocks impl for `Dual<F>`. CSE handles ops over duplicate constants. | CSE pass in bytecode tape |
@@ -28,3 +43,11 @@ All core roadmap items (R1–R13) and Phases 1–5 are complete. This ADR captur
 | Preaccumulation of straight-line segments | Superseded by cross-country Markowitz elimination (R5), which achieves the same goal more generally. | `jacobian_cross_country()` |
 | Trait impl macros for num_traits | ~3,200 lines are mechanical but readable. Macros hurt error messages, IDE support, and debuggability. Low maintenance cost since impls rarely change. | Manual trait impls |
 | DiffOperator trait abstraction | Concrete estimators work well as standalone functions. `Estimator` trait (5.1) already provides the needed abstraction. Another trait layer would over-abstract. | `Estimator` trait + concrete functions |
+
+### STDE Paper Items
+
+| Item | Paper Section | Reasoning | What exists instead |
+|------|---------------|-----------|---------------------|
+| Multi-pushforward correction | App F, Eq 48-52 | Current collision-free prime-window approach in `diagonal_kth_order` is simpler and equally effective for practical operators. The correction addresses collisions in multi-index sampling that don't occur with the prime-window design. | `diagonal_kth_order`, `diagonal_kth_order_const` |
+| Amortized PINN training | §5.1, Eq 25 | Application-level integration — amortizing STDE samples across training steps is a training loop concern, not a core AD primitive. Users can implement this in their training code using existing `stde::*` functions. | `stde::laplacian`, `stde::stde_sparse` |
+| Weight sharing across network layers | App G | Network architecture concern — sharing Taylor jet computation across layers with shared weights requires network-level orchestration, not AD-level support. | — |
diff --git a/docs/algorithms.md b/docs/algorithms.md
index 81b6522..7606e71 100644
--- a/docs/algorithms.md
+++ b/docs/algorithms.md
@@ -72,7 +72,7 @@ inputs. The tape uses a Structure-of-Arrays (SoA) layout: separate contiguous
 arrays for opcodes, operand indices, and constant values. This layout improves
 cache utilization — the opcode stream is compact enough to fit in L1 cache for
 moderate-sized tapes, and the separation allows independent vectorization of
-forward and reverse sweeps. The tape currently defines 38 opcodes covering all
+forward and reverse sweeps. The tape currently defines 44 opcodes covering all
 standard elemental operations (arithmetic, transcendental, power, comparison).
 
 A key advantage of the graph representation is that it enables optimization
@@ -505,11 +505,12 @@ numerically intensive phase. Custom operations are rejected at tape upload
 time since they cannot be compiled to GPU code.
 
 The wgpu backend (feature `gpu-wgpu`) provides cross-platform GPU access via
-Metal, Vulkan, and DX12. It implements four WGSL compute shaders: `forward`
+Metal, Vulkan, and DX12. It implements five WGSL compute shaders: `forward`
 (batched forward evaluation), `reverse` (batched adjoint sweep),
-`tangent_forward` (forward tangent for JVP and sparse Jacobian), and
-`tangent_reverse` (forward-over-reverse for HVP and sparse Hessian). All 43
-opcodes are implemented in each shader. The backend is limited to f32 due to
+`tangent_forward` (forward tangent for JVP and sparse Jacobian),
+`tangent_reverse` (forward-over-reverse for HVP and sparse Hessian), and
+`taylor_forward_2nd` (batched second-order Taylor forward propagation for STDE).
+All 44 opcodes are implemented in each shader. The backend is limited to f32 due to
 WGSL's lack of native f64 support. The CUDA backend (feature `gpu-cuda`)
 targets NVIDIA GPUs and supports both f32 and f64. It uses a single templated
 CUDA kernel file compiled via NVRTC at runtime, instantiated for both `float`
diff --git a/docs/roadmap.md b/docs/roadmap.md
index af6e99d..80660c0 100644
--- a/docs/roadmap.md
+++ b/docs/roadmap.md
@@ -5,7 +5,7 @@
 This roadmap synthesizes:
 - Deferred items from all implementation phases to date
 - Remaining phases from the [book breakdown](book-breakdown.md)
-- Taylor mode AD and stochastic estimators from the STDE paper (Schatz et al., NeurIPS 2024)
+- Taylor mode AD and stochastic estimators from the STDE paper (Shi et al., NeurIPS 2024)
 - Organic improvements identified during implementation
 
 ---
@@ -70,7 +70,7 @@ Given `y' = f(y)`, computes the Taylor expansion of `y(t)` to order K via coeffi
 Key file: `src/bytecode_tape.rs`.
 
 ### R2. Stochastic Taylor Derivative Estimators (STDE)
-**STDE paper (Schatz et al., NeurIPS 2024)**
+**STDE paper (Shi et al., NeurIPS 2024)**
 
 Builds on R1. Uses random jets to estimate differential operators without computing full derivative tensors.
 
@@ -219,7 +219,7 @@ Two GPU backends for batched tape evaluation:
 
 **CUDA backend** (`gpu-cuda` feature) — NVIDIA only, f32 + f64. Single templated CUDA kernel file `tape_eval.cu` with four kernels (`forward_eval`, `reverse_sweep`, `tangent_forward`, `tangent_reverse`), compiled via NVRTC at runtime for both float and double. `CudaContext` mirrors the wgpu API surface plus f64 variants: `forward_batch_f64`, `gradient_batch_f64`, `sparse_jacobian_f64`, `sparse_hessian_f64`, `hvp_batch_f64`. 6 CUDA tests (f32 + f64).
 
-CPU-side sparsity detection and graph coloring drive the GPU tangent sweeps. Custom ops are rejected at upload time (`GpuTapeData::from_tape`). All 38 opcodes implemented in all four shader/kernel variants.
+CPU-side sparsity detection and graph coloring drive the GPU tangent sweeps. Custom ops are rejected at upload time (`GpuTapeData::from_tape`). All 44 opcodes implemented in all five shader/kernel variants.
 
 Key files: `src/gpu/mod.rs`, `src/gpu/wgpu_backend.rs`, `src/gpu/cuda_backend.rs`, `src/gpu/shaders/forward.wgsl`, `src/gpu/shaders/reverse.wgsl`, `src/gpu/shaders/tangent_forward.wgsl`, `src/gpu/shaders/tangent_reverse.wgsl`, `src/gpu/kernels/tape_eval.cu`, `tests/gpu_wgpu_tests.rs`, `tests/gpu_cuda_tests.rs`.
 
diff --git a/echidna-optim/Cargo.toml b/echidna-optim/Cargo.toml
index 0f66d91..267f5ba 100644
--- a/echidna-optim/Cargo.toml
+++ b/echidna-optim/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "echidna-optim"
-version = "0.3.0"
+version = "0.4.0"
 edition = "2021"
 rust-version = "1.93"
 description = "Optimization solvers and implicit differentiation for echidna"
@@ -12,7 +12,7 @@ keywords = ["optimization", "differentiation", "implicit", "gradient", "scientif
 categories = ["mathematics", "science", "algorithms"]
 
 [dependencies]
-echidna = { version = "0.3.0", path = "..", features = ["bytecode"] }
+echidna = { version = "0.4.0", path = "..", features = ["bytecode"] }
 num-traits = "0.2"
 faer = { version = "0.24", optional = true }
 
diff --git a/echidna-optim/tests/integration.rs b/echidna-optim/tests/integration.rs
index d52bf13..b586bf0 100644
--- a/echidna-optim/tests/integration.rs
+++ b/echidna-optim/tests/integration.rs
@@ -17,7 +17,7 @@ fn rosenbrock<T: Scalar>(x: &[T]) -> T {
 }
 
 fn make_tape() -> echidna::BytecodeTape<f64> {
-    let (tape, _) = echidna::record(|x| rosenbrock(x), &[0.0_f64, 0.0]);
+    let (tape, _) = echidna::record(rosenbrock, &[0.0_f64, 0.0]);
     tape
 }
 
@@ -97,7 +97,7 @@ fn newton_fewer_iters_than_lbfgs() {
 
 #[test]
 fn optimized_tape_still_works() {
-    let (mut tape, _) = echidna::record(|x| rosenbrock(x), &[0.0_f64, 0.0]);
+    let (mut tape, _) = echidna::record(rosenbrock, &[0.0_f64, 0.0]);
     tape.optimize();
 
     let mut obj = TapeObjective::new(tape);
diff --git a/echidna-optim/tests/solvers.rs b/echidna-optim/tests/solvers.rs
new file mode 100644
index 0000000..b6e9d85
--- /dev/null
+++ b/echidna-optim/tests/solvers.rs
@@ -0,0 +1,651 @@
+use echidna::Scalar;
+use echidna_optim::{
+    lbfgs, newton, trust_region, ConvergenceParams, LbfgsConfig, NewtonConfig, Objective,
+    OptimResult, TapeObjective, TerminationReason, TrustRegionConfig,
+};
+
+// ============================================================
+// Test objectives
+// ============================================================
+
+/// f(x) = 0.5 * sum(x_i^2). Minimum at origin, value 0.
+struct Quadratic {
+    dim: usize,
+}
+
+impl Objective<f64> for Quadratic {
+    fn dim(&self) -> usize {
+        self.dim
+    }
+
+    fn eval_grad(&mut self, x: &[f64]) -> (f64, Vec<f64>) {
+        let f: f64 = 0.5 * x.iter().map(|&xi| xi * xi).sum::<f64>();
+        let g: Vec<f64> = x.to_vec();
+        (f, g)
+    }
+
+    fn eval_hessian(&mut self, x: &[f64]) -> (f64, Vec<f64>, Vec<Vec<f64>>) {
+        let (f, g) = self.eval_grad(x);
+        let n = x.len();
+        let mut h = vec![vec![0.0; n]; n];
+        for (i, row) in h.iter_mut().enumerate() {
+            row[i] = 1.0;
+        }
+        (f, g, h)
+    }
+
+    fn hvp(&mut self, x: &[f64], v: &[f64]) -> (Vec<f64>, Vec<f64>) {
+        let g = x.to_vec();
+        let hv = v.to_vec(); // H = I, so Hv = v
+        (g, hv)
+    }
+}
+
+/// f(x) = 0.5 * (a*x0^2 + b*x1^2). Ill-conditioned when a/b >> 1.
+struct IllConditionedQuadratic {
+    a: f64,
+    b: f64,
+}
+
+impl Objective<f64> for IllConditionedQuadratic {
+    fn dim(&self) -> usize {
+        2
+    }
+
+    fn eval_grad(&mut self, x: &[f64]) -> (f64, Vec<f64>) {
+        let f = 0.5 * (self.a * x[0] * x[0] + self.b * x[1] * x[1]);
+        let g = vec![self.a * x[0], self.b * x[1]];
+        (f, g)
+    }
+
+    fn eval_hessian(&mut self, x: &[f64]) -> (f64, Vec<f64>, Vec<Vec<f64>>) {
+        let (f, g) = self.eval_grad(x);
+        let h = vec![vec![self.a, 0.0], vec![0.0, self.b]];
+        (f, g, h)
+    }
+
+    fn hvp(&mut self, x: &[f64], v: &[f64]) -> (Vec<f64>, Vec<f64>) {
+        let g = vec![self.a * x[0], self.b * x[1]];
+        let hv = vec![self.a * v[0], self.b * v[1]];
+        (g, hv)
+    }
+}
+
+/// Rosenbrock: f(x) = (1 - x0)^2 + 100*(x1 - x0^2)^2. Minimum at (1,1), value 0.
+struct Rosenbrock2D;
+
+impl Objective<f64> for Rosenbrock2D {
+    fn dim(&self) -> usize {
+        2
+    }
+
+    fn eval_grad(&mut self, x: &[f64]) -> (f64, Vec<f64>) {
+        let a = 1.0 - x[0];
+        let b = x[1] - x[0] * x[0];
+        let f = a * a + 100.0 * b * b;
+        let g0 = -2.0 * a - 400.0 * x[0] * b;
+        let g1 = 200.0 * b;
+        (f, vec![g0, g1])
+    }
+
+    fn eval_hessian(&mut self, x: &[f64]) -> (f64, Vec<f64>, Vec<Vec<f64>>) {
+        let a = 1.0 - x[0];
+        let b = x[1] - x[0] * x[0];
+        let f = a * a + 100.0 * b * b;
+        let g0 = -2.0 * a - 400.0 * x[0] * b;
+        let g1 = 200.0 * b;
+
+        let h00 = 2.0 - 400.0 * (x[1] - 3.0 * x[0] * x[0]);
+        let h01 = -400.0 * x[0];
+        let h11 = 200.0;
+
+        (f, vec![g0, g1], vec![vec![h00, h01], vec![h01, h11]])
+    }
+
+    fn hvp(&mut self, x: &[f64], v: &[f64]) -> (Vec<f64>, Vec<f64>) {
+        let h00 = 2.0 - 400.0 * (x[1] - 3.0 * x[0] * x[0]);
+        let h01 = -400.0 * x[0];
+        let h11 = 200.0;
+
+        let g0 = -2.0 * (1.0 - x[0]) - 400.0 * x[0] * (x[1] - x[0] * x[0]);
+        let g1 = 200.0 * (x[1] - x[0] * x[0]);
+
+        let hv0 = h00 * v[0] + h01 * v[1];
+        let hv1 = h01 * v[0] + h11 * v[1];
+
+        (vec![g0, g1], vec![hv0, hv1])
+    }
+}
+
+fn assert_near_origin(result: &OptimResult<f64>, tol: f64) {
+    for (i, &xi) in result.x.iter().enumerate() {
+        assert!(
+            xi.abs() < tol,
+            "x[{}] = {}, expected ~0",
+            i,
+            xi
+        );
+    }
+    assert!(result.value < tol, "f = {}, expected ~0", result.value);
+}
+
+// ============================================================
+// Newton: simple quadratic
+// ============================================================
+
+#[test]
+fn newton_quadratic_2d() {
+    let mut obj = Quadratic { dim: 2 };
+    let config = NewtonConfig::default();
+    let result = newton(&mut obj, &[5.0, -3.0], &config);
+
+    assert_eq!(result.termination, TerminationReason::GradientNorm);
+    // Newton on a quadratic converges in 1 iteration
+    assert_eq!(result.iterations, 1, "Newton should solve quadratic in 1 step");
+    assert_near_origin(&result, 1e-10);
+}
+
+#[test]
+fn newton_quadratic_4d() {
+    let mut obj = Quadratic { dim: 4 };
+    let config = NewtonConfig::default();
+    let result = newton(&mut obj, &[10.0, -7.0, 3.0, -1.0], &config);
+
+    assert_eq!(result.termination, TerminationReason::GradientNorm);
+    assert_eq!(result.iterations, 1);
+    assert_near_origin(&result, 1e-10);
+}
+
+#[test]
+fn newton_ill_conditioned() {
+    let mut obj = IllConditionedQuadratic { a: 1000.0, b: 1.0 };
+    let config = NewtonConfig::default();
+    let result = newton(&mut obj, &[5.0, -3.0], &config);
+
+    // Newton with exact Hessian is unaffected by conditioning
+    assert_eq!(result.termination, TerminationReason::GradientNorm);
+    assert_eq!(result.iterations, 1);
+    assert_near_origin(&result, 1e-10);
+}
+
+#[test]
+fn newton_at_optimum() {
+    let mut obj = Quadratic { dim: 2 };
+    let config = NewtonConfig::default();
+    let result = newton(&mut obj, &[0.0, 0.0], &config);
+
+    assert_eq!(result.termination, TerminationReason::GradientNorm);
+    assert_eq!(result.iterations, 0);
+    assert_eq!(result.func_evals, 1); // single eval to check gradient
+}
+
+#[test]
+fn newton_max_iter_zero() {
+    let mut obj = Quadratic { dim: 2 };
+    let config = NewtonConfig {
+        convergence: ConvergenceParams {
+            max_iter: 0,
+            ..Default::default()
+        },
+        ..Default::default()
+    };
+    let result = newton(&mut obj, &[1.0, 1.0], &config);
+    assert_eq!(result.termination, TerminationReason::NumericalError);
+    assert_eq!(result.iterations, 0);
+}
+
+// ============================================================
+// L-BFGS: simple quadratic
+// ============================================================
+
+#[test]
+fn lbfgs_quadratic_2d() {
+    let mut obj = Quadratic { dim: 2 };
+    let config = LbfgsConfig::default();
+    let result = lbfgs(&mut obj, &[5.0, -3.0], &config);
+
+    assert_eq!(result.termination, TerminationReason::GradientNorm);
+    assert_near_origin(&result, 1e-6);
+}
+
+#[test]
+fn lbfgs_quadratic_8d() {
+    let mut obj = Quadratic { dim: 8 };
+    let config = LbfgsConfig::default();
+    let result = lbfgs(&mut obj, &[1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0], &config);
+
+    assert_eq!(result.termination, TerminationReason::GradientNorm);
+    assert_near_origin(&result, 1e-6);
+}
+
+#[test]
+fn lbfgs_ill_conditioned() {
+    let mut obj = IllConditionedQuadratic { a: 1000.0, b: 1.0 };
+    let config = LbfgsConfig::default();
+    let result = lbfgs(&mut obj, &[5.0, -3.0], &config);
+
+    assert_eq!(result.termination, TerminationReason::GradientNorm);
+    assert_near_origin(&result, 1e-4);
+}
+
+#[test]
+fn lbfgs_memory_1() {
+    let mut obj = Quadratic { dim: 2 };
+    let config = LbfgsConfig {
+        memory: 1,
+        ..Default::default()
+    };
+    let result = lbfgs(&mut obj, &[5.0, -3.0], &config);
+
+    assert_eq!(result.termination, TerminationReason::GradientNorm);
+    assert_near_origin(&result, 1e-6);
+}
+
+#[test]
+fn lbfgs_at_optimum() {
+    let mut obj = Quadratic { dim: 3 };
+    let config = LbfgsConfig::default();
+    let result = lbfgs(&mut obj, &[0.0, 0.0, 0.0], &config);
+
+    assert_eq!(result.termination, TerminationReason::GradientNorm);
+    assert_eq!(result.iterations, 0);
+}
+
+#[test]
+fn lbfgs_max_iter_zero() {
+    let mut obj = Quadratic { dim: 2 };
+    let config = LbfgsConfig {
+        convergence: ConvergenceParams {
+            max_iter: 0,
+            ..Default::default()
+        },
+        ..Default::default()
+    };
+    let result = lbfgs(&mut obj, &[1.0, 1.0], &config);
+    assert_eq!(result.termination, TerminationReason::NumericalError);
+}
+
+#[test]
+fn lbfgs_memory_zero() {
+    let mut obj = Quadratic { dim: 2 };
+    let config = LbfgsConfig {
+        memory: 0,
+        ..Default::default()
+    };
+    let result = lbfgs(&mut obj, &[1.0, 1.0], &config);
+    assert_eq!(result.termination, TerminationReason::NumericalError);
+}
+
+// ============================================================
+// Trust-region: simple quadratic
+// ============================================================
+
+#[test]
+fn trust_region_quadratic_2d() {
+    let mut obj = Quadratic { dim: 2 };
+    let config = TrustRegionConfig::default();
+    let result = trust_region(&mut obj, &[5.0, -3.0], &config);
+
+    assert_eq!(result.termination, TerminationReason::GradientNorm);
+    assert_near_origin(&result, 1e-6);
+}
+
+#[test]
+fn trust_region_quadratic_8d() {
+    let mut obj = Quadratic { dim: 8 };
+    let config = TrustRegionConfig {
+        convergence: ConvergenceParams {
+            max_iter: 200,
+            ..Default::default()
+        },
+        ..Default::default()
+    };
+    let result = trust_region(
+        &mut obj,
+        &[1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0],
+        &config,
+    );
+
+    assert_eq!(result.termination, TerminationReason::GradientNorm);
+    assert_near_origin(&result, 1e-6);
+}
+
+#[test]
+fn trust_region_ill_conditioned() {
+    let mut obj = IllConditionedQuadratic { a: 1000.0, b: 1.0 };
+    let config = TrustRegionConfig {
+        convergence: ConvergenceParams {
+            max_iter: 200,
+            ..Default::default()
+        },
+        ..Default::default()
+    };
+    let result = trust_region(&mut obj, &[5.0, -3.0], &config);
+
+    assert_eq!(result.termination, TerminationReason::GradientNorm);
+    assert_near_origin(&result, 1e-4);
+}
+
+#[test]
+fn trust_region_at_optimum() {
+    let mut obj = Quadratic { dim: 2 };
+    let config = TrustRegionConfig::default();
+    let result = trust_region(&mut obj, &[0.0, 0.0], &config);
+
+    assert_eq!(result.termination, TerminationReason::GradientNorm);
+    assert_eq!(result.iterations, 0);
+}
+
+#[test]
+fn trust_region_small_radius() {
+    let mut obj = Quadratic { dim: 2 };
+    let config = TrustRegionConfig {
+        initial_radius: 0.01,
+        max_radius: 100.0,
+        convergence: ConvergenceParams {
+            max_iter: 200,
+            ..Default::default()
+        },
+        ..Default::default()
+    };
+    let result = trust_region(&mut obj, &[5.0, -3.0], &config);
+
+    // Should still converge, just take more iterations
+    assert_eq!(result.termination, TerminationReason::GradientNorm);
+    assert_near_origin(&result, 1e-6);
+}
+
+#[test]
+fn trust_region_max_iter_zero() {
+    let mut obj = Quadratic { dim: 2 };
+    let config = TrustRegionConfig {
+        convergence: ConvergenceParams {
+            max_iter: 0,
+            ..Default::default()
+        },
+        ..Default::default()
+    };
+    let result = trust_region(&mut obj, &[1.0, 1.0], &config);
+    assert_eq!(result.termination, TerminationReason::NumericalError);
+}
+
+#[test]
+fn trust_region_zero_radius() {
+    let mut obj = Quadratic { dim: 2 };
+    let config = TrustRegionConfig {
+        initial_radius: 0.0,
+        ..Default::default()
+    };
+    let result = trust_region(&mut obj, &[1.0, 1.0], &config);
+    assert_eq!(result.termination, TerminationReason::NumericalError);
+}
+
+// ============================================================
+// Convergence criteria
+// ============================================================
+
+#[test]
+fn func_tol_terminates_lbfgs() {
+    // Use Rosenbrock (takes many iterations) so func_tol can fire
+    let mut obj = Rosenbrock2D;
+    let config = LbfgsConfig {
+        convergence: ConvergenceParams {
+            max_iter: 1000,
+            grad_tol: 0.0,  // disable gradient termination
+            step_tol: 0.0,  // disable step termination
+            func_tol: 1e-2, // generous func_tol — fires when progress stalls
+        },
+        ..Default::default()
+    };
+    let result = lbfgs(&mut obj, &[0.0, 0.0], &config);
+
+    assert_eq!(result.termination, TerminationReason::FunctionChange);
+    // Should have made progress toward the minimum
+    assert!(result.value < 1.0, "f = {}", result.value);
+}
+
+#[test]
+fn max_iter_terminates_lbfgs() {
+    // Use Rosenbrock so the solver doesn't converge before max_iter
+    let mut obj = Rosenbrock2D;
+    let config = LbfgsConfig {
+        convergence: ConvergenceParams {
+            max_iter: 3,
+            grad_tol: 0.0,
+            step_tol: 0.0,
+            func_tol: 0.0,
+        },
+        ..Default::default()
+    };
+    let result = lbfgs(&mut obj, &[0.0, 0.0], &config);
+
+    assert_eq!(result.termination, TerminationReason::MaxIterations);
+    assert_eq!(result.iterations, 3);
+}
+
+#[test]
+fn func_tol_terminates_newton() {
+    // Use Rosenbrock so Newton takes multiple iterations
+    let mut obj = Rosenbrock2D;
+    let config = NewtonConfig {
+        convergence: ConvergenceParams {
+            max_iter: 1000,
+            grad_tol: 0.0,
+            step_tol: 0.0,
+            func_tol: 1e-2,
+        },
+        ..Default::default()
+    };
+    let result = newton(&mut obj, &[0.0, 0.0], &config);
+
+    assert_eq!(
+        result.termination,
+        TerminationReason::FunctionChange,
+        "terminated with {:?}",
+        result.termination
+    );
+}
+
+// ============================================================
+// Cross-solver agreement on quadratic
+// ============================================================
+
+#[test]
+fn all_solvers_agree_quadratic() {
+    let x0 = &[3.0, -4.0, 1.0];
+
+    let mut obj1 = Quadratic { dim: 3 };
+    let r1 = lbfgs(&mut obj1, x0, &LbfgsConfig::default());
+
+    let mut obj2 = Quadratic { dim: 3 };
+    let r2 = newton(&mut obj2, x0, &NewtonConfig::default());
+
+    let mut obj3 = Quadratic { dim: 3 };
+    let r3 = trust_region(
+        &mut obj3,
+        x0,
+        &TrustRegionConfig {
+            convergence: ConvergenceParams {
+                max_iter: 200,
+                ..Default::default()
+            },
+            ..Default::default()
+        },
+    );
+
+    for (name, r) in [("L-BFGS", &r1), ("Newton", &r2), ("Trust-region", &r3)] {
+        assert_eq!(
+            r.termination,
+            TerminationReason::GradientNorm,
+            "{} did not converge: {:?}",
+            name,
+            r.termination
+        );
+        assert_near_origin(r, 1e-6);
+    }
+}
+
+// ============================================================
+// TapeObjective tests (higher-dimensional, tape-based)
+// ============================================================
+
+fn quadratic_tape<T: Scalar>(x: &[T]) -> T {
+    let half = T::from_f(<T::Float as num_traits::FromPrimitive>::from_f64(0.5).unwrap());
+    let mut sum = T::zero();
+    for &xi in x {
+        sum = sum + xi * xi;
+    }
+    half * sum
+}
+
+fn rosenbrock<T: Scalar>(x: &[T]) -> T {
+    let one = T::from_f(<T::Float as num_traits::FromPrimitive>::from_f64(1.0).unwrap());
+    let hundred = T::from_f(<T::Float as num_traits::FromPrimitive>::from_f64(100.0).unwrap());
+    let mut sum = T::zero();
+    for i in 0..x.len() - 1 {
+        let t1 = one - x[i];
+        let t2 = x[i + 1] - x[i] * x[i];
+        sum = sum + t1 * t1 + hundred * t2 * t2;
+    }
+    sum
+}
+
+#[test]
+fn tape_lbfgs_quadratic_6d() {
+    let x0 = [1.0_f64, -2.0, 3.0, -4.0, 5.0, -6.0];
+    let (tape, _) = echidna::record(quadratic_tape, &x0);
+    let mut obj = TapeObjective::new(tape);
+    let result = lbfgs(&mut obj, &x0, &LbfgsConfig::default());
+
+    assert_eq!(result.termination, TerminationReason::GradientNorm);
+    assert_near_origin(&result, 1e-6);
+}
+
+#[test]
+fn tape_newton_quadratic_6d() {
+    let x0 = [1.0_f64, -2.0, 3.0, -4.0, 5.0, -6.0];
+    let (tape, _) = echidna::record(quadratic_tape, &x0);
+    let mut obj = TapeObjective::new(tape);
+    let result = newton(&mut obj, &x0, &NewtonConfig::default());
+
+    assert_eq!(result.termination, TerminationReason::GradientNorm);
+    assert_eq!(result.iterations, 1);
+    assert_near_origin(&result, 1e-10);
+}
+
+#[test]
+fn tape_trust_region_quadratic_6d() {
+    let x0 = [1.0_f64, -2.0, 3.0, -4.0, 5.0, -6.0];
+    let (tape, _) = echidna::record(quadratic_tape, &x0);
+    let mut obj = TapeObjective::new(tape);
+    let config = TrustRegionConfig {
+        convergence: ConvergenceParams {
+            max_iter: 200,
+            ..Default::default()
+        },
+        ..Default::default()
+    };
+    let result = trust_region(&mut obj, &x0, &config);
+
+    assert_eq!(result.termination, TerminationReason::GradientNorm);
+    assert_near_origin(&result, 1e-6);
+}
+
+#[test]
+fn tape_rosenbrock_4d_newton() {
+    let x0 = [0.0_f64, 0.0, 0.0, 0.0];
+    let (tape, _) = echidna::record(rosenbrock, &x0);
+    let mut obj = TapeObjective::new(tape);
+    let result = newton(&mut obj, &x0, &NewtonConfig::default());
+
+    assert_eq!(result.termination, TerminationReason::GradientNorm);
+    for i in 0..4 {
+        assert!(
+            (result.x[i] - 1.0).abs() < 1e-5,
+            "x[{}] = {}, expected 1.0",
+            i,
+            result.x[i]
+        );
+    }
+}
+
+#[test]
+fn tape_rosenbrock_4d_lbfgs() {
+    let x0 = [0.0_f64, 0.0, 0.0, 0.0];
+    let (tape, _) = echidna::record(rosenbrock, &x0);
+    let mut obj = TapeObjective::new(tape);
+    let result = lbfgs(&mut obj, &x0, &LbfgsConfig::default());
+
+    assert_eq!(result.termination, TerminationReason::GradientNorm);
+    for i in 0..4 {
+        assert!(
+            (result.x[i] - 1.0).abs() < 1e-4,
+            "x[{}] = {}, expected 1.0",
+            i,
+            result.x[i]
+        );
+    }
+}
+
+#[test]
+fn tape_rosenbrock_4d_trust_region() {
+    let x0 = [0.0_f64, 0.0, 0.0, 0.0];
+    let (tape, _) = echidna::record(rosenbrock, &x0);
+    let mut obj = TapeObjective::new(tape);
+    let config = TrustRegionConfig {
+        convergence: ConvergenceParams {
+            max_iter: 500,
+            ..Default::default()
+        },
+        ..Default::default()
+    };
+    let result = trust_region(&mut obj, &x0, &config);
+
+    assert_eq!(
+        result.termination,
+        TerminationReason::GradientNorm,
+        "terminated with {:?} after {} iters",
+        result.termination,
+        result.iterations
+    );
+    for i in 0..4 {
+        assert!(
+            (result.x[i] - 1.0).abs() < 1e-4,
+            "x[{}] = {}, expected 1.0",
+            i,
+            result.x[i]
+        );
+    }
+}
+
+// ============================================================
+// TapeObjective with optimized tape
+// ============================================================
+
+#[test]
+fn optimized_tape_quadratic() {
+    let x0 = [2.0_f64, -3.0, 4.0];
+    let (mut tape, _) = echidna::record(quadratic_tape, &x0);
+    tape.optimize();
+
+    let mut obj = TapeObjective::new(tape);
+    let result = lbfgs(&mut obj, &x0, &LbfgsConfig::default());
+
+    assert_eq!(result.termination, TerminationReason::GradientNorm);
+    assert_near_origin(&result, 1e-6);
+}
+
+// ============================================================
+// func_evals tracking
+// ============================================================
+
+#[test]
+fn tape_objective_counts_evals() {
+    let x0 = [1.0_f64, 1.0];
+    let (tape, _) = echidna::record(quadratic_tape, &x0);
+    let mut obj = TapeObjective::new(tape);
+    assert_eq!(obj.func_evals(), 0);
+
+    let _ = lbfgs(&mut obj, &x0, &LbfgsConfig::default());
+    assert!(obj.func_evals() > 0, "should have counted evaluations");
+}
diff --git a/src/gpu/cuda_backend.rs b/src/gpu/cuda_backend.rs
index 32238ba..8807398 100644
--- a/src/gpu/cuda_backend.rs
+++ b/src/gpu/cuda_backend.rs
@@ -29,6 +29,394 @@ const KERNEL_SRC: &str = include_str!("kernels/tape_eval.cu");
 const TAYLOR_KERNEL_SRC: &str = include_str!("kernels/taylor_eval.cu");
 const BLOCK_SIZE: u32 = 256;
 
+// ── Macros to deduplicate f32/f64 CUDA dispatch methods ──
+//
+// Each macro expands to a block expression returning the method's result type.
+// Parameters are: self, tape buffer, kernel field, constants field, and float type.
+
+macro_rules! cuda_forward_batch_body {
+    ($self:expr, $tape:expr, $inputs:expr, $batch_size:expr, $F:ty, $constants:ident, $kernel:ident) => {{
+        let s = &$self.stream;
+        let ni = $tape.num_inputs;
+        let nv = $tape.num_variables;
+        let no = $tape.num_outputs;
+
+        assert_eq!($inputs.len(), ($batch_size * ni) as usize);
+
+        let d_inputs = s.clone_htod($inputs).map_err(cuda_err)?;
+        let mut d_values = s
+            .alloc_zeros::<$F>(($batch_size * nv) as usize)
+            .map_err(cuda_err)?;
+        let mut d_outputs = s
+            .alloc_zeros::<$F>(($batch_size * no) as usize)
+            .map_err(cuda_err)?;
+
+        let cfg = LaunchConfig {
+            grid_dim: CudaContext::grid_dim($batch_size),
+            block_dim: CudaContext::block_dim(),
+            shared_mem_bytes: 0,
+        };
+
+        let mut builder = s.launch_builder(&$self.$kernel);
+        builder.arg(&$tape.opcodes);
+        builder.arg(&$tape.arg0);
+        builder.arg(&$tape.arg1);
+        builder.arg(&$tape.$constants);
+        builder.arg(&d_inputs);
+        builder.arg(&mut d_values);
+        builder.arg(&mut d_outputs);
+        builder.arg(&$tape.output_indices);
+        builder.arg(&$tape.num_ops);
+        builder.arg(&ni);
+        builder.arg(&nv);
+        builder.arg(&no);
+        builder.arg(&$batch_size);
+        unsafe { builder.launch(cfg) }.map_err(cuda_err)?;
+
+        s.synchronize().map_err(cuda_err)?;
+        let results = s.clone_dtoh(&d_outputs).map_err(cuda_err)?;
+        Ok(results)
+    }};
+}
+
+macro_rules! cuda_gradient_batch_body {
+    ($self:expr, $tape:expr, $inputs:expr, $batch_size:expr, $F:ty, $constants:ident, $fwd_kernel:ident, $rev_kernel:ident) => {{
+        let s = &$self.stream;
+        let ni = $tape.num_inputs;
+        let nv = $tape.num_variables;
+        let no = $tape.num_outputs;
+
+        assert_eq!($inputs.len(), ($batch_size * ni) as usize);
+
+        let d_inputs = s.clone_htod($inputs).map_err(cuda_err)?;
+        let mut d_values = s
+            .alloc_zeros::<$F>(($batch_size * nv) as usize)
+            .map_err(cuda_err)?;
+        let mut d_outputs = s
+            .alloc_zeros::<$F>(($batch_size * no) as usize)
+            .map_err(cuda_err)?;
+
+        let cfg = LaunchConfig {
+            grid_dim: CudaContext::grid_dim($batch_size),
+            block_dim: CudaContext::block_dim(),
+            shared_mem_bytes: 0,
+        };
+
+        // Forward pass
+        let mut builder = s.launch_builder(&$self.$fwd_kernel);
+        builder.arg(&$tape.opcodes);
+        builder.arg(&$tape.arg0);
+        builder.arg(&$tape.arg1);
+        builder.arg(&$tape.$constants);
+        builder.arg(&d_inputs);
+        builder.arg(&mut d_values);
+        builder.arg(&mut d_outputs);
+        builder.arg(&$tape.output_indices);
+        builder.arg(&$tape.num_ops);
+        builder.arg(&ni);
+        builder.arg(&nv);
+        builder.arg(&no);
+        builder.arg(&$batch_size);
+        unsafe { builder.launch(cfg) }.map_err(cuda_err)?;
+
+        // Reverse pass
+        let mut d_adjoints = s
+            .alloc_zeros::<$F>(($batch_size * nv) as usize)
+            .map_err(cuda_err)?;
+        let mut d_grads = s
+            .alloc_zeros::<$F>(($batch_size * ni) as usize)
+            .map_err(cuda_err)?;
+
+        let mut builder = s.launch_builder(&$self.$rev_kernel);
+        builder.arg(&$tape.opcodes);
+        builder.arg(&$tape.arg0);
+        builder.arg(&$tape.arg1);
+        builder.arg(&d_values);
+        builder.arg(&mut d_adjoints);
+        builder.arg(&mut d_grads);
+        builder.arg(&$tape.output_indices);
+        builder.arg(&$tape.num_ops);
+        builder.arg(&ni);
+        builder.arg(&nv);
+        builder.arg(&$batch_size);
+        unsafe { builder.launch(cfg) }.map_err(cuda_err)?;
+
+        s.synchronize().map_err(cuda_err)?;
+        let output_vals = s.clone_dtoh(&d_outputs).map_err(cuda_err)?;
+        let grads = s.clone_dtoh(&d_grads).map_err(cuda_err)?;
+        Ok((output_vals, grads))
+    }};
+}
+
+macro_rules! cuda_hvp_batch_body {
+    ($self:expr, $tape:expr, $x:expr, $tangent_dirs:expr, $batch_size:expr, $F:ty, $constants:ident, $kernel:ident) => {{
+        let s = &$self.stream;
+        let ni = $tape.num_inputs;
+        let nv = $tape.num_variables;
+
+        assert_eq!($x.len(), ni as usize);
+        assert_eq!($tangent_dirs.len(), ($batch_size * ni) as usize);
+
+        let mut primal_inputs = Vec::with_capacity(($batch_size * ni) as usize);
+        for _ in 0..$batch_size {
+            primal_inputs.extend_from_slice($x);
+        }
+
+        let d_primal_in = s.clone_htod(&primal_inputs).map_err(cuda_err)?;
+        let d_seeds = s.clone_htod($tangent_dirs).map_err(cuda_err)?;
+        let mut d_primals = s
+            .alloc_zeros::<$F>(($batch_size * nv) as usize)
+            .map_err(cuda_err)?;
+        let mut d_tans = s
+            .alloc_zeros::<$F>(($batch_size * nv) as usize)
+            .map_err(cuda_err)?;
+        let mut d_adj_re = s
+            .alloc_zeros::<$F>(($batch_size * nv) as usize)
+            .map_err(cuda_err)?;
+        let mut d_adj_eps = s
+            .alloc_zeros::<$F>(($batch_size * nv) as usize)
+            .map_err(cuda_err)?;
+        let mut d_grads = s
+            .alloc_zeros::<$F>(($batch_size * ni) as usize)
+            .map_err(cuda_err)?;
+        let mut d_hvps = s
+            .alloc_zeros::<$F>(($batch_size * ni) as usize)
+            .map_err(cuda_err)?;
+
+        let cfg = LaunchConfig {
+            grid_dim: CudaContext::grid_dim($batch_size),
+            block_dim: CudaContext::block_dim(),
+            shared_mem_bytes: 0,
+        };
+
+        let mut builder = s.launch_builder(&$self.$kernel);
+        builder.arg(&$tape.opcodes);
+        builder.arg(&$tape.arg0);
+        builder.arg(&$tape.arg1);
+        builder.arg(&$tape.$constants);
+        builder.arg(&d_primal_in);
+        builder.arg(&d_seeds);
+        builder.arg(&mut d_primals);
+        builder.arg(&mut d_tans);
+        builder.arg(&mut d_adj_re);
+        builder.arg(&mut d_adj_eps);
+        builder.arg(&mut d_grads);
+        builder.arg(&mut d_hvps);
+        builder.arg(&$tape.output_indices);
+        builder.arg(&$tape.num_ops);
+        builder.arg(&ni);
+        builder.arg(&nv);
+        builder.arg(&$batch_size);
+        unsafe { builder.launch(cfg) }.map_err(cuda_err)?;
+
+        s.synchronize().map_err(cuda_err)?;
+        let grads = s.clone_dtoh(&d_grads).map_err(cuda_err)?;
+        let hvps = s.clone_dtoh(&d_hvps).map_err(cuda_err)?;
+        Ok((grads, hvps))
+    }};
+}
+
+macro_rules! cuda_sparse_jacobian_body {
+    ($self:expr, $tape:expr, $tape_cpu:expr, $x:expr, $F:ty, $constants:ident, $tangent_fwd_kernel:ident) => {{
+        let ni = $tape.num_inputs as usize;
+        let no = $tape.num_outputs as usize;
+
+        let pattern = $tape_cpu.detect_jacobian_sparsity();
+        let (colors, num_colors) = crate::sparse::column_coloring(&pattern);
+
+        if num_colors == 0 {
+            $tape_cpu.forward($x);
+            let vals = $tape_cpu.output_values();
+            return Ok((vals, pattern, vec![]));
+        }
+
+        let batch = num_colors as u32;
+        let mut seeds = Vec::with_capacity(batch as usize * ni);
+        for c in 0..num_colors {
+            for i in 0..ni {
+                seeds.push(if colors[i] == c as u32 {
+                    (1.0 as $F)
+                } else {
+                    (0.0 as $F)
+                });
+            }
+        }
+
+        let s = &$self.stream;
+        let nv = $tape.num_variables;
+
+        let d_primals_in = {
+            let mut replicated = Vec::with_capacity(batch as usize * ni);
+            for _ in 0..batch {
+                replicated.extend_from_slice($x);
+            }
+            s.clone_htod(&replicated).map_err(cuda_err)?
+        };
+        let d_seeds = s.clone_htod(&seeds).map_err(cuda_err)?;
+        let mut d_primals = s
+            .alloc_zeros::<$F>((batch * nv) as usize)
+            .map_err(cuda_err)?;
+        let mut d_tangents = s
+            .alloc_zeros::<$F>((batch * nv) as usize)
+            .map_err(cuda_err)?;
+        let mut d_tangent_out = s
+            .alloc_zeros::<$F>((batch * $tape.num_outputs) as usize)
+            .map_err(cuda_err)?;
+
+        let cfg = LaunchConfig {
+            grid_dim: CudaContext::grid_dim(batch),
+            block_dim: CudaContext::block_dim(),
+            shared_mem_bytes: 0,
+        };
+
+        let mut builder = s.launch_builder(&$self.$tangent_fwd_kernel);
+        builder.arg(&$tape.opcodes);
+        builder.arg(&$tape.arg0);
+        builder.arg(&$tape.arg1);
+        builder.arg(&$tape.$constants);
+        builder.arg(&d_primals_in);
+        builder.arg(&d_seeds);
+        builder.arg(&mut d_primals);
+        builder.arg(&mut d_tangents);
+        builder.arg(&mut d_tangent_out);
+        builder.arg(&$tape.output_indices);
+        builder.arg(&$tape.num_ops);
+        builder.arg(&$tape.num_inputs);
+        builder.arg(&nv);
+        builder.arg(&$tape.num_outputs);
+        builder.arg(&batch);
+        unsafe { builder.launch(cfg) }.map_err(cuda_err)?;
+
+        s.synchronize().map_err(cuda_err)?;
+        let tangent_outs = s.clone_dtoh(&d_tangent_out).map_err(cuda_err)?;
+
+        $tape_cpu.forward($x);
+        let output_values = $tape_cpu.output_values();
+
+        let nnz = pattern.nnz();
+        let mut jac_values = vec![(0.0 as $F); nnz];
+        for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() {
+            let c = colors[col as usize] as usize;
+            jac_values[k] = tangent_outs[c * no + row as usize];
+        }
+
+        Ok((output_values, pattern, jac_values))
+    }};
+}
+
+macro_rules! cuda_sparse_hessian_body {
+    ($self:expr, $tape:expr, $tape_cpu:expr, $x:expr, $F:ty, $hvp_method:ident) => {{
+        let ni = $tape.num_inputs as usize;
+
+        let pattern = $tape_cpu.detect_sparsity();
+        let (colors, num_colors) = crate::sparse::greedy_coloring(&pattern);
+
+        if num_colors == 0 {
+            $tape_cpu.forward($x);
+            let val = $tape_cpu.output_value();
+            let grad = $tape_cpu.gradient($x);
+            return Ok((val, grad, pattern, vec![]));
+        }
+
+        let batch = num_colors as u32;
+        let mut tangent_dirs = Vec::with_capacity(batch as usize * ni);
+        for c in 0..num_colors {
+            for i in 0..ni {
+                tangent_dirs.push(if colors[i] == c as u32 {
+                    (1.0 as $F)
+                } else {
+                    (0.0 as $F)
+                });
+            }
+        }
+
+        let (grads, hvps) = $self.$hvp_method($tape, $x, &tangent_dirs, batch)?;
+
+        let gradient: Vec<$F> = grads[..ni].to_vec();
+
+        let nnz = pattern.nnz();
+        let mut hess_values = vec![(0.0 as $F); nnz];
+        for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() {
+            let c = colors[col as usize] as usize;
+            hess_values[k] = hvps[c * ni + row as usize];
+        }
+
+        $tape_cpu.forward($x);
+        let value = $tape_cpu.output_value();
+
+        Ok((value, gradient, pattern, hess_values))
+    }};
+}
+
+macro_rules! cuda_taylor_fwd_2nd_body {
+    ($self:expr, $tape:expr, $primal_inputs:expr, $direction_seeds:expr, $batch_size:expr, $F:ty, $constants:ident, $kernel:ident) => {{
+        let s = &$self.stream;
+        let ni = $tape.num_inputs;
+        let nv = $tape.num_variables;
+        let no = $tape.num_outputs;
+        let total_in = ($batch_size * ni) as usize;
+
+        assert_eq!(
+            $primal_inputs.len(),
+            total_in,
+            "primal_inputs length mismatch"
+        );
+        assert_eq!(
+            $direction_seeds.len(),
+            total_in,
+            "direction_seeds length mismatch"
+        );
+
+        let d_primals = s.clone_htod($primal_inputs).map_err(cuda_err)?;
+        let d_seeds = s.clone_htod($direction_seeds).map_err(cuda_err)?;
+        let mut d_jets = s
+            .alloc_zeros::<$F>(($batch_size * nv * 3) as usize)
+            .map_err(cuda_err)?;
+        let mut d_jet_out = s
+            .alloc_zeros::<$F>(($batch_size * no * 3) as usize)
+            .map_err(cuda_err)?;
+
+        let cfg = LaunchConfig {
+            grid_dim: CudaContext::grid_dim($batch_size),
+            block_dim: CudaContext::block_dim(),
+            shared_mem_bytes: 0,
+        };
+
+        let mut builder = s.launch_builder(&$self.$kernel);
+        builder.arg(&$tape.opcodes);
+        builder.arg(&$tape.arg0);
+        builder.arg(&$tape.arg1);
+        builder.arg(&$tape.$constants);
+        builder.arg(&d_primals);
+        builder.arg(&d_seeds);
+        builder.arg(&mut d_jets);
+        builder.arg(&mut d_jet_out);
+        builder.arg(&$tape.output_indices);
+        builder.arg(&$tape.num_ops);
+        builder.arg(&ni);
+        builder.arg(&nv);
+        builder.arg(&no);
+        builder.arg(&$batch_size);
+        unsafe { builder.launch(cfg) }.map_err(cuda_err)?;
+
+        s.synchronize().map_err(cuda_err)?;
+        let raw = s.clone_dtoh(&d_jet_out).map_err(cuda_err)?;
+
+        // Deinterleave [c0, c1, c2, c0, c1, c2, ...]
+        let total_out = ($batch_size * no) as usize;
+        let mut values = Vec::with_capacity(total_out);
+        let mut c1s = Vec::with_capacity(total_out);
+        let mut c2s = Vec::with_capacity(total_out);
+        for i in 0..total_out {
+            values.push(raw[i * 3]);
+            c1s.push(raw[i * 3 + 1]);
+            c2s.push(raw[i * 3 + 2]);
+        }
+
+        Ok(super::TaylorBatchResult { values, c1s, c2s })
+    }};
+}
+
 /// Convert any `Display` error into `GpuError::Other`.
 fn cuda_err(e: impl std::fmt::Display) -> GpuError {
     GpuError::Other(format!("{e}"))
@@ -197,218 +585,27 @@ impl GpuBackend for CudaContext {
         inputs: &[f32],
         batch_size: u32,
     ) -> Result<Vec<f32>, GpuError> {
-        let s = &self.stream;
-        let ni = tape.num_inputs;
-        let nv = tape.num_variables;
-        let no = tape.num_outputs;
+        cuda_forward_batch_body!(self, tape, inputs, batch_size, f32, constants_f32, forward_f32)
+    }
 
-        assert_eq!(inputs.len(), (batch_size * ni) as usize);
+    fn gradient_batch(
+        &self,
+        tape: &CudaTapeBuffers,
+        inputs: &[f32],
+        batch_size: u32,
+    ) -> Result<(Vec<f32>, Vec<f32>), GpuError> {
+        cuda_gradient_batch_body!(self, tape, inputs, batch_size, f32, constants_f32, forward_f32, reverse_f32)
+    }
 
-        let d_inputs = s.clone_htod(inputs).map_err(cuda_err)?;
-        let mut d_values = s
-            .alloc_zeros::<f32>((batch_size * nv) as usize)
-            .map_err(cuda_err)?;
-        let mut d_outputs = s
-            .alloc_zeros::<f32>((batch_size * no) as usize)
-            .map_err(cuda_err)?;
+    fn sparse_jacobian(
+        &self,
+        tape: &CudaTapeBuffers,
+        tape_cpu: &mut crate::BytecodeTape<f32>,
+        x: &[f32],
+    ) -> Result<(Vec<f32>, crate::sparse::JacobianSparsityPattern, Vec<f32>), GpuError> {
+        cuda_sparse_jacobian_body!(self, tape, tape_cpu, x, f32, constants_f32, tangent_fwd_f32)
+    }
 
-        let cfg = LaunchConfig {
-            grid_dim: Self::grid_dim(batch_size),
-            block_dim: Self::block_dim(),
-            shared_mem_bytes: 0,
-        };
-
-        let mut builder = s.launch_builder(&self.forward_f32);
-        builder.arg(&tape.opcodes);
-        builder.arg(&tape.arg0);
-        builder.arg(&tape.arg1);
-        builder.arg(&tape.constants_f32);
-        builder.arg(&d_inputs);
-        builder.arg(&mut d_values);
-        builder.arg(&mut d_outputs);
-        builder.arg(&tape.output_indices);
-        builder.arg(&tape.num_ops);
-        builder.arg(&ni);
-        builder.arg(&nv);
-        builder.arg(&no);
-        builder.arg(&batch_size);
-        unsafe { builder.launch(cfg) }.map_err(cuda_err)?;
-
-        s.synchronize().map_err(cuda_err)?;
-        let results = s.clone_dtoh(&d_outputs).map_err(cuda_err)?;
-        Ok(results)
-    }
-
-    /// Batched gradient (forward + reverse) (f32).
-    fn gradient_batch(
-        &self,
-        tape: &CudaTapeBuffers,
-        inputs: &[f32],
-        batch_size: u32,
-    ) -> Result<(Vec<f32>, Vec<f32>), GpuError> {
-        let s = &self.stream;
-        let ni = tape.num_inputs;
-        let nv = tape.num_variables;
-        let no = tape.num_outputs;
-
-        assert_eq!(inputs.len(), (batch_size * ni) as usize);
-
-        let d_inputs = s.clone_htod(inputs).map_err(cuda_err)?;
-        let mut d_values = s
-            .alloc_zeros::<f32>((batch_size * nv) as usize)
-            .map_err(cuda_err)?;
-        let mut d_outputs = s
-            .alloc_zeros::<f32>((batch_size * no) as usize)
-            .map_err(cuda_err)?;
-
-        // Forward pass
-        let cfg = LaunchConfig {
-            grid_dim: Self::grid_dim(batch_size),
-            block_dim: Self::block_dim(),
-            shared_mem_bytes: 0,
-        };
-
-        let mut builder = s.launch_builder(&self.forward_f32);
-        builder.arg(&tape.opcodes);
-        builder.arg(&tape.arg0);
-        builder.arg(&tape.arg1);
-        builder.arg(&tape.constants_f32);
-        builder.arg(&d_inputs);
-        builder.arg(&mut d_values);
-        builder.arg(&mut d_outputs);
-        builder.arg(&tape.output_indices);
-        builder.arg(&tape.num_ops);
-        builder.arg(&ni);
-        builder.arg(&nv);
-        builder.arg(&no);
-        builder.arg(&batch_size);
-        unsafe { builder.launch(cfg) }.map_err(cuda_err)?;
-
-        // Reverse pass
-        let mut d_adjoints = s
-            .alloc_zeros::<f32>((batch_size * nv) as usize)
-            .map_err(cuda_err)?;
-        let mut d_grads = s
-            .alloc_zeros::<f32>((batch_size * ni) as usize)
-            .map_err(cuda_err)?;
-
-        let mut builder = s.launch_builder(&self.reverse_f32);
-        builder.arg(&tape.opcodes);
-        builder.arg(&tape.arg0);
-        builder.arg(&tape.arg1);
-        builder.arg(&d_values);
-        builder.arg(&mut d_adjoints);
-        builder.arg(&mut d_grads);
-        builder.arg(&tape.output_indices);
-        builder.arg(&tape.num_ops);
-        builder.arg(&ni);
-        builder.arg(&nv);
-        builder.arg(&batch_size);
-        unsafe { builder.launch(cfg) }.map_err(cuda_err)?;
-
-        s.synchronize().map_err(cuda_err)?;
-        let output_vals = s.clone_dtoh(&d_outputs).map_err(cuda_err)?;
-        let grads = s.clone_dtoh(&d_grads).map_err(cuda_err)?;
-        Ok((output_vals, grads))
-    }
-
-    /// Sparse Jacobian using forward-mode tangent sweeps (f32).
-    fn sparse_jacobian(
-        &self,
-        tape: &CudaTapeBuffers,
-        tape_cpu: &mut crate::BytecodeTape<f32>,
-        x: &[f32],
-    ) -> Result<(Vec<f32>, crate::sparse::JacobianSparsityPattern, Vec<f32>), GpuError> {
-        let ni = tape.num_inputs as usize;
-        let no = tape.num_outputs as usize;
-
-        let pattern = tape_cpu.detect_jacobian_sparsity();
-        let (colors, num_colors) = crate::sparse::column_coloring(&pattern);
-
-        if num_colors == 0 {
-            tape_cpu.forward(x);
-            let vals = tape_cpu.output_values();
-            return Ok((vals, pattern, vec![]));
-        }
-
-        // Build tangent seeds for each color
-        let batch = num_colors as u32;
-        let mut seeds = Vec::with_capacity(batch as usize * ni);
-        for c in 0..num_colors {
-            for i in 0..ni {
-                seeds.push(if colors[i] == c as u32 {
-                    1.0f32
-                } else {
-                    0.0f32
-                });
-            }
-        }
-
-        // Dispatch tangent forward kernel
-        let s = &self.stream;
-        let nv = tape.num_variables;
-
-        let d_primals_in = {
-            let mut replicated = Vec::with_capacity(batch as usize * ni);
-            for _ in 0..batch {
-                replicated.extend_from_slice(x);
-            }
-            s.clone_htod(&replicated).map_err(cuda_err)?
-        };
-        let d_seeds = s.clone_htod(&seeds).map_err(cuda_err)?;
-        let mut d_primals = s
-            .alloc_zeros::<f32>((batch * nv) as usize)
-            .map_err(cuda_err)?;
-        let mut d_tangents = s
-            .alloc_zeros::<f32>((batch * nv) as usize)
-            .map_err(cuda_err)?;
-        let mut d_tangent_out = s
-            .alloc_zeros::<f32>((batch * tape.num_outputs) as usize)
-            .map_err(cuda_err)?;
-
-        let cfg = LaunchConfig {
-            grid_dim: Self::grid_dim(batch),
-            block_dim: Self::block_dim(),
-            shared_mem_bytes: 0,
-        };
-
-        let mut builder = s.launch_builder(&self.tangent_fwd_f32);
-        builder.arg(&tape.opcodes);
-        builder.arg(&tape.arg0);
-        builder.arg(&tape.arg1);
-        builder.arg(&tape.constants_f32);
-        builder.arg(&d_primals_in);
-        builder.arg(&d_seeds);
-        builder.arg(&mut d_primals);
-        builder.arg(&mut d_tangents);
-        builder.arg(&mut d_tangent_out);
-        builder.arg(&tape.output_indices);
-        builder.arg(&tape.num_ops);
-        builder.arg(&tape.num_inputs);
-        builder.arg(&nv);
-        builder.arg(&tape.num_outputs);
-        builder.arg(&batch);
-        unsafe { builder.launch(cfg) }.map_err(cuda_err)?;
-
-        s.synchronize().map_err(cuda_err)?;
-        let tangent_outs = s.clone_dtoh(&d_tangent_out).map_err(cuda_err)?;
-
-        // Get output values from CPU
-        tape_cpu.forward(x);
-        let output_values = tape_cpu.output_values();
-
-        // Extract Jacobian entries
-        let nnz = pattern.nnz();
-        let mut jac_values = vec![0.0f32; nnz];
-        for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() {
-            let c = colors[col as usize] as usize;
-            jac_values[k] = tangent_outs[c * no + row as usize];
-        }
-
-        Ok((output_values, pattern, jac_values))
-    }
-
-    /// HVP batch (forward-over-reverse) (f32).
     fn hvp_batch(
         &self,
         tape: &CudaTapeBuffers,
@@ -416,123 +613,20 @@ impl GpuBackend for CudaContext {
         tangent_dirs: &[f32],
         batch_size: u32,
     ) -> Result<(Vec<f32>, Vec<f32>), GpuError> {
-        let s = &self.stream;
-        let ni = tape.num_inputs;
-        let nv = tape.num_variables;
-
-        assert_eq!(x.len(), ni as usize);
-        assert_eq!(tangent_dirs.len(), (batch_size * ni) as usize);
-
-        let mut primal_inputs = Vec::with_capacity((batch_size * ni) as usize);
-        for _ in 0..batch_size {
-            primal_inputs.extend_from_slice(x);
-        }
-
-        let d_primal_in = s.clone_htod(&primal_inputs).map_err(cuda_err)?;
-        let d_seeds = s.clone_htod(tangent_dirs).map_err(cuda_err)?;
-        let mut d_primals = s
-            .alloc_zeros::<f32>((batch_size * nv) as usize)
-            .map_err(cuda_err)?;
-        let mut d_tans = s
-            .alloc_zeros::<f32>((batch_size * nv) as usize)
-            .map_err(cuda_err)?;
-        let mut d_adj_re = s
-            .alloc_zeros::<f32>((batch_size * nv) as usize)
-            .map_err(cuda_err)?;
-        let mut d_adj_eps = s
-            .alloc_zeros::<f32>((batch_size * nv) as usize)
-            .map_err(cuda_err)?;
-        let mut d_grads = s
-            .alloc_zeros::<f32>((batch_size * ni) as usize)
-            .map_err(cuda_err)?;
-        let mut d_hvps = s
-            .alloc_zeros::<f32>((batch_size * ni) as usize)
-            .map_err(cuda_err)?;
-
-        let cfg = LaunchConfig {
-            grid_dim: Self::grid_dim(batch_size),
-            block_dim: Self::block_dim(),
-            shared_mem_bytes: 0,
-        };
-
-        let mut builder = s.launch_builder(&self.tangent_rev_f32);
-        builder.arg(&tape.opcodes);
-        builder.arg(&tape.arg0);
-        builder.arg(&tape.arg1);
-        builder.arg(&tape.constants_f32);
-        builder.arg(&d_primal_in);
-        builder.arg(&d_seeds);
-        builder.arg(&mut d_primals);
-        builder.arg(&mut d_tans);
-        builder.arg(&mut d_adj_re);
-        builder.arg(&mut d_adj_eps);
-        builder.arg(&mut d_grads);
-        builder.arg(&mut d_hvps);
-        builder.arg(&tape.output_indices);
-        builder.arg(&tape.num_ops);
-        builder.arg(&ni);
-        builder.arg(&nv);
-        builder.arg(&batch_size);
-        unsafe { builder.launch(cfg) }.map_err(cuda_err)?;
-
-        s.synchronize().map_err(cuda_err)?;
-        let grads = s.clone_dtoh(&d_grads).map_err(cuda_err)?;
-        let hvps = s.clone_dtoh(&d_hvps).map_err(cuda_err)?;
-        Ok((grads, hvps))
+        cuda_hvp_batch_body!(self, tape, x, tangent_dirs, batch_size, f32, constants_f32, tangent_rev_f32)
     }
 
-    /// Sparse Hessian using GPU HVP sweeps (f32).
     fn sparse_hessian(
         &self,
         tape: &CudaTapeBuffers,
         tape_cpu: &mut crate::BytecodeTape<f32>,
         x: &[f32],
     ) -> Result<(f32, Vec<f32>, crate::sparse::SparsityPattern, Vec<f32>), GpuError> {
-        let ni = tape.num_inputs as usize;
-
-        let pattern = tape_cpu.detect_sparsity();
-        let (colors, num_colors) = crate::sparse::greedy_coloring(&pattern);
-
-        if num_colors == 0 {
-            tape_cpu.forward(x);
-            let val = tape_cpu.output_value();
-            let grad = tape_cpu.gradient(x);
-            return Ok((val, grad, pattern, vec![]));
-        }
-
-        let batch = num_colors as u32;
-        let mut tangent_dirs = Vec::with_capacity(batch as usize * ni);
-        for c in 0..num_colors {
-            for i in 0..ni {
-                tangent_dirs.push(if colors[i] == c as u32 {
-                    1.0f32
-                } else {
-                    0.0f32
-                });
-            }
-        }
-
-        let (grads, hvps) = self.hvp_batch(tape, x, &tangent_dirs, batch)?;
-
-        let gradient: Vec<f32> = grads[..ni].to_vec();
-
-        let nnz = pattern.nnz();
-        let mut hess_values = vec![0.0f32; nnz];
-        for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() {
-            let c = colors[col as usize] as usize;
-            hess_values[k] = hvps[c * ni + row as usize];
-        }
-
-        tape_cpu.forward(x);
-        let value = tape_cpu.output_value();
-
-        Ok((value, gradient, pattern, hess_values))
+        cuda_sparse_hessian_body!(self, tape, tape_cpu, x, f32, hvp_batch)
     }
 }
 
 impl CudaContext {
-    // ── Taylor forward 2nd-order (f32) ──
-
     /// Batched second-order Taylor forward propagation (f32).
     ///
     /// Each batch element pushes one direction through the tape, producing
@@ -545,74 +639,9 @@ impl CudaContext {
         direction_seeds: &[f32],
         batch_size: u32,
     ) -> Result<super::TaylorBatchResult<f32>, GpuError> {
-        let s = &self.stream;
-        let ni = tape.num_inputs;
-        let nv = tape.num_variables;
-        let no = tape.num_outputs;
-        let total_in = (batch_size * ni) as usize;
-
-        assert_eq!(
-            primal_inputs.len(),
-            total_in,
-            "primal_inputs length mismatch"
-        );
-        assert_eq!(
-            direction_seeds.len(),
-            total_in,
-            "direction_seeds length mismatch"
-        );
-
-        let d_primals = s.clone_htod(primal_inputs).map_err(cuda_err)?;
-        let d_seeds = s.clone_htod(direction_seeds).map_err(cuda_err)?;
-        let mut d_jets = s
-            .alloc_zeros::<f32>((batch_size * nv * 3) as usize)
-            .map_err(cuda_err)?;
-        let mut d_jet_out = s
-            .alloc_zeros::<f32>((batch_size * no * 3) as usize)
-            .map_err(cuda_err)?;
-
-        let cfg = LaunchConfig {
-            grid_dim: Self::grid_dim(batch_size),
-            block_dim: Self::block_dim(),
-            shared_mem_bytes: 0,
-        };
-
-        let mut builder = s.launch_builder(&self.taylor_fwd_2nd_f32);
-        builder.arg(&tape.opcodes);
-        builder.arg(&tape.arg0);
-        builder.arg(&tape.arg1);
-        builder.arg(&tape.constants_f32);
-        builder.arg(&d_primals);
-        builder.arg(&d_seeds);
-        builder.arg(&mut d_jets);
-        builder.arg(&mut d_jet_out);
-        builder.arg(&tape.output_indices);
-        builder.arg(&tape.num_ops);
-        builder.arg(&ni);
-        builder.arg(&nv);
-        builder.arg(&no);
-        builder.arg(&batch_size);
-        unsafe { builder.launch(cfg) }.map_err(cuda_err)?;
-
-        s.synchronize().map_err(cuda_err)?;
-        let raw = s.clone_dtoh(&d_jet_out).map_err(cuda_err)?;
-
-        // Deinterleave [c0, c1, c2, c0, c1, c2, ...]
-        let total_out = (batch_size * no) as usize;
-        let mut values = Vec::with_capacity(total_out);
-        let mut c1s = Vec::with_capacity(total_out);
-        let mut c2s = Vec::with_capacity(total_out);
-        for i in 0..total_out {
-            values.push(raw[i * 3]);
-            c1s.push(raw[i * 3 + 1]);
-            c2s.push(raw[i * 3 + 2]);
-        }
-
-        Ok(super::TaylorBatchResult { values, c1s, c2s })
+        cuda_taylor_fwd_2nd_body!(self, tape, primal_inputs, direction_seeds, batch_size, f32, constants_f32, taylor_fwd_2nd_f32)
     }
 
-    // ── Taylor forward 2nd-order (f64) ──
-
     /// Batched second-order Taylor forward propagation (f64, CUDA only).
     pub fn taylor_forward_2nd_batch_f64(
         &self,
@@ -621,69 +650,7 @@ impl CudaContext {
         direction_seeds: &[f64],
         batch_size: u32,
     ) -> Result<super::TaylorBatchResult<f64>, GpuError> {
-        let s = &self.stream;
-        let ni = tape.num_inputs;
-        let nv = tape.num_variables;
-        let no = tape.num_outputs;
-        let total_in = (batch_size * ni) as usize;
-
-        assert_eq!(
-            primal_inputs.len(),
-            total_in,
-            "primal_inputs length mismatch"
-        );
-        assert_eq!(
-            direction_seeds.len(),
-            total_in,
-            "direction_seeds length mismatch"
-        );
-
-        let d_primals = s.clone_htod(primal_inputs).map_err(cuda_err)?;
-        let d_seeds = s.clone_htod(direction_seeds).map_err(cuda_err)?;
-        let mut d_jets = s
-            .alloc_zeros::<f64>((batch_size * nv * 3) as usize)
-            .map_err(cuda_err)?;
-        let mut d_jet_out = s
-            .alloc_zeros::<f64>((batch_size * no * 3) as usize)
-            .map_err(cuda_err)?;
-
-        let cfg = LaunchConfig {
-            grid_dim: Self::grid_dim(batch_size),
-            block_dim: Self::block_dim(),
-            shared_mem_bytes: 0,
-        };
-
-        let mut builder = s.launch_builder(&self.taylor_fwd_2nd_f64);
-        builder.arg(&tape.opcodes);
-        builder.arg(&tape.arg0);
-        builder.arg(&tape.arg1);
-        builder.arg(&tape.constants_f64);
-        builder.arg(&d_primals);
-        builder.arg(&d_seeds);
-        builder.arg(&mut d_jets);
-        builder.arg(&mut d_jet_out);
-        builder.arg(&tape.output_indices);
-        builder.arg(&tape.num_ops);
-        builder.arg(&ni);
-        builder.arg(&nv);
-        builder.arg(&no);
-        builder.arg(&batch_size);
-        unsafe { builder.launch(cfg) }.map_err(cuda_err)?;
-
-        s.synchronize().map_err(cuda_err)?;
-        let raw = s.clone_dtoh(&d_jet_out).map_err(cuda_err)?;
-
-        let total_out = (batch_size * no) as usize;
-        let mut values = Vec::with_capacity(total_out);
-        let mut c1s = Vec::with_capacity(total_out);
-        let mut c2s = Vec::with_capacity(total_out);
-        for i in 0..total_out {
-            values.push(raw[i * 3]);
-            c1s.push(raw[i * 3 + 1]);
-            c2s.push(raw[i * 3 + 2]);
-        }
-
-        Ok(super::TaylorBatchResult { values, c1s, c2s })
+        cuda_taylor_fwd_2nd_body!(self, tape, primal_inputs, direction_seeds, batch_size, f64, constants_f64, taylor_fwd_2nd_f64)
     }
 
     // ── f64 operations ──
@@ -695,46 +662,7 @@ impl CudaContext {
         inputs: &[f64],
         batch_size: u32,
     ) -> Result<Vec<f64>, GpuError> {
-        let s = &self.stream;
-        let ni = tape.num_inputs;
-        let nv = tape.num_variables;
-        let no = tape.num_outputs;
-
-        assert_eq!(inputs.len(), (batch_size * ni) as usize);
-
-        let d_inputs = s.clone_htod(inputs).map_err(cuda_err)?;
-        let mut d_values = s
-            .alloc_zeros::<f64>((batch_size * nv) as usize)
-            .map_err(cuda_err)?;
-        let mut d_outputs = s
-            .alloc_zeros::<f64>((batch_size * no) as usize)
-            .map_err(cuda_err)?;
-
-        let cfg = LaunchConfig {
-            grid_dim: Self::grid_dim(batch_size),
-            block_dim: Self::block_dim(),
-            shared_mem_bytes: 0,
-        };
-
-        let mut builder = s.launch_builder(&self.forward_f64);
-        builder.arg(&tape.opcodes);
-        builder.arg(&tape.arg0);
-        builder.arg(&tape.arg1);
-        builder.arg(&tape.constants_f64);
-        builder.arg(&d_inputs);
-        builder.arg(&mut d_values);
-        builder.arg(&mut d_outputs);
-        builder.arg(&tape.output_indices);
-        builder.arg(&tape.num_ops);
-        builder.arg(&ni);
-        builder.arg(&nv);
-        builder.arg(&no);
-        builder.arg(&batch_size);
-        unsafe { builder.launch(cfg) }.map_err(cuda_err)?;
-
-        s.synchronize().map_err(cuda_err)?;
-        let results = s.clone_dtoh(&d_outputs).map_err(cuda_err)?;
-        Ok(results)
+        cuda_forward_batch_body!(self, tape, inputs, batch_size, f64, constants_f64, forward_f64)
     }
 
     /// Batched gradient (f64, CUDA only).
@@ -744,70 +672,7 @@ impl CudaContext {
         inputs: &[f64],
         batch_size: u32,
     ) -> Result<(Vec<f64>, Vec<f64>), GpuError> {
-        let s = &self.stream;
-        let ni = tape.num_inputs;
-        let nv = tape.num_variables;
-        let no = tape.num_outputs;
-
-        assert_eq!(inputs.len(), (batch_size * ni) as usize);
-
-        let d_inputs = s.clone_htod(inputs).map_err(cuda_err)?;
-        let mut d_values = s
-            .alloc_zeros::<f64>((batch_size * nv) as usize)
-            .map_err(cuda_err)?;
-        let mut d_outputs = s
-            .alloc_zeros::<f64>((batch_size * no) as usize)
-            .map_err(cuda_err)?;
-
-        let cfg = LaunchConfig {
-            grid_dim: Self::grid_dim(batch_size),
-            block_dim: Self::block_dim(),
-            shared_mem_bytes: 0,
-        };
-
-        // Forward
-        let mut builder = s.launch_builder(&self.forward_f64);
-        builder.arg(&tape.opcodes);
-        builder.arg(&tape.arg0);
-        builder.arg(&tape.arg1);
-        builder.arg(&tape.constants_f64);
-        builder.arg(&d_inputs);
-        builder.arg(&mut d_values);
-        builder.arg(&mut d_outputs);
-        builder.arg(&tape.output_indices);
-        builder.arg(&tape.num_ops);
-        builder.arg(&ni);
-        builder.arg(&nv);
-        builder.arg(&no);
-        builder.arg(&batch_size);
-        unsafe { builder.launch(cfg) }.map_err(cuda_err)?;
-
-        // Reverse
-        let mut d_adjoints = s
-            .alloc_zeros::<f64>((batch_size * nv) as usize)
-            .map_err(cuda_err)?;
-        let mut d_grads = s
-            .alloc_zeros::<f64>((batch_size * ni) as usize)
-            .map_err(cuda_err)?;
-
-        let mut builder = s.launch_builder(&self.reverse_f64);
-        builder.arg(&tape.opcodes);
-        builder.arg(&tape.arg0);
-        builder.arg(&tape.arg1);
-        builder.arg(&d_values);
-        builder.arg(&mut d_adjoints);
-        builder.arg(&mut d_grads);
-        builder.arg(&tape.output_indices);
-        builder.arg(&tape.num_ops);
-        builder.arg(&ni);
-        builder.arg(&nv);
-        builder.arg(&batch_size);
-        unsafe { builder.launch(cfg) }.map_err(cuda_err)?;
-
-        s.synchronize().map_err(cuda_err)?;
-        let output_vals = s.clone_dtoh(&d_outputs).map_err(cuda_err)?;
-        let grads = s.clone_dtoh(&d_grads).map_err(cuda_err)?;
-        Ok((output_vals, grads))
+        cuda_gradient_batch_body!(self, tape, inputs, batch_size, f64, constants_f64, forward_f64, reverse_f64)
     }
 
     /// Sparse Jacobian (f64, CUDA only).
@@ -817,89 +682,7 @@ impl CudaContext {
         tape_cpu: &mut crate::BytecodeTape<f64>,
         x: &[f64],
     ) -> Result<(Vec<f64>, crate::sparse::JacobianSparsityPattern, Vec<f64>), GpuError> {
-        let ni = tape.num_inputs as usize;
-        let no = tape.num_outputs as usize;
-
-        let pattern = tape_cpu.detect_jacobian_sparsity();
-        let (colors, num_colors) = crate::sparse::column_coloring(&pattern);
-
-        if num_colors == 0 {
-            tape_cpu.forward(x);
-            let vals = tape_cpu.output_values();
-            return Ok((vals, pattern, vec![]));
-        }
-
-        let batch = num_colors as u32;
-        let mut seeds = Vec::with_capacity(batch as usize * ni);
-        for c in 0..num_colors {
-            for i in 0..ni {
-                seeds.push(if colors[i] == c as u32 {
-                    1.0f64
-                } else {
-                    0.0f64
-                });
-            }
-        }
-
-        let s = &self.stream;
-        let nv = tape.num_variables;
-
-        let d_primals_in = {
-            let mut replicated = Vec::with_capacity(batch as usize * ni);
-            for _ in 0..batch {
-                replicated.extend_from_slice(x);
-            }
-            s.clone_htod(&replicated).map_err(cuda_err)?
-        };
-        let d_seeds = s.clone_htod(&seeds).map_err(cuda_err)?;
-        let mut d_primals = s
-            .alloc_zeros::<f64>((batch * nv) as usize)
-            .map_err(cuda_err)?;
-        let mut d_tangents = s
-            .alloc_zeros::<f64>((batch * nv) as usize)
-            .map_err(cuda_err)?;
-        let mut d_tangent_out = s
-            .alloc_zeros::<f64>((batch * tape.num_outputs) as usize)
-            .map_err(cuda_err)?;
-
-        let cfg = LaunchConfig {
-            grid_dim: Self::grid_dim(batch),
-            block_dim: Self::block_dim(),
-            shared_mem_bytes: 0,
-        };
-
-        let mut builder = s.launch_builder(&self.tangent_fwd_f64);
-        builder.arg(&tape.opcodes);
-        builder.arg(&tape.arg0);
-        builder.arg(&tape.arg1);
-        builder.arg(&tape.constants_f64);
-        builder.arg(&d_primals_in);
-        builder.arg(&d_seeds);
-        builder.arg(&mut d_primals);
-        builder.arg(&mut d_tangents);
-        builder.arg(&mut d_tangent_out);
-        builder.arg(&tape.output_indices);
-        builder.arg(&tape.num_ops);
-        builder.arg(&tape.num_inputs);
-        builder.arg(&nv);
-        builder.arg(&tape.num_outputs);
-        builder.arg(&batch);
-        unsafe { builder.launch(cfg) }.map_err(cuda_err)?;
-
-        s.synchronize().map_err(cuda_err)?;
-        let tangent_outs = s.clone_dtoh(&d_tangent_out).map_err(cuda_err)?;
-
-        tape_cpu.forward(x);
-        let output_values = tape_cpu.output_values();
-
-        let nnz = pattern.nnz();
-        let mut jac_values = vec![0.0f64; nnz];
-        for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() {
-            let c = colors[col as usize] as usize;
-            jac_values[k] = tangent_outs[c * no + row as usize];
-        }
-
-        Ok((output_values, pattern, jac_values))
+        cuda_sparse_jacobian_body!(self, tape, tape_cpu, x, f64, constants_f64, tangent_fwd_f64)
     }
 
     /// Sparse Hessian (f64, CUDA only).
@@ -909,45 +692,7 @@ impl CudaContext {
         tape_cpu: &mut crate::BytecodeTape<f64>,
         x: &[f64],
     ) -> Result<(f64, Vec<f64>, crate::sparse::SparsityPattern, Vec<f64>), GpuError> {
-        let ni = tape.num_inputs as usize;
-
-        let pattern = tape_cpu.detect_sparsity();
-        let (colors, num_colors) = crate::sparse::greedy_coloring(&pattern);
-
-        if num_colors == 0 {
-            tape_cpu.forward(x);
-            let val = tape_cpu.output_value();
-            let grad = tape_cpu.gradient(x);
-            return Ok((val, grad, pattern, vec![]));
-        }
-
-        let batch = num_colors as u32;
-        let mut tangent_dirs = Vec::with_capacity(batch as usize * ni);
-        for c in 0..num_colors {
-            for i in 0..ni {
-                tangent_dirs.push(if colors[i] == c as u32 {
-                    1.0f64
-                } else {
-                    0.0f64
-                });
-            }
-        }
-
-        let (grads, hvps) = self.hvp_batch_f64(tape, x, &tangent_dirs, batch)?;
-
-        let gradient: Vec<f64> = grads[..ni].to_vec();
-
-        let nnz = pattern.nnz();
-        let mut hess_values = vec![0.0f64; nnz];
-        for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() {
-            let c = colors[col as usize] as usize;
-            hess_values[k] = hvps[c * ni + row as usize];
-        }
-
-        tape_cpu.forward(x);
-        let value = tape_cpu.output_value();
-
-        Ok((value, gradient, pattern, hess_values))
+        cuda_sparse_hessian_body!(self, tape, tape_cpu, x, f64, hvp_batch_f64)
     }
 
     /// HVP batch (f64, CUDA only).
@@ -958,68 +703,6 @@ impl CudaContext {
         tangent_dirs: &[f64],
         batch_size: u32,
     ) -> Result<(Vec<f64>, Vec<f64>), GpuError> {
-        let s = &self.stream;
-        let ni = tape.num_inputs;
-        let nv = tape.num_variables;
-
-        assert_eq!(x.len(), ni as usize);
-        assert_eq!(tangent_dirs.len(), (batch_size * ni) as usize);
-
-        let mut primal_inputs = Vec::with_capacity((batch_size * ni) as usize);
-        for _ in 0..batch_size {
-            primal_inputs.extend_from_slice(x);
-        }
-
-        let d_primal_in = s.clone_htod(&primal_inputs).map_err(cuda_err)?;
-        let d_seeds = s.clone_htod(tangent_dirs).map_err(cuda_err)?;
-        let mut d_primals = s
-            .alloc_zeros::<f64>((batch_size * nv) as usize)
-            .map_err(cuda_err)?;
-        let mut d_tans = s
-            .alloc_zeros::<f64>((batch_size * nv) as usize)
-            .map_err(cuda_err)?;
-        let mut d_adj_re = s
-            .alloc_zeros::<f64>((batch_size * nv) as usize)
-            .map_err(cuda_err)?;
-        let mut d_adj_eps = s
-            .alloc_zeros::<f64>((batch_size * nv) as usize)
-            .map_err(cuda_err)?;
-        let mut d_grads = s
-            .alloc_zeros::<f64>((batch_size * ni) as usize)
-            .map_err(cuda_err)?;
-        let mut d_hvps = s
-            .alloc_zeros::<f64>((batch_size * ni) as usize)
-            .map_err(cuda_err)?;
-
-        let cfg = LaunchConfig {
-            grid_dim: Self::grid_dim(batch_size),
-            block_dim: Self::block_dim(),
-            shared_mem_bytes: 0,
-        };
-
-        let mut builder = s.launch_builder(&self.tangent_rev_f64);
-        builder.arg(&tape.opcodes);
-        builder.arg(&tape.arg0);
-        builder.arg(&tape.arg1);
-        builder.arg(&tape.constants_f64);
-        builder.arg(&d_primal_in);
-        builder.arg(&d_seeds);
-        builder.arg(&mut d_primals);
-        builder.arg(&mut d_tans);
-        builder.arg(&mut d_adj_re);
-        builder.arg(&mut d_adj_eps);
-        builder.arg(&mut d_grads);
-        builder.arg(&mut d_hvps);
-        builder.arg(&tape.output_indices);
-        builder.arg(&tape.num_ops);
-        builder.arg(&ni);
-        builder.arg(&nv);
-        builder.arg(&batch_size);
-        unsafe { builder.launch(cfg) }.map_err(cuda_err)?;
-
-        s.synchronize().map_err(cuda_err)?;
-        let grads = s.clone_dtoh(&d_grads).map_err(cuda_err)?;
-        let hvps = s.clone_dtoh(&d_hvps).map_err(cuda_err)?;
-        Ok((grads, hvps))
+        cuda_hvp_batch_body!(self, tape, x, tangent_dirs, batch_size, f64, constants_f64, tangent_rev_f64)
     }
 }
diff --git a/src/gpu/mod.rs b/src/gpu/mod.rs
index 6942113..09acac8 100644
--- a/src/gpu/mod.rs
+++ b/src/gpu/mod.rs
@@ -31,7 +31,7 @@
 //! - [`stde_gpu::laplacian_with_control_gpu`] — variance-reduced Laplacian with diagonal control variate
 //!
 //! The Taylor kernel propagates `(c0, c1, c2)` triples through the tape for
-//! each batch element, where c2 = v^T H v / 2. All 43 opcodes are supported.
+//! each batch element, where c2 = v^T H v / 2. All 44 opcodes are supported.
 
 use crate::bytecode_tape::BytecodeTape;
 use crate::opcode::OpCode;
diff --git a/src/opcode.rs b/src/opcode.rs
index fdbc96a..aa338d4 100644
--- a/src/opcode.rs
+++ b/src/opcode.rs
@@ -10,7 +10,7 @@ pub const UNUSED: u32 = u32::MAX;
 
 /// Elementary operation codes for the bytecode tape.
 ///
-/// Fits in a `u8` (~38 variants). Binary ops use both `arg_indices` slots;
+/// Fits in a `u8` (44 variants). Binary ops use both `arg_indices` slots;
 /// unary ops use slot 0 only (slot 1 = [`UNUSED`], except for [`OpCode::Powi`]
 /// which stores the `i32` exponent reinterpreted as `u32` in slot 1).
 #[repr(u8)]

From 13f62304c8277c572325c1cbbddbc059b6ba08cb Mon Sep 17 00:00:00 2001
From: Greg von Nessi <greg.vonnessi@entrolution.ai>
Date: Thu, 26 Feb 2026 01:50:58 +0000
Subject: [PATCH 5/5] Fix rustfmt formatting for macro invocations

---
 echidna-optim/tests/solvers.rs | 18 +++----
 src/bytecode_tape/mod.rs       |  4 +-
 src/bytecode_tape/tangent.rs   |  6 ++-
 src/gpu/cuda_backend.rs        | 86 ++++++++++++++++++++++++++++++----
 4 files changed, 95 insertions(+), 19 deletions(-)

diff --git a/echidna-optim/tests/solvers.rs b/echidna-optim/tests/solvers.rs
index b6e9d85..b92f368 100644
--- a/echidna-optim/tests/solvers.rs
+++ b/echidna-optim/tests/solvers.rs
@@ -119,12 +119,7 @@ impl Objective<f64> for Rosenbrock2D {
 
 fn assert_near_origin(result: &OptimResult<f64>, tol: f64) {
     for (i, &xi) in result.x.iter().enumerate() {
-        assert!(
-            xi.abs() < tol,
-            "x[{}] = {}, expected ~0",
-            i,
-            xi
-        );
+        assert!(xi.abs() < tol, "x[{}] = {}, expected ~0", i, xi);
     }
     assert!(result.value < tol, "f = {}, expected ~0", result.value);
 }
@@ -141,7 +136,10 @@ fn newton_quadratic_2d() {
 
     assert_eq!(result.termination, TerminationReason::GradientNorm);
     // Newton on a quadratic converges in 1 iteration
-    assert_eq!(result.iterations, 1, "Newton should solve quadratic in 1 step");
+    assert_eq!(
+        result.iterations, 1,
+        "Newton should solve quadratic in 1 step"
+    );
     assert_near_origin(&result, 1e-10);
 }
 
@@ -212,7 +210,11 @@ fn lbfgs_quadratic_2d() {
 fn lbfgs_quadratic_8d() {
     let mut obj = Quadratic { dim: 8 };
     let config = LbfgsConfig::default();
-    let result = lbfgs(&mut obj, &[1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0], &config);
+    let result = lbfgs(
+        &mut obj,
+        &[1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0],
+        &config,
+    );
 
     assert_eq!(result.termination, TerminationReason::GradientNorm);
     assert_near_origin(&result, 1e-6);
diff --git a/src/bytecode_tape/mod.rs b/src/bytecode_tape/mod.rs
index 7b4666c..f08579c 100644
--- a/src/bytecode_tape/mod.rs
+++ b/src/bytecode_tape/mod.rs
@@ -26,12 +26,12 @@ mod reverse;
 mod sparse;
 mod tangent;
 
-#[cfg(feature = "taylor")]
-mod taylor;
 #[cfg(feature = "parallel")]
 mod parallel;
 #[cfg(feature = "serde")]
 mod serde_support;
+#[cfg(feature = "taylor")]
+mod taylor;
 
 mod thread_local;
 pub use self::thread_local::{with_active_btape, BtapeGuard, BtapeThreadLocal};
diff --git a/src/bytecode_tape/tangent.rs b/src/bytecode_tape/tangent.rs
index a15433e..9f10d48 100644
--- a/src/bytecode_tape/tangent.rs
+++ b/src/bytecode_tape/tangent.rs
@@ -96,7 +96,11 @@ impl<F: Float> super::BytecodeTape<F> {
     /// Reverse sweep with tangent-carrying adjoints. Uses values from
     /// [`forward_tangent`](Self::forward_tangent). Uses [`IsAllZero`] to
     /// safely skip zero adjoints without dropping tangent contributions.
-    pub(super) fn reverse_tangent<T: NumFloat + IsAllZero>(&self, tangent_vals: &[T], buf: &mut Vec<T>) {
+    pub(super) fn reverse_tangent<T: NumFloat + IsAllZero>(
+        &self,
+        tangent_vals: &[T],
+        buf: &mut Vec<T>,
+    ) {
         self.reverse_tangent_inner(tangent_vals, buf, |i| {
             // First-order: convert primal-float partials to T.
             let [a_idx, cb_idx] = self.arg_indices[i];
diff --git a/src/gpu/cuda_backend.rs b/src/gpu/cuda_backend.rs
index 8807398..2d2be88 100644
--- a/src/gpu/cuda_backend.rs
+++ b/src/gpu/cuda_backend.rs
@@ -585,7 +585,15 @@ impl GpuBackend for CudaContext {
         inputs: &[f32],
         batch_size: u32,
     ) -> Result<Vec<f32>, GpuError> {
-        cuda_forward_batch_body!(self, tape, inputs, batch_size, f32, constants_f32, forward_f32)
+        cuda_forward_batch_body!(
+            self,
+            tape,
+            inputs,
+            batch_size,
+            f32,
+            constants_f32,
+            forward_f32
+        )
     }
 
     fn gradient_batch(
@@ -594,7 +602,16 @@ impl GpuBackend for CudaContext {
         inputs: &[f32],
         batch_size: u32,
     ) -> Result<(Vec<f32>, Vec<f32>), GpuError> {
-        cuda_gradient_batch_body!(self, tape, inputs, batch_size, f32, constants_f32, forward_f32, reverse_f32)
+        cuda_gradient_batch_body!(
+            self,
+            tape,
+            inputs,
+            batch_size,
+            f32,
+            constants_f32,
+            forward_f32,
+            reverse_f32
+        )
     }
 
     fn sparse_jacobian(
@@ -613,7 +630,16 @@ impl GpuBackend for CudaContext {
         tangent_dirs: &[f32],
         batch_size: u32,
     ) -> Result<(Vec<f32>, Vec<f32>), GpuError> {
-        cuda_hvp_batch_body!(self, tape, x, tangent_dirs, batch_size, f32, constants_f32, tangent_rev_f32)
+        cuda_hvp_batch_body!(
+            self,
+            tape,
+            x,
+            tangent_dirs,
+            batch_size,
+            f32,
+            constants_f32,
+            tangent_rev_f32
+        )
     }
 
     fn sparse_hessian(
@@ -639,7 +665,16 @@ impl CudaContext {
         direction_seeds: &[f32],
         batch_size: u32,
     ) -> Result<super::TaylorBatchResult<f32>, GpuError> {
-        cuda_taylor_fwd_2nd_body!(self, tape, primal_inputs, direction_seeds, batch_size, f32, constants_f32, taylor_fwd_2nd_f32)
+        cuda_taylor_fwd_2nd_body!(
+            self,
+            tape,
+            primal_inputs,
+            direction_seeds,
+            batch_size,
+            f32,
+            constants_f32,
+            taylor_fwd_2nd_f32
+        )
     }
 
     /// Batched second-order Taylor forward propagation (f64, CUDA only).
@@ -650,7 +685,16 @@ impl CudaContext {
         direction_seeds: &[f64],
         batch_size: u32,
     ) -> Result<super::TaylorBatchResult<f64>, GpuError> {
-        cuda_taylor_fwd_2nd_body!(self, tape, primal_inputs, direction_seeds, batch_size, f64, constants_f64, taylor_fwd_2nd_f64)
+        cuda_taylor_fwd_2nd_body!(
+            self,
+            tape,
+            primal_inputs,
+            direction_seeds,
+            batch_size,
+            f64,
+            constants_f64,
+            taylor_fwd_2nd_f64
+        )
     }
 
     // ── f64 operations ──
@@ -662,7 +706,15 @@ impl CudaContext {
         inputs: &[f64],
         batch_size: u32,
     ) -> Result<Vec<f64>, GpuError> {
-        cuda_forward_batch_body!(self, tape, inputs, batch_size, f64, constants_f64, forward_f64)
+        cuda_forward_batch_body!(
+            self,
+            tape,
+            inputs,
+            batch_size,
+            f64,
+            constants_f64,
+            forward_f64
+        )
     }
 
     /// Batched gradient (f64, CUDA only).
@@ -672,7 +724,16 @@ impl CudaContext {
         inputs: &[f64],
         batch_size: u32,
     ) -> Result<(Vec<f64>, Vec<f64>), GpuError> {
-        cuda_gradient_batch_body!(self, tape, inputs, batch_size, f64, constants_f64, forward_f64, reverse_f64)
+        cuda_gradient_batch_body!(
+            self,
+            tape,
+            inputs,
+            batch_size,
+            f64,
+            constants_f64,
+            forward_f64,
+            reverse_f64
+        )
     }
 
     /// Sparse Jacobian (f64, CUDA only).
@@ -703,6 +764,15 @@ impl CudaContext {
         tangent_dirs: &[f64],
         batch_size: u32,
     ) -> Result<(Vec<f64>, Vec<f64>), GpuError> {
-        cuda_hvp_batch_body!(self, tape, x, tangent_dirs, batch_size, f64, constants_f64, tangent_rev_f64)
+        cuda_hvp_batch_body!(
+            self,
+            tape,
+            x,
+            tangent_dirs,
+            batch_size,
+            f64,
+            constants_f64,
+            tangent_rev_f64
+        )
     }
 }