From 30f89a6c3dd5194c6dc9dea8db3f0fe7ce0e7be6 Mon Sep 17 00:00:00 2001 From: Greg von Nessi Date: Thu, 26 Feb 2026 00:42:49 +0000 Subject: [PATCH 1/5] Deduplicate reverse sweep in gradient_with_buf and sparse_jacobian_par Both methods inlined the same reverse sweep loop that reverse_sweep_core already implements. Replace with calls to the shared function, which also picks up the zero-adjoint skip optimization in gradient_with_buf. --- src/bytecode_tape.rs | 88 +------------------------------------------- 1 file changed, 2 insertions(+), 86 deletions(-) diff --git a/src/bytecode_tape.rs b/src/bytecode_tape.rs index 6dbbb65..27dbbdd 100644 --- a/src/bytecode_tape.rs +++ b/src/bytecode_tape.rs @@ -898,50 +898,7 @@ impl BytecodeTape { adjoint_buf.resize(n, F::zero()); adjoint_buf[self.output_index as usize] = F::one(); - for i in (0..self.opcodes.len()).rev() { - let adj = adjoint_buf[i]; - if adj == F::zero() { - continue; - } - - match self.opcodes[i] { - OpCode::Input | OpCode::Const => continue, - OpCode::Custom => { - adjoint_buf[i] = F::zero(); - let [a_idx, cb_idx] = self.arg_indices[i]; - let a = self.values[a_idx as usize]; - let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied(); - let b = b_idx_opt - .map(|bi| self.values[bi as usize]) - .unwrap_or(F::zero()); - let r = self.values[i]; - let (da, db) = self.custom_ops[cb_idx as usize].partials(a, b, r); - adjoint_buf[a_idx as usize] = adjoint_buf[a_idx as usize] + da * adj; - if let Some(bi) = b_idx_opt { - adjoint_buf[bi as usize] = adjoint_buf[bi as usize] + db * adj; - } - } - op => { - adjoint_buf[i] = F::zero(); - let [a_idx, b_idx] = self.arg_indices[i]; - let a = self.values[a_idx as usize]; - let b = if b_idx != UNUSED && op != OpCode::Powi { - self.values[b_idx as usize] - } else if op == OpCode::Powi { - F::from(b_idx).unwrap_or_else(|| F::zero()) - } else { - F::zero() - }; - let r = self.values[i]; - let (da, db) = opcode::reverse_partials(op, a, b, r); - - adjoint_buf[a_idx as usize] = adjoint_buf[a_idx as usize] + da * adj; - if b_idx != UNUSED && op != OpCode::Powi { - adjoint_buf[b_idx as usize] = adjoint_buf[b_idx as usize] + db * adj; - } - } - } - } + self.reverse_sweep_core(adjoint_buf, &self.values, None); adjoint_buf[..self.num_inputs as usize].to_vec() } @@ -2410,48 +2367,7 @@ impl BytecodeTape { } } - for idx in (0..self.opcodes.len()).rev() { - let adj = adjoints[idx]; - if adj == F::zero() { - continue; - } - match self.opcodes[idx] { - OpCode::Input | OpCode::Const => continue, - OpCode::Custom => { - adjoints[idx] = F::zero(); - let [a_idx, cb_idx] = self.arg_indices[idx]; - let a = values_buf[a_idx as usize]; - let b_idx_opt = self.custom_second_args.get(&(idx as u32)).copied(); - let b = b_idx_opt - .map(|bi| values_buf[bi as usize]) - .unwrap_or(F::zero()); - let r = values_buf[idx]; - let (da, db) = self.custom_ops[cb_idx as usize].partials(a, b, r); - adjoints[a_idx as usize] = adjoints[a_idx as usize] + da * adj; - if let Some(bi) = b_idx_opt { - adjoints[bi as usize] = adjoints[bi as usize] + db * adj; - } - } - op => { - adjoints[idx] = F::zero(); - let [a_idx, b_idx] = self.arg_indices[idx]; - let a = values_buf[a_idx as usize]; - let b = if b_idx != UNUSED && op != OpCode::Powi { - values_buf[b_idx as usize] - } else if op == OpCode::Powi { - F::from(b_idx).unwrap_or_else(|| F::zero()) - } else { - F::zero() - }; - let r = values_buf[idx]; - let (da, db) = opcode::reverse_partials(op, a, b, r); - adjoints[a_idx as usize] = adjoints[a_idx as usize] + da * adj; - if b_idx != UNUSED && op != OpCode::Powi { - adjoints[b_idx as usize] = adjoints[b_idx as usize] + db * adj; - } - } - } - } + self.reverse_sweep_core(&mut adjoints, &values_buf, None); adjoints[..ni].to_vec() }) .collect(); From e5cf808c18e9967781a8ad64f8220a488504279a Mon Sep 17 00:00:00 2001 From: Greg von Nessi Date: Thu, 26 Feb 2026 00:43:27 +0000 Subject: [PATCH 2/5] Move bytecode_tape.rs to bytecode_tape/mod.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prepare for submodule decomposition. Pure rename via git mv, zero content changes — preserves full blame history. --- src/{bytecode_tape.rs => bytecode_tape/mod.rs} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/{bytecode_tape.rs => bytecode_tape/mod.rs} (100%) diff --git a/src/bytecode_tape.rs b/src/bytecode_tape/mod.rs similarity index 100% rename from src/bytecode_tape.rs rename to src/bytecode_tape/mod.rs From c5153696404f33d1ec9f9e328e57bcbc15a04c6b Mon Sep 17 00:00:00 2001 From: Greg von Nessi Date: Thu, 26 Feb 2026 00:56:56 +0000 Subject: [PATCH 3/5] Decompose BytecodeTape into directory module with 10 submodules Split the 2,689-line monolith into logical submodules for navigability: mod.rs (struct + construction + accessors), forward.rs, reverse.rs, tangent.rs, jacobian.rs, sparse.rs, optimize.rs, taylor.rs, parallel.rs, serde_support.rs, thread_local.rs. Struct fields changed to pub(crate) for cross-module impl blocks. Seven internal methods upgraded to pub(super) for cross-module calls. Zero public API or behavioral changes. --- src/bytecode_tape/forward.rs | 184 +++ src/bytecode_tape/jacobian.rs | 183 +++ src/bytecode_tape/mod.rs | 2174 +--------------------------- src/bytecode_tape/optimize.rs | 271 ++++ src/bytecode_tape/parallel.rs | 263 ++++ src/bytecode_tape/reverse.rs | 170 +++ src/bytecode_tape/serde_support.rs | 61 + src/bytecode_tape/sparse.rs | 435 ++++++ src/bytecode_tape/tangent.rs | 380 +++++ src/bytecode_tape/taylor.rs | 118 ++ src/bytecode_tape/thread_local.rs | 87 ++ 11 files changed, 2177 insertions(+), 2149 deletions(-) create mode 100644 src/bytecode_tape/forward.rs create mode 100644 src/bytecode_tape/jacobian.rs create mode 100644 src/bytecode_tape/optimize.rs create mode 100644 src/bytecode_tape/parallel.rs create mode 100644 src/bytecode_tape/reverse.rs create mode 100644 src/bytecode_tape/serde_support.rs create mode 100644 src/bytecode_tape/sparse.rs create mode 100644 src/bytecode_tape/tangent.rs create mode 100644 src/bytecode_tape/taylor.rs create mode 100644 src/bytecode_tape/thread_local.rs diff --git a/src/bytecode_tape/forward.rs b/src/bytecode_tape/forward.rs new file mode 100644 index 0000000..e17ebe4 --- /dev/null +++ b/src/bytecode_tape/forward.rs @@ -0,0 +1,184 @@ +use crate::float::Float; +use crate::opcode::{self, OpCode, UNUSED}; + +impl super::BytecodeTape { + /// Re-evaluate the tape at new inputs (forward sweep). + /// + /// Overwrites `values` in-place — no allocation. + pub fn forward(&mut self, inputs: &[F]) { + assert_eq!( + inputs.len(), + self.num_inputs as usize, + "wrong number of inputs" + ); + + // Overwrite input values. + for (i, &v) in inputs.iter().enumerate() { + self.values[i] = v; + } + + // Re-evaluate all non-Input, non-Const ops. + for i in 0..self.opcodes.len() { + match self.opcodes[i] { + OpCode::Input | OpCode::Const => continue, + OpCode::Custom => { + let [a_idx, cb_idx] = self.arg_indices[i]; + let a = self.values[a_idx as usize]; + let b = self + .custom_second_args + .get(&(i as u32)) + .map(|&bi| self.values[bi as usize]) + .unwrap_or(F::zero()); + self.values[i] = self.custom_ops[cb_idx as usize].eval(a, b); + } + op => { + let [a_idx, b_idx] = self.arg_indices[i]; + let a = self.values[a_idx as usize]; + let b = if b_idx != UNUSED && op != OpCode::Powi { + self.values[b_idx as usize] + } else if op == OpCode::Powi { + F::from(b_idx).unwrap_or_else(|| F::zero()) + } else { + F::zero() + }; + self.values[i] = opcode::eval_forward(op, a, b); + } + } + } + } + + /// Forward sweep with nonsmooth branch tracking. + /// + /// Calls [`forward`](Self::forward) to evaluate the tape, then scans for + /// nonsmooth operations and records which branch was taken at each one. + /// + /// Tracked operations: + /// - `Abs`, `Min`, `Max` — kinks with nontrivial subdifferentials + /// - `Signum`, `Floor`, `Ceil`, `Round`, `Trunc` — step-function + /// discontinuities (zero derivative on both sides, tracked for proximity + /// detection only) + /// + /// Returns [`crate::NonsmoothInfo`] containing all kink entries in tape order. + pub fn forward_nonsmooth(&mut self, inputs: &[F]) -> crate::nonsmooth::NonsmoothInfo { + self.forward(inputs); + + let mut kinks = Vec::new(); + for i in 0..self.opcodes.len() { + let op = self.opcodes[i]; + if !opcode::is_nonsmooth(op) { + continue; + } + + let [a_idx, b_idx] = self.arg_indices[i]; + let a = self.values[a_idx as usize]; + + match op { + OpCode::Abs => { + kinks.push(crate::nonsmooth::KinkEntry { + tape_index: i as u32, + opcode: op, + switching_value: a, + branch: if a >= F::zero() { 1 } else { -1 }, + }); + } + OpCode::Max => { + let b = self.values[b_idx as usize]; + kinks.push(crate::nonsmooth::KinkEntry { + tape_index: i as u32, + opcode: op, + switching_value: a - b, + branch: if a >= b { 1 } else { -1 }, + }); + } + OpCode::Min => { + let b = self.values[b_idx as usize]; + kinks.push(crate::nonsmooth::KinkEntry { + tape_index: i as u32, + opcode: op, + switching_value: a - b, + branch: if a <= b { 1 } else { -1 }, + }); + } + OpCode::Signum => { + // Kink at x = 0 (same as Abs). + kinks.push(crate::nonsmooth::KinkEntry { + tape_index: i as u32, + opcode: op, + switching_value: a, + branch: if a >= F::zero() { 1 } else { -1 }, + }); + } + OpCode::Floor | OpCode::Ceil | OpCode::Round | OpCode::Trunc => { + // Kink at integer values. switching_value = distance to + // nearest integer: zero exactly at kink points, works + // symmetrically for both approach directions. + kinks.push(crate::nonsmooth::KinkEntry { + tape_index: i as u32, + opcode: op, + switching_value: a - a.round(), + branch: if a - a.floor() < F::from(0.5).unwrap() { + 1 + } else { + -1 + }, + }); + } + _ => unreachable!(), + } + } + + crate::nonsmooth::NonsmoothInfo { kinks } + } + + /// Forward evaluation into an external buffer. + /// + /// Reads opcodes, constants, and argument indices from `self`, but writes + /// computed values into `values_buf` instead of `self.values`. This allows + /// parallel evaluation of the same tape at different inputs without cloning. + pub fn forward_into(&self, inputs: &[F], values_buf: &mut Vec) { + assert_eq!( + inputs.len(), + self.num_inputs as usize, + "wrong number of inputs" + ); + + let n = self.num_variables as usize; + values_buf.clear(); + values_buf.resize(n, F::zero()); + + // Copy constant values from the tape, then overwrite inputs. + values_buf.copy_from_slice(&self.values[..n]); + for (i, &v) in inputs.iter().enumerate() { + values_buf[i] = v; + } + + // Re-evaluate all non-Input, non-Const ops. + for i in 0..self.opcodes.len() { + match self.opcodes[i] { + OpCode::Input | OpCode::Const => continue, + OpCode::Custom => { + let [a_idx, cb_idx] = self.arg_indices[i]; + let a = values_buf[a_idx as usize]; + let b = self + .custom_second_args + .get(&(i as u32)) + .map(|&bi| values_buf[bi as usize]) + .unwrap_or(F::zero()); + values_buf[i] = self.custom_ops[cb_idx as usize].eval(a, b); + } + op => { + let [a_idx, b_idx] = self.arg_indices[i]; + let a = values_buf[a_idx as usize]; + let b = if b_idx != UNUSED && op != OpCode::Powi { + values_buf[b_idx as usize] + } else if op == OpCode::Powi { + F::from(b_idx).unwrap_or_else(|| F::zero()) + } else { + F::zero() + }; + values_buf[i] = opcode::eval_forward(op, a, b); + } + } + } + } +} diff --git a/src/bytecode_tape/jacobian.rs b/src/bytecode_tape/jacobian.rs new file mode 100644 index 0000000..f698902 --- /dev/null +++ b/src/bytecode_tape/jacobian.rs @@ -0,0 +1,183 @@ +use std::collections::HashMap; + +use crate::dual::Dual; +use crate::float::Float; +use crate::opcode; + +impl super::BytecodeTape { + /// Compute the full Jacobian of a multi-output tape via reverse mode. + /// + /// Performs `m` reverse sweeps (one per output). Returns `J[i][j] = ∂f_i/∂x_j`. + pub fn jacobian(&mut self, inputs: &[F]) -> Vec> { + self.forward(inputs); + + let out_indices = self.all_output_indices(); + + let ni = self.num_inputs as usize; + let mut jac = Vec::with_capacity(out_indices.len()); + + for &out_idx in out_indices { + let adjoints = self.reverse(out_idx); + jac.push(adjoints[..ni].to_vec()); + } + + jac + } + + /// Vector-Jacobian product for a multi-output tape. + /// + /// Computes `wᵀ · J` where `J` is the Jacobian. More efficient than + /// computing the full Jacobian when only the weighted combination is needed. + pub fn vjp_multi(&mut self, inputs: &[F], weights: &[F]) -> Vec { + self.forward(inputs); + self.reverse_seeded(weights) + } + + /// Compute a Jacobian with forced branch choices at specified tape indices. + /// + /// For each `(tape_index, sign)` in `forced_signs`, the reverse sweep uses + /// [`forced_reverse_partials`](opcode::forced_reverse_partials) instead of the + /// standard partials at that index. + /// + /// This is the building block for Clarke subdifferential enumeration. + pub fn jacobian_limiting(&mut self, inputs: &[F], forced_signs: &[(u32, i8)]) -> Vec> { + self.forward(inputs); + + let sign_map: HashMap = forced_signs.iter().copied().collect(); + let out_indices = self.all_output_indices(); + + let ni = self.num_inputs as usize; + let mut jac = Vec::with_capacity(out_indices.len()); + + for &out_idx in out_indices { + let adjoints = self.reverse_with_forced_signs(out_idx, &sign_map); + jac.push(adjoints[..ni].to_vec()); + } + + jac + } + + /// Compute the Clarke generalized Jacobian via limiting Jacobian enumeration. + /// + /// 1. Runs `forward_nonsmooth` to detect all kink operations and their branches. + /// 2. Identifies "active" kinks (|switching_value| < `tol`). + /// 3. Enumerates all 2^k sign combinations for the k active kinks. + /// 4. For each combination, computes a limiting Jacobian via forced reverse sweeps. + /// + /// Returns the nonsmooth info and a vector of limiting Jacobians. + /// + /// # Errors + /// + /// Returns [`crate::ClarkeError::TooManyKinks`] if the number of active kinks exceeds + /// the limit (default 20, overridden by `max_active_kinks`). + pub fn clarke_jacobian( + &mut self, + inputs: &[F], + tol: F, + max_active_kinks: Option, + ) -> Result<(crate::nonsmooth::NonsmoothInfo, Vec>>), crate::nonsmooth::ClarkeError> + { + #![allow(clippy::type_complexity)] + let info = self.forward_nonsmooth(inputs); + let active: Vec<&crate::nonsmooth::KinkEntry> = info + .active_kinks(tol) + .into_iter() + .filter(|k| opcode::has_nontrivial_subdifferential(k.opcode)) + .collect(); + let k = active.len(); + let limit = max_active_kinks.unwrap_or(20); + + if k > limit { + return Err(crate::nonsmooth::ClarkeError::TooManyKinks { count: k, limit }); + } + + let active_indices: Vec = active.iter().map(|e| e.tape_index).collect(); + + // Build sign_map from all (non-active) kinks using their natural branches, + // then override active kinks per combination. + let base_signs: HashMap = info + .kinks + .iter() + .map(|e| (e.tape_index, e.branch)) + .collect(); + + let out_indices = self.all_output_indices(); + let ni = self.num_inputs as usize; + + let num_combos = 1usize << k; + let mut jacobians = Vec::with_capacity(num_combos); + + for combo in 0..num_combos { + let mut sign_map = base_signs.clone(); + for (bit, &idx) in active_indices.iter().enumerate() { + let sign: i8 = if (combo >> bit) & 1 == 0 { 1 } else { -1 }; + sign_map.insert(idx, sign); + } + + let mut jac = Vec::with_capacity(out_indices.len()); + for &out_idx in out_indices { + let adjoints = self.reverse_with_forced_signs(out_idx, &sign_map); + jac.push(adjoints[..ni].to_vec()); + } + jacobians.push(jac); + } + + Ok((info, jacobians)) + } + + /// Dense Jacobian via forward mode (one forward-tangent pass per input). + /// + /// More efficient than reverse mode when `num_inputs < num_outputs`. + pub fn jacobian_forward(&self, x: &[F]) -> Vec> { + let n = self.num_inputs as usize; + + let out_indices = self.all_output_indices(); + let m = out_indices.len(); + + let mut jac = vec![vec![F::zero(); n]; m]; + let mut dual_input_buf: Vec> = Vec::with_capacity(n); + let mut dual_vals_buf: Vec> = Vec::new(); + + // Indexing by `col` is clearer than enumerate here: col seeds the tangent direction + #[allow(clippy::needless_range_loop)] + for col in 0..n { + dual_input_buf.clear(); + dual_input_buf.extend( + (0..n).map(|i| Dual::new(x[i], if i == col { F::one() } else { F::zero() })), + ); + + self.forward_tangent(&dual_input_buf, &mut dual_vals_buf); + + for (row_idx, &out_idx) in out_indices.iter().enumerate() { + jac[row_idx][col] = dual_vals_buf[out_idx as usize].eps; + } + } + + jac + } + + /// Dense Jacobian via cross-country (vertex) elimination. + /// + /// Builds a linearized DAG from the tape, then eliminates intermediate + /// vertices in Markowitz order. For functions where `m ≈ n` and the + /// graph has moderate connectivity, this can require fewer operations + /// than either pure forward mode (`n` passes) or reverse mode (`m` passes). + pub fn jacobian_cross_country(&mut self, inputs: &[F]) -> Vec> { + self.forward(inputs); + + let out_indices = self.all_output_indices(); + + let mut graph = crate::cross_country::LinearizedGraph::from_tape( + &self.opcodes, + &self.arg_indices, + &self.values, + self.num_inputs as usize, + out_indices, + &self.custom_ops, + &self.custom_second_args, + ); + + graph.eliminate_all(); + graph.extract_jacobian() + } +} diff --git a/src/bytecode_tape/mod.rs b/src/bytecode_tape/mod.rs index 27dbbdd..7b4666c 100644 --- a/src/bytecode_tape/mod.rs +++ b/src/bytecode_tape/mod.rs @@ -11,20 +11,30 @@ //! branches (`if x > 0 { ... } else { ... }`), re-evaluating at inputs that //! take a different branch produces incorrect results. -use std::cell::Cell; use std::collections::HashMap; use std::sync::Arc; use crate::dual::Dual; -use crate::dual_vec::DualVec; use crate::float::Float; use crate::opcode::{self, OpCode, UNUSED}; -use crate::float::IsAllZero; -use num_traits::Float as NumFloat; +// Submodules — each adds impl blocks to BytecodeTape +mod forward; +mod jacobian; +mod optimize; +mod reverse; +mod sparse; +mod tangent; #[cfg(feature = "taylor")] -use crate::taylor::Taylor; +mod taylor; +#[cfg(feature = "parallel")] +mod parallel; +#[cfg(feature = "serde")] +mod serde_support; + +mod thread_local; +pub use self::thread_local::{with_active_btape, BtapeGuard, BtapeThreadLocal}; /// Sentinel index for constant entries (not tracked). pub const CONSTANT: u32 = u32::MAX; @@ -93,19 +103,19 @@ pub struct CustomOpHandle(pub(crate) u16); /// Created via [`crate::api::record`]. After recording, call [`forward`](Self::forward) /// to re-evaluate and [`reverse`](Self::reverse) to compute adjoints. pub struct BytecodeTape { - opcodes: Vec, - arg_indices: Vec<[u32; 2]>, - values: Vec, - num_inputs: u32, - num_variables: u32, - output_index: u32, + pub(crate) opcodes: Vec, + pub(crate) arg_indices: Vec<[u32; 2]>, + pub(crate) values: Vec, + pub(crate) num_inputs: u32, + pub(crate) num_variables: u32, + pub(crate) output_index: u32, /// Indices of multiple output variables (empty = single-output mode). - output_indices: Vec, + pub(crate) output_indices: Vec, /// Registered custom operations (callback table). - custom_ops: Vec>>, + pub(crate) custom_ops: Vec>>, /// Second operand index for binary custom ops (sparse side table). /// Maps tape index → second operand index. - custom_second_args: HashMap, + pub(crate) custom_second_args: HashMap, } impl BytecodeTape { @@ -462,2144 +472,10 @@ impl BytecodeTape { pub fn has_custom_ops(&self) -> bool { !self.custom_ops.is_empty() } - - /// Reverse sweep with weighted seeds for multiple outputs. - /// - /// Computes `∑_i weights[i] * ∂output_i/∂x` — a vector-Jacobian product. - /// - /// Returns the gradient with respect to all inputs (length [`num_inputs`](Self::num_inputs)). - pub fn reverse_seeded(&self, seeds: &[F]) -> Vec { - let out_indices = self.all_output_indices(); - - assert_eq!( - seeds.len(), - out_indices.len(), - "seeds length must match number of outputs" - ); - - let ni = self.num_inputs as usize; - let adjoints = self.reverse_seeded_full(seeds, out_indices); - adjoints[..ni].to_vec() - } - - /// Compute the full Jacobian of a multi-output tape via reverse mode. - /// - /// Performs `m` reverse sweeps (one per output). Returns `J[i][j] = ∂f_i/∂x_j`. - pub fn jacobian(&mut self, inputs: &[F]) -> Vec> { - self.forward(inputs); - - let out_indices = self.all_output_indices(); - - let ni = self.num_inputs as usize; - let mut jac = Vec::with_capacity(out_indices.len()); - - for &out_idx in out_indices { - let adjoints = self.reverse(out_idx); - jac.push(adjoints[..ni].to_vec()); - } - - jac - } - - /// Vector-Jacobian product for a multi-output tape. - /// - /// Computes `wᵀ · J` where `J` is the Jacobian. More efficient than - /// computing the full Jacobian when only the weighted combination is needed. - pub fn vjp_multi(&mut self, inputs: &[F], weights: &[F]) -> Vec { - self.forward(inputs); - self.reverse_seeded(weights) - } - - /// Compute a Jacobian with forced branch choices at specified tape indices. - /// - /// For each `(tape_index, sign)` in `forced_signs`, the reverse sweep uses - /// [`forced_reverse_partials`](opcode::forced_reverse_partials) instead of the - /// standard partials at that index. - /// - /// This is the building block for Clarke subdifferential enumeration. - pub fn jacobian_limiting(&mut self, inputs: &[F], forced_signs: &[(u32, i8)]) -> Vec> { - self.forward(inputs); - - let sign_map: HashMap = forced_signs.iter().copied().collect(); - let out_indices = self.all_output_indices(); - - let ni = self.num_inputs as usize; - let mut jac = Vec::with_capacity(out_indices.len()); - - for &out_idx in out_indices { - let adjoints = self.reverse_with_forced_signs(out_idx, &sign_map); - jac.push(adjoints[..ni].to_vec()); - } - - jac - } - - /// Compute the Clarke generalized Jacobian via limiting Jacobian enumeration. - /// - /// 1. Runs `forward_nonsmooth` to detect all kink operations and their branches. - /// 2. Identifies "active" kinks (|switching_value| < `tol`). - /// 3. Enumerates all 2^k sign combinations for the k active kinks. - /// 4. For each combination, computes a limiting Jacobian via forced reverse sweeps. - /// - /// Returns the nonsmooth info and a vector of limiting Jacobians. - /// - /// # Errors - /// - /// Returns [`crate::ClarkeError::TooManyKinks`] if the number of active kinks exceeds - /// the limit (default 20, overridden by `max_active_kinks`). - pub fn clarke_jacobian( - &mut self, - inputs: &[F], - tol: F, - max_active_kinks: Option, - ) -> Result<(crate::nonsmooth::NonsmoothInfo, Vec>>), crate::nonsmooth::ClarkeError> - { - #![allow(clippy::type_complexity)] - let info = self.forward_nonsmooth(inputs); - let active: Vec<&crate::nonsmooth::KinkEntry> = info - .active_kinks(tol) - .into_iter() - .filter(|k| opcode::has_nontrivial_subdifferential(k.opcode)) - .collect(); - let k = active.len(); - let limit = max_active_kinks.unwrap_or(20); - - if k > limit { - return Err(crate::nonsmooth::ClarkeError::TooManyKinks { count: k, limit }); - } - - let active_indices: Vec = active.iter().map(|e| e.tape_index).collect(); - - // Build sign_map from all (non-active) kinks using their natural branches, - // then override active kinks per combination. - let base_signs: HashMap = info - .kinks - .iter() - .map(|e| (e.tape_index, e.branch)) - .collect(); - - let out_indices = self.all_output_indices(); - let ni = self.num_inputs as usize; - - let num_combos = 1usize << k; - let mut jacobians = Vec::with_capacity(num_combos); - - for combo in 0..num_combos { - let mut sign_map = base_signs.clone(); - for (bit, &idx) in active_indices.iter().enumerate() { - let sign: i8 = if (combo >> bit) & 1 == 0 { 1 } else { -1 }; - sign_map.insert(idx, sign); - } - - let mut jac = Vec::with_capacity(out_indices.len()); - for &out_idx in out_indices { - let adjoints = self.reverse_with_forced_signs(out_idx, &sign_map); - jac.push(adjoints[..ni].to_vec()); - } - jacobians.push(jac); - } - - Ok((info, jacobians)) - } - - /// Re-evaluate the tape at new inputs (forward sweep). - /// - /// Overwrites `values` in-place — no allocation. - pub fn forward(&mut self, inputs: &[F]) { - assert_eq!( - inputs.len(), - self.num_inputs as usize, - "wrong number of inputs" - ); - - // Overwrite input values. - for (i, &v) in inputs.iter().enumerate() { - self.values[i] = v; - } - - // Re-evaluate all non-Input, non-Const ops. - for i in 0..self.opcodes.len() { - match self.opcodes[i] { - OpCode::Input | OpCode::Const => continue, - OpCode::Custom => { - let [a_idx, cb_idx] = self.arg_indices[i]; - let a = self.values[a_idx as usize]; - let b = self - .custom_second_args - .get(&(i as u32)) - .map(|&bi| self.values[bi as usize]) - .unwrap_or(F::zero()); - self.values[i] = self.custom_ops[cb_idx as usize].eval(a, b); - } - op => { - let [a_idx, b_idx] = self.arg_indices[i]; - let a = self.values[a_idx as usize]; - let b = if b_idx != UNUSED && op != OpCode::Powi { - self.values[b_idx as usize] - } else if op == OpCode::Powi { - F::from(b_idx).unwrap_or_else(|| F::zero()) - } else { - F::zero() - }; - self.values[i] = opcode::eval_forward(op, a, b); - } - } - } - } - - /// Forward sweep with nonsmooth branch tracking. - /// - /// Calls [`forward`](Self::forward) to evaluate the tape, then scans for - /// nonsmooth operations and records which branch was taken at each one. - /// - /// Tracked operations: - /// - `Abs`, `Min`, `Max` — kinks with nontrivial subdifferentials - /// - `Signum`, `Floor`, `Ceil`, `Round`, `Trunc` — step-function - /// discontinuities (zero derivative on both sides, tracked for proximity - /// detection only) - /// - /// Returns [`crate::NonsmoothInfo`] containing all kink entries in tape order. - pub fn forward_nonsmooth(&mut self, inputs: &[F]) -> crate::nonsmooth::NonsmoothInfo { - self.forward(inputs); - - let mut kinks = Vec::new(); - for i in 0..self.opcodes.len() { - let op = self.opcodes[i]; - if !opcode::is_nonsmooth(op) { - continue; - } - - let [a_idx, b_idx] = self.arg_indices[i]; - let a = self.values[a_idx as usize]; - - match op { - OpCode::Abs => { - kinks.push(crate::nonsmooth::KinkEntry { - tape_index: i as u32, - opcode: op, - switching_value: a, - branch: if a >= F::zero() { 1 } else { -1 }, - }); - } - OpCode::Max => { - let b = self.values[b_idx as usize]; - kinks.push(crate::nonsmooth::KinkEntry { - tape_index: i as u32, - opcode: op, - switching_value: a - b, - branch: if a >= b { 1 } else { -1 }, - }); - } - OpCode::Min => { - let b = self.values[b_idx as usize]; - kinks.push(crate::nonsmooth::KinkEntry { - tape_index: i as u32, - opcode: op, - switching_value: a - b, - branch: if a <= b { 1 } else { -1 }, - }); - } - OpCode::Signum => { - // Kink at x = 0 (same as Abs). - kinks.push(crate::nonsmooth::KinkEntry { - tape_index: i as u32, - opcode: op, - switching_value: a, - branch: if a >= F::zero() { 1 } else { -1 }, - }); - } - OpCode::Floor | OpCode::Ceil | OpCode::Round | OpCode::Trunc => { - // Kink at integer values. switching_value = distance to - // nearest integer: zero exactly at kink points, works - // symmetrically for both approach directions. - kinks.push(crate::nonsmooth::KinkEntry { - tape_index: i as u32, - opcode: op, - switching_value: a - a.round(), - branch: if a - a.floor() < F::from(0.5).unwrap() { - 1 - } else { - -1 - }, - }); - } - _ => unreachable!(), - } - } - - crate::nonsmooth::NonsmoothInfo { kinks } - } - - /// Core reverse sweep loop shared by all scalar reverse sweep variants. - /// - /// Expects `adjoints` to be pre-seeded by the caller (length = `num_variables`). - /// Reads primal values from `values` (either `self.values` or an external buffer). - /// When `forced_signs` is `Some`, uses forced partials at matching tape indices. - fn reverse_sweep_core( - &self, - adjoints: &mut [F], - values: &[F], - forced_signs: Option<&HashMap>, - ) { - for i in (0..self.opcodes.len()).rev() { - let adj = adjoints[i]; - if adj == F::zero() { - continue; - } - - match self.opcodes[i] { - OpCode::Input | OpCode::Const => continue, - OpCode::Custom => { - adjoints[i] = F::zero(); - let [a_idx, cb_idx] = self.arg_indices[i]; - let a = values[a_idx as usize]; - let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied(); - let b = b_idx_opt.map(|bi| values[bi as usize]).unwrap_or(F::zero()); - let r = values[i]; - let (da, db) = self.custom_ops[cb_idx as usize].partials(a, b, r); - adjoints[a_idx as usize] = adjoints[a_idx as usize] + da * adj; - if let Some(bi) = b_idx_opt { - adjoints[bi as usize] = adjoints[bi as usize] + db * adj; - } - } - op => { - adjoints[i] = F::zero(); - let [a_idx, b_idx] = self.arg_indices[i]; - let a = values[a_idx as usize]; - let b = if b_idx != UNUSED && op != OpCode::Powi { - values[b_idx as usize] - } else if op == OpCode::Powi { - F::from(b_idx).unwrap_or_else(|| F::zero()) - } else { - F::zero() - }; - let r = values[i]; - - let (da, db) = match forced_signs.and_then(|fs| fs.get(&(i as u32))) { - Some(&sign) => opcode::forced_reverse_partials(op, a, b, r, sign), - None => opcode::reverse_partials(op, a, b, r), - }; - - adjoints[a_idx as usize] = adjoints[a_idx as usize] + da * adj; - if b_idx != UNUSED && op != OpCode::Powi { - adjoints[b_idx as usize] = adjoints[b_idx as usize] + db * adj; - } - } - } - } - } - - /// Reverse sweep: compute adjoints seeded at the output. - /// - /// Returns the full adjoint vector (length = `num_variables`). - pub fn reverse(&self, seed_index: u32) -> Vec { - let n = self.num_variables as usize; - let mut adjoints = vec![F::zero(); n]; - adjoints[seed_index as usize] = F::one(); - self.reverse_sweep_core(&mut adjoints, &self.values, None); - adjoints - } - - /// Reverse sweep with forced branch choices at specified tape indices. - fn reverse_with_forced_signs( - &self, - seed_index: u32, - forced_signs: &HashMap, - ) -> Vec { - let n = self.num_variables as usize; - let mut adjoints = vec![F::zero(); n]; - adjoints[seed_index as usize] = F::one(); - self.reverse_sweep_core(&mut adjoints, &self.values, Some(forced_signs)); - adjoints - } - - /// Forward evaluation into an external buffer. - /// - /// Reads opcodes, constants, and argument indices from `self`, but writes - /// computed values into `values_buf` instead of `self.values`. This allows - /// parallel evaluation of the same tape at different inputs without cloning. - pub fn forward_into(&self, inputs: &[F], values_buf: &mut Vec) { - assert_eq!( - inputs.len(), - self.num_inputs as usize, - "wrong number of inputs" - ); - - let n = self.num_variables as usize; - values_buf.clear(); - values_buf.resize(n, F::zero()); - - // Copy constant values from the tape, then overwrite inputs. - values_buf.copy_from_slice(&self.values[..n]); - for (i, &v) in inputs.iter().enumerate() { - values_buf[i] = v; - } - - // Re-evaluate all non-Input, non-Const ops. - for i in 0..self.opcodes.len() { - match self.opcodes[i] { - OpCode::Input | OpCode::Const => continue, - OpCode::Custom => { - let [a_idx, cb_idx] = self.arg_indices[i]; - let a = values_buf[a_idx as usize]; - let b = self - .custom_second_args - .get(&(i as u32)) - .map(|&bi| values_buf[bi as usize]) - .unwrap_or(F::zero()); - values_buf[i] = self.custom_ops[cb_idx as usize].eval(a, b); - } - op => { - let [a_idx, b_idx] = self.arg_indices[i]; - let a = values_buf[a_idx as usize]; - let b = if b_idx != UNUSED && op != OpCode::Powi { - values_buf[b_idx as usize] - } else if op == OpCode::Powi { - F::from(b_idx).unwrap_or_else(|| F::zero()) - } else { - F::zero() - }; - values_buf[i] = opcode::eval_forward(op, a, b); - } - } - } - } - - /// Reverse sweep reading from an external values buffer. - /// - /// Like [`reverse`](Self::reverse) but reads primal values from `values` - /// instead of `self.values`. Pair with [`forward_into`](Self::forward_into) - /// for parallel evaluation. - pub fn reverse_from(&self, values: &[F], seed_index: u32) -> Vec { - let n = self.num_variables as usize; - assert_eq!(values.len(), n, "values buffer has wrong length"); - let mut adjoints = vec![F::zero(); n]; - adjoints[seed_index as usize] = F::one(); - self.reverse_sweep_core(&mut adjoints, values, None); - adjoints - } - - /// Forward + reverse: compute the gradient at new inputs. - /// - /// Returns only the input adjoints (indices `0..num_inputs`). - pub fn gradient(&mut self, inputs: &[F]) -> Vec { - self.forward(inputs); - let adjoints = self.reverse(self.output_index); - adjoints[..self.num_inputs as usize].to_vec() - } - - /// Like [`gradient`](Self::gradient) but reuses a caller-provided buffer - /// for the adjoint vector, avoiding allocation on repeated calls. - pub fn gradient_with_buf(&mut self, inputs: &[F], adjoint_buf: &mut Vec) -> Vec { - self.forward(inputs); - - let n = self.num_variables as usize; - adjoint_buf.clear(); - adjoint_buf.resize(n, F::zero()); - adjoint_buf[self.output_index as usize] = F::one(); - - self.reverse_sweep_core(adjoint_buf, &self.values, None); - adjoint_buf[..self.num_inputs as usize].to_vec() - } - - // ── Forward-over-reverse (second-order) ── - - /// Forward sweep with tangent-carrying numbers. Reads opcodes and constants - /// from `self`, writing results into `buf`. Does not mutate the tape. - /// - /// Generic over `T: NumFloat` so it works with both `Dual` and - /// `DualVec`. - pub fn forward_tangent(&self, inputs: &[T], buf: &mut Vec) { - self.forward_tangent_inner(inputs, buf, |i, a_t, b_t| { - // First-order chain rule: evaluate on primals, convert partials to T. - let [a_idx, cb_idx] = self.arg_indices[i]; - let a_primal = self.values[a_idx as usize]; - let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied(); - let b_primal = b_idx_opt - .map(|bi| self.values[bi as usize]) - .unwrap_or(F::zero()); - let result = self.custom_ops[cb_idx as usize].eval(a_primal, b_primal); - let (da, db) = self.custom_ops[cb_idx as usize].partials(a_primal, b_primal, result); - let result_t = T::from(result).unwrap(); - let da_t = T::from(da).unwrap(); - let db_t = T::from(db).unwrap(); - let a_re_t = T::from(a_primal).unwrap(); - let b_re_t = T::from(b_primal).unwrap(); - result_t + da_t * (a_t - a_re_t) + db_t * (b_t - b_re_t) - }); - } - - /// Forward sweep specialized for `Dual`, calling [`CustomOp::eval_dual`] - /// so that custom ops propagate tangent information for second-order derivatives. - fn forward_tangent_dual(&self, inputs: &[Dual], buf: &mut Vec>) { - self.forward_tangent_inner(inputs, buf, |i, a_t, b_t| { - let [_a_idx, cb_idx] = self.arg_indices[i]; - self.custom_ops[cb_idx as usize].eval_dual(a_t, b_t) - }); - } - - /// Common forward-tangent loop. The `handle_custom` closure receives - /// `(tape_index, a_value, b_value)` for custom op slots and returns - /// the result to store. - fn forward_tangent_inner( - &self, - inputs: &[T], - buf: &mut Vec, - handle_custom: impl Fn(usize, T, T) -> T, - ) { - assert_eq!( - inputs.len(), - self.num_inputs as usize, - "wrong number of inputs" - ); - - let n = self.num_variables as usize; - buf.clear(); - buf.resize(n, T::zero()); - - let mut input_idx = 0usize; - for i in 0..self.opcodes.len() { - match self.opcodes[i] { - OpCode::Input => { - buf[i] = inputs[input_idx]; - input_idx += 1; - } - OpCode::Const => { - buf[i] = T::from(self.values[i]).unwrap(); - } - OpCode::Custom => { - let [a_idx, _cb_idx] = self.arg_indices[i]; - let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied(); - let a_t = buf[a_idx as usize]; - let b_t = b_idx_opt.map(|bi| buf[bi as usize]).unwrap_or(T::zero()); - buf[i] = handle_custom(i, a_t, b_t); - } - op => { - let [a_idx, b_idx] = self.arg_indices[i]; - let a = buf[a_idx as usize]; - let b = if b_idx != UNUSED && op != OpCode::Powi { - buf[b_idx as usize] - } else if op == OpCode::Powi { - T::from(b_idx).unwrap_or_else(|| T::zero()) - } else { - T::zero() - }; - buf[i] = opcode::eval_forward(op, a, b); - } - } - } - } - - /// Reverse sweep with tangent-carrying adjoints. Uses values from - /// [`forward_tangent`](Self::forward_tangent). Uses [`IsAllZero`] to - /// safely skip zero adjoints without dropping tangent contributions. - fn reverse_tangent(&self, tangent_vals: &[T], buf: &mut Vec) { - self.reverse_tangent_inner(tangent_vals, buf, |i| { - // First-order: convert primal-float partials to T. - let [a_idx, cb_idx] = self.arg_indices[i]; - let a_primal = self.values[a_idx as usize]; - let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied(); - let b_primal = b_idx_opt - .map(|bi| self.values[bi as usize]) - .unwrap_or(F::zero()); - let r_primal = self.values[i]; - let (da, db) = self.custom_ops[cb_idx as usize].partials(a_primal, b_primal, r_primal); - (T::from(da).unwrap(), T::from(db).unwrap()) - }); - } - - /// Reverse sweep specialized for `Dual`, calling [`CustomOp::partials_dual`] - /// so that custom op partials carry tangent information for second-order derivatives. - fn reverse_tangent_dual(&self, tangent_vals: &[Dual], buf: &mut Vec>) { - self.reverse_tangent_inner(tangent_vals, buf, |i| { - let [a_idx, cb_idx] = self.arg_indices[i]; - let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied(); - let a_dual = tangent_vals[a_idx as usize]; - let b_dual = b_idx_opt - .map(|bi| tangent_vals[bi as usize]) - .unwrap_or(Dual::constant(F::zero())); - let r_dual = tangent_vals[i]; - self.custom_ops[cb_idx as usize].partials_dual(a_dual, b_dual, r_dual) - }); - } - - /// Common reverse-tangent loop. The `custom_partials` closure receives - /// `tape_index` for custom op slots and returns `(da, db)` as T-valued partials. - fn reverse_tangent_inner( - &self, - tangent_vals: &[T], - buf: &mut Vec, - custom_partials: impl Fn(usize) -> (T, T), - ) { - let n = self.num_variables as usize; - buf.clear(); - buf.resize(n, T::zero()); - buf[self.output_index as usize] = T::one(); - - for i in (0..self.opcodes.len()).rev() { - match self.opcodes[i] { - OpCode::Input | OpCode::Const => continue, - OpCode::Custom => { - let adj = buf[i]; - if adj.is_all_zero() { - continue; - } - buf[i] = T::zero(); - - let [a_idx, _cb_idx] = self.arg_indices[i]; - let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied(); - let (da_t, db_t) = custom_partials(i); - buf[a_idx as usize] = buf[a_idx as usize] + da_t * adj; - if let Some(bi) = b_idx_opt { - buf[bi as usize] = buf[bi as usize] + db_t * adj; - } - } - op => { - let adj = buf[i]; - if adj.is_all_zero() { - continue; - } - buf[i] = T::zero(); - - let [a_idx, b_idx] = self.arg_indices[i]; - let a = tangent_vals[a_idx as usize]; - let b = if b_idx != UNUSED && op != OpCode::Powi { - tangent_vals[b_idx as usize] - } else if op == OpCode::Powi { - T::from(b_idx).unwrap_or_else(|| T::zero()) - } else { - T::zero() - }; - let r = tangent_vals[i]; - let (da, db) = opcode::reverse_partials(op, a, b, r); - - buf[a_idx as usize] = buf[a_idx as usize] + da * adj; - if b_idx != UNUSED && op != OpCode::Powi { - buf[b_idx as usize] = buf[b_idx as usize] + db * adj; - } - } - } - } - } - - /// Hessian-vector product via forward-over-reverse. - /// - /// Returns `(gradient, H·v)` where both are `Vec` of length - /// [`num_inputs`](Self::num_inputs). The tape is not mutated. - pub fn hvp(&self, x: &[F], v: &[F]) -> (Vec, Vec) { - let mut dual_vals = Vec::new(); - let mut adjoint_buf = Vec::new(); - self.hvp_with_buf(x, v, &mut dual_vals, &mut adjoint_buf) - } - - /// Like [`hvp`](Self::hvp) but reuses caller-provided buffers to avoid - /// allocation on repeated calls (e.g. inside [`hessian`](Self::hessian)). - pub fn hvp_with_buf( - &self, - x: &[F], - v: &[F], - dual_vals_buf: &mut Vec>, - adjoint_buf: &mut Vec>, - ) -> (Vec, Vec) { - let n = self.num_inputs as usize; - assert_eq!(x.len(), n, "wrong number of inputs"); - assert_eq!(v.len(), n, "wrong number of directions"); - - let dual_inputs: Vec> = x - .iter() - .zip(v.iter()) - .map(|(&xi, &vi)| Dual::new(xi, vi)) - .collect(); - - self.forward_tangent_dual(&dual_inputs, dual_vals_buf); - self.reverse_tangent_dual(dual_vals_buf, adjoint_buf); - - let gradient: Vec = (0..n).map(|i| adjoint_buf[i].re).collect(); - let hvp: Vec = (0..n).map(|i| adjoint_buf[i].eps).collect(); - (gradient, hvp) - } - - /// Like [`hvp_with_buf`](Self::hvp_with_buf) but also reuses a caller-provided - /// input buffer, eliminating all allocations on repeated calls. - fn hvp_with_all_bufs( - &self, - x: &[F], - v: &[F], - dual_input_buf: &mut Vec>, - dual_vals_buf: &mut Vec>, - adjoint_buf: &mut Vec>, - ) { - // Reuse the input buffer instead of allocating - dual_input_buf.clear(); - dual_input_buf.extend(x.iter().zip(v.iter()).map(|(&xi, &vi)| Dual::new(xi, vi))); - - self.forward_tangent_dual(dual_input_buf, dual_vals_buf); - self.reverse_tangent_dual(dual_vals_buf, adjoint_buf); - } - - /// Full Hessian matrix via `n` Hessian-vector products. - /// - /// Returns `(value, gradient, hessian)` where `hessian[i][j] = ∂²f/∂x_i∂x_j`. - /// The tape is not mutated. - pub fn hessian(&self, x: &[F]) -> (F, Vec, Vec>) { - let n = self.num_inputs as usize; - assert_eq!(x.len(), n, "wrong number of inputs"); - - let mut dual_input_buf: Vec> = Vec::with_capacity(n); - let mut dual_vals_buf = Vec::new(); - let mut adjoint_buf = Vec::new(); - let mut hessian = vec![vec![F::zero(); n]; n]; - let mut gradient = vec![F::zero(); n]; - let mut value = F::zero(); - - for j in 0..n { - // Reuse input buffer - dual_input_buf.clear(); - dual_input_buf - .extend((0..n).map(|i| Dual::new(x[i], if i == j { F::one() } else { F::zero() }))); - - self.forward_tangent_dual(&dual_input_buf, &mut dual_vals_buf); - self.reverse_tangent_dual(&dual_vals_buf, &mut adjoint_buf); - - if j == 0 { - value = dual_vals_buf[self.output_index as usize].re; - for i in 0..n { - gradient[i] = adjoint_buf[i].re; - } - } - - for (row, adj) in hessian.iter_mut().zip(adjoint_buf.iter()) { - row[j] = adj.eps; - } - } - - (value, gradient, hessian) - } - - /// Full Hessian matrix via batched forward-over-reverse. - /// - /// Processes `ceil(n/N)` batches instead of `n` individual HVPs, - /// computing N Hessian columns simultaneously. - pub fn hessian_vec(&self, x: &[F]) -> (F, Vec, Vec>) { - let n = self.num_inputs as usize; - assert_eq!(x.len(), n, "wrong number of inputs"); - - let mut dual_input_buf: Vec> = Vec::with_capacity(n); - let mut dual_vals_buf: Vec> = Vec::new(); - let mut adjoint_buf: Vec> = Vec::new(); - let mut hessian = vec![vec![F::zero(); n]; n]; - let mut gradient = vec![F::zero(); n]; - let mut value = F::zero(); - - let num_batches = n.div_ceil(N); - for batch in 0..num_batches { - let base = batch * N; - - // Reuse input buffer - dual_input_buf.clear(); - dual_input_buf.extend((0..n).map(|i| { - let eps = std::array::from_fn(|lane| { - let col = base + lane; - if col < n && i == col { - F::one() - } else { - F::zero() - } - }); - DualVec::new(x[i], eps) - })); - - self.forward_tangent(&dual_input_buf, &mut dual_vals_buf); - self.reverse_tangent(&dual_vals_buf, &mut adjoint_buf); - - if batch == 0 { - value = dual_vals_buf[self.output_index as usize].re; - for i in 0..n { - gradient[i] = adjoint_buf[i].re; - } - } - - for lane in 0..N { - let col = base + lane; - if col >= n { - break; - } - for i in 0..n { - hessian[i][col] = adjoint_buf[i].eps[lane]; - } - } - } - - (value, gradient, hessian) - } - - /// Detect the structural sparsity pattern of the Hessian. - /// - /// Walks the tape forward propagating input-dependency bitsets. - /// At nonlinear operations, marks cross-pairs as potential Hessian interactions. - pub fn detect_sparsity(&self) -> crate::sparse::SparsityPattern { - crate::sparse::detect_sparsity_impl( - &self.opcodes, - &self.arg_indices, - self.num_inputs as usize, - self.num_variables as usize, - ) - } - - /// Detect the structural sparsity pattern of the Jacobian. - /// - /// Walks the tape forward propagating input-dependency bitsets (first-order). - /// For each output, determines which inputs it depends on. - pub fn detect_jacobian_sparsity(&self) -> crate::sparse::JacobianSparsityPattern { - let out_indices = self.all_output_indices(); - crate::sparse::detect_jacobian_sparsity_impl( - &self.opcodes, - &self.arg_indices, - self.num_inputs as usize, - self.num_variables as usize, - out_indices, - ) - } - - /// Compute a sparse Hessian using structural sparsity detection and graph coloring. - /// - /// Returns `(value, gradient, pattern, hessian_values)` where - /// `hessian_values[k]` corresponds to `(pattern.rows[k], pattern.cols[k])`. - /// - /// For problems with sparse Hessians, this requires only `chromatic_number` - /// HVP calls instead of `n`, which can be dramatically fewer for banded - /// or sparse interaction structures. - pub fn sparse_hessian(&self, x: &[F]) -> (F, Vec, crate::sparse::SparsityPattern, Vec) { - let n = self.num_inputs as usize; - assert_eq!(x.len(), n, "wrong number of inputs"); - - let pattern = self.detect_sparsity(); - let (colors, num_colors) = crate::sparse::greedy_coloring(&pattern); - - let mut hessian_values = vec![F::zero(); pattern.nnz()]; - let mut gradient = vec![F::zero(); n]; - let mut value = F::zero(); - - let mut dual_input_buf: Vec> = Vec::with_capacity(n); - let mut dual_vals_buf = Vec::new(); - let mut adjoint_buf = Vec::new(); - let mut v = vec![F::zero(); n]; - - for color in 0..num_colors { - // Form direction vector: v[i] = 1 if colors[i] == color, else 0 - for i in 0..n { - v[i] = if colors[i] == color { - F::one() - } else { - F::zero() - }; - } - - self.hvp_with_all_bufs( - x, - &v, - &mut dual_input_buf, - &mut dual_vals_buf, - &mut adjoint_buf, - ); - - if color == 0 { - value = dual_vals_buf[self.output_index as usize].re; - for i in 0..n { - gradient[i] = adjoint_buf[i].re; - } - } - - // Extract Hessian entries for this color. - for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() { - if colors[col as usize] == color { - hessian_values[k] = adjoint_buf[row as usize].eps; - } - } - } - - (value, gradient, pattern, hessian_values) - } - - /// Batched sparse Hessian: packs N colors per sweep using DualVec. - /// - /// Reduces the number of forward+reverse sweeps from `num_colors` to - /// `ceil(num_colors / N)`. Each sweep processes N colors simultaneously. - /// - /// Returns `(value, gradient, pattern, hessian_values)`. - pub fn sparse_hessian_vec( - &self, - x: &[F], - ) -> (F, Vec, crate::sparse::SparsityPattern, Vec) { - let n = self.num_inputs as usize; - assert_eq!(x.len(), n, "wrong number of inputs"); - - let pattern = self.detect_sparsity(); - let (colors, num_colors) = crate::sparse::greedy_coloring(&pattern); - - let mut hessian_values = vec![F::zero(); pattern.nnz()]; - let mut gradient = vec![F::zero(); n]; - let mut value = F::zero(); - - let mut dual_input_buf: Vec> = Vec::with_capacity(n); - let mut dual_vals_buf: Vec> = Vec::new(); - let mut adjoint_buf: Vec> = Vec::new(); - - let num_batches = (num_colors as usize).div_ceil(N); - for batch in 0..num_batches { - let base_color = (batch * N) as u32; - - // Build DualVec inputs: lane k has v[i]=1 if colors[i] == base_color+k - dual_input_buf.clear(); - dual_input_buf.extend((0..n).map(|i| { - let eps = std::array::from_fn(|lane| { - let target_color = base_color + lane as u32; - if target_color < num_colors && colors[i] == target_color { - F::one() - } else { - F::zero() - } - }); - DualVec::new(x[i], eps) - })); - - self.forward_tangent(&dual_input_buf, &mut dual_vals_buf); - self.reverse_tangent(&dual_vals_buf, &mut adjoint_buf); - - if batch == 0 { - value = dual_vals_buf[self.output_index as usize].re; - for i in 0..n { - gradient[i] = adjoint_buf[i].re; - } - } - - // Extract Hessian entries: for entry (row, col) with colors[col] == base_color+lane, - // read adjoint_buf[row].eps[lane] - for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() { - let col_color = colors[col as usize]; - if col_color >= base_color && col_color < base_color + N as u32 { - let lane = (col_color - base_color) as usize; - hessian_values[k] = adjoint_buf[row as usize].eps[lane]; - } - } - } - - (value, gradient, pattern, hessian_values) - } - - // ── Batch evaluation ── - - /// Evaluate the gradient at multiple input points. - /// - /// Returns one gradient vector per input point. - pub fn gradient_batch(&mut self, inputs: &[&[F]]) -> Vec> { - inputs.iter().map(|x| self.gradient(x)).collect() - } - - // ── Higher-order derivatives ── - - /// Third-order directional derivative: `∑_{jk} (∂³f/∂x_i∂x_j∂x_k) v1_j v2_k`. - /// - /// Given directions `v1` and `v2`, computes: - /// - `gradient`: `∇f(x)` - /// - `hvp`: `H(x) · v1` (Hessian-vector product) - /// - `third`: `(∂/∂v2)(H · v1)` (third-order tensor contracted with v1 and v2) - /// - /// Uses `Dual>` (nested dual numbers): inner tangent for `v1`, - /// outer tangent for `v2`. - pub fn third_order_hvvp(&self, x: &[F], v1: &[F], v2: &[F]) -> (Vec, Vec, Vec) { - let n = self.num_inputs as usize; - assert_eq!(x.len(), n, "wrong number of inputs"); - assert_eq!(v1.len(), n, "wrong v1 length"); - assert_eq!(v2.len(), n, "wrong v2 length"); - - // Build Dual> inputs: - // inner.re = x[i], inner.eps = v1[i] (for HVP direction) - // outer.re = inner, outer.eps = v2[i] (for third-order direction) - // - // outer = Dual { re: Dual(x[i], v1[i]), eps: Dual(v2[i], 0) } - let dd_inputs: Vec>> = (0..n) - .map(|i| Dual { - re: Dual::new(x[i], v1[i]), - eps: Dual::new(v2[i], F::zero()), - }) - .collect(); - - let mut dd_vals: Vec>> = Vec::new(); - let mut dd_adj: Vec>> = Vec::new(); - - self.forward_tangent(&dd_inputs, &mut dd_vals); - self.reverse_tangent(&dd_vals, &mut dd_adj); - - let gradient: Vec = (0..n).map(|i| dd_adj[i].re.re).collect(); - let hvp: Vec = (0..n).map(|i| dd_adj[i].re.eps).collect(); - let third: Vec = (0..n).map(|i| dd_adj[i].eps.eps).collect(); - - (gradient, hvp, third) - } - - /// Forward-reverse Taylor pass for gradient + higher-order directional adjoints. - /// - /// Builds Taylor inputs `x_i(t) = x_i + v_i * t` (with zero higher coefficients), - /// runs `forward_tangent`, then `reverse_tangent` to get Taylor-valued adjoints. - /// - /// Returns `(output, adjoints)` where: - /// - `output` is the Taylor expansion of `f` along direction `v` - /// - `adjoints[i].coeff(0)` = `∂f/∂x_i` (gradient) - /// - `adjoints[i].coeff(1)` = `Σ_j (∂²f/∂x_i∂x_j) v_j` (HVP) - /// - `adjoints[i].derivative(k)` = k-th order directional adjoint - /// - /// For K=2, the HVP component is equivalent to [`hvp`](Self::hvp). - /// For K≥3, yields additional higher-order information in the same pass. - /// - /// Like [`hvp`](Self::hvp), takes `&self` and does not call `forward(x)` - /// before the Taylor pass. Custom ops will use primal values from recording time. - #[cfg(feature = "taylor")] - pub fn taylor_grad( - &self, - x: &[F], - v: &[F], - ) -> (Taylor, Vec>) { - let mut fwd_buf = Vec::new(); - let mut adj_buf = Vec::new(); - self.taylor_grad_with_buf(x, v, &mut fwd_buf, &mut adj_buf) - } - - /// Like [`taylor_grad`](Self::taylor_grad) but reuses caller-provided buffers - /// to avoid allocation on repeated calls. - #[cfg(feature = "taylor")] - pub fn taylor_grad_with_buf( - &self, - x: &[F], - v: &[F], - fwd_buf: &mut Vec>, - adj_buf: &mut Vec>, - ) -> (Taylor, Vec>) { - let n = self.num_inputs as usize; - assert_eq!(x.len(), n, "wrong number of inputs"); - assert_eq!(v.len(), n, "wrong number of directions"); - - // Build Taylor inputs: x_i(t) = x_i + v_i * t - let taylor_inputs: Vec> = x - .iter() - .zip(v.iter()) - .map(|(&xi, &vi)| { - let mut coeffs = [F::zero(); K]; - coeffs[0] = xi; - if K > 1 { - coeffs[1] = vi; - } - Taylor::new(coeffs) - }) - .collect(); - - self.forward_tangent(&taylor_inputs, fwd_buf); - let output = fwd_buf[self.output_index as usize]; - self.reverse_tangent(fwd_buf, adj_buf); - - (output, adj_buf[..n].to_vec()) - } - - // ── ODE Taylor integration ── - - /// Compute the Taylor expansion of the ODE solution `y(t)` to order K. - /// - /// Given a tape representing the right-hand side `f: R^n → R^n` of the ODE - /// `y' = f(y)`, and an initial condition `y(0) = y0`, computes the Taylor - /// coefficients `y_0, y_1, ..., y_{K-1}` such that - /// `y(t) ≈ y_0 + y_1·t + y_2·t² + ... + y_{K-1}·t^{K-1}`. - /// - /// The tape must have `num_outputs == num_inputs` (autonomous ODE: f maps R^n → R^n). - /// - /// Returns one `Taylor` per state variable. Use [`Taylor::eval_at`] to - /// evaluate at a step size `h`, or inspect coefficients for error estimation. - #[cfg(feature = "taylor")] - pub fn ode_taylor_step(&self, y0: &[F]) -> Vec> { - let mut buf = Vec::new(); - self.ode_taylor_step_with_buf(y0, &mut buf) - } - - /// Like [`ode_taylor_step`](Self::ode_taylor_step) but reuses a caller-provided - /// buffer to avoid allocation on repeated calls. - #[cfg(feature = "taylor")] - pub fn ode_taylor_step_with_buf( - &self, - y0: &[F], - buf: &mut Vec>, - ) -> Vec> { - let n = self.num_inputs as usize; - assert_eq!(y0.len(), n, "y0 length must match num_inputs"); - assert_eq!( - self.num_outputs(), - n, - "ODE tape must have num_outputs == num_inputs (f: R^n -> R^n)" - ); - - let out_indices = self.all_output_indices(); - - let mut y_coeffs = vec![[F::zero(); K]; n]; - for i in 0..n { - y_coeffs[i][0] = y0[i]; - } - - for k in 0..K - 1 { - let inputs: Vec> = (0..n).map(|i| Taylor::new(y_coeffs[i])).collect(); - - self.forward_tangent(&inputs, buf); - - let divisor = F::from(k + 1).unwrap(); - for i in 0..n { - y_coeffs[i][k + 1] = buf[out_indices[i] as usize].coeff(k) / divisor; - } - } - - (0..n).map(|i| Taylor::new(y_coeffs[i])).collect() - } - - // ── Tape optimizations ── - - /// Eliminate dead (unreachable) entries from the tape. - /// - /// Walks backward from all outputs, marks reachable entries, then compacts - /// the tape in-place with an index remap. Inputs are never removed. - /// Core DCE: reachability walk from `seeds`, compact tape, return index remap. - /// - /// Shared by [`dead_code_elimination`](Self::dead_code_elimination) and - /// [`dead_code_elimination_for_outputs`](Self::dead_code_elimination_for_outputs). - /// Callers handle output index updates after compaction. - fn dce_compact(&mut self, seeds: &[u32]) -> Vec { - let n = self.opcodes.len(); - let mut reachable = vec![false; n]; - - // Mark all inputs as reachable. - for flag in reachable.iter_mut().take(self.num_inputs as usize) { - *flag = true; - } - - let mut stack: Vec = seeds.to_vec(); - - while let Some(idx) = stack.pop() { - let i = idx as usize; - if reachable[i] { - continue; - } - reachable[i] = true; - let [a, b] = self.arg_indices[i]; - if a != UNUSED { - stack.push(a); - } - if b != UNUSED && self.opcodes[i] != OpCode::Powi { - stack.push(b); - } - } - - // Build remap: old index -> new index. - let mut remap = vec![0u32; n]; - let mut new_idx = 0u32; - for i in 0..n { - if reachable[i] { - remap[i] = new_idx; - new_idx += 1; - } - } - let new_len = new_idx as usize; - - // Compact in-place. - let mut write = 0; - for (read, &is_reachable) in reachable.iter().enumerate().take(n) { - if is_reachable { - self.opcodes[write] = self.opcodes[read]; - self.values[write] = self.values[read]; - let [a, b] = self.arg_indices[read]; - let ra = if a != UNUSED { - remap[a as usize] - } else { - UNUSED - }; - let rb = if b != UNUSED && self.opcodes[read] != OpCode::Powi { - remap[b as usize] - } else { - b - }; - self.arg_indices[write] = [ra, rb]; - write += 1; - } - } - - self.opcodes.truncate(new_len); - self.arg_indices.truncate(new_len); - self.values.truncate(new_len); - self.num_variables = new_len as u32; - - remap - } - - pub fn dead_code_elimination(&mut self) { - let mut seeds = vec![self.output_index]; - seeds.extend_from_slice(&self.output_indices); - let remap = self.dce_compact(&seeds); - self.output_index = remap[self.output_index as usize]; - for oi in &mut self.output_indices { - *oi = remap[*oi as usize]; - } - } - - /// Eliminate dead code, keeping only the specified outputs alive. - /// - /// Like [`dead_code_elimination`](Self::dead_code_elimination) but seeds - /// reachability only from `active_outputs`. After compaction, - /// `output_indices` contains only the active outputs (remapped), and - /// `output_index` is set to the first active output. - /// - /// **Note**: Like `dead_code_elimination`, this does not remap - /// `custom_second_args` (pre-existing limitation). - /// - /// # Panics - /// Panics if `active_outputs` is empty. - pub fn dead_code_elimination_for_outputs(&mut self, active_outputs: &[u32]) { - assert!( - !active_outputs.is_empty(), - "active_outputs must not be empty" - ); - let remap = self.dce_compact(active_outputs); - self.output_indices = active_outputs - .iter() - .map(|&oi| remap[oi as usize]) - .collect(); - self.output_index = self.output_indices[0]; - } - - /// Common subexpression elimination. - /// - /// Deduplicates identical `(OpCode, arg0, arg1)` triples, normalising - /// argument order for commutative ops. Finishes with a DCE pass to - /// remove the now-dead duplicates. - pub fn cse(&mut self) { - use std::collections::HashMap; - - let n = self.opcodes.len(); - // Maps canonical (op, arg0, arg1) -> first index that computed it. - let mut seen: HashMap<(OpCode, u32, u32), u32> = HashMap::new(); - // remap[i] = canonical index for entry i (identity by default). - let mut remap: Vec = (0..n as u32).collect(); - - let is_commutative = |op: OpCode| -> bool { - matches!( - op, - OpCode::Add | OpCode::Mul | OpCode::Max | OpCode::Min | OpCode::Hypot - ) - }; - - for i in 0..n { - let op = self.opcodes[i]; - match op { - OpCode::Input | OpCode::Const => continue, - _ => {} - } - - let [mut a, mut b] = self.arg_indices[i]; - // Apply remap to args (except Powi exponent in b). - a = remap[a as usize]; - if b != UNUSED && op != OpCode::Powi { - b = remap[b as usize]; - } - // Update arg_indices with remapped values. - self.arg_indices[i] = [a, b]; - - // Build the canonical key. - let key = if b == UNUSED { - // Unary: hash (op, arg0) only; use UNUSED as placeholder. - (op, a, UNUSED) - } else if is_commutative(op) { - let (lo, hi) = if a <= b { (a, b) } else { (b, a) }; - (op, lo, hi) - } else { - (op, a, b) - }; - - if let Some(&canonical) = seen.get(&key) { - remap[i] = canonical; - } else { - seen.insert(key, i as u32); - } - } - - // Apply remap to all arg_indices (for entries that reference CSE'd nodes). - for i in 0..n { - let op = self.opcodes[i]; - if matches!(op, OpCode::Input | OpCode::Const) { - continue; - } - let [a, b] = self.arg_indices[i]; - let ra = remap[a as usize]; - let rb = if b != UNUSED && op != OpCode::Powi { - remap[b as usize] - } else { - b - }; - self.arg_indices[i] = [ra, rb]; - } - - // Update output indices. - self.output_index = remap[self.output_index as usize]; - for oi in &mut self.output_indices { - *oi = remap[*oi as usize]; - } - - // DCE removes the now-unreachable duplicate entries. - self.dead_code_elimination(); - } - - // ── Sparse Jacobian ── - - /// Compute a sparse Jacobian using structural sparsity detection and graph coloring. - /// - /// Auto-selects forward-mode (column compression) or reverse-mode (row compression) - /// based on which requires fewer sweeps. - /// - /// Returns `(output_values, pattern, jacobian_values)`. - pub fn sparse_jacobian( - &mut self, - x: &[F], - ) -> (Vec, crate::sparse::JacobianSparsityPattern, Vec) { - self.forward(x); - let pattern = self.detect_jacobian_sparsity(); - let (col_colors, num_col_colors) = crate::sparse::column_coloring(&pattern); - let (row_colors, num_row_colors) = crate::sparse::row_coloring(&pattern); - - if num_col_colors <= num_row_colors { - let jac_values = - self.sparse_jacobian_forward_impl(x, &pattern, &col_colors, num_col_colors); - let outputs = self.output_values(); - (outputs, pattern, jac_values) - } else { - let jac_values = - self.sparse_jacobian_reverse_impl(x, &pattern, &row_colors, num_row_colors); - let outputs = self.output_values(); - (outputs, pattern, jac_values) - } - } - - /// Sparse Jacobian via forward-mode (column compression). - pub fn sparse_jacobian_forward( - &mut self, - x: &[F], - ) -> (Vec, crate::sparse::JacobianSparsityPattern, Vec) { - self.forward(x); - let pattern = self.detect_jacobian_sparsity(); - let (colors, num_colors) = crate::sparse::column_coloring(&pattern); - let jac_values = self.sparse_jacobian_forward_impl(x, &pattern, &colors, num_colors); - let outputs = self.output_values(); - (outputs, pattern, jac_values) - } - - /// Sparse Jacobian via reverse-mode (row compression). - pub fn sparse_jacobian_reverse( - &mut self, - x: &[F], - ) -> (Vec, crate::sparse::JacobianSparsityPattern, Vec) { - self.forward(x); - let pattern = self.detect_jacobian_sparsity(); - let (colors, num_colors) = crate::sparse::row_coloring(&pattern); - let jac_values = self.sparse_jacobian_reverse_impl(x, &pattern, &colors, num_colors); - let outputs = self.output_values(); - (outputs, pattern, jac_values) - } - - /// Sparse Jacobian with a precomputed sparsity pattern and coloring. - /// - /// Skips re-detection of sparsity on repeated calls. Use `column_coloring` colors - /// for forward mode or `row_coloring` colors for reverse mode. The `forward` flag - /// selects the mode. - pub fn sparse_jacobian_with_pattern( - &mut self, - x: &[F], - pattern: &crate::sparse::JacobianSparsityPattern, - colors: &[u32], - num_colors: u32, - forward_mode: bool, - ) -> (Vec, Vec) { - self.forward(x); - let jac_values = if forward_mode { - self.sparse_jacobian_forward_impl(x, pattern, colors, num_colors) - } else { - self.sparse_jacobian_reverse_impl(x, pattern, colors, num_colors) - }; - let outputs = self.output_values(); - (outputs, jac_values) - } - - /// Forward-mode sparse Jacobian implementation (column compression). - /// - /// Each color group seeds a forward pass with tangent 1.0 in the columns - /// sharing that color. The resulting tangent at each output gives J[row][col]. - fn sparse_jacobian_forward_impl( - &self, - x: &[F], - pattern: &crate::sparse::JacobianSparsityPattern, - colors: &[u32], - num_colors: u32, - ) -> Vec { - let n = self.num_inputs as usize; - let mut jac_values = vec![F::zero(); pattern.nnz()]; - - let out_indices = self.all_output_indices(); - - let mut dual_input_buf: Vec> = Vec::with_capacity(n); - let mut dual_vals_buf: Vec> = Vec::new(); - - for color in 0..num_colors { - // Build Dual inputs: tangent = 1 for inputs with this color - dual_input_buf.clear(); - dual_input_buf.extend((0..n).map(|i| { - Dual::new( - x[i], - if colors[i] == color { - F::one() - } else { - F::zero() - }, - ) - })); - - self.forward_tangent(&dual_input_buf, &mut dual_vals_buf); - - // Extract Jacobian entries: for entry (row, col) with colors[col] == color, - // the tangent at output_indices[row] gives J[row][col] - for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() { - if colors[col as usize] == color { - jac_values[k] = dual_vals_buf[out_indices[row as usize] as usize].eps; - } - } - } - - jac_values - } - - /// Reverse-mode sparse Jacobian implementation (row compression). - /// - /// Each color group seeds a reverse pass with adjoint 1.0 at the outputs - /// sharing that color. - fn sparse_jacobian_reverse_impl( - &self, - _x: &[F], - pattern: &crate::sparse::JacobianSparsityPattern, - colors: &[u32], - num_colors: u32, - ) -> Vec { - let m = self.num_outputs(); - let mut jac_values = vec![F::zero(); pattern.nnz()]; - - let out_indices = self.all_output_indices(); - - for color in 0..num_colors { - // Build seeds: weight = 1 for outputs with this color - let seeds: Vec = (0..m) - .map(|i| { - if colors[i] == color { - F::one() - } else { - F::zero() - } - }) - .collect(); - - let adjoints = self.reverse_seeded_full(&seeds, out_indices); - - // Extract Jacobian entries: for entry (row, col) with colors[row] == color, - // adjoint[col] gives J[row][col] - for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() { - if colors[row as usize] == color { - jac_values[k] = adjoints[col as usize]; - } - } - } - - jac_values - } - - /// Reverse sweep with weighted seeds, returning full adjoint vector. - fn reverse_seeded_full(&self, seeds: &[F], out_indices: &[u32]) -> Vec { - let n = self.num_variables as usize; - let mut adjoints = vec![F::zero(); n]; - - for (&out_idx, &weight) in out_indices.iter().zip(seeds.iter()) { - if weight == F::zero() { - continue; - } - adjoints[out_idx as usize] = adjoints[out_idx as usize] + weight; - } - - self.reverse_sweep_core(&mut adjoints, &self.values, None); - adjoints - } - - /// Batched sparse Jacobian: packs N colors per forward sweep using DualVec. - /// - /// Reduces the number of forward sweeps from `num_colors` to - /// `ceil(num_colors / N)`. - pub fn sparse_jacobian_vec( - &mut self, - x: &[F], - ) -> (Vec, crate::sparse::JacobianSparsityPattern, Vec) { - self.forward(x); - let pattern = self.detect_jacobian_sparsity(); - let (colors, num_colors) = crate::sparse::column_coloring(&pattern); - - let n = self.num_inputs as usize; - let mut jac_values = vec![F::zero(); pattern.nnz()]; - - let out_indices = self.all_output_indices(); - - let mut dual_input_buf: Vec> = Vec::with_capacity(n); - let mut dual_vals_buf: Vec> = Vec::new(); - - let num_batches = (num_colors as usize).div_ceil(N); - for batch in 0..num_batches { - let base_color = (batch * N) as u32; - - dual_input_buf.clear(); - dual_input_buf.extend((0..n).map(|i| { - let eps = std::array::from_fn(|lane| { - let target_color = base_color + lane as u32; - if target_color < num_colors && colors[i] == target_color { - F::one() - } else { - F::zero() - } - }); - DualVec::new(x[i], eps) - })); - - self.forward_tangent(&dual_input_buf, &mut dual_vals_buf); - - for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() { - let col_color = colors[col as usize]; - if col_color >= base_color && col_color < base_color + N as u32 { - let lane = (col_color - base_color) as usize; - jac_values[k] = dual_vals_buf[out_indices[row as usize] as usize].eps[lane]; - } - } - } - - let outputs = self.output_values(); - (outputs, pattern, jac_values) - } - - /// Dense Jacobian via forward mode (one forward-tangent pass per input). - /// - /// More efficient than reverse mode when `num_inputs < num_outputs`. - pub fn jacobian_forward(&self, x: &[F]) -> Vec> { - let n = self.num_inputs as usize; - - let out_indices = self.all_output_indices(); - let m = out_indices.len(); - - let mut jac = vec![vec![F::zero(); n]; m]; - let mut dual_input_buf: Vec> = Vec::with_capacity(n); - let mut dual_vals_buf: Vec> = Vec::new(); - - // Indexing by `col` is clearer than enumerate here: col seeds the tangent direction - #[allow(clippy::needless_range_loop)] - for col in 0..n { - dual_input_buf.clear(); - dual_input_buf.extend( - (0..n).map(|i| Dual::new(x[i], if i == col { F::one() } else { F::zero() })), - ); - - self.forward_tangent(&dual_input_buf, &mut dual_vals_buf); - - for (row_idx, &out_idx) in out_indices.iter().enumerate() { - jac[row_idx][col] = dual_vals_buf[out_idx as usize].eps; - } - } - - jac - } - - /// Dense Jacobian via cross-country (vertex) elimination. - /// - /// Builds a linearized DAG from the tape, then eliminates intermediate - /// vertices in Markowitz order. For functions where `m ≈ n` and the - /// graph has moderate connectivity, this can require fewer operations - /// than either pure forward mode (`n` passes) or reverse mode (`m` passes). - pub fn jacobian_cross_country(&mut self, inputs: &[F]) -> Vec> { - self.forward(inputs); - - let out_indices = self.all_output_indices(); - - let mut graph = crate::cross_country::LinearizedGraph::from_tape( - &self.opcodes, - &self.arg_indices, - &self.values, - self.num_inputs as usize, - out_indices, - &self.custom_ops, - &self.custom_second_args, - ); - - graph.eliminate_all(); - graph.extract_jacobian() - } - - /// Sparse Hessian with a precomputed sparsity pattern and coloring. - /// - /// Skips re-detection on repeated calls (e.g. in solver loops). - pub fn sparse_hessian_with_pattern( - &self, - x: &[F], - pattern: &crate::sparse::SparsityPattern, - colors: &[u32], - num_colors: u32, - ) -> (F, Vec, Vec) { - let n = self.num_inputs as usize; - assert_eq!(x.len(), n, "wrong number of inputs"); - - let mut hessian_values = vec![F::zero(); pattern.nnz()]; - let mut gradient = vec![F::zero(); n]; - let mut value = F::zero(); - - let mut dual_input_buf: Vec> = Vec::with_capacity(n); - let mut dual_vals_buf = Vec::new(); - let mut adjoint_buf = Vec::new(); - let mut v = vec![F::zero(); n]; - - for color in 0..num_colors { - for i in 0..n { - v[i] = if colors[i] == color { - F::one() - } else { - F::zero() - }; - } - - self.hvp_with_all_bufs( - x, - &v, - &mut dual_input_buf, - &mut dual_vals_buf, - &mut adjoint_buf, - ); - - if color == 0 { - value = dual_vals_buf[self.output_index as usize].re; - for i in 0..n { - gradient[i] = adjoint_buf[i].re; - } - } - - for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() { - if colors[col as usize] == color { - hessian_values[k] = adjoint_buf[row as usize].eps; - } - } - } - - (value, gradient, hessian_values) - } - - /// Run all tape optimizations: CSE followed by DCE. - /// - /// In debug builds, validates internal consistency after optimization. - pub fn optimize(&mut self) { - self.cse(); - self.dead_code_elimination(); - - // Validate internal consistency in debug builds. - #[cfg(debug_assertions)] - { - let n = self.opcodes.len(); - // All arg_indices must point to valid entries. - for i in 0..n { - let [a, b] = self.arg_indices[i]; - match self.opcodes[i] { - OpCode::Input | OpCode::Const => { - assert_eq!(a, UNUSED, "Input/Const should have UNUSED args"); - assert_eq!(b, UNUSED, "Input/Const should have UNUSED args"); - } - OpCode::Powi => { - assert!( - (a as usize) < n, - "Powi arg0 {} out of bounds (tape len {})", - a, - n - ); - } - _ => { - assert!( - (a as usize) < i, - "arg0 {} not before op {} (tape len {})", - a, - i, - n - ); - if b != UNUSED { - assert!( - (b as usize) < i, - "arg1 {} not before op {} (tape len {})", - b, - i, - n - ); - } - } - } - } - // output_index must be valid. - assert!( - (self.output_index as usize) < n, - "output_index {} out of bounds (tape len {})", - self.output_index, - n - ); - for &oi in &self.output_indices { - assert!( - (oi as usize) < n, - "output_indices entry {} out of bounds (tape len {})", - oi, - n - ); - } - // num_inputs must be preserved. - let input_count = self - .opcodes - .iter() - .filter(|&&op| op == OpCode::Input) - .count(); - assert_eq!( - input_count, self.num_inputs as usize, - "num_inputs mismatch after optimization" - ); - } - } -} +} impl Default for BytecodeTape { fn default() -> Self { Self::new() } } - -// ══════════════════════════════════════════════ -// Parallel methods (rayon) -// ══════════════════════════════════════════════ - -#[cfg(feature = "parallel")] -impl BytecodeTape { - /// Parallel gradient: forward + reverse using external buffers. - /// - /// Takes `&self` instead of `&mut self`, enabling shared access across threads. - pub fn gradient_par(&self, inputs: &[F]) -> Vec { - let mut values_buf = Vec::new(); - self.forward_into(inputs, &mut values_buf); - let adjoints = self.reverse_from(&values_buf, self.output_index); - adjoints[..self.num_inputs as usize].to_vec() - } - - /// Parallel Jacobian: one reverse sweep per output, parallelized. - /// - /// Returns `J[i][j] = ∂f_i/∂x_j`. - pub fn jacobian_par(&self, inputs: &[F]) -> Vec> { - use rayon::prelude::*; - - let mut values_buf = Vec::new(); - self.forward_into(inputs, &mut values_buf); - - let out_indices = self.all_output_indices(); - - let ni = self.num_inputs as usize; - out_indices - .par_iter() - .map(|&out_idx| { - let adjoints = self.reverse_from(&values_buf, out_idx); - adjoints[..ni].to_vec() - }) - .collect() - } - - /// Parallel Hessian: one HVP per column, parallelized over columns. - /// - /// Returns `(value, gradient, hessian)`. - pub fn hessian_par(&self, x: &[F]) -> (F, Vec, Vec>) { - use rayon::prelude::*; - - let n = self.num_inputs as usize; - assert_eq!(x.len(), n, "wrong number of inputs"); - - // Compute gradient and value from column 0 (serial). - let dual_input_buf: Vec> = (0..n) - .map(|i| Dual::new(x[i], if i == 0 { F::one() } else { F::zero() })) - .collect(); - let mut dual_vals_buf = Vec::new(); - let mut adjoint_buf = Vec::new(); - self.forward_tangent_dual(&dual_input_buf, &mut dual_vals_buf); - self.reverse_tangent_dual(&dual_vals_buf, &mut adjoint_buf); - - let value = dual_vals_buf[self.output_index as usize].re; - let gradient: Vec = (0..n).map(|i| adjoint_buf[i].re).collect(); - let col0: Vec = (0..n).map(|i| adjoint_buf[i].eps).collect(); - - // Parallelize remaining columns. - let other_cols: Vec> = (1..n) - .into_par_iter() - .map(|j| { - let inputs: Vec> = (0..n) - .map(|i| Dual::new(x[i], if i == j { F::one() } else { F::zero() })) - .collect(); - let mut dv = Vec::new(); - let mut ab = Vec::new(); - self.forward_tangent_dual(&inputs, &mut dv); - self.reverse_tangent_dual(&dv, &mut ab); - (0..n).map(|i| ab[i].eps).collect() - }) - .collect(); - - let mut hessian = vec![vec![F::zero(); n]; n]; - for i in 0..n { - hessian[i][0] = col0[i]; - } - for (j_minus_1, col) in other_cols.iter().enumerate() { - let j = j_minus_1 + 1; - for i in 0..n { - hessian[i][j] = col[i]; - } - } - - (value, gradient, hessian) - } - - /// Parallel sparse Hessian: parallelized over colors. - /// - /// Returns `(value, gradient, pattern, hessian_values)`. - pub fn sparse_hessian_par( - &self, - x: &[F], - ) -> (F, Vec, crate::sparse::SparsityPattern, Vec) { - use rayon::prelude::*; - - let n = self.num_inputs as usize; - assert_eq!(x.len(), n, "wrong number of inputs"); - - let pattern = self.detect_sparsity(); - let (colors, num_colors) = crate::sparse::greedy_coloring(&pattern); - - // Compute value/gradient from color 0 (serial). - let mut v0 = vec![F::zero(); n]; - for i in 0..n { - v0[i] = if colors[i] == 0 { F::one() } else { F::zero() }; - } - let di: Vec> = (0..n).map(|i| Dual::new(x[i], v0[i])).collect(); - let mut dv = Vec::new(); - let mut ab = Vec::new(); - self.forward_tangent_dual(&di, &mut dv); - self.reverse_tangent_dual(&dv, &mut ab); - let value = dv[self.output_index as usize].re; - let gradient: Vec = (0..n).map(|i| ab[i].re).collect(); - - // Collect all color results in parallel. - let color_results: Vec>> = (0..num_colors) - .into_par_iter() - .map(|color| { - let mut v = vec![F::zero(); n]; - for i in 0..n { - v[i] = if colors[i] == color { - F::one() - } else { - F::zero() - }; - } - let inputs: Vec> = (0..n).map(|i| Dual::new(x[i], v[i])).collect(); - let mut dv_local = Vec::new(); - let mut ab_local = Vec::new(); - self.forward_tangent_dual(&inputs, &mut dv_local); - self.reverse_tangent_dual(&dv_local, &mut ab_local); - ab_local - }) - .collect(); - - let mut hessian_values = vec![F::zero(); pattern.nnz()]; - for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() { - let color = colors[col as usize] as usize; - hessian_values[k] = color_results[color][row as usize].eps; - } - - (value, gradient, pattern, hessian_values) - } - - /// Parallel sparse Jacobian: parallelized over colors. - /// - /// Auto-selects forward (column compression) or reverse (row compression) - /// based on `num_outputs` vs `num_inputs`. - pub fn sparse_jacobian_par( - &self, - x: &[F], - ) -> (Vec, crate::sparse::JacobianSparsityPattern, Vec) { - use rayon::prelude::*; - - let n = self.num_inputs as usize; - assert_eq!(x.len(), n, "wrong number of inputs"); - - let mut values_buf = Vec::new(); - self.forward_into(x, &mut values_buf); - - let out_indices = self.all_output_indices(); - let m = out_indices.len(); - let outputs: Vec = out_indices - .iter() - .map(|&oi| values_buf[oi as usize]) - .collect(); - - let jac_pattern = self.detect_jacobian_sparsity(); - let ni = self.num_inputs as usize; - - if m <= n { - // Row compression (reverse mode) - let (row_colors, num_colors) = crate::sparse::row_coloring(&jac_pattern); - - let color_results: Vec> = (0..num_colors) - .into_par_iter() - .map(|color| { - let n_vars = self.num_variables as usize; - let mut adjoints = vec![F::zero(); n_vars]; - for (i, &oi) in out_indices.iter().enumerate() { - if row_colors[i] == color { - adjoints[oi as usize] = F::one(); - } - } - - self.reverse_sweep_core(&mut adjoints, &values_buf, None); - adjoints[..ni].to_vec() - }) - .collect(); - - let mut jac_values = vec![F::zero(); jac_pattern.nnz()]; - for (k, (&row, &col)) in jac_pattern - .rows - .iter() - .zip(jac_pattern.cols.iter()) - .enumerate() - { - let color = row_colors[row as usize] as usize; - jac_values[k] = color_results[color][col as usize]; - } - - (outputs, jac_pattern, jac_values) - } else { - // Column compression (forward mode) — parallelize forward tangent sweeps - let (col_colors, num_colors) = crate::sparse::column_coloring(&jac_pattern); - - let color_results: Vec> = (0..num_colors) - .into_par_iter() - .map(|color| { - let dir: Vec = (0..n) - .map(|i| { - if col_colors[i] == color { - F::one() - } else { - F::zero() - } - }) - .collect(); - // Forward tangent with F (not Dual) direction - // We need to do forward_into first, then forward tangent - // But forward_tangent uses self.values, so we need a workaround. - // Use Dual with tangent = direction. - let inputs: Vec> = (0..n).map(|i| Dual::new(x[i], dir[i])).collect(); - let mut dv = Vec::new(); - self.forward_tangent(&inputs, &mut dv); - out_indices.iter().map(|&oi| dv[oi as usize].eps).collect() - }) - .collect(); - - let mut jac_values = vec![F::zero(); jac_pattern.nnz()]; - for (k, (&row, &col)) in jac_pattern - .rows - .iter() - .zip(jac_pattern.cols.iter()) - .enumerate() - { - let color = col_colors[col as usize] as usize; - jac_values[k] = color_results[color][row as usize]; - } - - (outputs, jac_pattern, jac_values) - } - } - - /// Evaluate the gradient at multiple input points in parallel. - /// - /// Uses `forward_into` + `reverse_from` with per-thread buffers. - pub fn gradient_batch_par(&self, inputs: &[&[F]]) -> Vec> { - use rayon::prelude::*; - - let ni = self.num_inputs as usize; - let out_idx = self.output_index; - - inputs - .par_iter() - .map(|x| { - let mut values_buf = Vec::new(); - self.forward_into(x, &mut values_buf); - let adjoints = self.reverse_from(&values_buf, out_idx); - adjoints[..ni].to_vec() - }) - .collect() - } - - /// Compute Hessian at multiple input points in parallel. - /// - /// Returns `(value, gradient, hessian)` for each input point. - pub fn hessian_batch_par(&self, inputs: &[&[F]]) -> Vec<(F, Vec, Vec>)> { - use rayon::prelude::*; - - inputs.par_iter().map(|x| self.hessian_par(x)).collect() - } -} - -// ══════════════════════════════════════════════ -// Thread-local active bytecode tape -// ══════════════════════════════════════════════ -// Serde support -// ══════════════════════════════════════════════ - -#[cfg(feature = "serde")] -mod serde_support { - use super::*; - use serde::ser::SerializeStruct; - use serde::{Deserialize, Deserializer, Serialize, Serializer}; - - impl Serialize for BytecodeTape { - fn serialize(&self, serializer: S) -> Result { - if !self.custom_ops.is_empty() { - return Err(serde::ser::Error::custom( - "cannot serialize a BytecodeTape containing custom ops; \ - custom ops must be re-registered after deserialization", - )); - } - let mut s = serializer.serialize_struct("BytecodeTape", 8)?; - s.serialize_field("opcodes", &self.opcodes)?; - s.serialize_field("arg_indices", &self.arg_indices)?; - s.serialize_field("values", &self.values)?; - s.serialize_field("num_inputs", &self.num_inputs)?; - s.serialize_field("num_variables", &self.num_variables)?; - s.serialize_field("output_index", &self.output_index)?; - s.serialize_field("output_indices", &self.output_indices)?; - s.serialize_field("custom_second_args", &self.custom_second_args)?; - s.end() - } - } - - impl<'de, F: Float + Deserialize<'de>> Deserialize<'de> for BytecodeTape { - fn deserialize>(deserializer: D) -> Result { - #[derive(Deserialize)] - struct TapeData { - opcodes: Vec, - arg_indices: Vec<[u32; 2]>, - values: Vec, - num_inputs: u32, - num_variables: u32, - output_index: u32, - #[serde(default)] - output_indices: Vec, - #[serde(default)] - custom_second_args: HashMap, - } - - let data = TapeData::::deserialize(deserializer)?; - Ok(BytecodeTape { - opcodes: data.opcodes, - arg_indices: data.arg_indices, - values: data.values, - num_inputs: data.num_inputs, - num_variables: data.num_variables, - output_index: data.output_index, - output_indices: data.output_indices, - custom_ops: Vec::new(), - custom_second_args: data.custom_second_args, - }) - } - } -} - -// ══════════════════════════════════════════════ - -thread_local! { - static BTAPE_F32: Cell<*mut BytecodeTape> = const { Cell::new(std::ptr::null_mut()) }; - static BTAPE_F64: Cell<*mut BytecodeTape> = const { Cell::new(std::ptr::null_mut()) }; - static BTAPE_DUAL_F32: Cell<*mut BytecodeTape>> = const { Cell::new(std::ptr::null_mut()) }; - static BTAPE_DUAL_F64: Cell<*mut BytecodeTape>> = const { Cell::new(std::ptr::null_mut()) }; -} - -/// Trait to select the correct thread-local for a given float type. -/// -/// Implemented for `f32`, `f64`, `Dual`, and `Dual`, enabling -/// `BReverse` to be used with these base types. -pub trait BtapeThreadLocal: Float { - fn btape_cell() -> &'static std::thread::LocalKey>>; -} - -impl BtapeThreadLocal for f32 { - fn btape_cell() -> &'static std::thread::LocalKey>> { - &BTAPE_F32 - } -} - -impl BtapeThreadLocal for f64 { - fn btape_cell() -> &'static std::thread::LocalKey>> { - &BTAPE_F64 - } -} - -impl BtapeThreadLocal for Dual { - fn btape_cell() -> &'static std::thread::LocalKey>> { - &BTAPE_DUAL_F32 - } -} - -impl BtapeThreadLocal for Dual { - fn btape_cell() -> &'static std::thread::LocalKey>> { - &BTAPE_DUAL_F64 - } -} - -/// Access the active bytecode tape for the current thread. -/// Panics if no tape is active. -#[inline] -pub fn with_active_btape(f: impl FnOnce(&mut BytecodeTape) -> R) -> R { - F::btape_cell().with(|cell| { - let ptr = cell.get(); - assert!( - !ptr.is_null(), - "No active bytecode tape. Use echidna::record() to record a function." - ); - // SAFETY: BtapeGuard guarantees validity for the duration of the - // recording scope, single-threaded via thread-local. - let tape = unsafe { &mut *ptr }; - f(tape) - }) -} - -/// RAII guard that sets a bytecode tape as the thread-local active tape. -pub struct BtapeGuard { - prev: *mut BytecodeTape, -} - -impl BtapeGuard { - /// Activate `tape` as the thread-local bytecode tape. - pub fn new(tape: &mut BytecodeTape) -> Self { - let prev = F::btape_cell().with(|cell| { - let prev = cell.get(); - cell.set(tape as *mut BytecodeTape); - prev - }); - BtapeGuard { prev } - } -} - -impl Drop for BtapeGuard { - fn drop(&mut self) { - F::btape_cell().with(|cell| { - cell.set(self.prev); - }); - } -} diff --git a/src/bytecode_tape/optimize.rs b/src/bytecode_tape/optimize.rs new file mode 100644 index 0000000..9e66e3e --- /dev/null +++ b/src/bytecode_tape/optimize.rs @@ -0,0 +1,271 @@ +use std::collections::HashMap; + +use crate::float::Float; +use crate::opcode::{OpCode, UNUSED}; + +impl super::BytecodeTape { + /// Eliminate dead (unreachable) entries from the tape. + /// + /// Walks backward from all outputs, marks reachable entries, then compacts + /// the tape in-place with an index remap. Inputs are never removed. + /// Core DCE: reachability walk from `seeds`, compact tape, return index remap. + /// + /// Shared by [`dead_code_elimination`](Self::dead_code_elimination) and + /// [`dead_code_elimination_for_outputs`](Self::dead_code_elimination_for_outputs). + /// Callers handle output index updates after compaction. + fn dce_compact(&mut self, seeds: &[u32]) -> Vec { + let n = self.opcodes.len(); + let mut reachable = vec![false; n]; + + // Mark all inputs as reachable. + for flag in reachable.iter_mut().take(self.num_inputs as usize) { + *flag = true; + } + + let mut stack: Vec = seeds.to_vec(); + + while let Some(idx) = stack.pop() { + let i = idx as usize; + if reachable[i] { + continue; + } + reachable[i] = true; + let [a, b] = self.arg_indices[i]; + if a != UNUSED { + stack.push(a); + } + if b != UNUSED && self.opcodes[i] != OpCode::Powi { + stack.push(b); + } + } + + // Build remap: old index -> new index. + let mut remap = vec![0u32; n]; + let mut new_idx = 0u32; + for i in 0..n { + if reachable[i] { + remap[i] = new_idx; + new_idx += 1; + } + } + let new_len = new_idx as usize; + + // Compact in-place. + let mut write = 0; + for (read, &is_reachable) in reachable.iter().enumerate().take(n) { + if is_reachable { + self.opcodes[write] = self.opcodes[read]; + self.values[write] = self.values[read]; + let [a, b] = self.arg_indices[read]; + let ra = if a != UNUSED { + remap[a as usize] + } else { + UNUSED + }; + let rb = if b != UNUSED && self.opcodes[read] != OpCode::Powi { + remap[b as usize] + } else { + b + }; + self.arg_indices[write] = [ra, rb]; + write += 1; + } + } + + self.opcodes.truncate(new_len); + self.arg_indices.truncate(new_len); + self.values.truncate(new_len); + self.num_variables = new_len as u32; + + remap + } + + pub fn dead_code_elimination(&mut self) { + let mut seeds = vec![self.output_index]; + seeds.extend_from_slice(&self.output_indices); + let remap = self.dce_compact(&seeds); + self.output_index = remap[self.output_index as usize]; + for oi in &mut self.output_indices { + *oi = remap[*oi as usize]; + } + } + + /// Eliminate dead code, keeping only the specified outputs alive. + /// + /// Like [`dead_code_elimination`](Self::dead_code_elimination) but seeds + /// reachability only from `active_outputs`. After compaction, + /// `output_indices` contains only the active outputs (remapped), and + /// `output_index` is set to the first active output. + /// + /// **Note**: Like `dead_code_elimination`, this does not remap + /// `custom_second_args` (pre-existing limitation). + /// + /// # Panics + /// Panics if `active_outputs` is empty. + pub fn dead_code_elimination_for_outputs(&mut self, active_outputs: &[u32]) { + assert!( + !active_outputs.is_empty(), + "active_outputs must not be empty" + ); + let remap = self.dce_compact(active_outputs); + self.output_indices = active_outputs + .iter() + .map(|&oi| remap[oi as usize]) + .collect(); + self.output_index = self.output_indices[0]; + } + + /// Common subexpression elimination. + /// + /// Deduplicates identical `(OpCode, arg0, arg1)` triples, normalising + /// argument order for commutative ops. Finishes with a DCE pass to + /// remove the now-dead duplicates. + pub fn cse(&mut self) { + let n = self.opcodes.len(); + // Maps canonical (op, arg0, arg1) -> first index that computed it. + let mut seen: HashMap<(OpCode, u32, u32), u32> = HashMap::new(); + // remap[i] = canonical index for entry i (identity by default). + let mut remap: Vec = (0..n as u32).collect(); + + let is_commutative = |op: OpCode| -> bool { + matches!( + op, + OpCode::Add | OpCode::Mul | OpCode::Max | OpCode::Min | OpCode::Hypot + ) + }; + + for i in 0..n { + let op = self.opcodes[i]; + match op { + OpCode::Input | OpCode::Const => continue, + _ => {} + } + + let [mut a, mut b] = self.arg_indices[i]; + // Apply remap to args (except Powi exponent in b). + a = remap[a as usize]; + if b != UNUSED && op != OpCode::Powi { + b = remap[b as usize]; + } + // Update arg_indices with remapped values. + self.arg_indices[i] = [a, b]; + + // Build the canonical key. + let key = if b == UNUSED { + // Unary: hash (op, arg0) only; use UNUSED as placeholder. + (op, a, UNUSED) + } else if is_commutative(op) { + let (lo, hi) = if a <= b { (a, b) } else { (b, a) }; + (op, lo, hi) + } else { + (op, a, b) + }; + + if let Some(&canonical) = seen.get(&key) { + remap[i] = canonical; + } else { + seen.insert(key, i as u32); + } + } + + // Apply remap to all arg_indices (for entries that reference CSE'd nodes). + for i in 0..n { + let op = self.opcodes[i]; + if matches!(op, OpCode::Input | OpCode::Const) { + continue; + } + let [a, b] = self.arg_indices[i]; + let ra = remap[a as usize]; + let rb = if b != UNUSED && op != OpCode::Powi { + remap[b as usize] + } else { + b + }; + self.arg_indices[i] = [ra, rb]; + } + + // Update output indices. + self.output_index = remap[self.output_index as usize]; + for oi in &mut self.output_indices { + *oi = remap[*oi as usize]; + } + + // DCE removes the now-unreachable duplicate entries. + self.dead_code_elimination(); + } + + /// Run all tape optimizations: CSE followed by DCE. + /// + /// In debug builds, validates internal consistency after optimization. + pub fn optimize(&mut self) { + self.cse(); + self.dead_code_elimination(); + + // Validate internal consistency in debug builds. + #[cfg(debug_assertions)] + { + let n = self.opcodes.len(); + // All arg_indices must point to valid entries. + for i in 0..n { + let [a, b] = self.arg_indices[i]; + match self.opcodes[i] { + OpCode::Input | OpCode::Const => { + assert_eq!(a, UNUSED, "Input/Const should have UNUSED args"); + assert_eq!(b, UNUSED, "Input/Const should have UNUSED args"); + } + OpCode::Powi => { + assert!( + (a as usize) < n, + "Powi arg0 {} out of bounds (tape len {})", + a, + n + ); + } + _ => { + assert!( + (a as usize) < i, + "arg0 {} not before op {} (tape len {})", + a, + i, + n + ); + if b != UNUSED { + assert!( + (b as usize) < i, + "arg1 {} not before op {} (tape len {})", + b, + i, + n + ); + } + } + } + } + // output_index must be valid. + assert!( + (self.output_index as usize) < n, + "output_index {} out of bounds (tape len {})", + self.output_index, + n + ); + for &oi in &self.output_indices { + assert!( + (oi as usize) < n, + "output_indices entry {} out of bounds (tape len {})", + oi, + n + ); + } + // num_inputs must be preserved. + let input_count = self + .opcodes + .iter() + .filter(|&&op| op == OpCode::Input) + .count(); + assert_eq!( + input_count, self.num_inputs as usize, + "num_inputs mismatch after optimization" + ); + } + } +} diff --git a/src/bytecode_tape/parallel.rs b/src/bytecode_tape/parallel.rs new file mode 100644 index 0000000..bf30336 --- /dev/null +++ b/src/bytecode_tape/parallel.rs @@ -0,0 +1,263 @@ +use rayon::prelude::*; + +use crate::dual::Dual; +use crate::float::Float; + +impl super::BytecodeTape { + /// Parallel gradient: forward + reverse using external buffers. + /// + /// Takes `&self` instead of `&mut self`, enabling shared access across threads. + pub fn gradient_par(&self, inputs: &[F]) -> Vec { + let mut values_buf = Vec::new(); + self.forward_into(inputs, &mut values_buf); + let adjoints = self.reverse_from(&values_buf, self.output_index); + adjoints[..self.num_inputs as usize].to_vec() + } + + /// Parallel Jacobian: one reverse sweep per output, parallelized. + /// + /// Returns `J[i][j] = ∂f_i/∂x_j`. + pub fn jacobian_par(&self, inputs: &[F]) -> Vec> { + let mut values_buf = Vec::new(); + self.forward_into(inputs, &mut values_buf); + + let out_indices = self.all_output_indices(); + + let ni = self.num_inputs as usize; + out_indices + .par_iter() + .map(|&out_idx| { + let adjoints = self.reverse_from(&values_buf, out_idx); + adjoints[..ni].to_vec() + }) + .collect() + } + + /// Parallel Hessian: one HVP per column, parallelized over columns. + /// + /// Returns `(value, gradient, hessian)`. + pub fn hessian_par(&self, x: &[F]) -> (F, Vec, Vec>) { + let n = self.num_inputs as usize; + assert_eq!(x.len(), n, "wrong number of inputs"); + + // Compute gradient and value from column 0 (serial). + let dual_input_buf: Vec> = (0..n) + .map(|i| Dual::new(x[i], if i == 0 { F::one() } else { F::zero() })) + .collect(); + let mut dual_vals_buf = Vec::new(); + let mut adjoint_buf = Vec::new(); + self.forward_tangent_dual(&dual_input_buf, &mut dual_vals_buf); + self.reverse_tangent_dual(&dual_vals_buf, &mut adjoint_buf); + + let value = dual_vals_buf[self.output_index as usize].re; + let gradient: Vec = (0..n).map(|i| adjoint_buf[i].re).collect(); + let col0: Vec = (0..n).map(|i| adjoint_buf[i].eps).collect(); + + // Parallelize remaining columns. + let other_cols: Vec> = (1..n) + .into_par_iter() + .map(|j| { + let inputs: Vec> = (0..n) + .map(|i| Dual::new(x[i], if i == j { F::one() } else { F::zero() })) + .collect(); + let mut dv = Vec::new(); + let mut ab = Vec::new(); + self.forward_tangent_dual(&inputs, &mut dv); + self.reverse_tangent_dual(&dv, &mut ab); + (0..n).map(|i| ab[i].eps).collect() + }) + .collect(); + + let mut hessian = vec![vec![F::zero(); n]; n]; + for i in 0..n { + hessian[i][0] = col0[i]; + } + for (j_minus_1, col) in other_cols.iter().enumerate() { + let j = j_minus_1 + 1; + for i in 0..n { + hessian[i][j] = col[i]; + } + } + + (value, gradient, hessian) + } + + /// Parallel sparse Hessian: parallelized over colors. + /// + /// Returns `(value, gradient, pattern, hessian_values)`. + pub fn sparse_hessian_par( + &self, + x: &[F], + ) -> (F, Vec, crate::sparse::SparsityPattern, Vec) { + let n = self.num_inputs as usize; + assert_eq!(x.len(), n, "wrong number of inputs"); + + let pattern = self.detect_sparsity(); + let (colors, num_colors) = crate::sparse::greedy_coloring(&pattern); + + // Compute value/gradient from color 0 (serial). + let mut v0 = vec![F::zero(); n]; + for i in 0..n { + v0[i] = if colors[i] == 0 { F::one() } else { F::zero() }; + } + let di: Vec> = (0..n).map(|i| Dual::new(x[i], v0[i])).collect(); + let mut dv = Vec::new(); + let mut ab = Vec::new(); + self.forward_tangent_dual(&di, &mut dv); + self.reverse_tangent_dual(&dv, &mut ab); + let value = dv[self.output_index as usize].re; + let gradient: Vec = (0..n).map(|i| ab[i].re).collect(); + + // Collect all color results in parallel. + let color_results: Vec>> = (0..num_colors) + .into_par_iter() + .map(|color| { + let mut v = vec![F::zero(); n]; + for i in 0..n { + v[i] = if colors[i] == color { + F::one() + } else { + F::zero() + }; + } + let inputs: Vec> = (0..n).map(|i| Dual::new(x[i], v[i])).collect(); + let mut dv_local = Vec::new(); + let mut ab_local = Vec::new(); + self.forward_tangent_dual(&inputs, &mut dv_local); + self.reverse_tangent_dual(&dv_local, &mut ab_local); + ab_local + }) + .collect(); + + let mut hessian_values = vec![F::zero(); pattern.nnz()]; + for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() { + let color = colors[col as usize] as usize; + hessian_values[k] = color_results[color][row as usize].eps; + } + + (value, gradient, pattern, hessian_values) + } + + /// Parallel sparse Jacobian: parallelized over colors. + /// + /// Auto-selects forward (column compression) or reverse (row compression) + /// based on `num_outputs` vs `num_inputs`. + pub fn sparse_jacobian_par( + &self, + x: &[F], + ) -> (Vec, crate::sparse::JacobianSparsityPattern, Vec) { + let n = self.num_inputs as usize; + assert_eq!(x.len(), n, "wrong number of inputs"); + + let mut values_buf = Vec::new(); + self.forward_into(x, &mut values_buf); + + let out_indices = self.all_output_indices(); + let m = out_indices.len(); + let outputs: Vec = out_indices + .iter() + .map(|&oi| values_buf[oi as usize]) + .collect(); + + let jac_pattern = self.detect_jacobian_sparsity(); + let ni = self.num_inputs as usize; + + if m <= n { + // Row compression (reverse mode) + let (row_colors, num_colors) = crate::sparse::row_coloring(&jac_pattern); + + let color_results: Vec> = (0..num_colors) + .into_par_iter() + .map(|color| { + let n_vars = self.num_variables as usize; + let mut adjoints = vec![F::zero(); n_vars]; + for (i, &oi) in out_indices.iter().enumerate() { + if row_colors[i] == color { + adjoints[oi as usize] = F::one(); + } + } + + self.reverse_sweep_core(&mut adjoints, &values_buf, None); + adjoints[..ni].to_vec() + }) + .collect(); + + let mut jac_values = vec![F::zero(); jac_pattern.nnz()]; + for (k, (&row, &col)) in jac_pattern + .rows + .iter() + .zip(jac_pattern.cols.iter()) + .enumerate() + { + let color = row_colors[row as usize] as usize; + jac_values[k] = color_results[color][col as usize]; + } + + (outputs, jac_pattern, jac_values) + } else { + // Column compression (forward mode) — parallelize forward tangent sweeps + let (col_colors, num_colors) = crate::sparse::column_coloring(&jac_pattern); + + let color_results: Vec> = (0..num_colors) + .into_par_iter() + .map(|color| { + let dir: Vec = (0..n) + .map(|i| { + if col_colors[i] == color { + F::one() + } else { + F::zero() + } + }) + .collect(); + // Forward tangent with F (not Dual) direction + // We need to do forward_into first, then forward tangent + // But forward_tangent uses self.values, so we need a workaround. + // Use Dual with tangent = direction. + let inputs: Vec> = (0..n).map(|i| Dual::new(x[i], dir[i])).collect(); + let mut dv = Vec::new(); + self.forward_tangent(&inputs, &mut dv); + out_indices.iter().map(|&oi| dv[oi as usize].eps).collect() + }) + .collect(); + + let mut jac_values = vec![F::zero(); jac_pattern.nnz()]; + for (k, (&row, &col)) in jac_pattern + .rows + .iter() + .zip(jac_pattern.cols.iter()) + .enumerate() + { + let color = col_colors[col as usize] as usize; + jac_values[k] = color_results[color][row as usize]; + } + + (outputs, jac_pattern, jac_values) + } + } + + /// Evaluate the gradient at multiple input points in parallel. + /// + /// Uses `forward_into` + `reverse_from` with per-thread buffers. + pub fn gradient_batch_par(&self, inputs: &[&[F]]) -> Vec> { + let ni = self.num_inputs as usize; + let out_idx = self.output_index; + + inputs + .par_iter() + .map(|x| { + let mut values_buf = Vec::new(); + self.forward_into(x, &mut values_buf); + let adjoints = self.reverse_from(&values_buf, out_idx); + adjoints[..ni].to_vec() + }) + .collect() + } + + /// Compute Hessian at multiple input points in parallel. + /// + /// Returns `(value, gradient, hessian)` for each input point. + pub fn hessian_batch_par(&self, inputs: &[&[F]]) -> Vec<(F, Vec, Vec>)> { + inputs.par_iter().map(|x| self.hessian_par(x)).collect() + } +} diff --git a/src/bytecode_tape/reverse.rs b/src/bytecode_tape/reverse.rs new file mode 100644 index 0000000..a1be6af --- /dev/null +++ b/src/bytecode_tape/reverse.rs @@ -0,0 +1,170 @@ +use std::collections::HashMap; + +use crate::float::Float; +use crate::opcode::{self, OpCode, UNUSED}; + +impl super::BytecodeTape { + /// Reverse sweep with weighted seeds for multiple outputs. + /// + /// Computes `∑_i weights[i] * ∂output_i/∂x` — a vector-Jacobian product. + /// + /// Returns the gradient with respect to all inputs (length [`num_inputs`](Self::num_inputs)). + pub fn reverse_seeded(&self, seeds: &[F]) -> Vec { + let out_indices = self.all_output_indices(); + + assert_eq!( + seeds.len(), + out_indices.len(), + "seeds length must match number of outputs" + ); + + let ni = self.num_inputs as usize; + let adjoints = self.reverse_seeded_full(seeds, out_indices); + adjoints[..ni].to_vec() + } + + /// Core reverse sweep loop shared by all scalar reverse sweep variants. + /// + /// Expects `adjoints` to be pre-seeded by the caller (length = `num_variables`). + /// Reads primal values from `values` (either `self.values` or an external buffer). + /// When `forced_signs` is `Some`, uses forced partials at matching tape indices. + pub(super) fn reverse_sweep_core( + &self, + adjoints: &mut [F], + values: &[F], + forced_signs: Option<&HashMap>, + ) { + for i in (0..self.opcodes.len()).rev() { + let adj = adjoints[i]; + if adj == F::zero() { + continue; + } + + match self.opcodes[i] { + OpCode::Input | OpCode::Const => continue, + OpCode::Custom => { + adjoints[i] = F::zero(); + let [a_idx, cb_idx] = self.arg_indices[i]; + let a = values[a_idx as usize]; + let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied(); + let b = b_idx_opt.map(|bi| values[bi as usize]).unwrap_or(F::zero()); + let r = values[i]; + let (da, db) = self.custom_ops[cb_idx as usize].partials(a, b, r); + adjoints[a_idx as usize] = adjoints[a_idx as usize] + da * adj; + if let Some(bi) = b_idx_opt { + adjoints[bi as usize] = adjoints[bi as usize] + db * adj; + } + } + op => { + adjoints[i] = F::zero(); + let [a_idx, b_idx] = self.arg_indices[i]; + let a = values[a_idx as usize]; + let b = if b_idx != UNUSED && op != OpCode::Powi { + values[b_idx as usize] + } else if op == OpCode::Powi { + F::from(b_idx).unwrap_or_else(|| F::zero()) + } else { + F::zero() + }; + let r = values[i]; + + let (da, db) = match forced_signs.and_then(|fs| fs.get(&(i as u32))) { + Some(&sign) => opcode::forced_reverse_partials(op, a, b, r, sign), + None => opcode::reverse_partials(op, a, b, r), + }; + + adjoints[a_idx as usize] = adjoints[a_idx as usize] + da * adj; + if b_idx != UNUSED && op != OpCode::Powi { + adjoints[b_idx as usize] = adjoints[b_idx as usize] + db * adj; + } + } + } + } + } + + /// Reverse sweep: compute adjoints seeded at the output. + /// + /// Returns the full adjoint vector (length = `num_variables`). + pub fn reverse(&self, seed_index: u32) -> Vec { + let n = self.num_variables as usize; + let mut adjoints = vec![F::zero(); n]; + adjoints[seed_index as usize] = F::one(); + self.reverse_sweep_core(&mut adjoints, &self.values, None); + adjoints + } + + /// Reverse sweep with forced branch choices at specified tape indices. + pub(super) fn reverse_with_forced_signs( + &self, + seed_index: u32, + forced_signs: &HashMap, + ) -> Vec { + let n = self.num_variables as usize; + let mut adjoints = vec![F::zero(); n]; + adjoints[seed_index as usize] = F::one(); + self.reverse_sweep_core(&mut adjoints, &self.values, Some(forced_signs)); + adjoints + } + + /// Reverse sweep reading from an external values buffer. + /// + /// Like [`reverse`](Self::reverse) but reads primal values from `values` + /// instead of `self.values`. Pair with [`forward_into`](Self::forward_into) + /// for parallel evaluation. + pub fn reverse_from(&self, values: &[F], seed_index: u32) -> Vec { + let n = self.num_variables as usize; + assert_eq!(values.len(), n, "values buffer has wrong length"); + let mut adjoints = vec![F::zero(); n]; + adjoints[seed_index as usize] = F::one(); + self.reverse_sweep_core(&mut adjoints, values, None); + adjoints + } + + /// Forward + reverse: compute the gradient at new inputs. + /// + /// Returns only the input adjoints (indices `0..num_inputs`). + pub fn gradient(&mut self, inputs: &[F]) -> Vec { + self.forward(inputs); + let adjoints = self.reverse(self.output_index); + adjoints[..self.num_inputs as usize].to_vec() + } + + /// Like [`gradient`](Self::gradient) but reuses a caller-provided buffer + /// for the adjoint vector, avoiding allocation on repeated calls. + pub fn gradient_with_buf(&mut self, inputs: &[F], adjoint_buf: &mut Vec) -> Vec { + self.forward(inputs); + + let n = self.num_variables as usize; + adjoint_buf.clear(); + adjoint_buf.resize(n, F::zero()); + adjoint_buf[self.output_index as usize] = F::one(); + + self.reverse_sweep_core(adjoint_buf, &self.values, None); + adjoint_buf[..self.num_inputs as usize].to_vec() + } + + /// Reverse sweep with weighted seeds, returning full adjoint vector. + pub(super) fn reverse_seeded_full(&self, seeds: &[F], out_indices: &[u32]) -> Vec { + let n = self.num_variables as usize; + let mut adjoints = vec![F::zero(); n]; + + for (&out_idx, &weight) in out_indices.iter().zip(seeds.iter()) { + if weight == F::zero() { + continue; + } + adjoints[out_idx as usize] = adjoints[out_idx as usize] + weight; + } + + self.reverse_sweep_core(&mut adjoints, &self.values, None); + adjoints + } + + // ── Batch evaluation ── + + /// Evaluate the gradient at multiple input points. + /// + /// Returns one gradient vector per input point. + pub fn gradient_batch(&mut self, inputs: &[&[F]]) -> Vec> { + inputs.iter().map(|x| self.gradient(x)).collect() + } +} diff --git a/src/bytecode_tape/serde_support.rs b/src/bytecode_tape/serde_support.rs new file mode 100644 index 0000000..b99288e --- /dev/null +++ b/src/bytecode_tape/serde_support.rs @@ -0,0 +1,61 @@ +use std::collections::HashMap; + +use serde::ser::SerializeStruct; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; + +use crate::float::Float; +use crate::opcode::OpCode; + +use super::BytecodeTape; + +impl Serialize for BytecodeTape { + fn serialize(&self, serializer: S) -> Result { + if !self.custom_ops.is_empty() { + return Err(serde::ser::Error::custom( + "cannot serialize a BytecodeTape containing custom ops; \ + custom ops must be re-registered after deserialization", + )); + } + let mut s = serializer.serialize_struct("BytecodeTape", 8)?; + s.serialize_field("opcodes", &self.opcodes)?; + s.serialize_field("arg_indices", &self.arg_indices)?; + s.serialize_field("values", &self.values)?; + s.serialize_field("num_inputs", &self.num_inputs)?; + s.serialize_field("num_variables", &self.num_variables)?; + s.serialize_field("output_index", &self.output_index)?; + s.serialize_field("output_indices", &self.output_indices)?; + s.serialize_field("custom_second_args", &self.custom_second_args)?; + s.end() + } +} + +impl<'de, F: Float + Deserialize<'de>> Deserialize<'de> for BytecodeTape { + fn deserialize>(deserializer: D) -> Result { + #[derive(Deserialize)] + struct TapeData { + opcodes: Vec, + arg_indices: Vec<[u32; 2]>, + values: Vec, + num_inputs: u32, + num_variables: u32, + output_index: u32, + #[serde(default)] + output_indices: Vec, + #[serde(default)] + custom_second_args: HashMap, + } + + let data = TapeData::::deserialize(deserializer)?; + Ok(BytecodeTape { + opcodes: data.opcodes, + arg_indices: data.arg_indices, + values: data.values, + num_inputs: data.num_inputs, + num_variables: data.num_variables, + output_index: data.output_index, + output_indices: data.output_indices, + custom_ops: Vec::new(), + custom_second_args: data.custom_second_args, + }) + } +} diff --git a/src/bytecode_tape/sparse.rs b/src/bytecode_tape/sparse.rs new file mode 100644 index 0000000..b0998c3 --- /dev/null +++ b/src/bytecode_tape/sparse.rs @@ -0,0 +1,435 @@ +use crate::dual::Dual; +use crate::dual_vec::DualVec; +use crate::float::Float; + +impl super::BytecodeTape { + /// Detect the structural sparsity pattern of the Hessian. + /// + /// Walks the tape forward propagating input-dependency bitsets. + /// At nonlinear operations, marks cross-pairs as potential Hessian interactions. + pub fn detect_sparsity(&self) -> crate::sparse::SparsityPattern { + crate::sparse::detect_sparsity_impl( + &self.opcodes, + &self.arg_indices, + self.num_inputs as usize, + self.num_variables as usize, + ) + } + + /// Detect the structural sparsity pattern of the Jacobian. + /// + /// Walks the tape forward propagating input-dependency bitsets (first-order). + /// For each output, determines which inputs it depends on. + pub fn detect_jacobian_sparsity(&self) -> crate::sparse::JacobianSparsityPattern { + let out_indices = self.all_output_indices(); + crate::sparse::detect_jacobian_sparsity_impl( + &self.opcodes, + &self.arg_indices, + self.num_inputs as usize, + self.num_variables as usize, + out_indices, + ) + } + + /// Compute a sparse Hessian using structural sparsity detection and graph coloring. + /// + /// Returns `(value, gradient, pattern, hessian_values)` where + /// `hessian_values[k]` corresponds to `(pattern.rows[k], pattern.cols[k])`. + /// + /// For problems with sparse Hessians, this requires only `chromatic_number` + /// HVP calls instead of `n`, which can be dramatically fewer for banded + /// or sparse interaction structures. + pub fn sparse_hessian(&self, x: &[F]) -> (F, Vec, crate::sparse::SparsityPattern, Vec) { + let n = self.num_inputs as usize; + assert_eq!(x.len(), n, "wrong number of inputs"); + + let pattern = self.detect_sparsity(); + let (colors, num_colors) = crate::sparse::greedy_coloring(&pattern); + + let mut hessian_values = vec![F::zero(); pattern.nnz()]; + let mut gradient = vec![F::zero(); n]; + let mut value = F::zero(); + + let mut dual_input_buf: Vec> = Vec::with_capacity(n); + let mut dual_vals_buf = Vec::new(); + let mut adjoint_buf = Vec::new(); + let mut v = vec![F::zero(); n]; + + for color in 0..num_colors { + // Form direction vector: v[i] = 1 if colors[i] == color, else 0 + for i in 0..n { + v[i] = if colors[i] == color { + F::one() + } else { + F::zero() + }; + } + + self.hvp_with_all_bufs( + x, + &v, + &mut dual_input_buf, + &mut dual_vals_buf, + &mut adjoint_buf, + ); + + if color == 0 { + value = dual_vals_buf[self.output_index as usize].re; + for i in 0..n { + gradient[i] = adjoint_buf[i].re; + } + } + + // Extract Hessian entries for this color. + for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() { + if colors[col as usize] == color { + hessian_values[k] = adjoint_buf[row as usize].eps; + } + } + } + + (value, gradient, pattern, hessian_values) + } + + /// Batched sparse Hessian: packs N colors per sweep using DualVec. + /// + /// Reduces the number of forward+reverse sweeps from `num_colors` to + /// `ceil(num_colors / N)`. Each sweep processes N colors simultaneously. + /// + /// Returns `(value, gradient, pattern, hessian_values)`. + pub fn sparse_hessian_vec( + &self, + x: &[F], + ) -> (F, Vec, crate::sparse::SparsityPattern, Vec) { + let n = self.num_inputs as usize; + assert_eq!(x.len(), n, "wrong number of inputs"); + + let pattern = self.detect_sparsity(); + let (colors, num_colors) = crate::sparse::greedy_coloring(&pattern); + + let mut hessian_values = vec![F::zero(); pattern.nnz()]; + let mut gradient = vec![F::zero(); n]; + let mut value = F::zero(); + + let mut dual_input_buf: Vec> = Vec::with_capacity(n); + let mut dual_vals_buf: Vec> = Vec::new(); + let mut adjoint_buf: Vec> = Vec::new(); + + let num_batches = (num_colors as usize).div_ceil(N); + for batch in 0..num_batches { + let base_color = (batch * N) as u32; + + // Build DualVec inputs: lane k has v[i]=1 if colors[i] == base_color+k + dual_input_buf.clear(); + dual_input_buf.extend((0..n).map(|i| { + let eps = std::array::from_fn(|lane| { + let target_color = base_color + lane as u32; + if target_color < num_colors && colors[i] == target_color { + F::one() + } else { + F::zero() + } + }); + DualVec::new(x[i], eps) + })); + + self.forward_tangent(&dual_input_buf, &mut dual_vals_buf); + self.reverse_tangent(&dual_vals_buf, &mut adjoint_buf); + + if batch == 0 { + value = dual_vals_buf[self.output_index as usize].re; + for i in 0..n { + gradient[i] = adjoint_buf[i].re; + } + } + + // Extract Hessian entries: for entry (row, col) with colors[col] == base_color+lane, + // read adjoint_buf[row].eps[lane] + for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() { + let col_color = colors[col as usize]; + if col_color >= base_color && col_color < base_color + N as u32 { + let lane = (col_color - base_color) as usize; + hessian_values[k] = adjoint_buf[row as usize].eps[lane]; + } + } + } + + (value, gradient, pattern, hessian_values) + } + + // ── Sparse Jacobian ── + + /// Compute a sparse Jacobian using structural sparsity detection and graph coloring. + /// + /// Auto-selects forward-mode (column compression) or reverse-mode (row compression) + /// based on which requires fewer sweeps. + /// + /// Returns `(output_values, pattern, jacobian_values)`. + pub fn sparse_jacobian( + &mut self, + x: &[F], + ) -> (Vec, crate::sparse::JacobianSparsityPattern, Vec) { + self.forward(x); + let pattern = self.detect_jacobian_sparsity(); + let (col_colors, num_col_colors) = crate::sparse::column_coloring(&pattern); + let (row_colors, num_row_colors) = crate::sparse::row_coloring(&pattern); + + if num_col_colors <= num_row_colors { + let jac_values = + self.sparse_jacobian_forward_impl(x, &pattern, &col_colors, num_col_colors); + let outputs = self.output_values(); + (outputs, pattern, jac_values) + } else { + let jac_values = + self.sparse_jacobian_reverse_impl(x, &pattern, &row_colors, num_row_colors); + let outputs = self.output_values(); + (outputs, pattern, jac_values) + } + } + + /// Sparse Jacobian via forward-mode (column compression). + pub fn sparse_jacobian_forward( + &mut self, + x: &[F], + ) -> (Vec, crate::sparse::JacobianSparsityPattern, Vec) { + self.forward(x); + let pattern = self.detect_jacobian_sparsity(); + let (colors, num_colors) = crate::sparse::column_coloring(&pattern); + let jac_values = self.sparse_jacobian_forward_impl(x, &pattern, &colors, num_colors); + let outputs = self.output_values(); + (outputs, pattern, jac_values) + } + + /// Sparse Jacobian via reverse-mode (row compression). + pub fn sparse_jacobian_reverse( + &mut self, + x: &[F], + ) -> (Vec, crate::sparse::JacobianSparsityPattern, Vec) { + self.forward(x); + let pattern = self.detect_jacobian_sparsity(); + let (colors, num_colors) = crate::sparse::row_coloring(&pattern); + let jac_values = self.sparse_jacobian_reverse_impl(x, &pattern, &colors, num_colors); + let outputs = self.output_values(); + (outputs, pattern, jac_values) + } + + /// Sparse Jacobian with a precomputed sparsity pattern and coloring. + /// + /// Skips re-detection of sparsity on repeated calls. Use `column_coloring` colors + /// for forward mode or `row_coloring` colors for reverse mode. The `forward` flag + /// selects the mode. + pub fn sparse_jacobian_with_pattern( + &mut self, + x: &[F], + pattern: &crate::sparse::JacobianSparsityPattern, + colors: &[u32], + num_colors: u32, + forward_mode: bool, + ) -> (Vec, Vec) { + self.forward(x); + let jac_values = if forward_mode { + self.sparse_jacobian_forward_impl(x, pattern, colors, num_colors) + } else { + self.sparse_jacobian_reverse_impl(x, pattern, colors, num_colors) + }; + let outputs = self.output_values(); + (outputs, jac_values) + } + + /// Forward-mode sparse Jacobian implementation (column compression). + /// + /// Each color group seeds a forward pass with tangent 1.0 in the columns + /// sharing that color. The resulting tangent at each output gives J[row][col]. + fn sparse_jacobian_forward_impl( + &self, + x: &[F], + pattern: &crate::sparse::JacobianSparsityPattern, + colors: &[u32], + num_colors: u32, + ) -> Vec { + let n = self.num_inputs as usize; + let mut jac_values = vec![F::zero(); pattern.nnz()]; + + let out_indices = self.all_output_indices(); + + let mut dual_input_buf: Vec> = Vec::with_capacity(n); + let mut dual_vals_buf: Vec> = Vec::new(); + + for color in 0..num_colors { + // Build Dual inputs: tangent = 1 for inputs with this color + dual_input_buf.clear(); + dual_input_buf.extend((0..n).map(|i| { + Dual::new( + x[i], + if colors[i] == color { + F::one() + } else { + F::zero() + }, + ) + })); + + self.forward_tangent(&dual_input_buf, &mut dual_vals_buf); + + // Extract Jacobian entries: for entry (row, col) with colors[col] == color, + // the tangent at output_indices[row] gives J[row][col] + for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() { + if colors[col as usize] == color { + jac_values[k] = dual_vals_buf[out_indices[row as usize] as usize].eps; + } + } + } + + jac_values + } + + /// Reverse-mode sparse Jacobian implementation (row compression). + /// + /// Each color group seeds a reverse pass with adjoint 1.0 at the outputs + /// sharing that color. + fn sparse_jacobian_reverse_impl( + &self, + _x: &[F], + pattern: &crate::sparse::JacobianSparsityPattern, + colors: &[u32], + num_colors: u32, + ) -> Vec { + let m = self.num_outputs(); + let mut jac_values = vec![F::zero(); pattern.nnz()]; + + let out_indices = self.all_output_indices(); + + for color in 0..num_colors { + // Build seeds: weight = 1 for outputs with this color + let seeds: Vec = (0..m) + .map(|i| { + if colors[i] == color { + F::one() + } else { + F::zero() + } + }) + .collect(); + + let adjoints = self.reverse_seeded_full(&seeds, out_indices); + + // Extract Jacobian entries: for entry (row, col) with colors[row] == color, + // adjoint[col] gives J[row][col] + for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() { + if colors[row as usize] == color { + jac_values[k] = adjoints[col as usize]; + } + } + } + + jac_values + } + + /// Batched sparse Jacobian: packs N colors per forward sweep using DualVec. + /// + /// Reduces the number of forward sweeps from `num_colors` to + /// `ceil(num_colors / N)`. + pub fn sparse_jacobian_vec( + &mut self, + x: &[F], + ) -> (Vec, crate::sparse::JacobianSparsityPattern, Vec) { + self.forward(x); + let pattern = self.detect_jacobian_sparsity(); + let (colors, num_colors) = crate::sparse::column_coloring(&pattern); + + let n = self.num_inputs as usize; + let mut jac_values = vec![F::zero(); pattern.nnz()]; + + let out_indices = self.all_output_indices(); + + let mut dual_input_buf: Vec> = Vec::with_capacity(n); + let mut dual_vals_buf: Vec> = Vec::new(); + + let num_batches = (num_colors as usize).div_ceil(N); + for batch in 0..num_batches { + let base_color = (batch * N) as u32; + + dual_input_buf.clear(); + dual_input_buf.extend((0..n).map(|i| { + let eps = std::array::from_fn(|lane| { + let target_color = base_color + lane as u32; + if target_color < num_colors && colors[i] == target_color { + F::one() + } else { + F::zero() + } + }); + DualVec::new(x[i], eps) + })); + + self.forward_tangent(&dual_input_buf, &mut dual_vals_buf); + + for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() { + let col_color = colors[col as usize]; + if col_color >= base_color && col_color < base_color + N as u32 { + let lane = (col_color - base_color) as usize; + jac_values[k] = dual_vals_buf[out_indices[row as usize] as usize].eps[lane]; + } + } + } + + let outputs = self.output_values(); + (outputs, pattern, jac_values) + } + + /// Sparse Hessian with a precomputed sparsity pattern and coloring. + /// + /// Skips re-detection on repeated calls (e.g. in solver loops). + pub fn sparse_hessian_with_pattern( + &self, + x: &[F], + pattern: &crate::sparse::SparsityPattern, + colors: &[u32], + num_colors: u32, + ) -> (F, Vec, Vec) { + let n = self.num_inputs as usize; + assert_eq!(x.len(), n, "wrong number of inputs"); + + let mut hessian_values = vec![F::zero(); pattern.nnz()]; + let mut gradient = vec![F::zero(); n]; + let mut value = F::zero(); + + let mut dual_input_buf: Vec> = Vec::with_capacity(n); + let mut dual_vals_buf = Vec::new(); + let mut adjoint_buf = Vec::new(); + let mut v = vec![F::zero(); n]; + + for color in 0..num_colors { + for i in 0..n { + v[i] = if colors[i] == color { + F::one() + } else { + F::zero() + }; + } + + self.hvp_with_all_bufs( + x, + &v, + &mut dual_input_buf, + &mut dual_vals_buf, + &mut adjoint_buf, + ); + + if color == 0 { + value = dual_vals_buf[self.output_index as usize].re; + for i in 0..n { + gradient[i] = adjoint_buf[i].re; + } + } + + for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() { + if colors[col as usize] == color { + hessian_values[k] = adjoint_buf[row as usize].eps; + } + } + } + + (value, gradient, hessian_values) + } +} diff --git a/src/bytecode_tape/tangent.rs b/src/bytecode_tape/tangent.rs new file mode 100644 index 0000000..a15433e --- /dev/null +++ b/src/bytecode_tape/tangent.rs @@ -0,0 +1,380 @@ +use crate::dual::Dual; +use crate::dual_vec::DualVec; +use crate::float::{Float, IsAllZero}; +use crate::opcode::{self, OpCode, UNUSED}; +use num_traits::Float as NumFloat; + +impl super::BytecodeTape { + // ── Forward-over-reverse (second-order) ── + + /// Forward sweep with tangent-carrying numbers. Reads opcodes and constants + /// from `self`, writing results into `buf`. Does not mutate the tape. + /// + /// Generic over `T: NumFloat` so it works with both `Dual` and + /// `DualVec`. + pub fn forward_tangent(&self, inputs: &[T], buf: &mut Vec) { + self.forward_tangent_inner(inputs, buf, |i, a_t, b_t| { + // First-order chain rule: evaluate on primals, convert partials to T. + let [a_idx, cb_idx] = self.arg_indices[i]; + let a_primal = self.values[a_idx as usize]; + let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied(); + let b_primal = b_idx_opt + .map(|bi| self.values[bi as usize]) + .unwrap_or(F::zero()); + let result = self.custom_ops[cb_idx as usize].eval(a_primal, b_primal); + let (da, db) = self.custom_ops[cb_idx as usize].partials(a_primal, b_primal, result); + let result_t = T::from(result).unwrap(); + let da_t = T::from(da).unwrap(); + let db_t = T::from(db).unwrap(); + let a_re_t = T::from(a_primal).unwrap(); + let b_re_t = T::from(b_primal).unwrap(); + result_t + da_t * (a_t - a_re_t) + db_t * (b_t - b_re_t) + }); + } + + /// Forward sweep specialized for `Dual`, calling [`CustomOp::eval_dual`] + /// so that custom ops propagate tangent information for second-order derivatives. + pub(super) fn forward_tangent_dual(&self, inputs: &[Dual], buf: &mut Vec>) { + self.forward_tangent_inner(inputs, buf, |i, a_t, b_t| { + let [_a_idx, cb_idx] = self.arg_indices[i]; + self.custom_ops[cb_idx as usize].eval_dual(a_t, b_t) + }); + } + + /// Common forward-tangent loop. The `handle_custom` closure receives + /// `(tape_index, a_value, b_value)` for custom op slots and returns + /// the result to store. + fn forward_tangent_inner( + &self, + inputs: &[T], + buf: &mut Vec, + handle_custom: impl Fn(usize, T, T) -> T, + ) { + assert_eq!( + inputs.len(), + self.num_inputs as usize, + "wrong number of inputs" + ); + + let n = self.num_variables as usize; + buf.clear(); + buf.resize(n, T::zero()); + + let mut input_idx = 0usize; + for i in 0..self.opcodes.len() { + match self.opcodes[i] { + OpCode::Input => { + buf[i] = inputs[input_idx]; + input_idx += 1; + } + OpCode::Const => { + buf[i] = T::from(self.values[i]).unwrap(); + } + OpCode::Custom => { + let [a_idx, _cb_idx] = self.arg_indices[i]; + let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied(); + let a_t = buf[a_idx as usize]; + let b_t = b_idx_opt.map(|bi| buf[bi as usize]).unwrap_or(T::zero()); + buf[i] = handle_custom(i, a_t, b_t); + } + op => { + let [a_idx, b_idx] = self.arg_indices[i]; + let a = buf[a_idx as usize]; + let b = if b_idx != UNUSED && op != OpCode::Powi { + buf[b_idx as usize] + } else if op == OpCode::Powi { + T::from(b_idx).unwrap_or_else(|| T::zero()) + } else { + T::zero() + }; + buf[i] = opcode::eval_forward(op, a, b); + } + } + } + } + + /// Reverse sweep with tangent-carrying adjoints. Uses values from + /// [`forward_tangent`](Self::forward_tangent). Uses [`IsAllZero`] to + /// safely skip zero adjoints without dropping tangent contributions. + pub(super) fn reverse_tangent(&self, tangent_vals: &[T], buf: &mut Vec) { + self.reverse_tangent_inner(tangent_vals, buf, |i| { + // First-order: convert primal-float partials to T. + let [a_idx, cb_idx] = self.arg_indices[i]; + let a_primal = self.values[a_idx as usize]; + let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied(); + let b_primal = b_idx_opt + .map(|bi| self.values[bi as usize]) + .unwrap_or(F::zero()); + let r_primal = self.values[i]; + let (da, db) = self.custom_ops[cb_idx as usize].partials(a_primal, b_primal, r_primal); + (T::from(da).unwrap(), T::from(db).unwrap()) + }); + } + + /// Reverse sweep specialized for `Dual`, calling [`CustomOp::partials_dual`] + /// so that custom op partials carry tangent information for second-order derivatives. + pub(super) fn reverse_tangent_dual(&self, tangent_vals: &[Dual], buf: &mut Vec>) { + self.reverse_tangent_inner(tangent_vals, buf, |i| { + let [a_idx, cb_idx] = self.arg_indices[i]; + let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied(); + let a_dual = tangent_vals[a_idx as usize]; + let b_dual = b_idx_opt + .map(|bi| tangent_vals[bi as usize]) + .unwrap_or(Dual::constant(F::zero())); + let r_dual = tangent_vals[i]; + self.custom_ops[cb_idx as usize].partials_dual(a_dual, b_dual, r_dual) + }); + } + + /// Common reverse-tangent loop. The `custom_partials` closure receives + /// `tape_index` for custom op slots and returns `(da, db)` as T-valued partials. + fn reverse_tangent_inner( + &self, + tangent_vals: &[T], + buf: &mut Vec, + custom_partials: impl Fn(usize) -> (T, T), + ) { + let n = self.num_variables as usize; + buf.clear(); + buf.resize(n, T::zero()); + buf[self.output_index as usize] = T::one(); + + for i in (0..self.opcodes.len()).rev() { + match self.opcodes[i] { + OpCode::Input | OpCode::Const => continue, + OpCode::Custom => { + let adj = buf[i]; + if adj.is_all_zero() { + continue; + } + buf[i] = T::zero(); + + let [a_idx, _cb_idx] = self.arg_indices[i]; + let b_idx_opt = self.custom_second_args.get(&(i as u32)).copied(); + let (da_t, db_t) = custom_partials(i); + buf[a_idx as usize] = buf[a_idx as usize] + da_t * adj; + if let Some(bi) = b_idx_opt { + buf[bi as usize] = buf[bi as usize] + db_t * adj; + } + } + op => { + let adj = buf[i]; + if adj.is_all_zero() { + continue; + } + buf[i] = T::zero(); + + let [a_idx, b_idx] = self.arg_indices[i]; + let a = tangent_vals[a_idx as usize]; + let b = if b_idx != UNUSED && op != OpCode::Powi { + tangent_vals[b_idx as usize] + } else if op == OpCode::Powi { + T::from(b_idx).unwrap_or_else(|| T::zero()) + } else { + T::zero() + }; + let r = tangent_vals[i]; + let (da, db) = opcode::reverse_partials(op, a, b, r); + + buf[a_idx as usize] = buf[a_idx as usize] + da * adj; + if b_idx != UNUSED && op != OpCode::Powi { + buf[b_idx as usize] = buf[b_idx as usize] + db * adj; + } + } + } + } + } + + /// Hessian-vector product via forward-over-reverse. + /// + /// Returns `(gradient, H·v)` where both are `Vec` of length + /// [`num_inputs`](Self::num_inputs). The tape is not mutated. + pub fn hvp(&self, x: &[F], v: &[F]) -> (Vec, Vec) { + let mut dual_vals = Vec::new(); + let mut adjoint_buf = Vec::new(); + self.hvp_with_buf(x, v, &mut dual_vals, &mut adjoint_buf) + } + + /// Like [`hvp`](Self::hvp) but reuses caller-provided buffers to avoid + /// allocation on repeated calls (e.g. inside [`hessian`](Self::hessian)). + pub fn hvp_with_buf( + &self, + x: &[F], + v: &[F], + dual_vals_buf: &mut Vec>, + adjoint_buf: &mut Vec>, + ) -> (Vec, Vec) { + let n = self.num_inputs as usize; + assert_eq!(x.len(), n, "wrong number of inputs"); + assert_eq!(v.len(), n, "wrong number of directions"); + + let dual_inputs: Vec> = x + .iter() + .zip(v.iter()) + .map(|(&xi, &vi)| Dual::new(xi, vi)) + .collect(); + + self.forward_tangent_dual(&dual_inputs, dual_vals_buf); + self.reverse_tangent_dual(dual_vals_buf, adjoint_buf); + + let gradient: Vec = (0..n).map(|i| adjoint_buf[i].re).collect(); + let hvp: Vec = (0..n).map(|i| adjoint_buf[i].eps).collect(); + (gradient, hvp) + } + + /// Like [`hvp_with_buf`](Self::hvp_with_buf) but also reuses a caller-provided + /// input buffer, eliminating all allocations on repeated calls. + pub(super) fn hvp_with_all_bufs( + &self, + x: &[F], + v: &[F], + dual_input_buf: &mut Vec>, + dual_vals_buf: &mut Vec>, + adjoint_buf: &mut Vec>, + ) { + // Reuse the input buffer instead of allocating + dual_input_buf.clear(); + dual_input_buf.extend(x.iter().zip(v.iter()).map(|(&xi, &vi)| Dual::new(xi, vi))); + + self.forward_tangent_dual(dual_input_buf, dual_vals_buf); + self.reverse_tangent_dual(dual_vals_buf, adjoint_buf); + } + + /// Full Hessian matrix via `n` Hessian-vector products. + /// + /// Returns `(value, gradient, hessian)` where `hessian[i][j] = ∂²f/∂x_i∂x_j`. + /// The tape is not mutated. + pub fn hessian(&self, x: &[F]) -> (F, Vec, Vec>) { + let n = self.num_inputs as usize; + assert_eq!(x.len(), n, "wrong number of inputs"); + + let mut dual_input_buf: Vec> = Vec::with_capacity(n); + let mut dual_vals_buf = Vec::new(); + let mut adjoint_buf = Vec::new(); + let mut hessian = vec![vec![F::zero(); n]; n]; + let mut gradient = vec![F::zero(); n]; + let mut value = F::zero(); + + for j in 0..n { + // Reuse input buffer + dual_input_buf.clear(); + dual_input_buf + .extend((0..n).map(|i| Dual::new(x[i], if i == j { F::one() } else { F::zero() }))); + + self.forward_tangent_dual(&dual_input_buf, &mut dual_vals_buf); + self.reverse_tangent_dual(&dual_vals_buf, &mut adjoint_buf); + + if j == 0 { + value = dual_vals_buf[self.output_index as usize].re; + for i in 0..n { + gradient[i] = adjoint_buf[i].re; + } + } + + for (row, adj) in hessian.iter_mut().zip(adjoint_buf.iter()) { + row[j] = adj.eps; + } + } + + (value, gradient, hessian) + } + + /// Full Hessian matrix via batched forward-over-reverse. + /// + /// Processes `ceil(n/N)` batches instead of `n` individual HVPs, + /// computing N Hessian columns simultaneously. + pub fn hessian_vec(&self, x: &[F]) -> (F, Vec, Vec>) { + let n = self.num_inputs as usize; + assert_eq!(x.len(), n, "wrong number of inputs"); + + let mut dual_input_buf: Vec> = Vec::with_capacity(n); + let mut dual_vals_buf: Vec> = Vec::new(); + let mut adjoint_buf: Vec> = Vec::new(); + let mut hessian = vec![vec![F::zero(); n]; n]; + let mut gradient = vec![F::zero(); n]; + let mut value = F::zero(); + + let num_batches = n.div_ceil(N); + for batch in 0..num_batches { + let base = batch * N; + + // Reuse input buffer + dual_input_buf.clear(); + dual_input_buf.extend((0..n).map(|i| { + let eps = std::array::from_fn(|lane| { + let col = base + lane; + if col < n && i == col { + F::one() + } else { + F::zero() + } + }); + DualVec::new(x[i], eps) + })); + + self.forward_tangent(&dual_input_buf, &mut dual_vals_buf); + self.reverse_tangent(&dual_vals_buf, &mut adjoint_buf); + + if batch == 0 { + value = dual_vals_buf[self.output_index as usize].re; + for i in 0..n { + gradient[i] = adjoint_buf[i].re; + } + } + + for lane in 0..N { + let col = base + lane; + if col >= n { + break; + } + for i in 0..n { + hessian[i][col] = adjoint_buf[i].eps[lane]; + } + } + } + + (value, gradient, hessian) + } + + // ── Higher-order derivatives ── + + /// Third-order directional derivative: `∑_{jk} (∂³f/∂x_i∂x_j∂x_k) v1_j v2_k`. + /// + /// Given directions `v1` and `v2`, computes: + /// - `gradient`: `∇f(x)` + /// - `hvp`: `H(x) · v1` (Hessian-vector product) + /// - `third`: `(∂/∂v2)(H · v1)` (third-order tensor contracted with v1 and v2) + /// + /// Uses `Dual>` (nested dual numbers): inner tangent for `v1`, + /// outer tangent for `v2`. + pub fn third_order_hvvp(&self, x: &[F], v1: &[F], v2: &[F]) -> (Vec, Vec, Vec) { + let n = self.num_inputs as usize; + assert_eq!(x.len(), n, "wrong number of inputs"); + assert_eq!(v1.len(), n, "wrong v1 length"); + assert_eq!(v2.len(), n, "wrong v2 length"); + + // Build Dual> inputs: + // inner.re = x[i], inner.eps = v1[i] (for HVP direction) + // outer.re = inner, outer.eps = v2[i] (for third-order direction) + // + // outer = Dual { re: Dual(x[i], v1[i]), eps: Dual(v2[i], 0) } + let dd_inputs: Vec>> = (0..n) + .map(|i| Dual { + re: Dual::new(x[i], v1[i]), + eps: Dual::new(v2[i], F::zero()), + }) + .collect(); + + let mut dd_vals: Vec>> = Vec::new(); + let mut dd_adj: Vec>> = Vec::new(); + + self.forward_tangent(&dd_inputs, &mut dd_vals); + self.reverse_tangent(&dd_vals, &mut dd_adj); + + let gradient: Vec = (0..n).map(|i| dd_adj[i].re.re).collect(); + let hvp: Vec = (0..n).map(|i| dd_adj[i].re.eps).collect(); + let third: Vec = (0..n).map(|i| dd_adj[i].eps.eps).collect(); + + (gradient, hvp, third) + } +} diff --git a/src/bytecode_tape/taylor.rs b/src/bytecode_tape/taylor.rs new file mode 100644 index 0000000..7047ceb --- /dev/null +++ b/src/bytecode_tape/taylor.rs @@ -0,0 +1,118 @@ +use crate::float::Float; +use crate::taylor::Taylor; + +impl super::BytecodeTape { + /// Forward-reverse Taylor pass for gradient + higher-order directional adjoints. + /// + /// Builds Taylor inputs `x_i(t) = x_i + v_i * t` (with zero higher coefficients), + /// runs `forward_tangent`, then `reverse_tangent` to get Taylor-valued adjoints. + /// + /// Returns `(output, adjoints)` where: + /// - `output` is the Taylor expansion of `f` along direction `v` + /// - `adjoints[i].coeff(0)` = `∂f/∂x_i` (gradient) + /// - `adjoints[i].coeff(1)` = `Σ_j (∂²f/∂x_i∂x_j) v_j` (HVP) + /// - `adjoints[i].derivative(k)` = k-th order directional adjoint + /// + /// For K=2, the HVP component is equivalent to [`hvp`](Self::hvp). + /// For K≥3, yields additional higher-order information in the same pass. + /// + /// Like [`hvp`](Self::hvp), takes `&self` and does not call `forward(x)` + /// before the Taylor pass. Custom ops will use primal values from recording time. + pub fn taylor_grad( + &self, + x: &[F], + v: &[F], + ) -> (Taylor, Vec>) { + let mut fwd_buf = Vec::new(); + let mut adj_buf = Vec::new(); + self.taylor_grad_with_buf(x, v, &mut fwd_buf, &mut adj_buf) + } + + /// Like [`taylor_grad`](Self::taylor_grad) but reuses caller-provided buffers + /// to avoid allocation on repeated calls. + pub fn taylor_grad_with_buf( + &self, + x: &[F], + v: &[F], + fwd_buf: &mut Vec>, + adj_buf: &mut Vec>, + ) -> (Taylor, Vec>) { + let n = self.num_inputs as usize; + assert_eq!(x.len(), n, "wrong number of inputs"); + assert_eq!(v.len(), n, "wrong number of directions"); + + // Build Taylor inputs: x_i(t) = x_i + v_i * t + let taylor_inputs: Vec> = x + .iter() + .zip(v.iter()) + .map(|(&xi, &vi)| { + let mut coeffs = [F::zero(); K]; + coeffs[0] = xi; + if K > 1 { + coeffs[1] = vi; + } + Taylor::new(coeffs) + }) + .collect(); + + self.forward_tangent(&taylor_inputs, fwd_buf); + let output = fwd_buf[self.output_index as usize]; + self.reverse_tangent(fwd_buf, adj_buf); + + (output, adj_buf[..n].to_vec()) + } + + // ── ODE Taylor integration ── + + /// Compute the Taylor expansion of the ODE solution `y(t)` to order K. + /// + /// Given a tape representing the right-hand side `f: R^n → R^n` of the ODE + /// `y' = f(y)`, and an initial condition `y(0) = y0`, computes the Taylor + /// coefficients `y_0, y_1, ..., y_{K-1}` such that + /// `y(t) ≈ y_0 + y_1·t + y_2·t² + ... + y_{K-1}·t^{K-1}`. + /// + /// The tape must have `num_outputs == num_inputs` (autonomous ODE: f maps R^n → R^n). + /// + /// Returns one `Taylor` per state variable. Use [`Taylor::eval_at`] to + /// evaluate at a step size `h`, or inspect coefficients for error estimation. + pub fn ode_taylor_step(&self, y0: &[F]) -> Vec> { + let mut buf = Vec::new(); + self.ode_taylor_step_with_buf(y0, &mut buf) + } + + /// Like [`ode_taylor_step`](Self::ode_taylor_step) but reuses a caller-provided + /// buffer to avoid allocation on repeated calls. + pub fn ode_taylor_step_with_buf( + &self, + y0: &[F], + buf: &mut Vec>, + ) -> Vec> { + let n = self.num_inputs as usize; + assert_eq!(y0.len(), n, "y0 length must match num_inputs"); + assert_eq!( + self.num_outputs(), + n, + "ODE tape must have num_outputs == num_inputs (f: R^n -> R^n)" + ); + + let out_indices = self.all_output_indices(); + + let mut y_coeffs = vec![[F::zero(); K]; n]; + for i in 0..n { + y_coeffs[i][0] = y0[i]; + } + + for k in 0..K - 1 { + let inputs: Vec> = (0..n).map(|i| Taylor::new(y_coeffs[i])).collect(); + + self.forward_tangent(&inputs, buf); + + let divisor = F::from(k + 1).unwrap(); + for i in 0..n { + y_coeffs[i][k + 1] = buf[out_indices[i] as usize].coeff(k) / divisor; + } + } + + (0..n).map(|i| Taylor::new(y_coeffs[i])).collect() + } +} diff --git a/src/bytecode_tape/thread_local.rs b/src/bytecode_tape/thread_local.rs new file mode 100644 index 0000000..32c0b65 --- /dev/null +++ b/src/bytecode_tape/thread_local.rs @@ -0,0 +1,87 @@ +use std::cell::Cell; + +use crate::dual::Dual; +use crate::float::Float; + +use super::BytecodeTape; + +thread_local! { + static BTAPE_F32: Cell<*mut BytecodeTape> = const { Cell::new(std::ptr::null_mut()) }; + static BTAPE_F64: Cell<*mut BytecodeTape> = const { Cell::new(std::ptr::null_mut()) }; + static BTAPE_DUAL_F32: Cell<*mut BytecodeTape>> = const { Cell::new(std::ptr::null_mut()) }; + static BTAPE_DUAL_F64: Cell<*mut BytecodeTape>> = const { Cell::new(std::ptr::null_mut()) }; +} + +/// Trait to select the correct thread-local for a given float type. +/// +/// Implemented for `f32`, `f64`, `Dual`, and `Dual`, enabling +/// `BReverse` to be used with these base types. +pub trait BtapeThreadLocal: Float { + fn btape_cell() -> &'static std::thread::LocalKey>>; +} + +impl BtapeThreadLocal for f32 { + fn btape_cell() -> &'static std::thread::LocalKey>> { + &BTAPE_F32 + } +} + +impl BtapeThreadLocal for f64 { + fn btape_cell() -> &'static std::thread::LocalKey>> { + &BTAPE_F64 + } +} + +impl BtapeThreadLocal for Dual { + fn btape_cell() -> &'static std::thread::LocalKey>> { + &BTAPE_DUAL_F32 + } +} + +impl BtapeThreadLocal for Dual { + fn btape_cell() -> &'static std::thread::LocalKey>> { + &BTAPE_DUAL_F64 + } +} + +/// Access the active bytecode tape for the current thread. +/// Panics if no tape is active. +#[inline] +pub fn with_active_btape(f: impl FnOnce(&mut BytecodeTape) -> R) -> R { + F::btape_cell().with(|cell| { + let ptr = cell.get(); + assert!( + !ptr.is_null(), + "No active bytecode tape. Use echidna::record() to record a function." + ); + // SAFETY: BtapeGuard guarantees validity for the duration of the + // recording scope, single-threaded via thread-local. + let tape = unsafe { &mut *ptr }; + f(tape) + }) +} + +/// RAII guard that sets a bytecode tape as the thread-local active tape. +pub struct BtapeGuard { + prev: *mut BytecodeTape, +} + +impl BtapeGuard { + /// Activate `tape` as the thread-local bytecode tape. + pub fn new(tape: &mut BytecodeTape) -> Self { + let prev = F::btape_cell().with(|cell| { + let prev = cell.get(); + cell.set(tape as *mut BytecodeTape); + prev + }); + BtapeGuard { prev } + } +} + +impl Drop for BtapeGuard { + fn drop(&mut self) { + F::btape_cell().with(|cell| { + cell.set(self.prev); + }); + } +} From af656de76f48cb4d4656135d5eacf431f580e1bf Mon Sep 17 00:00:00 2001 From: Greg von Nessi Date: Thu, 26 Feb 2026 01:45:56 +0000 Subject: [PATCH 4/5] Bump to v0.4.0 and update docs BytecodeTape decomposition (10 submodules), reverse sweep deduplication, CUDA macro extraction, nalgebra 0.34, and accumulated doc/test improvements since v0.3.0. --- CHANGELOG.md | 16 +- CONTRIBUTING.md | 50 +- Cargo.toml | 6 +- README.md | 20 +- SECURITY.md | 6 +- docs/adr-deferred-work.md | 27 +- docs/algorithms.md | 11 +- docs/roadmap.md | 6 +- echidna-optim/Cargo.toml | 4 +- echidna-optim/tests/integration.rs | 4 +- echidna-optim/tests/solvers.rs | 651 ++++++++++++++++ src/gpu/cuda_backend.rs | 1147 ++++++++++------------------ src/gpu/mod.rs | 2 +- src/opcode.rs | 2 +- 14 files changed, 1185 insertions(+), 767 deletions(-) create mode 100644 echidna-optim/tests/solvers.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index b1e0909..054d493 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.4.0] - 2026-02-26 + +### Changed + +#### Internal Architecture +- **BytecodeTape decomposition**: split 2,689-line monolithic `bytecode_tape.rs` into a directory module with 10 focused submodules (`forward.rs`, `reverse.rs`, `tangent.rs`, `jacobian.rs`, `sparse.rs`, `optimize.rs`, `taylor.rs`, `parallel.rs`, `serde_support.rs`, `thread_local.rs`). Zero public API changes; benchmarks confirm no performance impact. +- Deduplicated reverse sweep in `gradient_with_buf()` and `sparse_jacobian_par()` — both now call shared `reverse_sweep_core()` instead of inlining the loop. `gradient_with_buf` gains the zero-adjoint skip optimization it was previously missing. +- Bumped `nalgebra` dependency from 0.33 to 0.34 + +### Fixed +- Corrected opcode variant count in documentation (44 variants, not 38/43) +- Fixed CONTRIBUTING.md MSRV reference (1.93, not 1.80) + ## [0.3.0] - 2026-02-25 ### Added @@ -162,7 +175,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Forward-vs-reverse cross-validation on Rosenbrock, Beale, Ackley, Booth, and more - Criterion benchmarks for forward overhead and reverse gradient -[Unreleased]: https://github.com/Entrolution/echidna/compare/v0.3.0...HEAD +[Unreleased]: https://github.com/Entrolution/echidna/compare/v0.4.0...HEAD +[0.4.0]: https://github.com/Entrolution/echidna/compare/v0.3.0...v0.4.0 [0.3.0]: https://github.com/Entrolution/echidna/compare/v0.2.0...v0.3.0 [0.2.0]: https://github.com/Entrolution/echidna/compare/v0.1.0...v0.2.0 [0.1.0]: https://github.com/Entrolution/echidna/releases/tag/v0.1.0 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f9ad291..0620da4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -21,7 +21,7 @@ This project is governed by the [Contributor Covenant Code of Conduct](CODE_OF_C ### Prerequisites -- Rust 1.80 or later (install via [rustup](https://rustup.rs/)) +- Rust 1.93 or later (install via [rustup](https://rustup.rs/)) - Cargo (included with Rust) ### Building @@ -37,9 +37,19 @@ cargo build --release ### Testing ```bash -# Run all tests +# Run core tests (no feature flags) cargo test +# Run with all CPU features +cargo test --features "bytecode,taylor,laurent,stde,diffop,serde,faer,nalgebra,ndarray" + +# Run GPU tests (requires hardware) +cargo test --features "bytecode,gpu-wgpu" +cargo test --features "bytecode,gpu-cuda" + +# Run echidna-optim tests +cargo test -p echidna-optim + # Run tests with output cargo test -- --nocapture @@ -65,12 +75,29 @@ cargo doc --no-deps ### Benchmarks ```bash -# Run all benchmarks +# Run core benchmarks (forward + reverse) cargo bench -# Run a specific benchmark -cargo bench --bench forward -cargo bench --bench reverse +# Bytecode tape +cargo bench --features bytecode --bench bytecode + +# Taylor mode +cargo bench --features stde --bench taylor + +# STDE estimators +cargo bench --features stde --bench stde + +# Differential operators +cargo bench --features diffop --bench diffop + +# Cross-country, sparse, nonsmooth +cargo bench --features bytecode --bench advanced + +# GPU +cargo bench --features gpu-wgpu --bench gpu + +# Comparison vs other libraries +cargo bench --features bytecode --bench comparison ``` ### Security Audits @@ -136,7 +163,7 @@ src/ ├── api.rs # Public API: grad, jvp, vjp, jacobian, hessian, ... ├── breverse.rs # BReverse bytecode-tape reverse variable [bytecode] ├── bytecode_tape.rs # BytecodeTape SoA representation [bytecode] -├── opcode.rs # Opcode definitions and dispatch [bytecode] +├── opcode.rs # Opcode definitions and dispatch (44 opcodes) [bytecode] ├── sparse.rs # Sparsity detection and graph coloring [bytecode] ├── cross_country.rs # Markowitz vertex elimination [bytecode] ├── nonsmooth.rs # Branch tracking, Clarke Jacobian [bytecode] @@ -146,7 +173,14 @@ src/ ├── taylor_ops.rs # Shared Taylor propagation rules [taylor] ├── laurent.rs # Laurent singularity analysis [laurent] ├── stde.rs # Stochastic derivative estimators [stde] -├── gpu/ # GPU acceleration [gpu-wgpu, gpu-cuda] +├── diffop.rs # Arbitrary differential operators via jets [diffop] +├── gpu/ +│ ├── mod.rs # GpuBackend trait, GpuTapeData, GpuError +│ ├── wgpu_backend.rs # WgpuContext (Metal/Vulkan/DX12, f32) [gpu-wgpu] +│ ├── cuda_backend.rs # CudaContext (NVIDIA, f32+f64) [gpu-cuda] +│ ├── stde_gpu.rs # GPU-accelerated STDE functions [stde] +│ ├── shaders/ # 5 WGSL compute shaders [gpu-wgpu] +│ └── kernels/ # CUDA kernels (tape_eval.cu, taylor_eval.cu) [gpu-cuda] ├── faer_support.rs # faer integration [faer] ├── nalgebra_support.rs # nalgebra integration [nalgebra] ├── ndarray_support.rs # ndarray integration [ndarray] diff --git a/Cargo.toml b/Cargo.toml index f6dd079..03fa10a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ members = [".", "echidna-optim"] [package] name = "echidna" -version = "0.3.0" +version = "0.4.0" edition = "2021" rust-version = "1.93" description = "A high-performance automatic differentiation library for Rust" @@ -26,7 +26,7 @@ rayon = { version = "1", optional = true } serde = { version = "1", features = ["derive"], optional = true } ndarray = { version = "0.17", optional = true } faer = { version = "0.24", optional = true } -nalgebra = { version = "0.33", optional = true } +nalgebra = { version = "0.34", optional = true } wgpu = { version = "28", optional = true } bytemuck = { version = "1", features = ["derive"], optional = true } pollster = { version = "0.4", optional = true } @@ -37,7 +37,7 @@ criterion = { version = "0.8", features = ["html_reports"] } approx = "0.5" serde_json = "1" ciborium = "0.2" -nalgebra = "0.33" +nalgebra = "0.34" num-dual = "0.11" ad_trait = "0.2.2" tempfile = "3" diff --git a/README.md b/README.md index f0f28ac..4e95577 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ Add to `Cargo.toml`: ```toml [dependencies] -echidna = "0.3" +echidna = "0.4" ``` ### Gradient via reverse mode @@ -157,7 +157,7 @@ assert!((d2_dxdy - 2.0).abs() < 1e-6); Enable features in `Cargo.toml`: ```toml -echidna = { version = "0.3", features = ["bytecode", "taylor"] } +echidna = { version = "0.4", features = ["bytecode", "taylor"] } ``` ## API @@ -200,12 +200,18 @@ echidna = { version = "0.3", features = ["bytecode", "taylor"] } | `tape.ode_taylor_step::(y0)` | ODE Taylor integration step | | `stde::laplacian(tape, x, dirs)` | Stochastic Laplacian estimate | | `stde::hessian_diagonal(tape, x)` | Stochastic Hessian diagonal | +| `stde::diagonal_kth_order(tape, x, k)` | Exact k-th order diagonal (dynamic, arena-based) | | `stde::diagonal_kth_order_const::(tape, x)` | Exact k-th order diagonal (const-generic, stack-allocated) | +| `stde::diagonal_kth_order_stochastic(tape, x, k, indices)` | Stochastic k-th order diagonal via subsampling | | `stde::dense_stde_2nd(tape, x, cholesky, z)` | Dense STDE for positive-definite 2nd-order operators | +| `stde::stde_sparse(tape, x, dist, indices)` | Sparse STDE for arbitrary differential operators | +| `stde::parabolic_diffusion(tape, x, sigma)` | Parabolic PDE σ-transform diffusion operator | | `stde::directional_derivatives(tape, x, dirs)` | Batched 1st and 2nd order directional derivatives | | `stde::laplacian_with_control(tape, x, dirs, ctrl)` | Laplacian with control variate variance reduction | | `stde::laplacian_hutchpp(tape, x, dirs_s, dirs_g)` | Hutch++ Laplacian estimator (lower variance) | | `stde::divergence(tape, x, dirs)` | Stochastic divergence (trace of Jacobian) estimator | +| `stde::estimate(estimator, tape, x, dirs)` | Run an `Estimator` with Welford statistics | +| `stde::estimate_weighted(estimator, tape, x, dirs, weights)` | Importance-weighted estimator (West's algorithm) | ### Differential Operators (requires `diffop`) @@ -215,6 +221,10 @@ echidna = { version = "0.3", features = ["bytecode", "taylor"] } | `diffop::hessian(tape, x)` | Full Hessian via jet extraction | | `JetPlan::plan(n, indices)` | Plan once for a set of multi-indices | | `diffop::eval_dyn(plan, tape, x)` | Evaluate a plan at a new point (reuses precomputed slot assignments) | +| `DiffOp::new(n, terms)` | Differential operator from `(coefficient, MultiIndex)` pairs | +| `DiffOp::laplacian(n)` / `biharmonic(n)` / `diagonal(n, k)` | Common operator constructors | +| `diffop.eval(tape, x)` | Evaluate operator at a point via `JetPlan` | +| `diffop.sparse_distribution()` | Build `SparseSamplingDistribution` for stochastic estimation | ### GPU (requires `gpu-wgpu` or `gpu-cuda`) @@ -229,6 +239,8 @@ echidna = { version = "0.3", features = ["bytecode", "taylor"] } | `stde_gpu::laplacian_gpu(ctx, bufs, x, dirs)` | GPU-accelerated Laplacian estimator | | `stde_gpu::hessian_diagonal_gpu(ctx, bufs, x)` | GPU-accelerated exact Hessian diagonal | | `stde_gpu::laplacian_with_control_gpu(ctx, bufs, x, dirs, ctrl)` | GPU Laplacian with control variate | +| `stde_gpu::laplacian_gpu_cuda(ctx, bufs, x, dirs)` | CUDA-accelerated Laplacian estimator | +| `stde_gpu::hessian_diagonal_gpu_cuda(ctx, bufs, x)` | CUDA-accelerated exact Hessian diagonal | CUDA additionally supports `_f64` variants of all methods. @@ -267,7 +279,7 @@ cargo bench # Forward + reverse cargo bench --features bytecode --bench bytecode # Bytecode tape cargo bench --features stde --bench taylor # Taylor mode cargo bench --features gpu-wgpu --bench gpu # GPU -cargo bench --features "bytecode,nalgebra" --bench comparison # vs other libraries +cargo bench --features bytecode --bench comparison # vs other libraries ``` ### Gradient benchmark (Rosenbrock, lower is better) @@ -296,7 +308,7 @@ The [`echidna-optim`](echidna-optim/) crate provides optimization solvers and im ```toml [dependencies] -echidna-optim = "0.3" +echidna-optim = "0.4" ``` Optional features: `parallel` (enables rayon parallelism via `echidna/parallel`), `sparse-implicit` (sparse implicit differentiation via faer). diff --git a/SECURITY.md b/SECURITY.md index 18b67f5..63c0c2f 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -4,8 +4,8 @@ | Version | Supported | |---------|-----------| -| 0.3.x | Yes | -| < 0.3 | No | +| 0.4.x | Yes | +| < 0.4 | No | Only the latest minor release receives security updates. @@ -30,7 +30,7 @@ echidna uses `unsafe` in the following scoped contexts: | Location | Purpose | |----------|---------| -| `tape.rs`, `bytecode_tape.rs`, `taylor_dyn.rs` | Thread-local mutable pointer dereference for tape/arena access. Each is RAII-guarded: the pointer is valid for the lifetime of the enclosing scope guard. | +| `tape.rs`, `bytecode_tape/thread_local.rs`, `taylor_dyn.rs` | Thread-local mutable pointer dereference for tape/arena access. Each is RAII-guarded: the pointer is valid for the lifetime of the enclosing scope guard. | | `checkpoint.rs` | Byte-level transmutation (`&[F]` ↔ `&[u8]`) for disk-backed checkpoint serialisation. Relies on `F: Float` being `f32`/`f64` (IEEE 754, no padding). | | `gpu/cuda_backend.rs` | FFI kernel launches via `cudarc`. Each call passes validated buffer sizes and grid dimensions to the CUDA driver. | | `traits/simba_impls.rs` | `extract_unchecked` / `replace_unchecked` for simba's `SimdValue` trait. Scalar types have only one lane, so the index is always 0. | diff --git a/docs/adr-deferred-work.md b/docs/adr-deferred-work.md index abc8a6c..419d1f4 100644 --- a/docs/adr-deferred-work.md +++ b/docs/adr-deferred-work.md @@ -2,8 +2,9 @@ **Date**: 2026-02-25 **Status**: Accepted +**Last updated**: 2026-02-25 (added STDE deferred items) -All core roadmap items (R1–R13) and Phases 1–5 are complete. This ADR captures every evaluated-but-not-implemented item with explicit reasoning, so future planning doesn't re-investigate the same paths. +All core roadmap items (R1–R13), Phases 1–5, and the STDE deferred items plan are complete. This ADR captures every evaluated-but-not-implemented item with explicit reasoning, so future planning doesn't re-investigate the same paths. --- @@ -12,12 +13,26 @@ All core roadmap items (R1–R13) and Phases 1–5 are complete. This ADR captur | Item | Blocker | Revisit When | |------|---------|--------------| | RUSTSEC-2024-0436 (paste/simba) | Upstream simba must release with paste alternative | simba publishes a new release | -| nalgebra 0.33 → 0.34 | nalgebra 0.34 requires Rust 1.87+; MSRV is 1.80 | MSRV is raised to 1.87+ | + +--- + +## Deferred (valuable, not yet needed) + +| Item | Reasoning | Revisit When | +|------|-----------|--------------| +| nalgebra 0.33 → 0.34 | MSRV blocker resolved (now 1.93, nalgebra 0.34 requires 1.87+). May have API changes in `NalgebraVec` and sparse matrix types. | When convenient; test thoroughly for API changes. | +| Indefinite dense STDE | Eigendecomposition-based approach for indefinite coefficient matrices C (6 parameters, sign-splitting into C⁺ - C⁻) adds significant API complexity. The positive-definite Cholesky case (`dense_stde_2nd`) covers most PDE use cases (Fokker-Planck, Black-Scholes, HJB). Users with indefinite C can manually split into C⁺ - C⁻, compute Cholesky factors for each, and call `dense_stde_2nd` twice. | A concrete user need for indefinite C arises | +| General-K GPU Taylor kernels | GPU Taylor kernel is hardcoded to K=3 (second-order). Hardcoding allows complete loop unrolling — critical for GPU performance. General K would need dynamic loops or a family of K-specialized kernels. | Need for GPU-accelerated 3rd+ order derivatives | +| Chunked GPU Taylor dispatch | Working buffer is `3 * num_variables * batch_size * 4` bytes. WebGPU's 128 MB limit caps `num_variables * batch_size ≤ ~10M`. For larger problems, the dispatch function should chunk the batch and accumulate results. | Users hit the buffer limit in practice | +| CUDA `laplacian_with_control_gpu` | `laplacian_with_control_gpu` is currently wgpu-only. CUDA equivalent (`laplacian_with_control_gpu_cuda`) is straightforward to add — same CPU-side Welford aggregation, just dispatches through `CudaContext`. | CUDA users need variance-reduced Laplacian | +| `taylor_forward_2nd_batch` in `GpuBackend` trait | Currently an inherent method on each backend, not part of the `GpuBackend` trait. Adding to the trait would enable generic code over backends but requires an associated type for Taylor results. | Multiple backends need to be used generically for Taylor | --- ## Rejected (evaluated, explicit reasoning) +### Core AD + | Item | Reasoning | What exists instead | |------|-----------|---------------------| | Constant deduplication (5.2) | `FloatBits` orphan rule blocks impl for `Dual`. CSE handles ops over duplicate constants. | CSE pass in bytecode tape | @@ -28,3 +43,11 @@ All core roadmap items (R1–R13) and Phases 1–5 are complete. This ADR captur | Preaccumulation of straight-line segments | Superseded by cross-country Markowitz elimination (R5), which achieves the same goal more generally. | `jacobian_cross_country()` | | Trait impl macros for num_traits | ~3,200 lines are mechanical but readable. Macros hurt error messages, IDE support, and debuggability. Low maintenance cost since impls rarely change. | Manual trait impls | | DiffOperator trait abstraction | Concrete estimators work well as standalone functions. `Estimator` trait (5.1) already provides the needed abstraction. Another trait layer would over-abstract. | `Estimator` trait + concrete functions | + +### STDE Paper Items + +| Item | Paper Section | Reasoning | What exists instead | +|------|---------------|-----------|---------------------| +| Multi-pushforward correction | App F, Eq 48-52 | Current collision-free prime-window approach in `diagonal_kth_order` is simpler and equally effective for practical operators. The correction addresses collisions in multi-index sampling that don't occur with the prime-window design. | `diagonal_kth_order`, `diagonal_kth_order_const` | +| Amortized PINN training | §5.1, Eq 25 | Application-level integration — amortizing STDE samples across training steps is a training loop concern, not a core AD primitive. Users can implement this in their training code using existing `stde::*` functions. | `stde::laplacian`, `stde::stde_sparse` | +| Weight sharing across network layers | App G | Network architecture concern — sharing Taylor jet computation across layers with shared weights requires network-level orchestration, not AD-level support. | — | diff --git a/docs/algorithms.md b/docs/algorithms.md index 81b6522..7606e71 100644 --- a/docs/algorithms.md +++ b/docs/algorithms.md @@ -72,7 +72,7 @@ inputs. The tape uses a Structure-of-Arrays (SoA) layout: separate contiguous arrays for opcodes, operand indices, and constant values. This layout improves cache utilization — the opcode stream is compact enough to fit in L1 cache for moderate-sized tapes, and the separation allows independent vectorization of -forward and reverse sweeps. The tape currently defines 38 opcodes covering all +forward and reverse sweeps. The tape currently defines 44 opcodes covering all standard elemental operations (arithmetic, transcendental, power, comparison). A key advantage of the graph representation is that it enables optimization @@ -505,11 +505,12 @@ numerically intensive phase. Custom operations are rejected at tape upload time since they cannot be compiled to GPU code. The wgpu backend (feature `gpu-wgpu`) provides cross-platform GPU access via -Metal, Vulkan, and DX12. It implements four WGSL compute shaders: `forward` +Metal, Vulkan, and DX12. It implements five WGSL compute shaders: `forward` (batched forward evaluation), `reverse` (batched adjoint sweep), -`tangent_forward` (forward tangent for JVP and sparse Jacobian), and -`tangent_reverse` (forward-over-reverse for HVP and sparse Hessian). All 43 -opcodes are implemented in each shader. The backend is limited to f32 due to +`tangent_forward` (forward tangent for JVP and sparse Jacobian), +`tangent_reverse` (forward-over-reverse for HVP and sparse Hessian), and +`taylor_forward_2nd` (batched second-order Taylor forward propagation for STDE). +All 44 opcodes are implemented in each shader. The backend is limited to f32 due to WGSL's lack of native f64 support. The CUDA backend (feature `gpu-cuda`) targets NVIDIA GPUs and supports both f32 and f64. It uses a single templated CUDA kernel file compiled via NVRTC at runtime, instantiated for both `float` diff --git a/docs/roadmap.md b/docs/roadmap.md index af6e99d..80660c0 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -5,7 +5,7 @@ This roadmap synthesizes: - Deferred items from all implementation phases to date - Remaining phases from the [book breakdown](book-breakdown.md) -- Taylor mode AD and stochastic estimators from the STDE paper (Schatz et al., NeurIPS 2024) +- Taylor mode AD and stochastic estimators from the STDE paper (Shi et al., NeurIPS 2024) - Organic improvements identified during implementation --- @@ -70,7 +70,7 @@ Given `y' = f(y)`, computes the Taylor expansion of `y(t)` to order K via coeffi Key file: `src/bytecode_tape.rs`. ### R2. Stochastic Taylor Derivative Estimators (STDE) -**STDE paper (Schatz et al., NeurIPS 2024)** +**STDE paper (Shi et al., NeurIPS 2024)** Builds on R1. Uses random jets to estimate differential operators without computing full derivative tensors. @@ -219,7 +219,7 @@ Two GPU backends for batched tape evaluation: **CUDA backend** (`gpu-cuda` feature) — NVIDIA only, f32 + f64. Single templated CUDA kernel file `tape_eval.cu` with four kernels (`forward_eval`, `reverse_sweep`, `tangent_forward`, `tangent_reverse`), compiled via NVRTC at runtime for both float and double. `CudaContext` mirrors the wgpu API surface plus f64 variants: `forward_batch_f64`, `gradient_batch_f64`, `sparse_jacobian_f64`, `sparse_hessian_f64`, `hvp_batch_f64`. 6 CUDA tests (f32 + f64). -CPU-side sparsity detection and graph coloring drive the GPU tangent sweeps. Custom ops are rejected at upload time (`GpuTapeData::from_tape`). All 38 opcodes implemented in all four shader/kernel variants. +CPU-side sparsity detection and graph coloring drive the GPU tangent sweeps. Custom ops are rejected at upload time (`GpuTapeData::from_tape`). All 44 opcodes implemented in all five shader/kernel variants. Key files: `src/gpu/mod.rs`, `src/gpu/wgpu_backend.rs`, `src/gpu/cuda_backend.rs`, `src/gpu/shaders/forward.wgsl`, `src/gpu/shaders/reverse.wgsl`, `src/gpu/shaders/tangent_forward.wgsl`, `src/gpu/shaders/tangent_reverse.wgsl`, `src/gpu/kernels/tape_eval.cu`, `tests/gpu_wgpu_tests.rs`, `tests/gpu_cuda_tests.rs`. diff --git a/echidna-optim/Cargo.toml b/echidna-optim/Cargo.toml index 0f66d91..267f5ba 100644 --- a/echidna-optim/Cargo.toml +++ b/echidna-optim/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "echidna-optim" -version = "0.3.0" +version = "0.4.0" edition = "2021" rust-version = "1.93" description = "Optimization solvers and implicit differentiation for echidna" @@ -12,7 +12,7 @@ keywords = ["optimization", "differentiation", "implicit", "gradient", "scientif categories = ["mathematics", "science", "algorithms"] [dependencies] -echidna = { version = "0.3.0", path = "..", features = ["bytecode"] } +echidna = { version = "0.4.0", path = "..", features = ["bytecode"] } num-traits = "0.2" faer = { version = "0.24", optional = true } diff --git a/echidna-optim/tests/integration.rs b/echidna-optim/tests/integration.rs index d52bf13..b586bf0 100644 --- a/echidna-optim/tests/integration.rs +++ b/echidna-optim/tests/integration.rs @@ -17,7 +17,7 @@ fn rosenbrock(x: &[T]) -> T { } fn make_tape() -> echidna::BytecodeTape { - let (tape, _) = echidna::record(|x| rosenbrock(x), &[0.0_f64, 0.0]); + let (tape, _) = echidna::record(rosenbrock, &[0.0_f64, 0.0]); tape } @@ -97,7 +97,7 @@ fn newton_fewer_iters_than_lbfgs() { #[test] fn optimized_tape_still_works() { - let (mut tape, _) = echidna::record(|x| rosenbrock(x), &[0.0_f64, 0.0]); + let (mut tape, _) = echidna::record(rosenbrock, &[0.0_f64, 0.0]); tape.optimize(); let mut obj = TapeObjective::new(tape); diff --git a/echidna-optim/tests/solvers.rs b/echidna-optim/tests/solvers.rs new file mode 100644 index 0000000..b6e9d85 --- /dev/null +++ b/echidna-optim/tests/solvers.rs @@ -0,0 +1,651 @@ +use echidna::Scalar; +use echidna_optim::{ + lbfgs, newton, trust_region, ConvergenceParams, LbfgsConfig, NewtonConfig, Objective, + OptimResult, TapeObjective, TerminationReason, TrustRegionConfig, +}; + +// ============================================================ +// Test objectives +// ============================================================ + +/// f(x) = 0.5 * sum(x_i^2). Minimum at origin, value 0. +struct Quadratic { + dim: usize, +} + +impl Objective for Quadratic { + fn dim(&self) -> usize { + self.dim + } + + fn eval_grad(&mut self, x: &[f64]) -> (f64, Vec) { + let f: f64 = 0.5 * x.iter().map(|&xi| xi * xi).sum::(); + let g: Vec = x.to_vec(); + (f, g) + } + + fn eval_hessian(&mut self, x: &[f64]) -> (f64, Vec, Vec>) { + let (f, g) = self.eval_grad(x); + let n = x.len(); + let mut h = vec![vec![0.0; n]; n]; + for (i, row) in h.iter_mut().enumerate() { + row[i] = 1.0; + } + (f, g, h) + } + + fn hvp(&mut self, x: &[f64], v: &[f64]) -> (Vec, Vec) { + let g = x.to_vec(); + let hv = v.to_vec(); // H = I, so Hv = v + (g, hv) + } +} + +/// f(x) = 0.5 * (a*x0^2 + b*x1^2). Ill-conditioned when a/b >> 1. +struct IllConditionedQuadratic { + a: f64, + b: f64, +} + +impl Objective for IllConditionedQuadratic { + fn dim(&self) -> usize { + 2 + } + + fn eval_grad(&mut self, x: &[f64]) -> (f64, Vec) { + let f = 0.5 * (self.a * x[0] * x[0] + self.b * x[1] * x[1]); + let g = vec![self.a * x[0], self.b * x[1]]; + (f, g) + } + + fn eval_hessian(&mut self, x: &[f64]) -> (f64, Vec, Vec>) { + let (f, g) = self.eval_grad(x); + let h = vec![vec![self.a, 0.0], vec![0.0, self.b]]; + (f, g, h) + } + + fn hvp(&mut self, x: &[f64], v: &[f64]) -> (Vec, Vec) { + let g = vec![self.a * x[0], self.b * x[1]]; + let hv = vec![self.a * v[0], self.b * v[1]]; + (g, hv) + } +} + +/// Rosenbrock: f(x) = (1 - x0)^2 + 100*(x1 - x0^2)^2. Minimum at (1,1), value 0. +struct Rosenbrock2D; + +impl Objective for Rosenbrock2D { + fn dim(&self) -> usize { + 2 + } + + fn eval_grad(&mut self, x: &[f64]) -> (f64, Vec) { + let a = 1.0 - x[0]; + let b = x[1] - x[0] * x[0]; + let f = a * a + 100.0 * b * b; + let g0 = -2.0 * a - 400.0 * x[0] * b; + let g1 = 200.0 * b; + (f, vec![g0, g1]) + } + + fn eval_hessian(&mut self, x: &[f64]) -> (f64, Vec, Vec>) { + let a = 1.0 - x[0]; + let b = x[1] - x[0] * x[0]; + let f = a * a + 100.0 * b * b; + let g0 = -2.0 * a - 400.0 * x[0] * b; + let g1 = 200.0 * b; + + let h00 = 2.0 - 400.0 * (x[1] - 3.0 * x[0] * x[0]); + let h01 = -400.0 * x[0]; + let h11 = 200.0; + + (f, vec![g0, g1], vec![vec![h00, h01], vec![h01, h11]]) + } + + fn hvp(&mut self, x: &[f64], v: &[f64]) -> (Vec, Vec) { + let h00 = 2.0 - 400.0 * (x[1] - 3.0 * x[0] * x[0]); + let h01 = -400.0 * x[0]; + let h11 = 200.0; + + let g0 = -2.0 * (1.0 - x[0]) - 400.0 * x[0] * (x[1] - x[0] * x[0]); + let g1 = 200.0 * (x[1] - x[0] * x[0]); + + let hv0 = h00 * v[0] + h01 * v[1]; + let hv1 = h01 * v[0] + h11 * v[1]; + + (vec![g0, g1], vec![hv0, hv1]) + } +} + +fn assert_near_origin(result: &OptimResult, tol: f64) { + for (i, &xi) in result.x.iter().enumerate() { + assert!( + xi.abs() < tol, + "x[{}] = {}, expected ~0", + i, + xi + ); + } + assert!(result.value < tol, "f = {}, expected ~0", result.value); +} + +// ============================================================ +// Newton: simple quadratic +// ============================================================ + +#[test] +fn newton_quadratic_2d() { + let mut obj = Quadratic { dim: 2 }; + let config = NewtonConfig::default(); + let result = newton(&mut obj, &[5.0, -3.0], &config); + + assert_eq!(result.termination, TerminationReason::GradientNorm); + // Newton on a quadratic converges in 1 iteration + assert_eq!(result.iterations, 1, "Newton should solve quadratic in 1 step"); + assert_near_origin(&result, 1e-10); +} + +#[test] +fn newton_quadratic_4d() { + let mut obj = Quadratic { dim: 4 }; + let config = NewtonConfig::default(); + let result = newton(&mut obj, &[10.0, -7.0, 3.0, -1.0], &config); + + assert_eq!(result.termination, TerminationReason::GradientNorm); + assert_eq!(result.iterations, 1); + assert_near_origin(&result, 1e-10); +} + +#[test] +fn newton_ill_conditioned() { + let mut obj = IllConditionedQuadratic { a: 1000.0, b: 1.0 }; + let config = NewtonConfig::default(); + let result = newton(&mut obj, &[5.0, -3.0], &config); + + // Newton with exact Hessian is unaffected by conditioning + assert_eq!(result.termination, TerminationReason::GradientNorm); + assert_eq!(result.iterations, 1); + assert_near_origin(&result, 1e-10); +} + +#[test] +fn newton_at_optimum() { + let mut obj = Quadratic { dim: 2 }; + let config = NewtonConfig::default(); + let result = newton(&mut obj, &[0.0, 0.0], &config); + + assert_eq!(result.termination, TerminationReason::GradientNorm); + assert_eq!(result.iterations, 0); + assert_eq!(result.func_evals, 1); // single eval to check gradient +} + +#[test] +fn newton_max_iter_zero() { + let mut obj = Quadratic { dim: 2 }; + let config = NewtonConfig { + convergence: ConvergenceParams { + max_iter: 0, + ..Default::default() + }, + ..Default::default() + }; + let result = newton(&mut obj, &[1.0, 1.0], &config); + assert_eq!(result.termination, TerminationReason::NumericalError); + assert_eq!(result.iterations, 0); +} + +// ============================================================ +// L-BFGS: simple quadratic +// ============================================================ + +#[test] +fn lbfgs_quadratic_2d() { + let mut obj = Quadratic { dim: 2 }; + let config = LbfgsConfig::default(); + let result = lbfgs(&mut obj, &[5.0, -3.0], &config); + + assert_eq!(result.termination, TerminationReason::GradientNorm); + assert_near_origin(&result, 1e-6); +} + +#[test] +fn lbfgs_quadratic_8d() { + let mut obj = Quadratic { dim: 8 }; + let config = LbfgsConfig::default(); + let result = lbfgs(&mut obj, &[1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0], &config); + + assert_eq!(result.termination, TerminationReason::GradientNorm); + assert_near_origin(&result, 1e-6); +} + +#[test] +fn lbfgs_ill_conditioned() { + let mut obj = IllConditionedQuadratic { a: 1000.0, b: 1.0 }; + let config = LbfgsConfig::default(); + let result = lbfgs(&mut obj, &[5.0, -3.0], &config); + + assert_eq!(result.termination, TerminationReason::GradientNorm); + assert_near_origin(&result, 1e-4); +} + +#[test] +fn lbfgs_memory_1() { + let mut obj = Quadratic { dim: 2 }; + let config = LbfgsConfig { + memory: 1, + ..Default::default() + }; + let result = lbfgs(&mut obj, &[5.0, -3.0], &config); + + assert_eq!(result.termination, TerminationReason::GradientNorm); + assert_near_origin(&result, 1e-6); +} + +#[test] +fn lbfgs_at_optimum() { + let mut obj = Quadratic { dim: 3 }; + let config = LbfgsConfig::default(); + let result = lbfgs(&mut obj, &[0.0, 0.0, 0.0], &config); + + assert_eq!(result.termination, TerminationReason::GradientNorm); + assert_eq!(result.iterations, 0); +} + +#[test] +fn lbfgs_max_iter_zero() { + let mut obj = Quadratic { dim: 2 }; + let config = LbfgsConfig { + convergence: ConvergenceParams { + max_iter: 0, + ..Default::default() + }, + ..Default::default() + }; + let result = lbfgs(&mut obj, &[1.0, 1.0], &config); + assert_eq!(result.termination, TerminationReason::NumericalError); +} + +#[test] +fn lbfgs_memory_zero() { + let mut obj = Quadratic { dim: 2 }; + let config = LbfgsConfig { + memory: 0, + ..Default::default() + }; + let result = lbfgs(&mut obj, &[1.0, 1.0], &config); + assert_eq!(result.termination, TerminationReason::NumericalError); +} + +// ============================================================ +// Trust-region: simple quadratic +// ============================================================ + +#[test] +fn trust_region_quadratic_2d() { + let mut obj = Quadratic { dim: 2 }; + let config = TrustRegionConfig::default(); + let result = trust_region(&mut obj, &[5.0, -3.0], &config); + + assert_eq!(result.termination, TerminationReason::GradientNorm); + assert_near_origin(&result, 1e-6); +} + +#[test] +fn trust_region_quadratic_8d() { + let mut obj = Quadratic { dim: 8 }; + let config = TrustRegionConfig { + convergence: ConvergenceParams { + max_iter: 200, + ..Default::default() + }, + ..Default::default() + }; + let result = trust_region( + &mut obj, + &[1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0], + &config, + ); + + assert_eq!(result.termination, TerminationReason::GradientNorm); + assert_near_origin(&result, 1e-6); +} + +#[test] +fn trust_region_ill_conditioned() { + let mut obj = IllConditionedQuadratic { a: 1000.0, b: 1.0 }; + let config = TrustRegionConfig { + convergence: ConvergenceParams { + max_iter: 200, + ..Default::default() + }, + ..Default::default() + }; + let result = trust_region(&mut obj, &[5.0, -3.0], &config); + + assert_eq!(result.termination, TerminationReason::GradientNorm); + assert_near_origin(&result, 1e-4); +} + +#[test] +fn trust_region_at_optimum() { + let mut obj = Quadratic { dim: 2 }; + let config = TrustRegionConfig::default(); + let result = trust_region(&mut obj, &[0.0, 0.0], &config); + + assert_eq!(result.termination, TerminationReason::GradientNorm); + assert_eq!(result.iterations, 0); +} + +#[test] +fn trust_region_small_radius() { + let mut obj = Quadratic { dim: 2 }; + let config = TrustRegionConfig { + initial_radius: 0.01, + max_radius: 100.0, + convergence: ConvergenceParams { + max_iter: 200, + ..Default::default() + }, + ..Default::default() + }; + let result = trust_region(&mut obj, &[5.0, -3.0], &config); + + // Should still converge, just take more iterations + assert_eq!(result.termination, TerminationReason::GradientNorm); + assert_near_origin(&result, 1e-6); +} + +#[test] +fn trust_region_max_iter_zero() { + let mut obj = Quadratic { dim: 2 }; + let config = TrustRegionConfig { + convergence: ConvergenceParams { + max_iter: 0, + ..Default::default() + }, + ..Default::default() + }; + let result = trust_region(&mut obj, &[1.0, 1.0], &config); + assert_eq!(result.termination, TerminationReason::NumericalError); +} + +#[test] +fn trust_region_zero_radius() { + let mut obj = Quadratic { dim: 2 }; + let config = TrustRegionConfig { + initial_radius: 0.0, + ..Default::default() + }; + let result = trust_region(&mut obj, &[1.0, 1.0], &config); + assert_eq!(result.termination, TerminationReason::NumericalError); +} + +// ============================================================ +// Convergence criteria +// ============================================================ + +#[test] +fn func_tol_terminates_lbfgs() { + // Use Rosenbrock (takes many iterations) so func_tol can fire + let mut obj = Rosenbrock2D; + let config = LbfgsConfig { + convergence: ConvergenceParams { + max_iter: 1000, + grad_tol: 0.0, // disable gradient termination + step_tol: 0.0, // disable step termination + func_tol: 1e-2, // generous func_tol — fires when progress stalls + }, + ..Default::default() + }; + let result = lbfgs(&mut obj, &[0.0, 0.0], &config); + + assert_eq!(result.termination, TerminationReason::FunctionChange); + // Should have made progress toward the minimum + assert!(result.value < 1.0, "f = {}", result.value); +} + +#[test] +fn max_iter_terminates_lbfgs() { + // Use Rosenbrock so the solver doesn't converge before max_iter + let mut obj = Rosenbrock2D; + let config = LbfgsConfig { + convergence: ConvergenceParams { + max_iter: 3, + grad_tol: 0.0, + step_tol: 0.0, + func_tol: 0.0, + }, + ..Default::default() + }; + let result = lbfgs(&mut obj, &[0.0, 0.0], &config); + + assert_eq!(result.termination, TerminationReason::MaxIterations); + assert_eq!(result.iterations, 3); +} + +#[test] +fn func_tol_terminates_newton() { + // Use Rosenbrock so Newton takes multiple iterations + let mut obj = Rosenbrock2D; + let config = NewtonConfig { + convergence: ConvergenceParams { + max_iter: 1000, + grad_tol: 0.0, + step_tol: 0.0, + func_tol: 1e-2, + }, + ..Default::default() + }; + let result = newton(&mut obj, &[0.0, 0.0], &config); + + assert_eq!( + result.termination, + TerminationReason::FunctionChange, + "terminated with {:?}", + result.termination + ); +} + +// ============================================================ +// Cross-solver agreement on quadratic +// ============================================================ + +#[test] +fn all_solvers_agree_quadratic() { + let x0 = &[3.0, -4.0, 1.0]; + + let mut obj1 = Quadratic { dim: 3 }; + let r1 = lbfgs(&mut obj1, x0, &LbfgsConfig::default()); + + let mut obj2 = Quadratic { dim: 3 }; + let r2 = newton(&mut obj2, x0, &NewtonConfig::default()); + + let mut obj3 = Quadratic { dim: 3 }; + let r3 = trust_region( + &mut obj3, + x0, + &TrustRegionConfig { + convergence: ConvergenceParams { + max_iter: 200, + ..Default::default() + }, + ..Default::default() + }, + ); + + for (name, r) in [("L-BFGS", &r1), ("Newton", &r2), ("Trust-region", &r3)] { + assert_eq!( + r.termination, + TerminationReason::GradientNorm, + "{} did not converge: {:?}", + name, + r.termination + ); + assert_near_origin(r, 1e-6); + } +} + +// ============================================================ +// TapeObjective tests (higher-dimensional, tape-based) +// ============================================================ + +fn quadratic_tape(x: &[T]) -> T { + let half = T::from_f(::from_f64(0.5).unwrap()); + let mut sum = T::zero(); + for &xi in x { + sum = sum + xi * xi; + } + half * sum +} + +fn rosenbrock(x: &[T]) -> T { + let one = T::from_f(::from_f64(1.0).unwrap()); + let hundred = T::from_f(::from_f64(100.0).unwrap()); + let mut sum = T::zero(); + for i in 0..x.len() - 1 { + let t1 = one - x[i]; + let t2 = x[i + 1] - x[i] * x[i]; + sum = sum + t1 * t1 + hundred * t2 * t2; + } + sum +} + +#[test] +fn tape_lbfgs_quadratic_6d() { + let x0 = [1.0_f64, -2.0, 3.0, -4.0, 5.0, -6.0]; + let (tape, _) = echidna::record(quadratic_tape, &x0); + let mut obj = TapeObjective::new(tape); + let result = lbfgs(&mut obj, &x0, &LbfgsConfig::default()); + + assert_eq!(result.termination, TerminationReason::GradientNorm); + assert_near_origin(&result, 1e-6); +} + +#[test] +fn tape_newton_quadratic_6d() { + let x0 = [1.0_f64, -2.0, 3.0, -4.0, 5.0, -6.0]; + let (tape, _) = echidna::record(quadratic_tape, &x0); + let mut obj = TapeObjective::new(tape); + let result = newton(&mut obj, &x0, &NewtonConfig::default()); + + assert_eq!(result.termination, TerminationReason::GradientNorm); + assert_eq!(result.iterations, 1); + assert_near_origin(&result, 1e-10); +} + +#[test] +fn tape_trust_region_quadratic_6d() { + let x0 = [1.0_f64, -2.0, 3.0, -4.0, 5.0, -6.0]; + let (tape, _) = echidna::record(quadratic_tape, &x0); + let mut obj = TapeObjective::new(tape); + let config = TrustRegionConfig { + convergence: ConvergenceParams { + max_iter: 200, + ..Default::default() + }, + ..Default::default() + }; + let result = trust_region(&mut obj, &x0, &config); + + assert_eq!(result.termination, TerminationReason::GradientNorm); + assert_near_origin(&result, 1e-6); +} + +#[test] +fn tape_rosenbrock_4d_newton() { + let x0 = [0.0_f64, 0.0, 0.0, 0.0]; + let (tape, _) = echidna::record(rosenbrock, &x0); + let mut obj = TapeObjective::new(tape); + let result = newton(&mut obj, &x0, &NewtonConfig::default()); + + assert_eq!(result.termination, TerminationReason::GradientNorm); + for i in 0..4 { + assert!( + (result.x[i] - 1.0).abs() < 1e-5, + "x[{}] = {}, expected 1.0", + i, + result.x[i] + ); + } +} + +#[test] +fn tape_rosenbrock_4d_lbfgs() { + let x0 = [0.0_f64, 0.0, 0.0, 0.0]; + let (tape, _) = echidna::record(rosenbrock, &x0); + let mut obj = TapeObjective::new(tape); + let result = lbfgs(&mut obj, &x0, &LbfgsConfig::default()); + + assert_eq!(result.termination, TerminationReason::GradientNorm); + for i in 0..4 { + assert!( + (result.x[i] - 1.0).abs() < 1e-4, + "x[{}] = {}, expected 1.0", + i, + result.x[i] + ); + } +} + +#[test] +fn tape_rosenbrock_4d_trust_region() { + let x0 = [0.0_f64, 0.0, 0.0, 0.0]; + let (tape, _) = echidna::record(rosenbrock, &x0); + let mut obj = TapeObjective::new(tape); + let config = TrustRegionConfig { + convergence: ConvergenceParams { + max_iter: 500, + ..Default::default() + }, + ..Default::default() + }; + let result = trust_region(&mut obj, &x0, &config); + + assert_eq!( + result.termination, + TerminationReason::GradientNorm, + "terminated with {:?} after {} iters", + result.termination, + result.iterations + ); + for i in 0..4 { + assert!( + (result.x[i] - 1.0).abs() < 1e-4, + "x[{}] = {}, expected 1.0", + i, + result.x[i] + ); + } +} + +// ============================================================ +// TapeObjective with optimized tape +// ============================================================ + +#[test] +fn optimized_tape_quadratic() { + let x0 = [2.0_f64, -3.0, 4.0]; + let (mut tape, _) = echidna::record(quadratic_tape, &x0); + tape.optimize(); + + let mut obj = TapeObjective::new(tape); + let result = lbfgs(&mut obj, &x0, &LbfgsConfig::default()); + + assert_eq!(result.termination, TerminationReason::GradientNorm); + assert_near_origin(&result, 1e-6); +} + +// ============================================================ +// func_evals tracking +// ============================================================ + +#[test] +fn tape_objective_counts_evals() { + let x0 = [1.0_f64, 1.0]; + let (tape, _) = echidna::record(quadratic_tape, &x0); + let mut obj = TapeObjective::new(tape); + assert_eq!(obj.func_evals(), 0); + + let _ = lbfgs(&mut obj, &x0, &LbfgsConfig::default()); + assert!(obj.func_evals() > 0, "should have counted evaluations"); +} diff --git a/src/gpu/cuda_backend.rs b/src/gpu/cuda_backend.rs index 32238ba..8807398 100644 --- a/src/gpu/cuda_backend.rs +++ b/src/gpu/cuda_backend.rs @@ -29,6 +29,394 @@ const KERNEL_SRC: &str = include_str!("kernels/tape_eval.cu"); const TAYLOR_KERNEL_SRC: &str = include_str!("kernels/taylor_eval.cu"); const BLOCK_SIZE: u32 = 256; +// ── Macros to deduplicate f32/f64 CUDA dispatch methods ── +// +// Each macro expands to a block expression returning the method's result type. +// Parameters are: self, tape buffer, kernel field, constants field, and float type. + +macro_rules! cuda_forward_batch_body { + ($self:expr, $tape:expr, $inputs:expr, $batch_size:expr, $F:ty, $constants:ident, $kernel:ident) => {{ + let s = &$self.stream; + let ni = $tape.num_inputs; + let nv = $tape.num_variables; + let no = $tape.num_outputs; + + assert_eq!($inputs.len(), ($batch_size * ni) as usize); + + let d_inputs = s.clone_htod($inputs).map_err(cuda_err)?; + let mut d_values = s + .alloc_zeros::<$F>(($batch_size * nv) as usize) + .map_err(cuda_err)?; + let mut d_outputs = s + .alloc_zeros::<$F>(($batch_size * no) as usize) + .map_err(cuda_err)?; + + let cfg = LaunchConfig { + grid_dim: CudaContext::grid_dim($batch_size), + block_dim: CudaContext::block_dim(), + shared_mem_bytes: 0, + }; + + let mut builder = s.launch_builder(&$self.$kernel); + builder.arg(&$tape.opcodes); + builder.arg(&$tape.arg0); + builder.arg(&$tape.arg1); + builder.arg(&$tape.$constants); + builder.arg(&d_inputs); + builder.arg(&mut d_values); + builder.arg(&mut d_outputs); + builder.arg(&$tape.output_indices); + builder.arg(&$tape.num_ops); + builder.arg(&ni); + builder.arg(&nv); + builder.arg(&no); + builder.arg(&$batch_size); + unsafe { builder.launch(cfg) }.map_err(cuda_err)?; + + s.synchronize().map_err(cuda_err)?; + let results = s.clone_dtoh(&d_outputs).map_err(cuda_err)?; + Ok(results) + }}; +} + +macro_rules! cuda_gradient_batch_body { + ($self:expr, $tape:expr, $inputs:expr, $batch_size:expr, $F:ty, $constants:ident, $fwd_kernel:ident, $rev_kernel:ident) => {{ + let s = &$self.stream; + let ni = $tape.num_inputs; + let nv = $tape.num_variables; + let no = $tape.num_outputs; + + assert_eq!($inputs.len(), ($batch_size * ni) as usize); + + let d_inputs = s.clone_htod($inputs).map_err(cuda_err)?; + let mut d_values = s + .alloc_zeros::<$F>(($batch_size * nv) as usize) + .map_err(cuda_err)?; + let mut d_outputs = s + .alloc_zeros::<$F>(($batch_size * no) as usize) + .map_err(cuda_err)?; + + let cfg = LaunchConfig { + grid_dim: CudaContext::grid_dim($batch_size), + block_dim: CudaContext::block_dim(), + shared_mem_bytes: 0, + }; + + // Forward pass + let mut builder = s.launch_builder(&$self.$fwd_kernel); + builder.arg(&$tape.opcodes); + builder.arg(&$tape.arg0); + builder.arg(&$tape.arg1); + builder.arg(&$tape.$constants); + builder.arg(&d_inputs); + builder.arg(&mut d_values); + builder.arg(&mut d_outputs); + builder.arg(&$tape.output_indices); + builder.arg(&$tape.num_ops); + builder.arg(&ni); + builder.arg(&nv); + builder.arg(&no); + builder.arg(&$batch_size); + unsafe { builder.launch(cfg) }.map_err(cuda_err)?; + + // Reverse pass + let mut d_adjoints = s + .alloc_zeros::<$F>(($batch_size * nv) as usize) + .map_err(cuda_err)?; + let mut d_grads = s + .alloc_zeros::<$F>(($batch_size * ni) as usize) + .map_err(cuda_err)?; + + let mut builder = s.launch_builder(&$self.$rev_kernel); + builder.arg(&$tape.opcodes); + builder.arg(&$tape.arg0); + builder.arg(&$tape.arg1); + builder.arg(&d_values); + builder.arg(&mut d_adjoints); + builder.arg(&mut d_grads); + builder.arg(&$tape.output_indices); + builder.arg(&$tape.num_ops); + builder.arg(&ni); + builder.arg(&nv); + builder.arg(&$batch_size); + unsafe { builder.launch(cfg) }.map_err(cuda_err)?; + + s.synchronize().map_err(cuda_err)?; + let output_vals = s.clone_dtoh(&d_outputs).map_err(cuda_err)?; + let grads = s.clone_dtoh(&d_grads).map_err(cuda_err)?; + Ok((output_vals, grads)) + }}; +} + +macro_rules! cuda_hvp_batch_body { + ($self:expr, $tape:expr, $x:expr, $tangent_dirs:expr, $batch_size:expr, $F:ty, $constants:ident, $kernel:ident) => {{ + let s = &$self.stream; + let ni = $tape.num_inputs; + let nv = $tape.num_variables; + + assert_eq!($x.len(), ni as usize); + assert_eq!($tangent_dirs.len(), ($batch_size * ni) as usize); + + let mut primal_inputs = Vec::with_capacity(($batch_size * ni) as usize); + for _ in 0..$batch_size { + primal_inputs.extend_from_slice($x); + } + + let d_primal_in = s.clone_htod(&primal_inputs).map_err(cuda_err)?; + let d_seeds = s.clone_htod($tangent_dirs).map_err(cuda_err)?; + let mut d_primals = s + .alloc_zeros::<$F>(($batch_size * nv) as usize) + .map_err(cuda_err)?; + let mut d_tans = s + .alloc_zeros::<$F>(($batch_size * nv) as usize) + .map_err(cuda_err)?; + let mut d_adj_re = s + .alloc_zeros::<$F>(($batch_size * nv) as usize) + .map_err(cuda_err)?; + let mut d_adj_eps = s + .alloc_zeros::<$F>(($batch_size * nv) as usize) + .map_err(cuda_err)?; + let mut d_grads = s + .alloc_zeros::<$F>(($batch_size * ni) as usize) + .map_err(cuda_err)?; + let mut d_hvps = s + .alloc_zeros::<$F>(($batch_size * ni) as usize) + .map_err(cuda_err)?; + + let cfg = LaunchConfig { + grid_dim: CudaContext::grid_dim($batch_size), + block_dim: CudaContext::block_dim(), + shared_mem_bytes: 0, + }; + + let mut builder = s.launch_builder(&$self.$kernel); + builder.arg(&$tape.opcodes); + builder.arg(&$tape.arg0); + builder.arg(&$tape.arg1); + builder.arg(&$tape.$constants); + builder.arg(&d_primal_in); + builder.arg(&d_seeds); + builder.arg(&mut d_primals); + builder.arg(&mut d_tans); + builder.arg(&mut d_adj_re); + builder.arg(&mut d_adj_eps); + builder.arg(&mut d_grads); + builder.arg(&mut d_hvps); + builder.arg(&$tape.output_indices); + builder.arg(&$tape.num_ops); + builder.arg(&ni); + builder.arg(&nv); + builder.arg(&$batch_size); + unsafe { builder.launch(cfg) }.map_err(cuda_err)?; + + s.synchronize().map_err(cuda_err)?; + let grads = s.clone_dtoh(&d_grads).map_err(cuda_err)?; + let hvps = s.clone_dtoh(&d_hvps).map_err(cuda_err)?; + Ok((grads, hvps)) + }}; +} + +macro_rules! cuda_sparse_jacobian_body { + ($self:expr, $tape:expr, $tape_cpu:expr, $x:expr, $F:ty, $constants:ident, $tangent_fwd_kernel:ident) => {{ + let ni = $tape.num_inputs as usize; + let no = $tape.num_outputs as usize; + + let pattern = $tape_cpu.detect_jacobian_sparsity(); + let (colors, num_colors) = crate::sparse::column_coloring(&pattern); + + if num_colors == 0 { + $tape_cpu.forward($x); + let vals = $tape_cpu.output_values(); + return Ok((vals, pattern, vec![])); + } + + let batch = num_colors as u32; + let mut seeds = Vec::with_capacity(batch as usize * ni); + for c in 0..num_colors { + for i in 0..ni { + seeds.push(if colors[i] == c as u32 { + (1.0 as $F) + } else { + (0.0 as $F) + }); + } + } + + let s = &$self.stream; + let nv = $tape.num_variables; + + let d_primals_in = { + let mut replicated = Vec::with_capacity(batch as usize * ni); + for _ in 0..batch { + replicated.extend_from_slice($x); + } + s.clone_htod(&replicated).map_err(cuda_err)? + }; + let d_seeds = s.clone_htod(&seeds).map_err(cuda_err)?; + let mut d_primals = s + .alloc_zeros::<$F>((batch * nv) as usize) + .map_err(cuda_err)?; + let mut d_tangents = s + .alloc_zeros::<$F>((batch * nv) as usize) + .map_err(cuda_err)?; + let mut d_tangent_out = s + .alloc_zeros::<$F>((batch * $tape.num_outputs) as usize) + .map_err(cuda_err)?; + + let cfg = LaunchConfig { + grid_dim: CudaContext::grid_dim(batch), + block_dim: CudaContext::block_dim(), + shared_mem_bytes: 0, + }; + + let mut builder = s.launch_builder(&$self.$tangent_fwd_kernel); + builder.arg(&$tape.opcodes); + builder.arg(&$tape.arg0); + builder.arg(&$tape.arg1); + builder.arg(&$tape.$constants); + builder.arg(&d_primals_in); + builder.arg(&d_seeds); + builder.arg(&mut d_primals); + builder.arg(&mut d_tangents); + builder.arg(&mut d_tangent_out); + builder.arg(&$tape.output_indices); + builder.arg(&$tape.num_ops); + builder.arg(&$tape.num_inputs); + builder.arg(&nv); + builder.arg(&$tape.num_outputs); + builder.arg(&batch); + unsafe { builder.launch(cfg) }.map_err(cuda_err)?; + + s.synchronize().map_err(cuda_err)?; + let tangent_outs = s.clone_dtoh(&d_tangent_out).map_err(cuda_err)?; + + $tape_cpu.forward($x); + let output_values = $tape_cpu.output_values(); + + let nnz = pattern.nnz(); + let mut jac_values = vec![(0.0 as $F); nnz]; + for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() { + let c = colors[col as usize] as usize; + jac_values[k] = tangent_outs[c * no + row as usize]; + } + + Ok((output_values, pattern, jac_values)) + }}; +} + +macro_rules! cuda_sparse_hessian_body { + ($self:expr, $tape:expr, $tape_cpu:expr, $x:expr, $F:ty, $hvp_method:ident) => {{ + let ni = $tape.num_inputs as usize; + + let pattern = $tape_cpu.detect_sparsity(); + let (colors, num_colors) = crate::sparse::greedy_coloring(&pattern); + + if num_colors == 0 { + $tape_cpu.forward($x); + let val = $tape_cpu.output_value(); + let grad = $tape_cpu.gradient($x); + return Ok((val, grad, pattern, vec![])); + } + + let batch = num_colors as u32; + let mut tangent_dirs = Vec::with_capacity(batch as usize * ni); + for c in 0..num_colors { + for i in 0..ni { + tangent_dirs.push(if colors[i] == c as u32 { + (1.0 as $F) + } else { + (0.0 as $F) + }); + } + } + + let (grads, hvps) = $self.$hvp_method($tape, $x, &tangent_dirs, batch)?; + + let gradient: Vec<$F> = grads[..ni].to_vec(); + + let nnz = pattern.nnz(); + let mut hess_values = vec![(0.0 as $F); nnz]; + for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() { + let c = colors[col as usize] as usize; + hess_values[k] = hvps[c * ni + row as usize]; + } + + $tape_cpu.forward($x); + let value = $tape_cpu.output_value(); + + Ok((value, gradient, pattern, hess_values)) + }}; +} + +macro_rules! cuda_taylor_fwd_2nd_body { + ($self:expr, $tape:expr, $primal_inputs:expr, $direction_seeds:expr, $batch_size:expr, $F:ty, $constants:ident, $kernel:ident) => {{ + let s = &$self.stream; + let ni = $tape.num_inputs; + let nv = $tape.num_variables; + let no = $tape.num_outputs; + let total_in = ($batch_size * ni) as usize; + + assert_eq!( + $primal_inputs.len(), + total_in, + "primal_inputs length mismatch" + ); + assert_eq!( + $direction_seeds.len(), + total_in, + "direction_seeds length mismatch" + ); + + let d_primals = s.clone_htod($primal_inputs).map_err(cuda_err)?; + let d_seeds = s.clone_htod($direction_seeds).map_err(cuda_err)?; + let mut d_jets = s + .alloc_zeros::<$F>(($batch_size * nv * 3) as usize) + .map_err(cuda_err)?; + let mut d_jet_out = s + .alloc_zeros::<$F>(($batch_size * no * 3) as usize) + .map_err(cuda_err)?; + + let cfg = LaunchConfig { + grid_dim: CudaContext::grid_dim($batch_size), + block_dim: CudaContext::block_dim(), + shared_mem_bytes: 0, + }; + + let mut builder = s.launch_builder(&$self.$kernel); + builder.arg(&$tape.opcodes); + builder.arg(&$tape.arg0); + builder.arg(&$tape.arg1); + builder.arg(&$tape.$constants); + builder.arg(&d_primals); + builder.arg(&d_seeds); + builder.arg(&mut d_jets); + builder.arg(&mut d_jet_out); + builder.arg(&$tape.output_indices); + builder.arg(&$tape.num_ops); + builder.arg(&ni); + builder.arg(&nv); + builder.arg(&no); + builder.arg(&$batch_size); + unsafe { builder.launch(cfg) }.map_err(cuda_err)?; + + s.synchronize().map_err(cuda_err)?; + let raw = s.clone_dtoh(&d_jet_out).map_err(cuda_err)?; + + // Deinterleave [c0, c1, c2, c0, c1, c2, ...] + let total_out = ($batch_size * no) as usize; + let mut values = Vec::with_capacity(total_out); + let mut c1s = Vec::with_capacity(total_out); + let mut c2s = Vec::with_capacity(total_out); + for i in 0..total_out { + values.push(raw[i * 3]); + c1s.push(raw[i * 3 + 1]); + c2s.push(raw[i * 3 + 2]); + } + + Ok(super::TaylorBatchResult { values, c1s, c2s }) + }}; +} + /// Convert any `Display` error into `GpuError::Other`. fn cuda_err(e: impl std::fmt::Display) -> GpuError { GpuError::Other(format!("{e}")) @@ -197,218 +585,27 @@ impl GpuBackend for CudaContext { inputs: &[f32], batch_size: u32, ) -> Result, GpuError> { - let s = &self.stream; - let ni = tape.num_inputs; - let nv = tape.num_variables; - let no = tape.num_outputs; + cuda_forward_batch_body!(self, tape, inputs, batch_size, f32, constants_f32, forward_f32) + } - assert_eq!(inputs.len(), (batch_size * ni) as usize); + fn gradient_batch( + &self, + tape: &CudaTapeBuffers, + inputs: &[f32], + batch_size: u32, + ) -> Result<(Vec, Vec), GpuError> { + cuda_gradient_batch_body!(self, tape, inputs, batch_size, f32, constants_f32, forward_f32, reverse_f32) + } - let d_inputs = s.clone_htod(inputs).map_err(cuda_err)?; - let mut d_values = s - .alloc_zeros::((batch_size * nv) as usize) - .map_err(cuda_err)?; - let mut d_outputs = s - .alloc_zeros::((batch_size * no) as usize) - .map_err(cuda_err)?; + fn sparse_jacobian( + &self, + tape: &CudaTapeBuffers, + tape_cpu: &mut crate::BytecodeTape, + x: &[f32], + ) -> Result<(Vec, crate::sparse::JacobianSparsityPattern, Vec), GpuError> { + cuda_sparse_jacobian_body!(self, tape, tape_cpu, x, f32, constants_f32, tangent_fwd_f32) + } - let cfg = LaunchConfig { - grid_dim: Self::grid_dim(batch_size), - block_dim: Self::block_dim(), - shared_mem_bytes: 0, - }; - - let mut builder = s.launch_builder(&self.forward_f32); - builder.arg(&tape.opcodes); - builder.arg(&tape.arg0); - builder.arg(&tape.arg1); - builder.arg(&tape.constants_f32); - builder.arg(&d_inputs); - builder.arg(&mut d_values); - builder.arg(&mut d_outputs); - builder.arg(&tape.output_indices); - builder.arg(&tape.num_ops); - builder.arg(&ni); - builder.arg(&nv); - builder.arg(&no); - builder.arg(&batch_size); - unsafe { builder.launch(cfg) }.map_err(cuda_err)?; - - s.synchronize().map_err(cuda_err)?; - let results = s.clone_dtoh(&d_outputs).map_err(cuda_err)?; - Ok(results) - } - - /// Batched gradient (forward + reverse) (f32). - fn gradient_batch( - &self, - tape: &CudaTapeBuffers, - inputs: &[f32], - batch_size: u32, - ) -> Result<(Vec, Vec), GpuError> { - let s = &self.stream; - let ni = tape.num_inputs; - let nv = tape.num_variables; - let no = tape.num_outputs; - - assert_eq!(inputs.len(), (batch_size * ni) as usize); - - let d_inputs = s.clone_htod(inputs).map_err(cuda_err)?; - let mut d_values = s - .alloc_zeros::((batch_size * nv) as usize) - .map_err(cuda_err)?; - let mut d_outputs = s - .alloc_zeros::((batch_size * no) as usize) - .map_err(cuda_err)?; - - // Forward pass - let cfg = LaunchConfig { - grid_dim: Self::grid_dim(batch_size), - block_dim: Self::block_dim(), - shared_mem_bytes: 0, - }; - - let mut builder = s.launch_builder(&self.forward_f32); - builder.arg(&tape.opcodes); - builder.arg(&tape.arg0); - builder.arg(&tape.arg1); - builder.arg(&tape.constants_f32); - builder.arg(&d_inputs); - builder.arg(&mut d_values); - builder.arg(&mut d_outputs); - builder.arg(&tape.output_indices); - builder.arg(&tape.num_ops); - builder.arg(&ni); - builder.arg(&nv); - builder.arg(&no); - builder.arg(&batch_size); - unsafe { builder.launch(cfg) }.map_err(cuda_err)?; - - // Reverse pass - let mut d_adjoints = s - .alloc_zeros::((batch_size * nv) as usize) - .map_err(cuda_err)?; - let mut d_grads = s - .alloc_zeros::((batch_size * ni) as usize) - .map_err(cuda_err)?; - - let mut builder = s.launch_builder(&self.reverse_f32); - builder.arg(&tape.opcodes); - builder.arg(&tape.arg0); - builder.arg(&tape.arg1); - builder.arg(&d_values); - builder.arg(&mut d_adjoints); - builder.arg(&mut d_grads); - builder.arg(&tape.output_indices); - builder.arg(&tape.num_ops); - builder.arg(&ni); - builder.arg(&nv); - builder.arg(&batch_size); - unsafe { builder.launch(cfg) }.map_err(cuda_err)?; - - s.synchronize().map_err(cuda_err)?; - let output_vals = s.clone_dtoh(&d_outputs).map_err(cuda_err)?; - let grads = s.clone_dtoh(&d_grads).map_err(cuda_err)?; - Ok((output_vals, grads)) - } - - /// Sparse Jacobian using forward-mode tangent sweeps (f32). - fn sparse_jacobian( - &self, - tape: &CudaTapeBuffers, - tape_cpu: &mut crate::BytecodeTape, - x: &[f32], - ) -> Result<(Vec, crate::sparse::JacobianSparsityPattern, Vec), GpuError> { - let ni = tape.num_inputs as usize; - let no = tape.num_outputs as usize; - - let pattern = tape_cpu.detect_jacobian_sparsity(); - let (colors, num_colors) = crate::sparse::column_coloring(&pattern); - - if num_colors == 0 { - tape_cpu.forward(x); - let vals = tape_cpu.output_values(); - return Ok((vals, pattern, vec![])); - } - - // Build tangent seeds for each color - let batch = num_colors as u32; - let mut seeds = Vec::with_capacity(batch as usize * ni); - for c in 0..num_colors { - for i in 0..ni { - seeds.push(if colors[i] == c as u32 { - 1.0f32 - } else { - 0.0f32 - }); - } - } - - // Dispatch tangent forward kernel - let s = &self.stream; - let nv = tape.num_variables; - - let d_primals_in = { - let mut replicated = Vec::with_capacity(batch as usize * ni); - for _ in 0..batch { - replicated.extend_from_slice(x); - } - s.clone_htod(&replicated).map_err(cuda_err)? - }; - let d_seeds = s.clone_htod(&seeds).map_err(cuda_err)?; - let mut d_primals = s - .alloc_zeros::((batch * nv) as usize) - .map_err(cuda_err)?; - let mut d_tangents = s - .alloc_zeros::((batch * nv) as usize) - .map_err(cuda_err)?; - let mut d_tangent_out = s - .alloc_zeros::((batch * tape.num_outputs) as usize) - .map_err(cuda_err)?; - - let cfg = LaunchConfig { - grid_dim: Self::grid_dim(batch), - block_dim: Self::block_dim(), - shared_mem_bytes: 0, - }; - - let mut builder = s.launch_builder(&self.tangent_fwd_f32); - builder.arg(&tape.opcodes); - builder.arg(&tape.arg0); - builder.arg(&tape.arg1); - builder.arg(&tape.constants_f32); - builder.arg(&d_primals_in); - builder.arg(&d_seeds); - builder.arg(&mut d_primals); - builder.arg(&mut d_tangents); - builder.arg(&mut d_tangent_out); - builder.arg(&tape.output_indices); - builder.arg(&tape.num_ops); - builder.arg(&tape.num_inputs); - builder.arg(&nv); - builder.arg(&tape.num_outputs); - builder.arg(&batch); - unsafe { builder.launch(cfg) }.map_err(cuda_err)?; - - s.synchronize().map_err(cuda_err)?; - let tangent_outs = s.clone_dtoh(&d_tangent_out).map_err(cuda_err)?; - - // Get output values from CPU - tape_cpu.forward(x); - let output_values = tape_cpu.output_values(); - - // Extract Jacobian entries - let nnz = pattern.nnz(); - let mut jac_values = vec![0.0f32; nnz]; - for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() { - let c = colors[col as usize] as usize; - jac_values[k] = tangent_outs[c * no + row as usize]; - } - - Ok((output_values, pattern, jac_values)) - } - - /// HVP batch (forward-over-reverse) (f32). fn hvp_batch( &self, tape: &CudaTapeBuffers, @@ -416,123 +613,20 @@ impl GpuBackend for CudaContext { tangent_dirs: &[f32], batch_size: u32, ) -> Result<(Vec, Vec), GpuError> { - let s = &self.stream; - let ni = tape.num_inputs; - let nv = tape.num_variables; - - assert_eq!(x.len(), ni as usize); - assert_eq!(tangent_dirs.len(), (batch_size * ni) as usize); - - let mut primal_inputs = Vec::with_capacity((batch_size * ni) as usize); - for _ in 0..batch_size { - primal_inputs.extend_from_slice(x); - } - - let d_primal_in = s.clone_htod(&primal_inputs).map_err(cuda_err)?; - let d_seeds = s.clone_htod(tangent_dirs).map_err(cuda_err)?; - let mut d_primals = s - .alloc_zeros::((batch_size * nv) as usize) - .map_err(cuda_err)?; - let mut d_tans = s - .alloc_zeros::((batch_size * nv) as usize) - .map_err(cuda_err)?; - let mut d_adj_re = s - .alloc_zeros::((batch_size * nv) as usize) - .map_err(cuda_err)?; - let mut d_adj_eps = s - .alloc_zeros::((batch_size * nv) as usize) - .map_err(cuda_err)?; - let mut d_grads = s - .alloc_zeros::((batch_size * ni) as usize) - .map_err(cuda_err)?; - let mut d_hvps = s - .alloc_zeros::((batch_size * ni) as usize) - .map_err(cuda_err)?; - - let cfg = LaunchConfig { - grid_dim: Self::grid_dim(batch_size), - block_dim: Self::block_dim(), - shared_mem_bytes: 0, - }; - - let mut builder = s.launch_builder(&self.tangent_rev_f32); - builder.arg(&tape.opcodes); - builder.arg(&tape.arg0); - builder.arg(&tape.arg1); - builder.arg(&tape.constants_f32); - builder.arg(&d_primal_in); - builder.arg(&d_seeds); - builder.arg(&mut d_primals); - builder.arg(&mut d_tans); - builder.arg(&mut d_adj_re); - builder.arg(&mut d_adj_eps); - builder.arg(&mut d_grads); - builder.arg(&mut d_hvps); - builder.arg(&tape.output_indices); - builder.arg(&tape.num_ops); - builder.arg(&ni); - builder.arg(&nv); - builder.arg(&batch_size); - unsafe { builder.launch(cfg) }.map_err(cuda_err)?; - - s.synchronize().map_err(cuda_err)?; - let grads = s.clone_dtoh(&d_grads).map_err(cuda_err)?; - let hvps = s.clone_dtoh(&d_hvps).map_err(cuda_err)?; - Ok((grads, hvps)) + cuda_hvp_batch_body!(self, tape, x, tangent_dirs, batch_size, f32, constants_f32, tangent_rev_f32) } - /// Sparse Hessian using GPU HVP sweeps (f32). fn sparse_hessian( &self, tape: &CudaTapeBuffers, tape_cpu: &mut crate::BytecodeTape, x: &[f32], ) -> Result<(f32, Vec, crate::sparse::SparsityPattern, Vec), GpuError> { - let ni = tape.num_inputs as usize; - - let pattern = tape_cpu.detect_sparsity(); - let (colors, num_colors) = crate::sparse::greedy_coloring(&pattern); - - if num_colors == 0 { - tape_cpu.forward(x); - let val = tape_cpu.output_value(); - let grad = tape_cpu.gradient(x); - return Ok((val, grad, pattern, vec![])); - } - - let batch = num_colors as u32; - let mut tangent_dirs = Vec::with_capacity(batch as usize * ni); - for c in 0..num_colors { - for i in 0..ni { - tangent_dirs.push(if colors[i] == c as u32 { - 1.0f32 - } else { - 0.0f32 - }); - } - } - - let (grads, hvps) = self.hvp_batch(tape, x, &tangent_dirs, batch)?; - - let gradient: Vec = grads[..ni].to_vec(); - - let nnz = pattern.nnz(); - let mut hess_values = vec![0.0f32; nnz]; - for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() { - let c = colors[col as usize] as usize; - hess_values[k] = hvps[c * ni + row as usize]; - } - - tape_cpu.forward(x); - let value = tape_cpu.output_value(); - - Ok((value, gradient, pattern, hess_values)) + cuda_sparse_hessian_body!(self, tape, tape_cpu, x, f32, hvp_batch) } } impl CudaContext { - // ── Taylor forward 2nd-order (f32) ── - /// Batched second-order Taylor forward propagation (f32). /// /// Each batch element pushes one direction through the tape, producing @@ -545,74 +639,9 @@ impl CudaContext { direction_seeds: &[f32], batch_size: u32, ) -> Result, GpuError> { - let s = &self.stream; - let ni = tape.num_inputs; - let nv = tape.num_variables; - let no = tape.num_outputs; - let total_in = (batch_size * ni) as usize; - - assert_eq!( - primal_inputs.len(), - total_in, - "primal_inputs length mismatch" - ); - assert_eq!( - direction_seeds.len(), - total_in, - "direction_seeds length mismatch" - ); - - let d_primals = s.clone_htod(primal_inputs).map_err(cuda_err)?; - let d_seeds = s.clone_htod(direction_seeds).map_err(cuda_err)?; - let mut d_jets = s - .alloc_zeros::((batch_size * nv * 3) as usize) - .map_err(cuda_err)?; - let mut d_jet_out = s - .alloc_zeros::((batch_size * no * 3) as usize) - .map_err(cuda_err)?; - - let cfg = LaunchConfig { - grid_dim: Self::grid_dim(batch_size), - block_dim: Self::block_dim(), - shared_mem_bytes: 0, - }; - - let mut builder = s.launch_builder(&self.taylor_fwd_2nd_f32); - builder.arg(&tape.opcodes); - builder.arg(&tape.arg0); - builder.arg(&tape.arg1); - builder.arg(&tape.constants_f32); - builder.arg(&d_primals); - builder.arg(&d_seeds); - builder.arg(&mut d_jets); - builder.arg(&mut d_jet_out); - builder.arg(&tape.output_indices); - builder.arg(&tape.num_ops); - builder.arg(&ni); - builder.arg(&nv); - builder.arg(&no); - builder.arg(&batch_size); - unsafe { builder.launch(cfg) }.map_err(cuda_err)?; - - s.synchronize().map_err(cuda_err)?; - let raw = s.clone_dtoh(&d_jet_out).map_err(cuda_err)?; - - // Deinterleave [c0, c1, c2, c0, c1, c2, ...] - let total_out = (batch_size * no) as usize; - let mut values = Vec::with_capacity(total_out); - let mut c1s = Vec::with_capacity(total_out); - let mut c2s = Vec::with_capacity(total_out); - for i in 0..total_out { - values.push(raw[i * 3]); - c1s.push(raw[i * 3 + 1]); - c2s.push(raw[i * 3 + 2]); - } - - Ok(super::TaylorBatchResult { values, c1s, c2s }) + cuda_taylor_fwd_2nd_body!(self, tape, primal_inputs, direction_seeds, batch_size, f32, constants_f32, taylor_fwd_2nd_f32) } - // ── Taylor forward 2nd-order (f64) ── - /// Batched second-order Taylor forward propagation (f64, CUDA only). pub fn taylor_forward_2nd_batch_f64( &self, @@ -621,69 +650,7 @@ impl CudaContext { direction_seeds: &[f64], batch_size: u32, ) -> Result, GpuError> { - let s = &self.stream; - let ni = tape.num_inputs; - let nv = tape.num_variables; - let no = tape.num_outputs; - let total_in = (batch_size * ni) as usize; - - assert_eq!( - primal_inputs.len(), - total_in, - "primal_inputs length mismatch" - ); - assert_eq!( - direction_seeds.len(), - total_in, - "direction_seeds length mismatch" - ); - - let d_primals = s.clone_htod(primal_inputs).map_err(cuda_err)?; - let d_seeds = s.clone_htod(direction_seeds).map_err(cuda_err)?; - let mut d_jets = s - .alloc_zeros::((batch_size * nv * 3) as usize) - .map_err(cuda_err)?; - let mut d_jet_out = s - .alloc_zeros::((batch_size * no * 3) as usize) - .map_err(cuda_err)?; - - let cfg = LaunchConfig { - grid_dim: Self::grid_dim(batch_size), - block_dim: Self::block_dim(), - shared_mem_bytes: 0, - }; - - let mut builder = s.launch_builder(&self.taylor_fwd_2nd_f64); - builder.arg(&tape.opcodes); - builder.arg(&tape.arg0); - builder.arg(&tape.arg1); - builder.arg(&tape.constants_f64); - builder.arg(&d_primals); - builder.arg(&d_seeds); - builder.arg(&mut d_jets); - builder.arg(&mut d_jet_out); - builder.arg(&tape.output_indices); - builder.arg(&tape.num_ops); - builder.arg(&ni); - builder.arg(&nv); - builder.arg(&no); - builder.arg(&batch_size); - unsafe { builder.launch(cfg) }.map_err(cuda_err)?; - - s.synchronize().map_err(cuda_err)?; - let raw = s.clone_dtoh(&d_jet_out).map_err(cuda_err)?; - - let total_out = (batch_size * no) as usize; - let mut values = Vec::with_capacity(total_out); - let mut c1s = Vec::with_capacity(total_out); - let mut c2s = Vec::with_capacity(total_out); - for i in 0..total_out { - values.push(raw[i * 3]); - c1s.push(raw[i * 3 + 1]); - c2s.push(raw[i * 3 + 2]); - } - - Ok(super::TaylorBatchResult { values, c1s, c2s }) + cuda_taylor_fwd_2nd_body!(self, tape, primal_inputs, direction_seeds, batch_size, f64, constants_f64, taylor_fwd_2nd_f64) } // ── f64 operations ── @@ -695,46 +662,7 @@ impl CudaContext { inputs: &[f64], batch_size: u32, ) -> Result, GpuError> { - let s = &self.stream; - let ni = tape.num_inputs; - let nv = tape.num_variables; - let no = tape.num_outputs; - - assert_eq!(inputs.len(), (batch_size * ni) as usize); - - let d_inputs = s.clone_htod(inputs).map_err(cuda_err)?; - let mut d_values = s - .alloc_zeros::((batch_size * nv) as usize) - .map_err(cuda_err)?; - let mut d_outputs = s - .alloc_zeros::((batch_size * no) as usize) - .map_err(cuda_err)?; - - let cfg = LaunchConfig { - grid_dim: Self::grid_dim(batch_size), - block_dim: Self::block_dim(), - shared_mem_bytes: 0, - }; - - let mut builder = s.launch_builder(&self.forward_f64); - builder.arg(&tape.opcodes); - builder.arg(&tape.arg0); - builder.arg(&tape.arg1); - builder.arg(&tape.constants_f64); - builder.arg(&d_inputs); - builder.arg(&mut d_values); - builder.arg(&mut d_outputs); - builder.arg(&tape.output_indices); - builder.arg(&tape.num_ops); - builder.arg(&ni); - builder.arg(&nv); - builder.arg(&no); - builder.arg(&batch_size); - unsafe { builder.launch(cfg) }.map_err(cuda_err)?; - - s.synchronize().map_err(cuda_err)?; - let results = s.clone_dtoh(&d_outputs).map_err(cuda_err)?; - Ok(results) + cuda_forward_batch_body!(self, tape, inputs, batch_size, f64, constants_f64, forward_f64) } /// Batched gradient (f64, CUDA only). @@ -744,70 +672,7 @@ impl CudaContext { inputs: &[f64], batch_size: u32, ) -> Result<(Vec, Vec), GpuError> { - let s = &self.stream; - let ni = tape.num_inputs; - let nv = tape.num_variables; - let no = tape.num_outputs; - - assert_eq!(inputs.len(), (batch_size * ni) as usize); - - let d_inputs = s.clone_htod(inputs).map_err(cuda_err)?; - let mut d_values = s - .alloc_zeros::((batch_size * nv) as usize) - .map_err(cuda_err)?; - let mut d_outputs = s - .alloc_zeros::((batch_size * no) as usize) - .map_err(cuda_err)?; - - let cfg = LaunchConfig { - grid_dim: Self::grid_dim(batch_size), - block_dim: Self::block_dim(), - shared_mem_bytes: 0, - }; - - // Forward - let mut builder = s.launch_builder(&self.forward_f64); - builder.arg(&tape.opcodes); - builder.arg(&tape.arg0); - builder.arg(&tape.arg1); - builder.arg(&tape.constants_f64); - builder.arg(&d_inputs); - builder.arg(&mut d_values); - builder.arg(&mut d_outputs); - builder.arg(&tape.output_indices); - builder.arg(&tape.num_ops); - builder.arg(&ni); - builder.arg(&nv); - builder.arg(&no); - builder.arg(&batch_size); - unsafe { builder.launch(cfg) }.map_err(cuda_err)?; - - // Reverse - let mut d_adjoints = s - .alloc_zeros::((batch_size * nv) as usize) - .map_err(cuda_err)?; - let mut d_grads = s - .alloc_zeros::((batch_size * ni) as usize) - .map_err(cuda_err)?; - - let mut builder = s.launch_builder(&self.reverse_f64); - builder.arg(&tape.opcodes); - builder.arg(&tape.arg0); - builder.arg(&tape.arg1); - builder.arg(&d_values); - builder.arg(&mut d_adjoints); - builder.arg(&mut d_grads); - builder.arg(&tape.output_indices); - builder.arg(&tape.num_ops); - builder.arg(&ni); - builder.arg(&nv); - builder.arg(&batch_size); - unsafe { builder.launch(cfg) }.map_err(cuda_err)?; - - s.synchronize().map_err(cuda_err)?; - let output_vals = s.clone_dtoh(&d_outputs).map_err(cuda_err)?; - let grads = s.clone_dtoh(&d_grads).map_err(cuda_err)?; - Ok((output_vals, grads)) + cuda_gradient_batch_body!(self, tape, inputs, batch_size, f64, constants_f64, forward_f64, reverse_f64) } /// Sparse Jacobian (f64, CUDA only). @@ -817,89 +682,7 @@ impl CudaContext { tape_cpu: &mut crate::BytecodeTape, x: &[f64], ) -> Result<(Vec, crate::sparse::JacobianSparsityPattern, Vec), GpuError> { - let ni = tape.num_inputs as usize; - let no = tape.num_outputs as usize; - - let pattern = tape_cpu.detect_jacobian_sparsity(); - let (colors, num_colors) = crate::sparse::column_coloring(&pattern); - - if num_colors == 0 { - tape_cpu.forward(x); - let vals = tape_cpu.output_values(); - return Ok((vals, pattern, vec![])); - } - - let batch = num_colors as u32; - let mut seeds = Vec::with_capacity(batch as usize * ni); - for c in 0..num_colors { - for i in 0..ni { - seeds.push(if colors[i] == c as u32 { - 1.0f64 - } else { - 0.0f64 - }); - } - } - - let s = &self.stream; - let nv = tape.num_variables; - - let d_primals_in = { - let mut replicated = Vec::with_capacity(batch as usize * ni); - for _ in 0..batch { - replicated.extend_from_slice(x); - } - s.clone_htod(&replicated).map_err(cuda_err)? - }; - let d_seeds = s.clone_htod(&seeds).map_err(cuda_err)?; - let mut d_primals = s - .alloc_zeros::((batch * nv) as usize) - .map_err(cuda_err)?; - let mut d_tangents = s - .alloc_zeros::((batch * nv) as usize) - .map_err(cuda_err)?; - let mut d_tangent_out = s - .alloc_zeros::((batch * tape.num_outputs) as usize) - .map_err(cuda_err)?; - - let cfg = LaunchConfig { - grid_dim: Self::grid_dim(batch), - block_dim: Self::block_dim(), - shared_mem_bytes: 0, - }; - - let mut builder = s.launch_builder(&self.tangent_fwd_f64); - builder.arg(&tape.opcodes); - builder.arg(&tape.arg0); - builder.arg(&tape.arg1); - builder.arg(&tape.constants_f64); - builder.arg(&d_primals_in); - builder.arg(&d_seeds); - builder.arg(&mut d_primals); - builder.arg(&mut d_tangents); - builder.arg(&mut d_tangent_out); - builder.arg(&tape.output_indices); - builder.arg(&tape.num_ops); - builder.arg(&tape.num_inputs); - builder.arg(&nv); - builder.arg(&tape.num_outputs); - builder.arg(&batch); - unsafe { builder.launch(cfg) }.map_err(cuda_err)?; - - s.synchronize().map_err(cuda_err)?; - let tangent_outs = s.clone_dtoh(&d_tangent_out).map_err(cuda_err)?; - - tape_cpu.forward(x); - let output_values = tape_cpu.output_values(); - - let nnz = pattern.nnz(); - let mut jac_values = vec![0.0f64; nnz]; - for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() { - let c = colors[col as usize] as usize; - jac_values[k] = tangent_outs[c * no + row as usize]; - } - - Ok((output_values, pattern, jac_values)) + cuda_sparse_jacobian_body!(self, tape, tape_cpu, x, f64, constants_f64, tangent_fwd_f64) } /// Sparse Hessian (f64, CUDA only). @@ -909,45 +692,7 @@ impl CudaContext { tape_cpu: &mut crate::BytecodeTape, x: &[f64], ) -> Result<(f64, Vec, crate::sparse::SparsityPattern, Vec), GpuError> { - let ni = tape.num_inputs as usize; - - let pattern = tape_cpu.detect_sparsity(); - let (colors, num_colors) = crate::sparse::greedy_coloring(&pattern); - - if num_colors == 0 { - tape_cpu.forward(x); - let val = tape_cpu.output_value(); - let grad = tape_cpu.gradient(x); - return Ok((val, grad, pattern, vec![])); - } - - let batch = num_colors as u32; - let mut tangent_dirs = Vec::with_capacity(batch as usize * ni); - for c in 0..num_colors { - for i in 0..ni { - tangent_dirs.push(if colors[i] == c as u32 { - 1.0f64 - } else { - 0.0f64 - }); - } - } - - let (grads, hvps) = self.hvp_batch_f64(tape, x, &tangent_dirs, batch)?; - - let gradient: Vec = grads[..ni].to_vec(); - - let nnz = pattern.nnz(); - let mut hess_values = vec![0.0f64; nnz]; - for (k, (&row, &col)) in pattern.rows.iter().zip(pattern.cols.iter()).enumerate() { - let c = colors[col as usize] as usize; - hess_values[k] = hvps[c * ni + row as usize]; - } - - tape_cpu.forward(x); - let value = tape_cpu.output_value(); - - Ok((value, gradient, pattern, hess_values)) + cuda_sparse_hessian_body!(self, tape, tape_cpu, x, f64, hvp_batch_f64) } /// HVP batch (f64, CUDA only). @@ -958,68 +703,6 @@ impl CudaContext { tangent_dirs: &[f64], batch_size: u32, ) -> Result<(Vec, Vec), GpuError> { - let s = &self.stream; - let ni = tape.num_inputs; - let nv = tape.num_variables; - - assert_eq!(x.len(), ni as usize); - assert_eq!(tangent_dirs.len(), (batch_size * ni) as usize); - - let mut primal_inputs = Vec::with_capacity((batch_size * ni) as usize); - for _ in 0..batch_size { - primal_inputs.extend_from_slice(x); - } - - let d_primal_in = s.clone_htod(&primal_inputs).map_err(cuda_err)?; - let d_seeds = s.clone_htod(tangent_dirs).map_err(cuda_err)?; - let mut d_primals = s - .alloc_zeros::((batch_size * nv) as usize) - .map_err(cuda_err)?; - let mut d_tans = s - .alloc_zeros::((batch_size * nv) as usize) - .map_err(cuda_err)?; - let mut d_adj_re = s - .alloc_zeros::((batch_size * nv) as usize) - .map_err(cuda_err)?; - let mut d_adj_eps = s - .alloc_zeros::((batch_size * nv) as usize) - .map_err(cuda_err)?; - let mut d_grads = s - .alloc_zeros::((batch_size * ni) as usize) - .map_err(cuda_err)?; - let mut d_hvps = s - .alloc_zeros::((batch_size * ni) as usize) - .map_err(cuda_err)?; - - let cfg = LaunchConfig { - grid_dim: Self::grid_dim(batch_size), - block_dim: Self::block_dim(), - shared_mem_bytes: 0, - }; - - let mut builder = s.launch_builder(&self.tangent_rev_f64); - builder.arg(&tape.opcodes); - builder.arg(&tape.arg0); - builder.arg(&tape.arg1); - builder.arg(&tape.constants_f64); - builder.arg(&d_primal_in); - builder.arg(&d_seeds); - builder.arg(&mut d_primals); - builder.arg(&mut d_tans); - builder.arg(&mut d_adj_re); - builder.arg(&mut d_adj_eps); - builder.arg(&mut d_grads); - builder.arg(&mut d_hvps); - builder.arg(&tape.output_indices); - builder.arg(&tape.num_ops); - builder.arg(&ni); - builder.arg(&nv); - builder.arg(&batch_size); - unsafe { builder.launch(cfg) }.map_err(cuda_err)?; - - s.synchronize().map_err(cuda_err)?; - let grads = s.clone_dtoh(&d_grads).map_err(cuda_err)?; - let hvps = s.clone_dtoh(&d_hvps).map_err(cuda_err)?; - Ok((grads, hvps)) + cuda_hvp_batch_body!(self, tape, x, tangent_dirs, batch_size, f64, constants_f64, tangent_rev_f64) } } diff --git a/src/gpu/mod.rs b/src/gpu/mod.rs index 6942113..09acac8 100644 --- a/src/gpu/mod.rs +++ b/src/gpu/mod.rs @@ -31,7 +31,7 @@ //! - [`stde_gpu::laplacian_with_control_gpu`] — variance-reduced Laplacian with diagonal control variate //! //! The Taylor kernel propagates `(c0, c1, c2)` triples through the tape for -//! each batch element, where c2 = v^T H v / 2. All 43 opcodes are supported. +//! each batch element, where c2 = v^T H v / 2. All 44 opcodes are supported. use crate::bytecode_tape::BytecodeTape; use crate::opcode::OpCode; diff --git a/src/opcode.rs b/src/opcode.rs index fdbc96a..aa338d4 100644 --- a/src/opcode.rs +++ b/src/opcode.rs @@ -10,7 +10,7 @@ pub const UNUSED: u32 = u32::MAX; /// Elementary operation codes for the bytecode tape. /// -/// Fits in a `u8` (~38 variants). Binary ops use both `arg_indices` slots; +/// Fits in a `u8` (44 variants). Binary ops use both `arg_indices` slots; /// unary ops use slot 0 only (slot 1 = [`UNUSED`], except for [`OpCode::Powi`] /// which stores the `i32` exponent reinterpreted as `u32` in slot 1). #[repr(u8)] From 13f62304c8277c572325c1cbbddbc059b6ba08cb Mon Sep 17 00:00:00 2001 From: Greg von Nessi Date: Thu, 26 Feb 2026 01:50:58 +0000 Subject: [PATCH 5/5] Fix rustfmt formatting for macro invocations --- echidna-optim/tests/solvers.rs | 18 +++---- src/bytecode_tape/mod.rs | 4 +- src/bytecode_tape/tangent.rs | 6 ++- src/gpu/cuda_backend.rs | 86 ++++++++++++++++++++++++++++++---- 4 files changed, 95 insertions(+), 19 deletions(-) diff --git a/echidna-optim/tests/solvers.rs b/echidna-optim/tests/solvers.rs index b6e9d85..b92f368 100644 --- a/echidna-optim/tests/solvers.rs +++ b/echidna-optim/tests/solvers.rs @@ -119,12 +119,7 @@ impl Objective for Rosenbrock2D { fn assert_near_origin(result: &OptimResult, tol: f64) { for (i, &xi) in result.x.iter().enumerate() { - assert!( - xi.abs() < tol, - "x[{}] = {}, expected ~0", - i, - xi - ); + assert!(xi.abs() < tol, "x[{}] = {}, expected ~0", i, xi); } assert!(result.value < tol, "f = {}, expected ~0", result.value); } @@ -141,7 +136,10 @@ fn newton_quadratic_2d() { assert_eq!(result.termination, TerminationReason::GradientNorm); // Newton on a quadratic converges in 1 iteration - assert_eq!(result.iterations, 1, "Newton should solve quadratic in 1 step"); + assert_eq!( + result.iterations, 1, + "Newton should solve quadratic in 1 step" + ); assert_near_origin(&result, 1e-10); } @@ -212,7 +210,11 @@ fn lbfgs_quadratic_2d() { fn lbfgs_quadratic_8d() { let mut obj = Quadratic { dim: 8 }; let config = LbfgsConfig::default(); - let result = lbfgs(&mut obj, &[1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0], &config); + let result = lbfgs( + &mut obj, + &[1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0], + &config, + ); assert_eq!(result.termination, TerminationReason::GradientNorm); assert_near_origin(&result, 1e-6); diff --git a/src/bytecode_tape/mod.rs b/src/bytecode_tape/mod.rs index 7b4666c..f08579c 100644 --- a/src/bytecode_tape/mod.rs +++ b/src/bytecode_tape/mod.rs @@ -26,12 +26,12 @@ mod reverse; mod sparse; mod tangent; -#[cfg(feature = "taylor")] -mod taylor; #[cfg(feature = "parallel")] mod parallel; #[cfg(feature = "serde")] mod serde_support; +#[cfg(feature = "taylor")] +mod taylor; mod thread_local; pub use self::thread_local::{with_active_btape, BtapeGuard, BtapeThreadLocal}; diff --git a/src/bytecode_tape/tangent.rs b/src/bytecode_tape/tangent.rs index a15433e..9f10d48 100644 --- a/src/bytecode_tape/tangent.rs +++ b/src/bytecode_tape/tangent.rs @@ -96,7 +96,11 @@ impl super::BytecodeTape { /// Reverse sweep with tangent-carrying adjoints. Uses values from /// [`forward_tangent`](Self::forward_tangent). Uses [`IsAllZero`] to /// safely skip zero adjoints without dropping tangent contributions. - pub(super) fn reverse_tangent(&self, tangent_vals: &[T], buf: &mut Vec) { + pub(super) fn reverse_tangent( + &self, + tangent_vals: &[T], + buf: &mut Vec, + ) { self.reverse_tangent_inner(tangent_vals, buf, |i| { // First-order: convert primal-float partials to T. let [a_idx, cb_idx] = self.arg_indices[i]; diff --git a/src/gpu/cuda_backend.rs b/src/gpu/cuda_backend.rs index 8807398..2d2be88 100644 --- a/src/gpu/cuda_backend.rs +++ b/src/gpu/cuda_backend.rs @@ -585,7 +585,15 @@ impl GpuBackend for CudaContext { inputs: &[f32], batch_size: u32, ) -> Result, GpuError> { - cuda_forward_batch_body!(self, tape, inputs, batch_size, f32, constants_f32, forward_f32) + cuda_forward_batch_body!( + self, + tape, + inputs, + batch_size, + f32, + constants_f32, + forward_f32 + ) } fn gradient_batch( @@ -594,7 +602,16 @@ impl GpuBackend for CudaContext { inputs: &[f32], batch_size: u32, ) -> Result<(Vec, Vec), GpuError> { - cuda_gradient_batch_body!(self, tape, inputs, batch_size, f32, constants_f32, forward_f32, reverse_f32) + cuda_gradient_batch_body!( + self, + tape, + inputs, + batch_size, + f32, + constants_f32, + forward_f32, + reverse_f32 + ) } fn sparse_jacobian( @@ -613,7 +630,16 @@ impl GpuBackend for CudaContext { tangent_dirs: &[f32], batch_size: u32, ) -> Result<(Vec, Vec), GpuError> { - cuda_hvp_batch_body!(self, tape, x, tangent_dirs, batch_size, f32, constants_f32, tangent_rev_f32) + cuda_hvp_batch_body!( + self, + tape, + x, + tangent_dirs, + batch_size, + f32, + constants_f32, + tangent_rev_f32 + ) } fn sparse_hessian( @@ -639,7 +665,16 @@ impl CudaContext { direction_seeds: &[f32], batch_size: u32, ) -> Result, GpuError> { - cuda_taylor_fwd_2nd_body!(self, tape, primal_inputs, direction_seeds, batch_size, f32, constants_f32, taylor_fwd_2nd_f32) + cuda_taylor_fwd_2nd_body!( + self, + tape, + primal_inputs, + direction_seeds, + batch_size, + f32, + constants_f32, + taylor_fwd_2nd_f32 + ) } /// Batched second-order Taylor forward propagation (f64, CUDA only). @@ -650,7 +685,16 @@ impl CudaContext { direction_seeds: &[f64], batch_size: u32, ) -> Result, GpuError> { - cuda_taylor_fwd_2nd_body!(self, tape, primal_inputs, direction_seeds, batch_size, f64, constants_f64, taylor_fwd_2nd_f64) + cuda_taylor_fwd_2nd_body!( + self, + tape, + primal_inputs, + direction_seeds, + batch_size, + f64, + constants_f64, + taylor_fwd_2nd_f64 + ) } // ── f64 operations ── @@ -662,7 +706,15 @@ impl CudaContext { inputs: &[f64], batch_size: u32, ) -> Result, GpuError> { - cuda_forward_batch_body!(self, tape, inputs, batch_size, f64, constants_f64, forward_f64) + cuda_forward_batch_body!( + self, + tape, + inputs, + batch_size, + f64, + constants_f64, + forward_f64 + ) } /// Batched gradient (f64, CUDA only). @@ -672,7 +724,16 @@ impl CudaContext { inputs: &[f64], batch_size: u32, ) -> Result<(Vec, Vec), GpuError> { - cuda_gradient_batch_body!(self, tape, inputs, batch_size, f64, constants_f64, forward_f64, reverse_f64) + cuda_gradient_batch_body!( + self, + tape, + inputs, + batch_size, + f64, + constants_f64, + forward_f64, + reverse_f64 + ) } /// Sparse Jacobian (f64, CUDA only). @@ -703,6 +764,15 @@ impl CudaContext { tangent_dirs: &[f64], batch_size: u32, ) -> Result<(Vec, Vec), GpuError> { - cuda_hvp_batch_body!(self, tape, x, tangent_dirs, batch_size, f64, constants_f64, tangent_rev_f64) + cuda_hvp_batch_body!( + self, + tape, + x, + tangent_dirs, + batch_size, + f64, + constants_f64, + tangent_rev_f64 + ) } }