From aacc54a026f14e11f0cc58eb4b29f46fafbb9184 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 13 Jan 2026 12:11:42 +0000
Subject: [PATCH 1/2] Release v2.0.1: Code quality and Rust backend
 optimizations

Python improvements:
- Extract shared within-transformation logic to utils.py
  - New demean_by_group() for one-way fixed effects demeaning
  - New within_transform() for two-way (unit + time) FE transformation
  - Reduces code duplication across estimators.py, twfe.py, sun_abraham.py, bacon.py
  - Fix DataFrame fragmentation warning by building columns in batch

Rust backend optimizations:
- Use Cholesky factorization for matrix inversion in linalg.rs
  - More efficient for symmetric positive-definite matrices (X'X)
  - Approximately half the operations of general LU decomposition
- Reduce bootstrap allocations in bootstrap.rs
  - Allocate directly into pre-allocated buffer
  - Eliminates intermediate Vec<Vec<f64>> -> flatten -> Array2 pattern
  - Single allocation instead of two

Version bump: 2.0.0 -> 2.0.1
---
 TODO.md                  |   6 +-
 diff_diff/__init__.py    |   2 +-
 diff_diff/bacon.py       |  72 ++++----------------
 diff_diff/estimators.py  |  17 ++---
 diff_diff/sun_abraham.py |  24 +------
 diff_diff/twfe.py        |  20 +-----
 diff_diff/utils.py       | 137 +++++++++++++++++++++++++++++++++++++++
 pyproject.toml           |   2 +-
 rust/Cargo.toml          |   2 +-
 rust/src/bootstrap.rs    | 109 ++++++++++++++++---------------
 rust/src/linalg.rs       |  24 +++++--
 11 files changed, 242 insertions(+), 173 deletions(-)
diff --git a/TODO.md b/TODO.md
index 3fbd24f..2a3293c 100644
--- a/TODO.md
+++ b/TODO.md
@@ -25,7 +25,7 @@ Consolidation opportunities for cleaner maintenance:
 
 | Duplicate Code | Locations | Notes |
 |---------------|-----------|-------|
-| Within-transformation logic | `estimators.py:217-232`, `estimators.py:787-833`, `bacon.py:567-642` | Extract to utils.py |
+| ~~Within-transformation logic~~ | ~~Multiple files~~ | ✅ Extracted to `utils.py` as `demean_by_group()` and `within_transform()` (v2.0.1) |
 | Linear regression helper | `staggered.py:205-240`, `estimators.py:366-408` | Consider consolidation |
 
 ### Large Module Files
@@ -92,8 +92,8 @@ Enhancements for `honest_did.py`:
 
 Deferred from PR #58 code review (can be done post-merge):
 
-- [ ] **Matrix inversion efficiency** (`rust/src/linalg.rs:180-194`): Use Cholesky factorization for symmetric positive-definite matrices instead of column-by-column solve
-- [ ] **Reduce bootstrap allocations** (`rust/src/bootstrap.rs`): Currently uses `Vec<Vec<f64>>` → flatten → `Array2` which allocates twice. Should allocate directly into ndarray.
+- [x] **Matrix inversion efficiency** (`rust/src/linalg.rs`): ~~Use Cholesky factorization for symmetric positive-definite matrices instead of column-by-column solve~~ (completed in v2.0.1)
+- [x] **Reduce bootstrap allocations** (`rust/src/bootstrap.rs`): ~~Currently uses `Vec<Vec<f64>>` → flatten → `Array2` which allocates twice.~~ Now allocates directly into pre-allocated buffer. (completed in v2.0.1)
 - [ ] **Consider static BLAS linking** (`rust/Cargo.toml`): Currently requires system BLAS libraries. Consider `openblas-static` or `intel-mkl-static` features for easier distribution.
 
 ---
diff --git a/diff_diff/__init__.py b/diff_diff/__init__.py
index ffb0f07..ba5ef51 100644
--- a/diff_diff/__init__.py
+++ b/diff_diff/__init__.py
@@ -113,7 +113,7 @@
     plot_sensitivity,
 )
 
-__version__ = "2.0.0"
+__version__ = "2.0.1"
 __all__ = [
     # Estimators
     "DifferenceInDifferences",
diff --git a/diff_diff/bacon.py b/diff_diff/bacon.py
index ed94e73..8f09d9f 100644
--- a/diff_diff/bacon.py
+++ b/diff_diff/bacon.py
@@ -17,6 +17,8 @@
 import numpy as np
 import pandas as pd
 
+from diff_diff.utils import within_transform as _within_transform_util
+
 
 @dataclass
 class Comparison2x2:
@@ -573,66 +575,16 @@ def _compute_twfe(
         treat_col: str = '__bacon_treated_internal__',
     ) -> float:
         """Compute TWFE estimate using within-transformation."""
-        # Demean by unit and time
-        y = df[outcome].values
-        d = df[treat_col].astype(float).values
-
-        # Create unit and time dummies for demeaning
-        units = df[unit].values
-        times = df[time].values
-
-        # Unit means
-        unit_map = {u: i for i, u in enumerate(df[unit].unique())}
-        unit_idx = np.array([unit_map[u] for u in units])
-        n_units = len(unit_map)
-
-        # Time means
-        time_map = {t: i for i, t in enumerate(df[time].unique())}
-        time_idx = np.array([time_map[t] for t in times])
-        n_times = len(time_map)
-
-        # Compute means
-        y_unit_mean = np.zeros(n_units)
-        d_unit_mean = np.zeros(n_units)
-        unit_counts = np.zeros(n_units)
-
-        for i in range(len(y)):
-            u = unit_idx[i]
-            y_unit_mean[u] += y[i]
-            d_unit_mean[u] += d[i]
-            unit_counts[u] += 1
-
-        y_unit_mean /= np.maximum(unit_counts, 1)
-        d_unit_mean /= np.maximum(unit_counts, 1)
-
-        y_time_mean = np.zeros(n_times)
-        d_time_mean = np.zeros(n_times)
-        time_counts = np.zeros(n_times)
-
-        for i in range(len(y)):
-            t = time_idx[i]
-            y_time_mean[t] += y[i]
-            d_time_mean[t] += d[i]
-            time_counts[t] += 1
-
-        y_time_mean /= np.maximum(time_counts, 1)
-        d_time_mean /= np.maximum(time_counts, 1)
-
-        # Overall mean
-        y_mean = np.mean(y)
-        d_mean = np.mean(d)
-
-        # Within transformation: y_it - y_i - y_t + y
-        y_within = np.zeros(len(y))
-        d_within = np.zeros(len(d))
-
-        for i in range(len(y)):
-            u = unit_idx[i]
-            t = time_idx[i]
-            y_within[i] = y[i] - y_unit_mean[u] - y_time_mean[t] + y_mean
-            d_within[i] = d[i] - d_unit_mean[u] - d_time_mean[t] + d_mean
-
-        # OLS on demeaned data
+        # Apply two-way within transformation
+        df_dm = _within_transform_util(
+            df, [outcome, treat_col], unit, time, suffix="_within"
+        )
+
+        # Extract within-transformed values
+        y_within = df_dm[f"{outcome}_within"].values
+        d_within = df_dm[f"{treat_col}_within"].values
+
+        # OLS on demeaned data: beta = sum(d * y) / sum(d^2)
         d_var = np.sum(d_within ** 2)
         if d_var > 0:
             beta = np.sum(d_within * y_within) / d_var
diff --git a/diff_diff/estimators.py b/diff_diff/estimators.py
index f896e50..40fd220 100644
--- a/diff_diff/estimators.py
+++ b/diff_diff/estimators.py
@@ -23,6 +23,7 @@
     WildBootstrapResults,
     compute_confidence_interval,
     compute_p_value,
+    demean_by_group,
     validate_binary,
     wild_bootstrap_se,
 )
@@ -227,10 +228,10 @@ def fit(
             # unit-invariant, so demeaning them would create multicollinearity
             vars_to_demean = [outcome] + (covariates or [])
             for ab_var in absorb:
-                n_absorbed_effects += working_data[ab_var].nunique() - 1
-                for var in vars_to_demean:
-                    group_means = working_data.groupby(ab_var)[var].transform("mean")
-                    working_data[var] = working_data[var] - group_means
+                working_data, n_fe = demean_by_group(
+                    working_data, vars_to_demean, ab_var, inplace=True
+                )
+                n_absorbed_effects += n_fe
                 absorbed_vars.append(ab_var)
 
         # Extract variables (may be demeaned if absorb was used)
@@ -828,10 +829,10 @@ def fit(  # type: ignore[override]
         if absorb:
             vars_to_demean = [outcome] + (covariates or [])
             for ab_var in absorb:
-                n_absorbed_effects += working_data[ab_var].nunique() - 1
-                for var in vars_to_demean:
-                    group_means = working_data.groupby(ab_var)[var].transform("mean")
-                    working_data[var] = working_data[var] - group_means
+                working_data, n_fe = demean_by_group(
+                    working_data, vars_to_demean, ab_var, inplace=True
+                )
+                n_absorbed_effects += n_fe
 
         # Extract outcome and treatment
         y = working_data[outcome].values.astype(float)
diff --git a/diff_diff/sun_abraham.py b/diff_diff/sun_abraham.py
index e50247f..914b6ff 100644
--- a/diff_diff/sun_abraham.py
+++ b/diff_diff/sun_abraham.py
@@ -21,6 +21,7 @@
 from diff_diff.utils import (
     compute_confidence_interval,
     compute_p_value,
+    within_transform as _within_transform_util,
 )
 
 
@@ -789,28 +790,7 @@ def _within_transform(
 
         y_it - y_i. - y_.t + y_..
         """
-        df = df.copy()
-
-        # Build all demeaned columns at once to avoid fragmentation
-        demeaned_data = {}
-        for var in variables:
-            # Unit means
-            unit_means = df.groupby(unit)[var].transform("mean")
-            # Time means
-            time_means = df.groupby(time)[var].transform("mean")
-            # Grand mean
-            grand_mean = df[var].mean()
-
-            # Within transformation
-            demeaned_data[f"{var}_dm"] = (
-                df[var] - unit_means - time_means + grand_mean
-            ).values
-
-        # Add all demeaned columns at once
-        demeaned_df = pd.DataFrame(demeaned_data, index=df.index)
-        df = pd.concat([df, demeaned_df], axis=1)
-
-        return df
+        return _within_transform_util(df, variables, unit, time, suffix="_dm")
 
     def _compute_iw_effects(
         self,
diff --git a/diff_diff/twfe.py b/diff_diff/twfe.py
index 46224f6..54d9dce 100644
--- a/diff_diff/twfe.py
+++ b/diff_diff/twfe.py
@@ -17,6 +17,7 @@
 from diff_diff.utils import (
     compute_confidence_interval,
     compute_p_value,
+    within_transform as _within_transform_util,
 )
 
 
@@ -211,25 +212,8 @@ def _within_transform(
         pd.DataFrame
             Data with demeaned variables.
         """
-        data = data.copy()
         variables = [outcome] + (covariates or [])
-
-        # Cache groupby objects for efficiency (avoids re-computing group indexes)
-        unit_grouper = data.groupby(unit, sort=False)
-        time_grouper = data.groupby(time, sort=False)
-
-        for var in variables:
-            # Unit means (using cached grouper)
-            unit_means = unit_grouper[var].transform("mean")
-            # Time means (using cached grouper)
-            time_means = time_grouper[var].transform("mean")
-            # Grand mean
-            grand_mean = data[var].mean()
-
-            # Within transformation
-            data[f"{var}_demeaned"] = data[var] - unit_means - time_means + grand_mean
-
-        return data
+        return _within_transform_util(data, variables, unit, time, suffix="_demeaned")
 
     def _check_staggered_treatment(
         self,
diff --git a/diff_diff/utils.py b/diff_diff/utils.py
index 600d9c2..6295824 100644
--- a/diff_diff/utils.py
+++ b/diff_diff/utils.py
@@ -1342,3 +1342,140 @@ def compute_placebo_effects(
         placebo_effects.append(placebo_tau)
 
     return np.asarray(placebo_effects)
+
+
+def demean_by_group(
+    data: pd.DataFrame,
+    variables: List[str],
+    group_var: str,
+    inplace: bool = False,
+    suffix: str = "",
+) -> Tuple[pd.DataFrame, int]:
+    """
+    Demean variables by a grouping variable (one-way within transformation).
+
+    For each variable, computes: x_ig - mean(x_g) where g is the group.
+
+    Parameters
+    ----------
+    data : pd.DataFrame
+        DataFrame containing the variables to demean.
+    variables : list of str
+        Column names to demean.
+    group_var : str
+        Column name for the grouping variable.
+    inplace : bool, default False
+        If True, modifies the original columns. If False, leaves original
+        columns unchanged (demeaning is still applied to return value).
+    suffix : str, default ""
+        Suffix to add to demeaned column names (only used when inplace=False
+        and you want to keep both original and demeaned columns).
+
+    Returns
+    -------
+    data : pd.DataFrame
+        DataFrame with demeaned variables.
+    n_effects : int
+        Number of absorbed fixed effects (nunique - 1).
+
+    Examples
+    --------
+    >>> df, n_fe = demean_by_group(df, ['y', 'x1', 'x2'], 'unit')
+    >>> # df['y'], df['x1'], df['x2'] are now demeaned by unit
+    """
+    if not inplace:
+        data = data.copy()
+
+    # Count fixed effects (categories - 1 for identification)
+    n_effects = data[group_var].nunique() - 1
+
+    # Cache the groupby object for efficiency
+    grouper = data.groupby(group_var, sort=False)
+
+    for var in variables:
+        col_name = var if not suffix else f"{var}{suffix}"
+        group_means = grouper[var].transform("mean")
+        data[col_name] = data[var] - group_means
+
+    return data, n_effects
+
+
+def within_transform(
+    data: pd.DataFrame,
+    variables: List[str],
+    unit: str,
+    time: str,
+    inplace: bool = False,
+    suffix: str = "_demeaned",
+) -> pd.DataFrame:
+    """
+    Apply two-way within transformation to remove unit and time fixed effects.
+
+    Computes: y_it - y_i. - y_.t + y_.. for each variable.
+
+    This is the standard fixed effects transformation for panel data that
+    removes both unit-specific and time-specific effects.
+
+    Parameters
+    ----------
+    data : pd.DataFrame
+        Panel data containing the variables to transform.
+    variables : list of str
+        Column names to transform.
+    unit : str
+        Column name for unit identifier.
+    time : str
+        Column name for time period identifier.
+    inplace : bool, default False
+        If True, modifies the original columns. If False, creates new columns
+        with the specified suffix.
+    suffix : str, default "_demeaned"
+        Suffix for new column names when inplace=False.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with within-transformed variables.
+
+    Notes
+    -----
+    The within transformation removes variation that is constant within units
+    (unit fixed effects) and constant within time periods (time fixed effects).
+    The resulting estimates are equivalent to including unit and time dummies
+    but is computationally more efficient for large panels.
+
+    Examples
+    --------
+    >>> df = within_transform(df, ['y', 'x'], 'unit_id', 'year')
+    >>> # df now has 'y_demeaned' and 'x_demeaned' columns
+    """
+    if not inplace:
+        data = data.copy()
+
+    # Cache groupby objects for efficiency
+    unit_grouper = data.groupby(unit, sort=False)
+    time_grouper = data.groupby(time, sort=False)
+
+    if inplace:
+        # Modify columns in place
+        for var in variables:
+            unit_means = unit_grouper[var].transform("mean")
+            time_means = time_grouper[var].transform("mean")
+            grand_mean = data[var].mean()
+            data[var] = data[var] - unit_means - time_means + grand_mean
+    else:
+        # Build all demeaned columns at once to avoid DataFrame fragmentation
+        demeaned_data = {}
+        for var in variables:
+            unit_means = unit_grouper[var].transform("mean")
+            time_means = time_grouper[var].transform("mean")
+            grand_mean = data[var].mean()
+            demeaned_data[f"{var}{suffix}"] = (
+                data[var] - unit_means - time_means + grand_mean
+            ).values
+
+        # Add all columns at once
+        demeaned_df = pd.DataFrame(demeaned_data, index=data.index)
+        data = pd.concat([data, demeaned_df], axis=1)
+
+    return data
diff --git a/pyproject.toml b/pyproject.toml
index 2dbb55c..6ab44de 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "diff-diff"
-version = "2.0.0"
+version = "2.0.1"
 description = "A library for Difference-in-Differences causal inference analysis"
 readme = "README.md"
 license = "MIT"
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index e60e893..71bb01a 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "diff_diff_rust"
-version = "2.0.0"
+version = "2.0.1"
 edition = "2021"
 description = "Rust backend for diff-diff DiD library"
 license = "MIT"
diff --git a/rust/src/bootstrap.rs b/rust/src/bootstrap.rs
index 21acca6..fb6d35c 100644
--- a/rust/src/bootstrap.rs
+++ b/rust/src/bootstrap.rs
@@ -51,19 +51,20 @@ pub fn generate_bootstrap_weights_batch<'py>(
 ///
 /// E[w] = 0, Var[w] = 1
 fn generate_rademacher_batch(n_bootstrap: usize, n_units: usize, seed: u64) -> Array2<f64> {
-    // Generate weights in parallel using rayon
-    let rows: Vec<Vec<f64>> = (0..n_bootstrap)
-        .into_par_iter()
-        .map(|i| {
+    // Pre-allocate flat array directly (single allocation instead of Vec<Vec<f64>> + flatten)
+    let total_size = n_bootstrap * n_units;
+    let mut flat = vec![0.0_f64; total_size];
+
+    // Generate weights in parallel, writing directly to pre-allocated buffer
+    flat.par_chunks_mut(n_units)
+        .enumerate()
+        .for_each(|(i, row)| {
             let mut rng = Xoshiro256PlusPlus::seed_from_u64(seed.wrapping_add(i as u64));
-            (0..n_units)
-                .map(|_| if rng.gen::<bool>() { 1.0 } else { -1.0 })
-                .collect()
-        })
-        .collect();
-
-    // Convert to ndarray
-    let flat: Vec<f64> = rows.into_iter().flatten().collect();
+            for val in row.iter_mut() {
+                *val = if rng.gen::<bool>() { 1.0 } else { -1.0 };
+            }
+        });
+
     Array2::from_shape_vec((n_bootstrap, n_units), flat).unwrap()
 }
 
@@ -83,23 +84,24 @@ fn generate_mammen_batch(n_bootstrap: usize, n_units: usize, seed: u64) -> Array
     // Probability of negative value
     let prob_neg = (sqrt5 + 1.0) / (2.0 * sqrt5); // ≈ 0.724
 
-    let rows: Vec<Vec<f64>> = (0..n_bootstrap)
-        .into_par_iter()
-        .map(|i| {
+    // Pre-allocate flat array directly (single allocation)
+    let total_size = n_bootstrap * n_units;
+    let mut flat = vec![0.0_f64; total_size];
+
+    // Generate weights in parallel, writing directly to pre-allocated buffer
+    flat.par_chunks_mut(n_units)
+        .enumerate()
+        .for_each(|(i, row)| {
             let mut rng = Xoshiro256PlusPlus::seed_from_u64(seed.wrapping_add(i as u64));
-            (0..n_units)
-                .map(|_| {
-                    if rng.gen::<f64>() < prob_neg {
-                        val_neg
-                    } else {
-                        val_pos
-                    }
-                })
-                .collect()
-        })
-        .collect();
-
-    let flat: Vec<f64> = rows.into_iter().flatten().collect();
+            for val in row.iter_mut() {
+                *val = if rng.gen::<f64>() < prob_neg {
+                    val_neg
+                } else {
+                    val_pos
+                };
+            }
+        });
+
     Array2::from_shape_vec((n_bootstrap, n_units), flat).unwrap()
 }
 
@@ -118,32 +120,33 @@ fn generate_webb_batch(n_bootstrap: usize, n_units: usize, seed: u64) -> Array2<
     // Equal probability for each of 6 values: 1/6 each
     let prob = 1.0 / 6.0;
 
-    let rows: Vec<Vec<f64>> = (0..n_bootstrap)
-        .into_par_iter()
-        .map(|i| {
+    // Pre-allocate flat array directly (single allocation)
+    let total_size = n_bootstrap * n_units;
+    let mut flat = vec![0.0_f64; total_size];
+
+    // Generate weights in parallel, writing directly to pre-allocated buffer
+    flat.par_chunks_mut(n_units)
+        .enumerate()
+        .for_each(|(i, row)| {
             let mut rng = Xoshiro256PlusPlus::seed_from_u64(seed.wrapping_add(i as u64));
-            (0..n_units)
-                .map(|_| {
-                    let u = rng.gen::<f64>();
-                    if u < prob {
-                        -val1
-                    } else if u < 2.0 * prob {
-                        -val2
-                    } else if u < 3.0 * prob {
-                        -val3
-                    } else if u < 4.0 * prob {
-                        val3
-                    } else if u < 5.0 * prob {
-                        val2
-                    } else {
-                        val1
-                    }
-                })
-                .collect()
-        })
-        .collect();
-
-    let flat: Vec<f64> = rows.into_iter().flatten().collect();
+            for val in row.iter_mut() {
+                let u = rng.gen::<f64>();
+                *val = if u < prob {
+                    -val1
+                } else if u < 2.0 * prob {
+                    -val2
+                } else if u < 3.0 * prob {
+                    -val3
+                } else if u < 4.0 * prob {
+                    val3
+                } else if u < 5.0 * prob {
+                    val2
+                } else {
+                    val1
+                };
+            }
+        });
+
     Array2::from_shape_vec((n_bootstrap, n_units), flat).unwrap()
 }
 
diff --git a/rust/src/linalg.rs b/rust/src/linalg.rs
index 08c7b37..6020574 100644
--- a/rust/src/linalg.rs
+++ b/rust/src/linalg.rs
@@ -6,7 +6,7 @@
 //! - Cluster-robust variance-covariance estimation
 
 use ndarray::{Array1, Array2, ArrayView1, ArrayView2};
-use ndarray_linalg::{LeastSquaresSvd, Solve};
+use ndarray_linalg::{Cholesky, LeastSquaresSvd, Solve, UPLO};
 use numpy::{IntoPyArray, PyArray1, PyArray2, PyReadonlyArray1, PyReadonlyArray2};
 use pyo3::prelude::*;
 use std::collections::HashMap;
@@ -190,18 +190,30 @@ fn compute_robust_vcov_internal(
     }
 }
 
-/// Invert a symmetric positive-definite matrix.
+/// Invert a symmetric positive-definite matrix using Cholesky factorization.
+///
+/// For symmetric positive-definite matrices like X'X, Cholesky factorization
+/// (A = L L') is more efficient than general LU decomposition, requiring
+/// approximately half the operations.
 fn invert_symmetric(a: &Array2<f64>) -> PyResult<Array2<f64>> {
     let n = a.nrows();
-    let mut result = Array2::<f64>::zeros((n, n));
 
-    // Solve A * x_i = e_i for each column of the identity matrix
+    // Compute Cholesky factorization: A = L L' where L is lower triangular
+    let factorized = a.cholesky(UPLO::Lower)
+        .map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(
+            format!("Cholesky factorization failed (matrix may not be positive-definite): {}", e)
+        ))?;
+
+    // Solve A * A^{-1} = I by solving for each column of the identity
+    let mut result = Array2::<f64>::zeros((n, n));
     for i in 0..n {
         let mut e_i = Array1::<f64>::zeros(n);
         e_i[i] = 1.0;
 
-        let col = a.solve(&e_i)
-            .map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(format!("Matrix inversion failed: {}", e)))?;
+        let col = factorized.solve(&e_i)
+            .map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(
+                format!("Matrix inversion failed: {}", e)
+            ))?;
 
         result.column_mut(i).assign(&col);
     }

From 9e576c06fb41fa6e31be314bba41c73ae8319fb5 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 13 Jan 2026 12:15:47 +0000
Subject: [PATCH 2/2] Revert untested Rust backend changes

The Rust backend optimizations (Cholesky factorization, reduced allocations)
could not be tested in CI due to missing OpenBLAS library. Reverting these
changes to keep v2.0.1 focused on tested Python improvements only.

The Rust optimizations remain in TODO.md for future implementation when
proper testing infrastructure is available.
---
 TODO.md               |   4 +-
 rust/Cargo.toml       |   2 +-
 rust/src/bootstrap.rs | 109 ++++++++++++++++++++----------------------
 rust/src/linalg.rs    |  24 +++-------
 4 files changed, 62 insertions(+), 77 deletions(-)

diff --git a/TODO.md b/TODO.md
index 2a3293c..697d432 100644
--- a/TODO.md
+++ b/TODO.md
@@ -92,8 +92,8 @@ Enhancements for `honest_did.py`:
 
 Deferred from PR #58 code review (can be done post-merge):
 
-- [x] **Matrix inversion efficiency** (`rust/src/linalg.rs`): ~~Use Cholesky factorization for symmetric positive-definite matrices instead of column-by-column solve~~ (completed in v2.0.1)
-- [x] **Reduce bootstrap allocations** (`rust/src/bootstrap.rs`): ~~Currently uses `Vec<Vec<f64>>` → flatten → `Array2` which allocates twice.~~ Now allocates directly into pre-allocated buffer. (completed in v2.0.1)
+- [ ] **Matrix inversion efficiency** (`rust/src/linalg.rs:180-194`): Use Cholesky factorization for symmetric positive-definite matrices instead of column-by-column solve
+- [ ] **Reduce bootstrap allocations** (`rust/src/bootstrap.rs`): Currently uses `Vec<Vec<f64>>` → flatten → `Array2` which allocates twice. Should allocate directly into ndarray.
 - [ ] **Consider static BLAS linking** (`rust/Cargo.toml`): Currently requires system BLAS libraries. Consider `openblas-static` or `intel-mkl-static` features for easier distribution.
 
 ---
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index 71bb01a..e60e893 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "diff_diff_rust"
-version = "2.0.1"
+version = "2.0.0"
 edition = "2021"
 description = "Rust backend for diff-diff DiD library"
 license = "MIT"
diff --git a/rust/src/bootstrap.rs b/rust/src/bootstrap.rs
index fb6d35c..21acca6 100644
--- a/rust/src/bootstrap.rs
+++ b/rust/src/bootstrap.rs
@@ -51,20 +51,19 @@ pub fn generate_bootstrap_weights_batch<'py>(
 ///
 /// E[w] = 0, Var[w] = 1
 fn generate_rademacher_batch(n_bootstrap: usize, n_units: usize, seed: u64) -> Array2<f64> {
-    // Pre-allocate flat array directly (single allocation instead of Vec<Vec<f64>> + flatten)
-    let total_size = n_bootstrap * n_units;
-    let mut flat = vec![0.0_f64; total_size];
-
-    // Generate weights in parallel, writing directly to pre-allocated buffer
-    flat.par_chunks_mut(n_units)
-        .enumerate()
-        .for_each(|(i, row)| {
+    // Generate weights in parallel using rayon
+    let rows: Vec<Vec<f64>> = (0..n_bootstrap)
+        .into_par_iter()
+        .map(|i| {
             let mut rng = Xoshiro256PlusPlus::seed_from_u64(seed.wrapping_add(i as u64));
-            for val in row.iter_mut() {
-                *val = if rng.gen::<bool>() { 1.0 } else { -1.0 };
-            }
-        });
-
+            (0..n_units)
+                .map(|_| if rng.gen::<bool>() { 1.0 } else { -1.0 })
+                .collect()
+        })
+        .collect();
+
+    // Convert to ndarray
+    let flat: Vec<f64> = rows.into_iter().flatten().collect();
     Array2::from_shape_vec((n_bootstrap, n_units), flat).unwrap()
 }
 
@@ -84,24 +83,23 @@ fn generate_mammen_batch(n_bootstrap: usize, n_units: usize, seed: u64) -> Array
     // Probability of negative value
     let prob_neg = (sqrt5 + 1.0) / (2.0 * sqrt5); // ≈ 0.724
 
-    // Pre-allocate flat array directly (single allocation)
-    let total_size = n_bootstrap * n_units;
-    let mut flat = vec![0.0_f64; total_size];
-
-    // Generate weights in parallel, writing directly to pre-allocated buffer
-    flat.par_chunks_mut(n_units)
-        .enumerate()
-        .for_each(|(i, row)| {
+    let rows: Vec<Vec<f64>> = (0..n_bootstrap)
+        .into_par_iter()
+        .map(|i| {
             let mut rng = Xoshiro256PlusPlus::seed_from_u64(seed.wrapping_add(i as u64));
-            for val in row.iter_mut() {
-                *val = if rng.gen::<f64>() < prob_neg {
-                    val_neg
-                } else {
-                    val_pos
-                };
-            }
-        });
-
+            (0..n_units)
+                .map(|_| {
+                    if rng.gen::<f64>() < prob_neg {
+                        val_neg
+                    } else {
+                        val_pos
+                    }
+                })
+                .collect()
+        })
+        .collect();
+
+    let flat: Vec<f64> = rows.into_iter().flatten().collect();
     Array2::from_shape_vec((n_bootstrap, n_units), flat).unwrap()
 }
 
@@ -120,33 +118,32 @@ fn generate_webb_batch(n_bootstrap: usize, n_units: usize, seed: u64) -> Array2<
     // Equal probability for each of 6 values: 1/6 each
     let prob = 1.0 / 6.0;
 
-    // Pre-allocate flat array directly (single allocation)
-    let total_size = n_bootstrap * n_units;
-    let mut flat = vec![0.0_f64; total_size];
-
-    // Generate weights in parallel, writing directly to pre-allocated buffer
-    flat.par_chunks_mut(n_units)
-        .enumerate()
-        .for_each(|(i, row)| {
+    let rows: Vec<Vec<f64>> = (0..n_bootstrap)
+        .into_par_iter()
+        .map(|i| {
             let mut rng = Xoshiro256PlusPlus::seed_from_u64(seed.wrapping_add(i as u64));
-            for val in row.iter_mut() {
-                let u = rng.gen::<f64>();
-                *val = if u < prob {
-                    -val1
-                } else if u < 2.0 * prob {
-                    -val2
-                } else if u < 3.0 * prob {
-                    -val3
-                } else if u < 4.0 * prob {
-                    val3
-                } else if u < 5.0 * prob {
-                    val2
-                } else {
-                    val1
-                };
-            }
-        });
-
+            (0..n_units)
+                .map(|_| {
+                    let u = rng.gen::<f64>();
+                    if u < prob {
+                        -val1
+                    } else if u < 2.0 * prob {
+                        -val2
+                    } else if u < 3.0 * prob {
+                        -val3
+                    } else if u < 4.0 * prob {
+                        val3
+                    } else if u < 5.0 * prob {
+                        val2
+                    } else {
+                        val1
+                    }
+                })
+                .collect()
+        })
+        .collect();
+
+    let flat: Vec<f64> = rows.into_iter().flatten().collect();
     Array2::from_shape_vec((n_bootstrap, n_units), flat).unwrap()
 }
 
diff --git a/rust/src/linalg.rs b/rust/src/linalg.rs
index 6020574..08c7b37 100644
--- a/rust/src/linalg.rs
+++ b/rust/src/linalg.rs
@@ -6,7 +6,7 @@
 //! - Cluster-robust variance-covariance estimation
 
 use ndarray::{Array1, Array2, ArrayView1, ArrayView2};
-use ndarray_linalg::{Cholesky, LeastSquaresSvd, Solve, UPLO};
+use ndarray_linalg::{LeastSquaresSvd, Solve};
 use numpy::{IntoPyArray, PyArray1, PyArray2, PyReadonlyArray1, PyReadonlyArray2};
 use pyo3::prelude::*;
 use std::collections::HashMap;
@@ -190,30 +190,18 @@ fn compute_robust_vcov_internal(
     }
 }
 
-/// Invert a symmetric positive-definite matrix using Cholesky factorization.
-///
-/// For symmetric positive-definite matrices like X'X, Cholesky factorization
-/// (A = L L') is more efficient than general LU decomposition, requiring
-/// approximately half the operations.
+/// Invert a symmetric positive-definite matrix.
 fn invert_symmetric(a: &Array2<f64>) -> PyResult<Array2<f64>> {
     let n = a.nrows();
-
-    // Compute Cholesky factorization: A = L L' where L is lower triangular
-    let factorized = a.cholesky(UPLO::Lower)
-        .map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(
-            format!("Cholesky factorization failed (matrix may not be positive-definite): {}", e)
-        ))?;
-
-    // Solve A * A^{-1} = I by solving for each column of the identity
     let mut result = Array2::<f64>::zeros((n, n));
+
+    // Solve A * x_i = e_i for each column of the identity matrix
     for i in 0..n {
         let mut e_i = Array1::<f64>::zeros(n);
         e_i[i] = 1.0;
 
-        let col = factorized.solve(&e_i)
-            .map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                format!("Matrix inversion failed: {}", e)
-            ))?;
+        let col = a.solve(&e_i)
+            .map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(format!("Matrix inversion failed: {}", e)))?;
 
         result.column_mut(i).assign(&col);
     }