From 379d7a3700d6aed2dbf7835f54e555fe7ececdc0 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Tue, 17 Mar 2026 22:21:50 +0000
Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=20Bolt:=20Optimize=20dataframe=20iter?=
 =?UTF-8?q?ation=20in=20run=5Fbatch?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: dhanush342 <187305764+dhanush342@users.noreply.github.com>
---
 .jules/bolt.md             |  3 +++
 web/streamlit_dashboard.py | 12 +++++++-----
 2 files changed, 10 insertions(+), 5 deletions(-)
 create mode 100644 .jules/bolt.md

diff --git a/.jules/bolt.md b/.jules/bolt.md
new file mode 100644
index 0000000..7541723
--- /dev/null
+++ b/.jules/bolt.md
@@ -0,0 +1,3 @@
+## 2024-05-15 - Pandas df.iterrows() Anti-pattern
+**Learning:** Found an instance of `df.iterrows()` in `web/streamlit_dashboard.py` within `run_batch`. `iterrows()` is notoriously slow in pandas. Memory notes suggested replacing it with `df.itertuples()` or `zip()` on specific columns. Using `itertuples()` allows faster row iteration.
+**Action:** Replace `df.iterrows()` with `df.itertuples(index=True, name='Pandas')` or a more specific `zip()` over the columns to reduce overhead in loops that process large dataframes.
diff --git a/web/streamlit_dashboard.py b/web/streamlit_dashboard.py
index c76da3a..7e27266 100644
--- a/web/streamlit_dashboard.py
+++ b/web/streamlit_dashboard.py
@@ -40,11 +40,13 @@ def run_batch(df, problem_col, answer_col, cot, temperature, top_p, max_new_toke
     total = len(df)
     progress = st.progress(0)
     correct = 0
-    for i, row in df.iterrows():
-        problem = str(row[problem_col])
-        reference = None
-        if answer_col and answer_col in df.columns:
-            reference = row[answer_col]
+    # ⚡ Bolt Optimization: Replacing slow df.iterrows() with fast iteration over parallel arrays/series
+    # This avoids the high overhead of creating a Series object for every row
+    # and safely handles dynamic column names and indices.
+    problem_series = df[problem_col].astype(str).tolist()
+    reference_series = df[answer_col].tolist() if answer_col and answer_col in df.columns else [None] * total
+
+    for i, problem, reference in zip(range(total), problem_series, reference_series):
         out = generate_solution(problem, cot=cot, temperature=temperature, top_p=top_p, max_new_tokens=max_new_tokens, base_model=base_model, adapter_path=adapter_path)
         pred = extract_numeric(out)
         ref_num = None