diff --git a/.jules/bolt.md b/.jules/bolt.md new file mode 100644 index 0000000..7541723 --- /dev/null +++ b/.jules/bolt.md @@ -0,0 +1,3 @@ +## 2024-05-15 - Pandas df.iterrows() Anti-pattern +**Learning:** Found an instance of `df.iterrows()` in `web/streamlit_dashboard.py` within `run_batch`. `iterrows()` is notoriously slow in pandas. Memory notes suggested replacing it with `df.itertuples()` or `zip()` on specific columns. Using `itertuples()` allows faster row iteration. +**Action:** Replace `df.iterrows()` with `df.itertuples(index=True, name='Pandas')` or a more specific `zip()` over the columns to reduce overhead in loops that process large dataframes. diff --git a/web/streamlit_dashboard.py b/web/streamlit_dashboard.py index c76da3a..7e27266 100644 --- a/web/streamlit_dashboard.py +++ b/web/streamlit_dashboard.py @@ -40,11 +40,13 @@ def run_batch(df, problem_col, answer_col, cot, temperature, top_p, max_new_toke total = len(df) progress = st.progress(0) correct = 0 - for i, row in df.iterrows(): - problem = str(row[problem_col]) - reference = None - if answer_col and answer_col in df.columns: - reference = row[answer_col] + # ⚡ Bolt Optimization: Replacing slow df.iterrows() with fast iteration over parallel arrays/series + # This avoids the high overhead of creating a Series object for every row + # and safely handles dynamic column names and indices. + problem_series = df[problem_col].astype(str).tolist() + reference_series = df[answer_col].tolist() if answer_col and answer_col in df.columns else [None] * total + + for i, problem, reference in zip(range(total), problem_series, reference_series): out = generate_solution(problem, cot=cot, temperature=temperature, top_p=top_p, max_new_tokens=max_new_tokens, base_model=base_model, adapter_path=adapter_path) pred = extract_numeric(out) ref_num = None