gpsaggese · vsripada2423 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
diff --git a/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/Dockerfile b/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/Dockerfile
@@ -0,0 +1,30 @@
+# Use Python 3.12 slim (already has Python and pip).
+FROM python:3.12-slim
+
+# Avoid interactive prompts during apt operations.
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install CA certificates (needed for HTTPS).
+RUN apt-get update && apt-get install -y \
+    ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install project specific packages.
+RUN mkdir -p /install
+COPY requirements.txt /install/requirements.txt
+RUN pip install --upgrade pip && \
+    pip install --no-cache-dir jupyterlab jupyterlab_vim jupytext -r /install/requirements.txt
+
+# Config.
+COPY etc_sudoers /install/
+COPY etc_sudoers /etc/sudoers
+COPY bashrc /root/.bashrc
+
+# Report package versions.
+COPY version.sh /install/
+RUN /install/version.sh 2>&1 | tee version.log
+
+# Jupyter.
+EXPOSE 8888
+
+CMD ["/bin/bash"]
diff --git a/..._project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/README.md b/..._project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/README.md
@@ -0,0 +1,53 @@
+# Benchmarking Data Science Agents: A Comparative Study
+
+## Team Members
+- Venkata Sripada
+- Amulya Grace Bandlamudi
+
+## Project Overview
+This project studies and compares publicly available data science benchmarks for AI agents, including DataSciBench, DSBench, MLE-Bench, GAIA, and SWE-bench. The goal is to understand how different benchmarks evaluate data science capabilities such as data analysis, machine learning engineering, and multi-step reasoning.
+
+We will collect leaderboard data from these benchmarks, analyze benchmark design (task type, evaluation metric, and difficulty), and compare agent performance across benchmarks. The project will produce a reproducible data analysis pipeline, visualizations, and a comparative research report.
+
+## Objectives
+- Collect benchmark leaderboard data
+- Analyze benchmark structure and evaluation metrics
+- Compare performance across different AI agents
+- Perform clustering and correlation analysis on benchmark results
+- Visualize benchmark differences and capability gaps
+- Produce a final research report
+
+## Tools and Technologies
+- Python
+- Pandas
+- NumPy
+- Matplotlib / Seaborn
+- Scikit-learn
+- Jupyter Notebook
+- (Optional) PySpark for large-scale data processing
+
+## Methodology
+1. Data collection from benchmark leaderboards
+2. Data cleaning and dataset integration
+3. Exploratory data analysis
+4. Statistical analysis and correlation analysis      
+5. Clustering benchmarks and agent performance
+6. Visualization and interpretation of results
+7. Final report and documentation
+
+## Expected Outcome
+The project aims to identify differences between data science benchmarks and determine which benchmarks better represent real-world data science tasks. We expect to find capability gaps where some agents perform well on coding tasks but struggle with multi-step reasoning and workflow-based tasks.
+
+## Repository Structure
+
+```
+Benchmarking_Data_Science_Agents/
+│
+├── README.md          # Project overview and instructions
+├── data/              # Collected benchmark datasets
+├── notebooks/         # Jupyter notebooks for analysis
+├── src/               # Python scripts for data processing and analysis
+├── results/           # Output files, plots, and tables
+├── report/            # Final report and documentation
+└── references/        # Papers and benchmark documentation
+```
diff --git a/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/requirements.txt b/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/requirements.txt
@@ -0,0 +1,4 @@
+matplotlib
+numpy
+pandas
+seaborn
diff --git a/..._project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/template.example.py b/..._project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/template.example.py
@@ -0,0 +1,125 @@
+---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.19.0
+#   kernelspec:
+#     display_name: Python 3 (ipykernel)
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Template Example Notebook
+#
+# This is a template notebook. The first heading should be the title of what notebook is about. For example, if it is a project on neo4j tutorial the heading should be `Project Title`.
+#
+# - Add description of what the notebook does.
+# - Point to references, e.g. (neo4j.example.md)
+# - Add citations.
+# - Keep the notebook flow clear.
+# - Comments should be imperative and have a period at the end.
+# - Your code should be well commented.
+#
+# The name of this notebook should in the following format:
+# - if the notebook is exploring `pycaret API`, then it is `pycaret.example.ipynb`
+#
+# Follow the reference to write notebooks in a clear manner: https://github.com/causify-ai/helpers/blob/master/docs/coding/all.jupyter_notebook.how_to_guide.md
+
+# %%
+# %load_ext autoreload
+# %autoreload 2
+# %matplotlib inline
+
+# %%
+import logging
+# Import libraries in this section.
+# Avoid imports like import *, from ... import ..., from ... import *, etc.
+
+import helpers.hdbg as hdbg
+import helpers.hnotebook as hnotebo
+
+# %%
+hdbg.init_logger(verbosity=logging.INFO)
+
+_LOG = logging.getLogger(__name__)
+
+hnotebo.config_notebook()
+
+
+# %% [markdown]
+# ## Make the notebook flow clear
+# Each notebook needs to follow a clear and logical flow, e.g:
+# - Load data
+# - Compute stats
+# - Clean data
+# - Compute stats
+# - Do analysis
+# - Show results
+#
+#
+#
+#
+
+
+# #############################################################################
+# Template
+# #############################################################################
+
+
+# %%
+class Template:
+    """
+    Brief imperative description of what the class does in one line, if needed.
+    """
+
+    def __init__(self):
+        pass
+
+    def method1(self, arg1: int) -> None:
+        """
+        Brief imperative description of what the method does in one line.
+
+        You can elaborate more in the method docstring in this section, for e.g. explaining
+        the formula/algorithm. Every method/function should have a docstring, typehints and include the
+        parameters and return as follows:
+
+        :param arg1: description of arg1
+        :return: description of return
+        """
+        # Code bloks go here.
+        # Make sure to include comments to explain what the code is doing.
+        # No empty lines between code blocks.
+        pass
+
+
+def template_function(arg1: int) -> None:
+    """
+    Brief imperative description of what the function does in one line.
+
+    You can elaborate more in the function docstring in this section, for e.g. explaining
+    the formula/algorithm. Every function should have a docstring, typehints and include the
+    parameters and return as follows:
+
+    :param arg1: description of arg1
+    :return: description of return
+    """
+    # Code bloks go here.
+    # Make sure to include comments to explain what the code is doing.
+    # No empty lines between code blocks.
+    pass
+
+
+# %% [markdown]
+# ## The flow should be highlighted using headings in markdown
+# ```
+# # Level 1
+# ## Level 2
+# ### Level 3
+# ```
+
+# %%
diff --git a/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/template_utils.py b/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/template_utils.py
@@ -0,0 +1,72 @@
+"""
+template_utils.py
+
+This file contains utility functions that support the tutorial notebooks.
+
+- Notebooks should call these functions instead of writing raw logic inline.
+- This helps keep the notebooks clean, modular, and easier to debug.
+- Students should implement functions here for data preprocessing,
+  model setup, evaluation, or any reusable logic.
+
+Import as:
+
+import class_project.project_template.template_utils as cpptteut
+"""
+
+import pandas as pd
+import logging
+from sklearn.model_selection import train_test_split
+from pycaret.classification import compare_models
+
+# -----------------------------------------------------------------------------
+# Logging
+# -----------------------------------------------------------------------------
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# -----------------------------------------------------------------------------
+# Example 1: Split the dataset into train and test sets
+# -----------------------------------------------------------------------------
+
+
+def split_data(df: pd.DataFrame, target_column: str, test_size: float = 0.2):
+    """
+    Split the dataset into training and testing sets.
+
+    :param df: full dataset
+    :param target_column: name of the target column
+    :param test_size: proportion of test data (default = 0.2)
+
+    :return: X_train, X_test, y_train, y_test
+    """
+    logger.info("Splitting data into train and test sets")
+    X = df.drop(columns=[target_column])
+    y = df[target_column]
+    return train_test_split(X, y, test_size=test_size, random_state=42)
+
+
+# -----------------------------------------------------------------------------
+# Example 2: PyCaret classification pipeline
+# -----------------------------------------------------------------------------
+
+
+def run_pycaret_classification(
+    df: pd.DataFrame, target_column: str
+) -> pd.DataFrame:
+    """
+    Run a basic PyCaret classification experiment.
+
+    :param df: dataset containing features and target
+    :param target_column: name of the target column
+
+    :return: comparison of top-performing models
+    """
+    logger.info("Initializing PyCaret classification setup")
+    ...
+
+    logger.info("Comparing models")
+    results = compare_models()
+    ...
+
+    return results
diff --git a/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/version.sh b/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/version.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# """
+# Display versions of installed tools and packages.
+#
+# This script prints version information for Python, pip, Jupyter, and all
+# installed Python packages. Used for debugging and documentation purposes
+# to verify the Docker container environment setup.
+# """
+
+# Display Python 3 version.
+echo "# Python3"
+python3 --version
+
+# Display pip version.
+echo "# pip3"
+pip3 --version
+
+# Display Jupyter version.
+echo "# jupyter"
+jupyter --version
+
+# List all installed Python packages and their versions.
+echo "# Python packages"
+pip3 list
+
+# Template for adding additional tool versions.
+# echo "# mongo"
+# mongod --version