diff --git a/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/Dockerfile b/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/Dockerfile new file mode 100644 index 000000000..f5c02c562 --- /dev/null +++ b/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/Dockerfile @@ -0,0 +1,30 @@ +# Use Python 3.12 slim (already has Python and pip). +FROM python:3.12-slim + +# Avoid interactive prompts during apt operations. +ENV DEBIAN_FRONTEND=noninteractive + +# Install CA certificates (needed for HTTPS). +RUN apt-get update && apt-get install -y \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Install project specific packages. +RUN mkdir -p /install +COPY requirements.txt /install/requirements.txt +RUN pip install --upgrade pip && \ + pip install --no-cache-dir jupyterlab jupyterlab_vim jupytext -r /install/requirements.txt + +# Config. +COPY etc_sudoers /install/ +COPY etc_sudoers /etc/sudoers +COPY bashrc /root/.bashrc + +# Report package versions. +COPY version.sh /install/ +RUN /install/version.sh 2>&1 | tee version.log + +# Jupyter. +EXPOSE 8888 + +CMD ["/bin/bash"] diff --git a/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/README.md b/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/README.md new file mode 100644 index 000000000..f2f9aa91f --- /dev/null +++ b/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/README.md @@ -0,0 +1,53 @@ +# Benchmarking Data Science Agents: A Comparative Study + +## Team Members +- Venkata Sripada +- Amulya Grace Bandlamudi + +## Project Overview +This project studies and compares publicly available data science benchmarks for AI agents, including DataSciBench, DSBench, MLE-Bench, GAIA, and SWE-bench. The goal is to understand how different benchmarks evaluate data science capabilities such as data analysis, machine learning engineering, and multi-step reasoning. + +We will collect leaderboard data from these benchmarks, analyze benchmark design (task type, evaluation metric, and difficulty), and compare agent performance across benchmarks. The project will produce a reproducible data analysis pipeline, visualizations, and a comparative research report. + +## Objectives +- Collect benchmark leaderboard data +- Analyze benchmark structure and evaluation metrics +- Compare performance across different AI agents +- Perform clustering and correlation analysis on benchmark results +- Visualize benchmark differences and capability gaps +- Produce a final research report + +## Tools and Technologies +- Python +- Pandas +- NumPy +- Matplotlib / Seaborn +- Scikit-learn +- Jupyter Notebook +- (Optional) PySpark for large-scale data processing + +## Methodology +1. Data collection from benchmark leaderboards +2. Data cleaning and dataset integration +3. Exploratory data analysis +4. Statistical analysis and correlation analysis +5. Clustering benchmarks and agent performance +6. Visualization and interpretation of results +7. Final report and documentation + +## Expected Outcome +The project aims to identify differences between data science benchmarks and determine which benchmarks better represent real-world data science tasks. We expect to find capability gaps where some agents perform well on coding tasks but struggle with multi-step reasoning and workflow-based tasks. + +## Repository Structure + +``` +Benchmarking_Data_Science_Agents/ +│ +├── README.md # Project overview and instructions +├── data/ # Collected benchmark datasets +├── notebooks/ # Jupyter notebooks for analysis +├── src/ # Python scripts for data processing and analysis +├── results/ # Output files, plots, and tables +├── report/ # Final report and documentation +└── references/ # Papers and benchmark documentation +``` diff --git a/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/requirements.txt b/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/requirements.txt new file mode 100644 index 000000000..49aca3901 --- /dev/null +++ b/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/requirements.txt @@ -0,0 +1,4 @@ +matplotlib +numpy +pandas +seaborn diff --git a/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/template.example.py b/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/template.example.py new file mode 100644 index 000000000..b091b1369 --- /dev/null +++ b/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/template.example.py @@ -0,0 +1,125 @@ +--- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.0 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Template Example Notebook +# +# This is a template notebook. The first heading should be the title of what notebook is about. For example, if it is a project on neo4j tutorial the heading should be `Project Title`. +# +# - Add description of what the notebook does. +# - Point to references, e.g. (neo4j.example.md) +# - Add citations. +# - Keep the notebook flow clear. +# - Comments should be imperative and have a period at the end. +# - Your code should be well commented. +# +# The name of this notebook should in the following format: +# - if the notebook is exploring `pycaret API`, then it is `pycaret.example.ipynb` +# +# Follow the reference to write notebooks in a clear manner: https://github.com/causify-ai/helpers/blob/master/docs/coding/all.jupyter_notebook.how_to_guide.md + +# %% +# %load_ext autoreload +# %autoreload 2 +# %matplotlib inline + +# %% +import logging +# Import libraries in this section. +# Avoid imports like import *, from ... import ..., from ... import *, etc. + +import helpers.hdbg as hdbg +import helpers.hnotebook as hnotebo + +# %% +hdbg.init_logger(verbosity=logging.INFO) + +_LOG = logging.getLogger(__name__) + +hnotebo.config_notebook() + + +# %% [markdown] +# ## Make the notebook flow clear +# Each notebook needs to follow a clear and logical flow, e.g: +# - Load data +# - Compute stats +# - Clean data +# - Compute stats +# - Do analysis +# - Show results +# +# +# +# + + +# ############################################################################# +# Template +# ############################################################################# + + +# %% +class Template: + """ + Brief imperative description of what the class does in one line, if needed. + """ + + def __init__(self): + pass + + def method1(self, arg1: int) -> None: + """ + Brief imperative description of what the method does in one line. + + You can elaborate more in the method docstring in this section, for e.g. explaining + the formula/algorithm. Every method/function should have a docstring, typehints and include the + parameters and return as follows: + + :param arg1: description of arg1 + :return: description of return + """ + # Code bloks go here. + # Make sure to include comments to explain what the code is doing. + # No empty lines between code blocks. + pass + + +def template_function(arg1: int) -> None: + """ + Brief imperative description of what the function does in one line. + + You can elaborate more in the function docstring in this section, for e.g. explaining + the formula/algorithm. Every function should have a docstring, typehints and include the + parameters and return as follows: + + :param arg1: description of arg1 + :return: description of return + """ + # Code bloks go here. + # Make sure to include comments to explain what the code is doing. + # No empty lines between code blocks. + pass + + +# %% [markdown] +# ## The flow should be highlighted using headings in markdown +# ``` +# # Level 1 +# ## Level 2 +# ### Level 3 +# ``` + +# %% diff --git a/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/template_utils.py b/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/template_utils.py new file mode 100644 index 000000000..f8916102e --- /dev/null +++ b/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/template_utils.py @@ -0,0 +1,72 @@ +""" +template_utils.py + +This file contains utility functions that support the tutorial notebooks. + +- Notebooks should call these functions instead of writing raw logic inline. +- This helps keep the notebooks clean, modular, and easier to debug. +- Students should implement functions here for data preprocessing, + model setup, evaluation, or any reusable logic. + +Import as: + +import class_project.project_template.template_utils as cpptteut +""" + +import pandas as pd +import logging +from sklearn.model_selection import train_test_split +from pycaret.classification import compare_models + +# ----------------------------------------------------------------------------- +# Logging +# ----------------------------------------------------------------------------- + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# ----------------------------------------------------------------------------- +# Example 1: Split the dataset into train and test sets +# ----------------------------------------------------------------------------- + + +def split_data(df: pd.DataFrame, target_column: str, test_size: float = 0.2): + """ + Split the dataset into training and testing sets. + + :param df: full dataset + :param target_column: name of the target column + :param test_size: proportion of test data (default = 0.2) + + :return: X_train, X_test, y_train, y_test + """ + logger.info("Splitting data into train and test sets") + X = df.drop(columns=[target_column]) + y = df[target_column] + return train_test_split(X, y, test_size=test_size, random_state=42) + + +# ----------------------------------------------------------------------------- +# Example 2: PyCaret classification pipeline +# ----------------------------------------------------------------------------- + + +def run_pycaret_classification( + df: pd.DataFrame, target_column: str +) -> pd.DataFrame: + """ + Run a basic PyCaret classification experiment. + + :param df: dataset containing features and target + :param target_column: name of the target column + + :return: comparison of top-performing models + """ + logger.info("Initializing PyCaret classification setup") + ... + + logger.info("Comparing models") + results = compare_models() + ... + + return results diff --git a/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/version.sh b/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/version.sh new file mode 100644 index 000000000..c46ed254c --- /dev/null +++ b/class_project/data605/Spring2026/projects/Benchmarking_Data_Science_Agents/version.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# """ +# Display versions of installed tools and packages. +# +# This script prints version information for Python, pip, Jupyter, and all +# installed Python packages. Used for debugging and documentation purposes +# to verify the Docker container environment setup. +# """ + +# Display Python 3 version. +echo "# Python3" +python3 --version + +# Display pip version. +echo "# pip3" +pip3 --version + +# Display Jupyter version. +echo "# jupyter" +jupyter --version + +# List all installed Python packages and their versions. +echo "# Python packages" +pip3 list + +# Template for adding additional tool versions. +# echo "# mongo" +# mongod --version