From 69fc103b0b9e7ee9b7d591efed8b897c6209dc12 Mon Sep 17 00:00:00 2001
From: Pranav Shashidhara <pranavmay22@gmail.com>
Date: Mon, 30 Mar 2026 12:48:50 -0400
Subject: [PATCH 01/14] Add README and utils.py

---
 .../schema_agent/schema_agent_utils.py        | 115 ++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 research/agentic_data_science/schema_agent/schema_agent_utils.py

diff --git a/research/agentic_data_science/schema_agent/schema_agent_utils.py b/research/agentic_data_science/schema_agent/schema_agent_utils.py
new file mode 100644
index 000000000..10856c555
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/schema_agent_utils.py
@@ -0,0 +1,115 @@
+import helpers.hpandas_conversion as hpandas_conversion
+import helpers.hpandas_stats as hpanstat
+import helpers.hpandas_io as hpanio
+import helpers.hlogging as hloggin
+
+import pandas as pd
+import typing
+
+
+_LOG = hloggin.getLogger(__name__)
+
+def load_employee_data(csv_path: str) -> pd.DataFrame:
+    """
+    Load employee data from CSV. Raises FileNotFoundError if the file does not exist.
+    """
+    try:
+        df = hpanio.read_csv_to_df(csv_path)
+    except FileNotFoundError:
+        _LOG.error("CSV not found at '%s'.", csv_path)
+        raise
+    return df
+
+def compute_llm_agent_stats(
+    tag_to_df: typing.Dict[str, pd.DataFrame],
+    categorical_cols_map: typing.Optional[typing.Dict[str, typing.List[str]]] = None,
+) -> typing.Dict[str, typing.Any]:
+    """
+    Computes a comprehensive statistical profile of dataframes for LLM context.
+    Produces: temporal boundaries, data quality (zeros/nans/infs), categorical
+    distributions, and a numeric summary — all formatted for LLM prompt injection.
+    """
+    dataframe_stats: typing.Dict[str, typing.Any] = {}
+
+    # 1. Temporal boundaries
+    try:
+        duration_stats, _ = hpanstat.compute_duration_df(tag_to_df)
+        dataframe_stats["temporal_boundaries"] = duration_stats
+        print("\n=== Temporal Boundaries ===")
+        print(duration_stats.to_string())
+    except Exception as e:
+        _LOG.warning("Skipping duration stats: %s", e)
+        dataframe_stats["temporal_boundaries"] = None
+
+    # 2. Data quality profiling (zeros / nans / infs)
+    dataframe_stats["quality_reports"] = {}
+    for tag, df in tag_to_df.items():
+        # Only numeric columns — report_zero_nan_inf_stats uses np.isnan/isinf
+        numeric_df = df.select_dtypes(include="number")
+        if numeric_df.empty:
+            _LOG.warning("No numeric columns in '%s'; skipping quality report", tag)
+            continue
+        df_stamped = hpanstat.add_end_download_timestamp(numeric_df.copy())
+        try:
+            quality = hpanstat.report_zero_nan_inf_stats(
+                df_stamped,
+                zero_threshold=1e-9,
+                verbose=True,
+                as_txt=True,        # plain text — avoids IPython display entirely
+            )
+            dataframe_stats["quality_reports"][tag] = quality
+            print(f"\n=== Quality Report: {tag} ===")
+            print(quality.to_string())
+        except Exception as e:
+            _LOG.warning("Quality report failed for '%s': %s", tag, e)
+
+    # 3. Categorical distributions
+    dataframe_stats["categorical_distributions"] = {}
+    if categorical_cols_map:
+        for tag, cols in categorical_cols_map.items():
+            if tag not in tag_to_df:
+                continue
+            dataframe_stats["categorical_distributions"][tag] = {}
+            for col in cols:
+                if col in tag_to_df[tag].columns:
+                    dist = hpanstat.get_value_counts_stats_df(tag_to_df[tag], col)
+                    dataframe_stats["categorical_distributions"][tag][col] = dist
+                    print(f"\n=== Distribution: {tag} / {col} ===")
+                    print(dist.to_string())
+
+    # 4. Numeric summary (mean / std / min / max / median)
+    dataframe_stats["numeric_summary"] = {}
+    for tag, df in tag_to_df.items():
+        numeric_df = df.select_dtypes(include="number")
+        if not numeric_df.empty:
+            summary = numeric_df.describe().T[["mean", "std", "min", "50%", "max"]]
+            summary.rename(columns={"50%": "median"}, inplace=True)
+            dataframe_stats["numeric_summary"][tag] = summary
+            print(f"\n=== Numeric Summary: {tag} ===")
+            print(summary.to_string())
+
+    return dataframe_stats
+
+def main():
+    df = load_employee_data("global_ecommerce_forecasting.csv")
+
+    # Dynamically convert datetime-like columns and set best index
+
+    df_typed = hpandas_conversion.convert_df(df)
+    # df = convert_flexible_datetime(df)
+    
+    print(df_typed.dtypes)
+    # Select categorical columns excluding datetime
+    categorical_cols = df_typed.select_dtypes(include=["object", "category"]).columns.tolist()
+
+    stats = compute_llm_agent_stats(
+        {"ecommerce_data": df_typed},
+        categorical_cols_map={"ecommerce_data": categorical_cols},
+    )
+
+
+    print(df_typed.head())
+    return df_typed, stats
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 46507006cc82b8cb223d5796d4e8171cd2a70d74 Mon Sep 17 00:00:00 2001
From: Pranav Shashidhara <pranavmay22@gmail.com>
Date: Tue, 31 Mar 2026 12:26:19 -0400
Subject: [PATCH 02/14] Add LLM logic and update requirements.txt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-commit checks:
All checks passed ✅
---
 .../schema_agent/schema_agent_utils.py        | 213 ++++++++++++++++--
 1 file changed, 199 insertions(+), 14 deletions(-)

diff --git a/research/agentic_data_science/schema_agent/schema_agent_utils.py b/research/agentic_data_science/schema_agent/schema_agent_utils.py
index 10856c555..e1bfc2f4a 100644
--- a/research/agentic_data_science/schema_agent/schema_agent_utils.py
+++ b/research/agentic_data_science/schema_agent/schema_agent_utils.py
@@ -1,13 +1,45 @@
+import logging
+import sys
+import os 
+import json
+import typing
+import pandas as pd
+from openai import OpenAI
+from dotenv import load_dotenv
+
+# LangChain Imports
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import JsonOutputParser
+from pydantic import BaseModel, Field
+# Internal helper imports
 import helpers.hpandas_conversion as hpandas_conversion
 import helpers.hpandas_stats as hpanstat
 import helpers.hpandas_io as hpanio
 import helpers.hlogging as hloggin
+import helpers.hllm_cli as hllmcli
 
-import pandas as pd
-import typing
-
+load_dotenv()
+api_key = os.environ.get("OPENAI_API_KEY")
+if not api_key:
+    print("Error: OPENAI_API_KEY not found.")
+    sys.exit(1)
+client = OpenAI(api_key=api_key)
 
 _LOG = hloggin.getLogger(__name__)
+_LOG.setLevel(logging.DEBUG)  
+
+console_handler = logging.StreamHandler(sys.stdout)
+
+hloggin.set_v2_formatter(
+    ch=console_handler,
+    root_logger=_LOG,
+    force_no_warning=False,
+    force_print_format=False,      
+    force_verbose_format=True,    
+    report_memory_usage=True,
+    report_cpu_usage=True
+)
 
 def load_employee_data(csv_path: str) -> pd.DataFrame:
     """
@@ -55,7 +87,7 @@ def compute_llm_agent_stats(
                 df_stamped,
                 zero_threshold=1e-9,
                 verbose=True,
-                as_txt=True,        # plain text — avoids IPython display entirely
+                as_txt=True,        
             )
             dataframe_stats["quality_reports"][tag] = quality
             print(f"\n=== Quality Report: {tag} ===")
@@ -90,25 +122,178 @@ def compute_llm_agent_stats(
 
     return dataframe_stats
 
-def main():
-    df = load_employee_data("global_ecommerce_forecasting.csv")
+def build_llm_prompt(stats: typing.Dict[str, typing.Any]) -> str:
+    """Serializes stats into a prompt block with instructions for hypothesis generation."""
+    prompt_segments = [
+        "You are a Senior Data Scientist and Domain Expert.",
+        "Analyze the provided dataset statistics and generate a profile for each column.",
+        "For each column, provide 2-3 testable hypotheses. For example, if the column is 'Discount', "
+        "a hypothesis might be: 'Higher discount rates correlate with higher sales volume but lower profit margins.'",
+        "\n--- DATASET STATISTICS ---"
+    ]
+    
+    if "numeric_summary" in stats:
+        for tag, summary in stats["numeric_summary"].items():
+            prompt_segments.append(f"\nDataset [{tag}] Numeric Summary:\n{summary.to_string()}")
+            
+    if "categorical_distributions" in stats:
+        for tag, cols in stats["categorical_distributions"].items():
+            for col_name, dist in cols.items():
+                prompt_segments.append(f"\nDistribution for [{col_name}]:\n{dist.to_string()}")
+                
+    return "\n".join(prompt_segments)
 
-    # Dynamically convert datetime-like columns and set best index
+# --- Structured Output Schema ---
+class ColumnInsight(BaseModel):
+    semantic_meaning: str = Field(description="Brief description of what the data represents")
+    role: str = Field(description="One of [ID, Feature, Target, Timestamp]")
+    data_quality_notes: str = Field(description="Any concerns based on the stats (e.g. high nulls, outliers)")
+    hypotheses: typing.List[str] = Field(
+        description="A list of testable hypotheses about this column's relationship to the business outcome or target variable."
+    )
 
-    df_typed = hpandas_conversion.convert_df(df)
-    # df = convert_flexible_datetime(df)
+class DatasetInsights(BaseModel):
+    columns: typing.Dict[str, ColumnInsight]
+
+def get_llm_semantic_insights_langchain(prompt_text: str, model: str = "gpt-4o") -> typing.Dict[str, typing.Any]:
+    """
+    Uses LangChain to process metadata and return structured insights.
+    """
+    _LOG.info("Querying LLM via LangChain (%s)...", model)
+    
+    # 1. Initialize the Model
+    llm = ChatOpenAI(model=model, temperature=0)
+    
+    # 2. Set up the Parser and Prompt
+    parser = JsonOutputParser(pydantic_object=DatasetInsights)
+    
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a Senior Data Scientist. Answer in JSON format.\n{format_instructions}"),
+        ("user", "{metadata_stats}")
+    ]).partial(format_instructions=parser.get_format_instructions())
+
+    # 3. Create the Chain
+    chain = prompt | llm | parser
+
+    # 4. Invoke
+    try:
+        insights = chain.invoke({"metadata_stats": prompt_text})
+        return insights
+    except Exception as e:
+        _LOG.error("LangChain invocation failed: %s", e)
+        return {"error": str(e)}
+
+def merge_and_export_results(
+    stats: typing.Dict[str, typing.Any], 
+    insights: typing.Dict[str, typing.Any],
+    output_path: str = "data_profile_report.json"
+):
+    """
+    Merges technical pandas stats with LangChain-generated semantic insights.
     
-    print(df_typed.dtypes)
-    # Select categorical columns excluding datetime
-    categorical_cols = df_typed.select_dtypes(include=["object", "category"]).columns.tolist()
+    :param stats: The dictionary returned by compute_llm_agent_stats (contains DataFrames)
+    :param insights: The dictionary returned by the LangChain invocation
+    :param output_path: Path to save the final JSON report
+    """
+    _LOG.info("Merging technical stats with LLM insights...")
+
+    # 1. Prepare the final structure
+    # We convert DataFrames to dicts/JSON-serializable formats within the 'stats' object
+    serializable_stats = {}
+    for key, value in stats.items():
+        if isinstance(value, pd.DataFrame):
+            serializable_stats[key] = value.to_dict(orient="index")
+        elif isinstance(value, dict):
+            # Handle nested dictionaries that might contain DataFrames (like quality_reports)
+            inner_dict = {}
+            for k, v in value.items():
+                inner_dict[k] = v.to_dict(orient="index") if isinstance(v, pd.DataFrame) else v
+            serializable_stats[key] = inner_dict
+        else:
+            serializable_stats[key] = value
+
+    # 2. Combine into one master object
+    final_report = {
+        "report_metadata": {
+            "version": "1.0",
+            "agent": "LangChain-Data-Profiler"
+        },
+        "technical_stats": serializable_stats,
+        "semantic_insights": insights
+    }
+
+    # 3. Export to JSON
+    try:
+        with open(output_path, "w") as f:
+            json.dump(final_report, f, indent=4, default=str)
+        _LOG.info("Successfully exported merged profile to: %s", output_path)
+    except Exception as e:
+        _LOG.error("Failed to export results: %s", e)
+
+def generate_hypotheses_via_cli(
+    stats: typing.Dict[str, typing.Any], 
+    model: str = "gpt-4o"
+) -> typing.Dict[str, typing.Any]:
+    """
+    Generates semantic insights and hypotheses using the underlying 
+    logic of llm_cli (hllmcli).
+    """
+    _LOG.info("Generating hypotheses via hllmcli logic...")
+
+    # 1. Prepare the Schema
+    # We use Pydantic's schema to force the LLM into the correct JSON structure
+    schema_json = DatasetInsights.model_json_schema()
 
+    # 2. Build the Prompts
+    user_prompt = build_llm_prompt(stats)
+    
+    system_prompt = (
+        "You are a Senior Data Scientist. Analyze the following data statistics.\n"
+        "Generate a set of 2-3 predictive or causal hypotheses for EVERY column.\n"
+        f"Return the output strictly     in JSON matching this schema: {json.dumps(schema_json)}"
+    )
+
+    # 3. Call the library function used by llm_cli
+    try:
+        # apply_llm returns a Tuple[str, float] (response_text, cost)
+        response_text, cost = hllmcli.apply_llm(
+            input_str=user_prompt,
+            system_prompt=system_prompt,
+            model=model,
+            use_llm_executable=False # Use the Python library for better error handling
+        )
+        
+        _LOG.info("LLM Call successful. Cost: $%.6f", cost)
+
+        # 4. Parse the result
+        cleaned_response = response_text.strip().removeprefix("```json").removesuffix("```").strip()
+        parsed_data = json.loads(cleaned_response)
+        
+        return parsed_data
+        
+    except Exception as e:
+        _LOG.error("hllmcli call failed: %s", e)
+        return {"error": str(e)}
+
+# Update main to use the new CLI-based function if desired
+def main():
+    # 1. Load & Process Data
+    df = hpanio.read_csv_to_df("global_ecommerce_forecasting.csv")
+    df_typed = hpandas_conversion.convert_df(df)
+
+    # 2. Compute Deterministic Stats
+    cat_cols = df_typed.select_dtypes(include=["object", "category", "string"]).columns.tolist()
     stats = compute_llm_agent_stats(
         {"ecommerce_data": df_typed},
-        categorical_cols_map={"ecommerce_data": categorical_cols},
+        categorical_cols_map={"ecommerce_data": cat_cols},
     )
 
+    # 3. Call LLM via our new CLI-based helper
+    semantic_insights = generate_hypotheses_via_cli(stats)
 
-    print(df_typed.head())
+    # 4. Export
+    merge_and_export_results(stats, semantic_insights)
+    
     return df_typed, stats
 
 if __name__ == "__main__":

From be1defd543b5e00df1af3b7873758ce1bafa6119 Mon Sep 17 00:00:00 2001
From: Pranav Shashidhara <pranavmay22@gmail.com>
Date: Wed, 1 Apr 2026 10:49:12 -0400
Subject: [PATCH 03/14] Lint and Update README.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-commit checks:
All checks passed ✅
---
 .../schema_agent/README.md                    | 141 ++------
 .../schema_agent/schema_agent_utils.py        | 307 ++++++++++--------
 2 files changed, 194 insertions(+), 254 deletions(-)

diff --git a/research/agentic_data_science/schema_agent/README.md b/research/agentic_data_science/schema_agent/README.md
index c3d1c3a13..777d42283 100644
--- a/research/agentic_data_science/schema_agent/README.md
+++ b/research/agentic_data_science/schema_agent/README.md
@@ -2,133 +2,52 @@
 
 Automated statistical profiling and LLM-powered semantic analysis for CSV datasets. Generates column-level insights including semantic meaning, data quality assessment, and testable business hypotheses.
 
-## Features
+## Setup and Usage
 
-- **Temporal Detection:** Auto-detects and converts date/datetime columns across multiple formats
-- **Statistical Profiling:** Computes numeric summaries, data quality metrics, and categorical distributions
-- **LLM Semantic Analysis:** Generates column roles (ID, Feature, Target, Timestamp), semantic meaning, and hypotheses
-- **Cost Optimization:** Filter columns before LLM analysis to control token usage and API costs
-- **Multi-Format Output:** JSON reports and Markdown summaries
-
-## Setup
-
-Go into the schema folder:
-```bash
-cd research/agentic_data_science/schema_agent
-```
-
-Install the requirements:
-```bash
-pip install -r requirements.txt
-```
-
-Set the `OPENAI_API_KEY` in your environment:
+To navigate to the repository:
 ```bash
-export OPENAI_API_KEY=sk-...
+cd research/agentic_data_science/schema_agent/schema_agent_utils.py
 ```
+Setup the OpenAI key in your environment before running in a .env file
 
-## Module Structure
-
-The agent is split into six focused modules:
-
-| Module | Responsibility |
-|--------|---------------|
-| `schema_agent_models.py` | Pydantic schemas for type-safe column/dataset insights |
-| `schema_agent_loader.py` | CSV loading, type inference, datetime detection |
-| `schema_agent_stats.py` | Numeric summaries, quality reports, categorical distributions |
-| `schema_agent_llm.py` | Prompt building, OpenAI/LangChain calls, structured output parsing |
-| `schema_agent_report.py` | Column profiles, JSON and Markdown export |
-| `schema_agent.py` | Pipeline orchestration and CLI entry point |
-
-## Usage
-
-### Basic
-
-```bash
-python schema_agent.py data.csv
-```
+## Current Files
 
-Outputs:
-- `data_profile_report.json` — Machine-readable report
-- `data_profile_summary.md` — Human-readable summary
+- **`requirements.txt`** – Lists the Python dependencies required to run the agent
+- **`schema_agent_utils.py`** – Contains functions for parsing data, computing column statistics, and preparing summaries for LLM-based analysis
+- **`global_ecommerce_forecasting.csv`** – The dataset used for testing
 
-### Advanced
 
-```bash
-# Multiple files with tags
-python schema_agent.py dataset1.csv dataset2.csv --tags sales_2024 inv_q1
-
-# Cost-optimized: only high-null columns
-python schema_agent.py data.csv --llm-scope nulls --model gpt-4o-mini
-
-# Custom metrics and output
-python schema_agent.py data.csv --metrics mean std max --output-json my_report.json
-
-# LangChain backend
-python schema_agent.py data.csv --use-langchain
-```
-
-## Command-Line Arguments
-
-| Argument | Default | Description |
-|----------|---------|-------------|
-| `csv_paths` | Required | One or more CSV file paths |
-| `--tags` | File stems | Tags for each CSV (must match count) |
-| `--model` | `gpt-4o` | LLM model (`gpt-4o`, `gpt-4o-mini`, etc.) |
-| `--llm-scope` | `all` | Which columns to profile: `all`, `semantic`, `nulls` |
-| `--metrics` | Subset | Numeric metrics: `mean`, `std`, `min`, `25%`, `50%`, `75%`, `max` |
-| `--use-langchain` | False | Use LangChain instead of hllmcli |
-| `--output-json` | `data_profile_report.json` | JSON report path |
-| `--output-md` | `data_profile_summary.md` | Markdown summary path |
-
-## LLM Scoping
-
-- **`all`** — Every column (highest cost, comprehensive)
-- **`semantic`** — Non-numeric columns only
-- **`nulls`** — Columns with >5% null values (cost-optimized)
+## Setup
 
-## Python API
+### 1. Load CSV
 
-### Full pipeline
+- Read into a `pandas.DataFrame`
+- Ensure the DataFrame is non-empty
 
-```python
-import schema_agent as radsasag
-tag_to_df, stats = radsasag.run_pipeline(
-    csv_paths=["data.csv"],
-    model="gpt-4o-mini",
-    llm_scope="semantic"
-)
-```
+### 2. Compute Column Stats
 
-### Individual modules
+- Identify column types: numeric, categorical, datetime
+- Compute per-column statistics:
+  - **Numeric**: min, max, mean, median
+  - **Categorical**: unique count, top values
+  - **Datetime**: ranges, durations
+- Capture null percentages and sample values
 
-Each module can be imported independently for exploratory use or testing:
+### 3. Build LLM Prompt
 
-```python
-import schema_agent_loader as radsasal
-import schema_agent_stats as radsasas
-import schema_agent_llm as radsasal
-import schema_agent_report as radsasar
-```
+- Serialize per-column stats with optional user context
+- Designed for efficient LLM input (summaries only, not full data)
 
-## Output
+### 4. LLM Analysis
 
-### data_profile_report.json
-Structured report with column profiles, technical stats, and LLM insights.
+- Generate hypotheses about each column's meaning
+- Suggest semantic roles (identifier, timestamp, category, etc.)
+- Highlight data quality concerns
 
-### data_profile_summary.md
-Formatted table summary: Column | Meaning | Role | Quality | Hypotheses
+### 5. Merge Results
 
-## Troubleshooting
-
-**API Key Error:**
-```bash
-export OPENAI_API_KEY=sk-...
-```
+- Combine pandas statistics and LLM output by column name
 
-**Validation Errors:**
-- Use `--llm-scope nulls` or `--llm-scope semantic` to reduce columns
-- Try `--model gpt-4o-mini`
+### 6. Export
 
-**Datetime Detection:**
-Skipped automatically if no temporal columns detected.
\ No newline at end of file
+- JSON output for downstream automation or agents
diff --git a/research/agentic_data_science/schema_agent/schema_agent_utils.py b/research/agentic_data_science/schema_agent/schema_agent_utils.py
index e1bfc2f4a..262f42160 100644
--- a/research/agentic_data_science/schema_agent/schema_agent_utils.py
+++ b/research/agentic_data_science/schema_agent/schema_agent_utils.py
@@ -1,101 +1,140 @@
+"""
+Import as:
+
+import research.agentic_data_science.schema_agent.schema_agent_utils as radsasau
+"""
+
+import json
 import logging
+import os
 import sys
-import os 
-import json
 import typing
+
+import dotenv
+import langchain_core.output_parsers as lcop  
+import langchain_core.prompts as lcpr  
+import langchain_openai as lco  
+import openai  
 import pandas as pd
-from openai import OpenAI
-from dotenv import load_dotenv
-
-# LangChain Imports
-from langchain_openai import ChatOpenAI
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_core.output_parsers import JsonOutputParser
-from pydantic import BaseModel, Field
-# Internal helper imports
-import helpers.hpandas_conversion as hpandas_conversion
-import helpers.hpandas_stats as hpanstat
-import helpers.hpandas_io as hpanio
-import helpers.hlogging as hloggin
+import pydantic  
+
 import helpers.hllm_cli as hllmcli
+import helpers.hlogging as hloggin
+import helpers.hpandas_conversion as hpanconv
+import helpers.hpandas_io as hpanio
+import helpers.hpandas_stats as hpanstat
 
-load_dotenv()
+# --- Configuration & Logging ---
+dotenv.load_dotenv()
 api_key = os.environ.get("OPENAI_API_KEY")
 if not api_key:
     print("Error: OPENAI_API_KEY not found.")
     sys.exit(1)
-client = OpenAI(api_key=api_key)
 
+client = openai.OpenAI(api_key=api_key)
 _LOG = hloggin.getLogger(__name__)
-_LOG.setLevel(logging.DEBUG)  
+_LOG.setLevel(logging.DEBUG)
 
 console_handler = logging.StreamHandler(sys.stdout)
-
-hloggin.set_v2_formatter(
+hloggin.set_v2_formatter(  
     ch=console_handler,
     root_logger=_LOG,
     force_no_warning=False,
-    force_print_format=False,      
-    force_verbose_format=True,    
+    force_print_format=False,
+    force_verbose_format=True,
     report_memory_usage=True,
-    report_cpu_usage=True
+    report_cpu_usage=True,
 )
 
+
+# #############################################################################
+# ColumnInsight
+# #############################################################################
+
+
+# --- Schemas ---
+class ColumnInsight(pydantic.BaseModel):
+    semantic_meaning: str = pydantic.Field(
+        description="Brief description of what the data represents"
+    )
+    role: str = pydantic.Field(
+        description="One of [ID, Feature, Target, Timestamp]"
+    )
+    data_quality_notes: str = pydantic.Field(
+        description="Any concerns based on the stats (e.g. high nulls, outliers)"
+    )
+    hypotheses: typing.List[str] = pydantic.Field(
+        description="List of testable hypotheses regarding the column's relationship "
+        "to business outcomes."
+    )
+
+
+# #############################################################################
+# DatasetInsights
+# #############################################################################
+
+
+class DatasetInsights(pydantic.BaseModel):
+    columns: typing.Dict[str, ColumnInsight]
+
+
+# --- Core Logic ---
 def load_employee_data(csv_path: str) -> pd.DataFrame:
     """
-    Load employee data from CSV. Raises FileNotFoundError if the file does not exist.
+    Load employee data from CSV with error handling for missing files.
     """
     try:
-        df = hpanio.read_csv_to_df(csv_path)
+        return hpanio.read_csv_to_df(csv_path)
     except FileNotFoundError:
         _LOG.error("CSV not found at '%s'.", csv_path)
         raise
-    return df
+
 
 def compute_llm_agent_stats(
     tag_to_df: typing.Dict[str, pd.DataFrame],
-    categorical_cols_map: typing.Optional[typing.Dict[str, typing.List[str]]] = None,
+    categorical_cols_map: typing.Optional[
+        typing.Dict[str, typing.List[str]]
+    ] = None,
 ) -> typing.Dict[str, typing.Any]:
     """
-    Computes a comprehensive statistical profile of dataframes for LLM context.
-    Produces: temporal boundaries, data quality (zeros/nans/infs), categorical
-    distributions, and a numeric summary — all formatted for LLM prompt injection.
+    Compute a statistical profile including temporal boundaries, data quality,
+    categorical distributions, and numeric summaries for LLM injection.
     """
     dataframe_stats: typing.Dict[str, typing.Any] = {}
 
-    # 1. Temporal boundaries
+    # 1. Temporal Analysis
     try:
         duration_stats, _ = hpanstat.compute_duration_df(tag_to_df)
         dataframe_stats["temporal_boundaries"] = duration_stats
-        print("\n=== Temporal Boundaries ===")
-        print(duration_stats.to_string())
-    except Exception as e:
+        print("\n=== Temporal Boundaries ===\n", duration_stats.to_string())
+    except Exception as e:  # pylint: disable=broad-exception-caught
         _LOG.warning("Skipping duration stats: %s", e)
         dataframe_stats["temporal_boundaries"] = None
 
-    # 2. Data quality profiling (zeros / nans / infs)
+    # 2. Data Quality Profiling
     dataframe_stats["quality_reports"] = {}
     for tag, df in tag_to_df.items():
-        # Only numeric columns — report_zero_nan_inf_stats uses np.isnan/isinf
         numeric_df = df.select_dtypes(include="number")
         if numeric_df.empty:
-            _LOG.warning("No numeric columns in '%s'; skipping quality report", tag)
+            _LOG.warning(
+                "No numeric columns in '%s'; skipping quality report", tag
+            )
             continue
+
         df_stamped = hpanstat.add_end_download_timestamp(numeric_df.copy())
         try:
             quality = hpanstat.report_zero_nan_inf_stats(
                 df_stamped,
                 zero_threshold=1e-9,
                 verbose=True,
-                as_txt=True,        
+                as_txt=True,
             )
             dataframe_stats["quality_reports"][tag] = quality
-            print(f"\n=== Quality Report: {tag} ===")
-            print(quality.to_string())
-        except Exception as e:
+            print(f"\n=== Quality Report: {tag} ===\n", quality.to_string())
+        except Exception as e:  # pylint: disable=broad-exception-caught
             _LOG.warning("Quality report failed for '%s': %s", tag, e)
 
-    # 3. Categorical distributions
+    # 3. Categorical Distributions
     dataframe_stats["categorical_distributions"] = {}
     if categorical_cols_map:
         for tag, cols in categorical_cols_map.items():
@@ -106,195 +145,177 @@ def compute_llm_agent_stats(
                 if col in tag_to_df[tag].columns:
                     dist = hpanstat.get_value_counts_stats_df(tag_to_df[tag], col)
                     dataframe_stats["categorical_distributions"][tag][col] = dist
-                    print(f"\n=== Distribution: {tag} / {col} ===")
-                    print(dist.to_string())
+                    print(
+                        f"\n=== Distribution: {tag} / {col} ===\n",
+                        dist.to_string(),
+                    )
 
-    # 4. Numeric summary (mean / std / min / max / median)
+    # 4. Numeric Summary
     dataframe_stats["numeric_summary"] = {}
     for tag, df in tag_to_df.items():
         numeric_df = df.select_dtypes(include="number")
         if not numeric_df.empty:
-            summary = numeric_df.describe().T[["mean", "std", "min", "50%", "max"]]
+            summary = numeric_df.describe().T[
+                ["mean", "std", "min", "50%", "max"]
+            ]
             summary.rename(columns={"50%": "median"}, inplace=True)
             dataframe_stats["numeric_summary"][tag] = summary
-            print(f"\n=== Numeric Summary: {tag} ===")
-            print(summary.to_string())
+            print(f"\n=== Numeric Summary: {tag} ===\n", summary.to_string())
 
     return dataframe_stats
 
+
 def build_llm_prompt(stats: typing.Dict[str, typing.Any]) -> str:
-    """Serializes stats into a prompt block with instructions for hypothesis generation."""
+    """
+    Serialize statistical data into a structured prompt for hypothesis
+    generation.
+    """
     prompt_segments = [
         "You are a Senior Data Scientist and Domain Expert.",
         "Analyze the provided dataset statistics and generate a profile for each column.",
-        "For each column, provide 2-3 testable hypotheses. For example, if the column is 'Discount', "
-        "a hypothesis might be: 'Higher discount rates correlate with higher sales volume but lower profit margins.'",
-        "\n--- DATASET STATISTICS ---"
+        "For each column, provide 2-3 testable hypotheses. "
+        "Example: 'Higher discount rates correlate with higher volume but lower margins.'",
+        "\n--- DATASET STATISTICS ---",
     ]
-    
     if "numeric_summary" in stats:
         for tag, summary in stats["numeric_summary"].items():
-            prompt_segments.append(f"\nDataset [{tag}] Numeric Summary:\n{summary.to_string()}")
-            
+            prompt_segments.append(
+                f"\nDataset [{tag}] Numeric Summary:\n{summary.to_string()}"
+            )
     if "categorical_distributions" in stats:
         for tag, cols in stats["categorical_distributions"].items():
             for col_name, dist in cols.items():
-                prompt_segments.append(f"\nDistribution for [{col_name}]:\n{dist.to_string()}")
-                
+                prompt_segments.append(
+                    f"\nDistribution for [{col_name}]:\n{dist.to_string()}"
+                )
     return "\n".join(prompt_segments)
 
-# --- Structured Output Schema ---
-class ColumnInsight(BaseModel):
-    semantic_meaning: str = Field(description="Brief description of what the data represents")
-    role: str = Field(description="One of [ID, Feature, Target, Timestamp]")
-    data_quality_notes: str = Field(description="Any concerns based on the stats (e.g. high nulls, outliers)")
-    hypotheses: typing.List[str] = Field(
-        description="A list of testable hypotheses about this column's relationship to the business outcome or target variable."
-    )
-
-class DatasetInsights(BaseModel):
-    columns: typing.Dict[str, ColumnInsight]
 
-def get_llm_semantic_insights_langchain(prompt_text: str, model: str = "gpt-4o") -> typing.Dict[str, typing.Any]:
+def get_llm_semantic_insights_langchain(
+    prompt_text: str, model: str = "gpt-4o"
+) -> typing.Dict[str, typing.Any]:
     """
-    Uses LangChain to process metadata and return structured insights.
+    Process dataset metadata via LangChain to extract structured semantic
+    insights.
     """
     _LOG.info("Querying LLM via LangChain (%s)...", model)
-    
-    # 1. Initialize the Model
-    llm = ChatOpenAI(model=model, temperature=0)
-    
-    # 2. Set up the Parser and Prompt
-    parser = JsonOutputParser(pydantic_object=DatasetInsights)
-    
-    prompt = ChatPromptTemplate.from_messages([
-        ("system", "You are a Senior Data Scientist. Answer in JSON format.\n{format_instructions}"),
-        ("user", "{metadata_stats}")
-    ]).partial(format_instructions=parser.get_format_instructions())
-
-    # 3. Create the Chain
+    llm = lco.ChatOpenAI(model=model, temperature=0)
+    parser = lcop.JsonOutputParser(pydantic_object=DatasetInsights)
+    prompt = lcpr.ChatPromptTemplate.from_messages(
+        [
+            (
+                "system",
+                "You are a Senior Data Scientist. Answer in JSON format.\n"
+                "{format_instructions}",
+            ),
+            ("user", "{metadata_stats}"),
+        ]
+    ).partial(format_instructions=parser.get_format_instructions())
     chain = prompt | llm | parser
-
-    # 4. Invoke
     try:
-        insights = chain.invoke({"metadata_stats": prompt_text})
-        return insights
-    except Exception as e:
+        result = chain.invoke({"metadata_stats": prompt_text})
+        return typing.cast(typing.Dict[str, typing.Any], result)
+    except Exception as e:  # pylint: disable=broad-exception-caught
         _LOG.error("LangChain invocation failed: %s", e)
         return {"error": str(e)}
 
+
 def merge_and_export_results(
-    stats: typing.Dict[str, typing.Any], 
+    stats: typing.Dict[str, typing.Any],
     insights: typing.Dict[str, typing.Any],
-    output_path: str = "data_profile_report.json"
-):
+    output_path: str = "data_profile_report.json",
+) -> None:
     """
-    Merges technical pandas stats with LangChain-generated semantic insights.
-    
-    :param stats: The dictionary returned by compute_llm_agent_stats (contains DataFrames)
-    :param insights: The dictionary returned by the LangChain invocation
-    :param output_path: Path to save the final JSON report
+    Merge pandas statistics with LLM insights and export to a JSON report.
     """
     _LOG.info("Merging technical stats with LLM insights...")
 
-    # 1. Prepare the final structure
-    # We convert DataFrames to dicts/JSON-serializable formats within the 'stats' object
     serializable_stats = {}
     for key, value in stats.items():
         if isinstance(value, pd.DataFrame):
             serializable_stats[key] = value.to_dict(orient="index")
         elif isinstance(value, dict):
-            # Handle nested dictionaries that might contain DataFrames (like quality_reports)
             inner_dict = {}
             for k, v in value.items():
-                inner_dict[k] = v.to_dict(orient="index") if isinstance(v, pd.DataFrame) else v
+                inner_dict[k] = (
+                    v.to_dict(orient="index")
+                    if isinstance(v, pd.DataFrame)
+                    else v
+                )
             serializable_stats[key] = inner_dict
         else:
             serializable_stats[key] = value
 
-    # 2. Combine into one master object
     final_report = {
-        "report_metadata": {
-            "version": "1.0",
-            "agent": "LangChain-Data-Profiler"
-        },
+        "report_metadata": {"version": "1.0", "agent": "LangChain-Data-Profiler"},
         "technical_stats": serializable_stats,
-        "semantic_insights": insights
+        "semantic_insights": insights,
     }
 
-    # 3. Export to JSON
     try:
-        with open(output_path, "w") as f:
+        with open(output_path, "w", encoding="utf-8") as f:
             json.dump(final_report, f, indent=4, default=str)
         _LOG.info("Successfully exported merged profile to: %s", output_path)
-    except Exception as e:
+    except Exception as e:  # pylint: disable=broad-exception-caught
         _LOG.error("Failed to export results: %s", e)
 
+
 def generate_hypotheses_via_cli(
-    stats: typing.Dict[str, typing.Any], 
-    model: str = "gpt-4o"
+    stats: typing.Dict[str, typing.Any], model: str = "gpt-4o"
 ) -> typing.Dict[str, typing.Any]:
     """
-    Generates semantic insights and hypotheses using the underlying 
-    logic of llm_cli (hllmcli).
+    Generate insights and hypotheses using internal hllmcli logic.
     """
     _LOG.info("Generating hypotheses via hllmcli logic...")
 
-    # 1. Prepare the Schema
-    # We use Pydantic's schema to force the LLM into the correct JSON structure
     schema_json = DatasetInsights.model_json_schema()
-
-    # 2. Build the Prompts
     user_prompt = build_llm_prompt(stats)
-    
     system_prompt = (
         "You are a Senior Data Scientist. Analyze the following data statistics.\n"
         "Generate a set of 2-3 predictive or causal hypotheses for EVERY column.\n"
-        f"Return the output strictly     in JSON matching this schema: {json.dumps(schema_json)}"
+        f"Return the output strictly in JSON matching this schema: {json.dumps(schema_json)}"
     )
 
-    # 3. Call the library function used by llm_cli
     try:
-        # apply_llm returns a Tuple[str, float] (response_text, cost)
         response_text, cost = hllmcli.apply_llm(
             input_str=user_prompt,
             system_prompt=system_prompt,
             model=model,
-            use_llm_executable=False # Use the Python library for better error handling
+            use_llm_executable=False,
         )
-        
+
         _LOG.info("LLM Call successful. Cost: $%.6f", cost)
+        cleaned_response = (
+            response_text.strip()
+            .removeprefix("```json")
+            .removesuffix("```")
+            .strip()
+        )
+        parsed = json.loads(cleaned_response)
+        return typing.cast(typing.Dict[str, typing.Any], parsed)
 
-        # 4. Parse the result
-        cleaned_response = response_text.strip().removeprefix("```json").removesuffix("```").strip()
-        parsed_data = json.loads(cleaned_response)
-        
-        return parsed_data
-        
-    except Exception as e:
+    except Exception as e:  
         _LOG.error("hllmcli call failed: %s", e)
         return {"error": str(e)}
 
-# Update main to use the new CLI-based function if desired
-def main():
-    # 1. Load & Process Data
-    df = hpanio.read_csv_to_df("global_ecommerce_forecasting.csv")
-    df_typed = hpandas_conversion.convert_df(df)
 
-    # 2. Compute Deterministic Stats
-    cat_cols = df_typed.select_dtypes(include=["object", "category", "string"]).columns.tolist()
+def main() -> typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]:
+    """
+    Execute entry point for the data profiling pipeline.
+    """
+    df = hpanio.read_csv_to_df("global_ecommerce_forecasting.csv")
+    df_typed = hpanconv.convert_df(df)
+    cat_cols = df_typed.select_dtypes(
+        include=["object", "category", "string"]
+    ).columns.tolist()
     stats = compute_llm_agent_stats(
         {"ecommerce_data": df_typed},
         categorical_cols_map={"ecommerce_data": cat_cols},
     )
-
-    # 3. Call LLM via our new CLI-based helper
     semantic_insights = generate_hypotheses_via_cli(stats)
-
-    # 4. Export
     merge_and_export_results(stats, semantic_insights)
-    
     return df_typed, stats
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From 090352830f7cf139f8665ce74e6701595d2e18e3 Mon Sep 17 00:00:00 2001
From: Pranav Shashidhara <pranavmay22@gmail.com>
Date: Wed, 1 Apr 2026 13:09:12 -0400
Subject: [PATCH 04/14] Add datetime convertion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-commit checks:
All checks passed ✅
---
 .../schema_agent/schema_agent_utils.py        | 185 +++++++++++++++---
 1 file changed, 162 insertions(+), 23 deletions(-)

diff --git a/research/agentic_data_science/schema_agent/schema_agent_utils.py b/research/agentic_data_science/schema_agent/schema_agent_utils.py
index 262f42160..a99e7d8b0 100644
--- a/research/agentic_data_science/schema_agent/schema_agent_utils.py
+++ b/research/agentic_data_science/schema_agent/schema_agent_utils.py
@@ -11,12 +11,12 @@
 import typing
 
 import dotenv
-import langchain_core.output_parsers as lcop  
-import langchain_core.prompts as lcpr  
-import langchain_openai as lco  
-import openai  
+import langchain_core.output_parsers as lcop
+import langchain_core.prompts as lcpr
+import langchain_openai as lco
+import openai
 import pandas as pd
-import pydantic  
+import pydantic
 
 import helpers.hllm_cli as hllmcli
 import helpers.hlogging as hloggin
@@ -36,7 +36,7 @@
 _LOG.setLevel(logging.DEBUG)
 
 console_handler = logging.StreamHandler(sys.stdout)
-hloggin.set_v2_formatter(  
+hloggin.set_v2_formatter(
     ch=console_handler,
     root_logger=_LOG,
     force_no_warning=False,
@@ -48,11 +48,10 @@
 
 
 # #############################################################################
-# ColumnInsight
+# ColumnInsight Schema
 # #############################################################################
 
 
-# --- Schemas ---
 class ColumnInsight(pydantic.BaseModel):
     semantic_meaning: str = pydantic.Field(
         description="Brief description of what the data represents"
@@ -70,7 +69,7 @@ class ColumnInsight(pydantic.BaseModel):
 
 
 # #############################################################################
-# DatasetInsights
+# DatasetInsights Schema
 # #############################################################################
 
 
@@ -79,6 +78,8 @@ class DatasetInsights(pydantic.BaseModel):
 
 
 # --- Core Logic ---
+
+
 def load_employee_data(csv_path: str) -> pd.DataFrame:
     """
     Load employee data from CSV with error handling for missing files.
@@ -155,9 +156,7 @@ def compute_llm_agent_stats(
     for tag, df in tag_to_df.items():
         numeric_df = df.select_dtypes(include="number")
         if not numeric_df.empty:
-            summary = numeric_df.describe().T[
-                ["mean", "std", "min", "50%", "max"]
-            ]
+            summary = numeric_df.describe().T[["mean", "std", "min", "50%", "max"]]
             summary.rename(columns={"50%": "median"}, inplace=True)
             dataframe_stats["numeric_summary"][tag] = summary
             print(f"\n=== Numeric Summary: {tag} ===\n", summary.to_string())
@@ -167,8 +166,7 @@ def compute_llm_agent_stats(
 
 def build_llm_prompt(stats: typing.Dict[str, typing.Any]) -> str:
     """
-    Serialize statistical data into a structured prompt for hypothesis
-    generation.
+    Serialize statistical data into a structured string prompt for LLM consumption.
     """
     prompt_segments = [
         "You are a Senior Data Scientist and Domain Expert.",
@@ -177,17 +175,28 @@ def build_llm_prompt(stats: typing.Dict[str, typing.Any]) -> str:
         "Example: 'Higher discount rates correlate with higher volume but lower margins.'",
         "\n--- DATASET STATISTICS ---",
     ]
+
+    # Append datetime column metadata if available
+    if "datetime_columns" in stats and stats["datetime_columns"]:
+        prompt_segments.append(
+            f"\nDetected Datetime Columns:\n{json.dumps(stats['datetime_columns'], indent=2)}"
+        )
+
+    # Append numeric summaries if available
     if "numeric_summary" in stats:
         for tag, summary in stats["numeric_summary"].items():
             prompt_segments.append(
                 f"\nDataset [{tag}] Numeric Summary:\n{summary.to_string()}"
             )
+
+    # Append categorical distributions if available
     if "categorical_distributions" in stats:
         for tag, cols in stats["categorical_distributions"].items():
             for col_name, dist in cols.items():
                 prompt_segments.append(
                     f"\nDistribution for [{col_name}]:\n{dist.to_string()}"
                 )
+
     return "\n".join(prompt_segments)
 
 
@@ -195,12 +204,13 @@ def get_llm_semantic_insights_langchain(
     prompt_text: str, model: str = "gpt-4o"
 ) -> typing.Dict[str, typing.Any]:
     """
-    Process dataset metadata via LangChain to extract structured semantic
-    insights.
+    Process dataset metadata via LangChain to extract structured semantic insights.
+    Uses LangChain's JsonOutputParser alongside the Pydantic schema.
     """
     _LOG.info("Querying LLM via LangChain (%s)...", model)
     llm = lco.ChatOpenAI(model=model, temperature=0)
     parser = lcop.JsonOutputParser(pydantic_object=DatasetInsights)
+
     prompt = lcpr.ChatPromptTemplate.from_messages(
         [
             (
@@ -211,6 +221,7 @@ def get_llm_semantic_insights_langchain(
             ("user", "{metadata_stats}"),
         ]
     ).partial(format_instructions=parser.get_format_instructions())
+
     chain = prompt | llm | parser
     try:
         result = chain.invoke({"metadata_stats": prompt_text})
@@ -227,21 +238,32 @@ def merge_and_export_results(
 ) -> None:
     """
     Merge pandas statistics with LLM insights and export to a JSON report.
+    Converts DataFrame objects into dictionaries to ensure JSON serialization.
     """
     _LOG.info("Merging technical stats with LLM insights...")
 
     serializable_stats = {}
+
     for key, value in stats.items():
         if isinstance(value, pd.DataFrame):
             serializable_stats[key] = value.to_dict(orient="index")
         elif isinstance(value, dict):
             inner_dict = {}
             for k, v in value.items():
-                inner_dict[k] = (
-                    v.to_dict(orient="index")
-                    if isinstance(v, pd.DataFrame)
-                    else v
-                )
+                if isinstance(v, pd.DataFrame):
+                    inner_dict[k] = v.to_dict(orient="index")
+                elif isinstance(v, dict):
+                    # Handle nested dicts (e.g. categorical_distributions[tag][col])
+                    inner_inner = {}
+                    for kk, vv in v.items():
+                        inner_inner[kk] = (
+                            vv.to_dict(orient="index")
+                            if isinstance(vv, pd.DataFrame)
+                            else vv
+                        )
+                    inner_dict[k] = inner_inner
+                else:
+                    inner_dict[k] = v
             serializable_stats[key] = inner_dict
         else:
             serializable_stats[key] = value
@@ -285,6 +307,7 @@ def generate_hypotheses_via_cli(
         )
 
         _LOG.info("LLM Call successful. Cost: $%.6f", cost)
+
         cleaned_response = (
             response_text.strip()
             .removeprefix("```json")
@@ -294,28 +317,144 @@ def generate_hypotheses_via_cli(
         parsed = json.loads(cleaned_response)
         return typing.cast(typing.Dict[str, typing.Any], parsed)
 
-    except Exception as e:  
+    except Exception as e:  # pylint: disable=broad-exception-caught
         _LOG.error("hllmcli call failed: %s", e)
         return {"error": str(e)}
 
 
+def infer_and_convert_datetime_columns(
+    df: pd.DataFrame,
+    sample_size: int = 100,
+    threshold: float = 0.8,
+) -> typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]:
+    """
+    Detect and convert date/datetime columns in a DataFrame.
+    Uses sampling to improve performance when checking format compliance.
+
+    Returns:
+        - Updated DataFrame with converted columns
+        - Metadata dict with inference details per column
+    """
+    from datetime import datetime
+
+    COMMON_FORMATS = [
+        "%Y-%m-%d",
+        "%d-%m-%Y",
+        "%m-%d-%Y",
+        "%Y/%m/%d",
+        "%d/%m/%Y",
+        "%m/%d/%Y",
+        "%Y-%m-%d %H:%M:%S",
+        "%Y-%m-%d %H:%M",
+        "%d-%m-%Y %H:%M:%S",
+        "%m/%d/%Y %H:%M:%S",
+    ]
+
+    metadata: typing.Dict[str, typing.Any] = {}
+    df_out = df.copy()
+
+    for col in df.columns:
+        if not pd.api.types.is_object_dtype(
+            df[col]
+        ) and not pd.api.types.is_string_dtype(df[col]):
+            continue
+
+        series = df[col].dropna().astype(str)
+
+        if series.empty:
+            continue
+
+        sample = series.head(sample_size)
+
+        best_format = None
+        best_score = 0.0
+
+        for fmt in COMMON_FORMATS:
+            success = 0
+            for val in sample:
+                try:
+                    datetime.strptime(val, fmt)
+                    success += 1
+                except Exception:
+                    continue
+
+            score = success / len(sample)
+
+            if score > best_score:
+                best_score = score
+                best_format = fmt
+
+        if best_score >= threshold:
+            parsed = pd.to_datetime(df[col], format=best_format, errors="coerce")
+            used_format = best_format
+        else:
+            parsed = pd.to_datetime(df[col], errors="coerce")
+            used_format = None
+
+        confidence = parsed.notna().mean()
+
+        if confidence < threshold:
+            continue
+
+        has_time = (parsed.dt.time != pd.Timestamp("00:00:00").time()).any()
+        col_type = "datetime" if has_time else "date"
+
+        df_out[col] = parsed
+
+        metadata[col] = {
+            "semantic_type": "temporal",
+            "granularity": col_type,
+            "format": used_format,
+            "confidence": float(confidence),
+        }
+
+        _LOG.info(
+            "Column '%s' detected as %s (format=%s, confidence=%.2f)",
+            col,
+            col_type,
+            used_format,
+            confidence,
+        )
+
+    return df_out, metadata
+
+
 def main() -> typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]:
     """
     Execute entry point for the data profiling pipeline.
+    Flow: Load Data -> Clean Types -> Infer Dates -> Compute Stats -> Request LLM Insights -> Export.
     """
     df = hpanio.read_csv_to_df("global_ecommerce_forecasting.csv")
     df_typed = hpanconv.convert_df(df)
+
+    # Process temporal inference
+    df_typed, datetime_meta = infer_and_convert_datetime_columns(df_typed)
+
+    # Identify categorical columns to calculate their distributions
     cat_cols = df_typed.select_dtypes(
         include=["object", "category", "string"]
     ).columns.tolist()
+
+    # Compute base statistics
     stats = compute_llm_agent_stats(
         {"ecommerce_data": df_typed},
         categorical_cols_map={"ecommerce_data": cat_cols},
     )
+
+    # FIX: Inject datetime metadata into stats so it's written to JSON and fed to LLM
+    stats["datetime_columns"] = datetime_meta
+
+    print(df_typed.dtypes)
+    print(datetime_meta)
+
+    # Send stats to LLM to generate testable hypotheses
     semantic_insights = generate_hypotheses_via_cli(stats)
+
+    # Save the combined numerical stats and semantic insights to disk
     merge_and_export_results(stats, semantic_insights)
+
     return df_typed, stats
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file

From 7bd7d9611eae0993f3b4de0b80e7e8b6cf84f03c Mon Sep 17 00:00:00 2001
From: Pranav Shashidhara <pranavmay22@gmail.com>
Date: Wed, 1 Apr 2026 13:14:17 -0400
Subject: [PATCH 05/14] Update datetime function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-commit checks:
All checks passed ✅
---
 .../schema_agent/schema_agent_utils.py        | 44 +++++++++----------
 1 file changed, 20 insertions(+), 24 deletions(-)

diff --git a/research/agentic_data_science/schema_agent/schema_agent_utils.py b/research/agentic_data_science/schema_agent/schema_agent_utils.py
index a99e7d8b0..a235e5e13 100644
--- a/research/agentic_data_science/schema_agent/schema_agent_utils.py
+++ b/research/agentic_data_science/schema_agent/schema_agent_utils.py
@@ -1,9 +1,10 @@
 """
 Import as:
 
-import research.agentic_data_science.schema_agent.schema_agent_utils as radsasau
+import research.agentic_data_science.schema_agent.schema_agent_utils as radssasau
 """
 
+import datetime
 import json
 import logging
 import os
@@ -48,7 +49,7 @@
 
 
 # #############################################################################
-# ColumnInsight Schema
+# ColumnInsight
 # #############################################################################
 
 
@@ -69,7 +70,7 @@ class ColumnInsight(pydantic.BaseModel):
 
 
 # #############################################################################
-# DatasetInsights Schema
+# DatasetInsights
 # #############################################################################
 
 
@@ -156,7 +157,9 @@ def compute_llm_agent_stats(
     for tag, df in tag_to_df.items():
         numeric_df = df.select_dtypes(include="number")
         if not numeric_df.empty:
-            summary = numeric_df.describe().T[["mean", "std", "min", "50%", "max"]]
+            summary = numeric_df.describe().T[
+                ["mean", "std", "min", "50%", "max"]
+            ]
             summary.rename(columns={"50%": "median"}, inplace=True)
             dataframe_stats["numeric_summary"][tag] = summary
             print(f"\n=== Numeric Summary: {tag} ===\n", summary.to_string())
@@ -166,7 +169,8 @@ def compute_llm_agent_stats(
 
 def build_llm_prompt(stats: typing.Dict[str, typing.Any]) -> str:
     """
-    Serialize statistical data into a structured string prompt for LLM consumption.
+    Serialize statistical data into a structured string prompt for LLM
+    consumption.
     """
     prompt_segments = [
         "You are a Senior Data Scientist and Domain Expert.",
@@ -175,20 +179,17 @@ def build_llm_prompt(stats: typing.Dict[str, typing.Any]) -> str:
         "Example: 'Higher discount rates correlate with higher volume but lower margins.'",
         "\n--- DATASET STATISTICS ---",
     ]
-
     # Append datetime column metadata if available
     if "datetime_columns" in stats and stats["datetime_columns"]:
         prompt_segments.append(
             f"\nDetected Datetime Columns:\n{json.dumps(stats['datetime_columns'], indent=2)}"
         )
-
     # Append numeric summaries if available
     if "numeric_summary" in stats:
         for tag, summary in stats["numeric_summary"].items():
             prompt_segments.append(
                 f"\nDataset [{tag}] Numeric Summary:\n{summary.to_string()}"
             )
-
     # Append categorical distributions if available
     if "categorical_distributions" in stats:
         for tag, cols in stats["categorical_distributions"].items():
@@ -196,7 +197,6 @@ def build_llm_prompt(stats: typing.Dict[str, typing.Any]) -> str:
                 prompt_segments.append(
                     f"\nDistribution for [{col_name}]:\n{dist.to_string()}"
                 )
-
     return "\n".join(prompt_segments)
 
 
@@ -204,7 +204,9 @@ def get_llm_semantic_insights_langchain(
     prompt_text: str, model: str = "gpt-4o"
 ) -> typing.Dict[str, typing.Any]:
     """
-    Process dataset metadata via LangChain to extract structured semantic insights.
+    Process dataset metadata via LangChain to extract structured semantic
+    insights.
+
     Uses LangChain's JsonOutputParser alongside the Pydantic schema.
     """
     _LOG.info("Querying LLM via LangChain (%s)...", model)
@@ -238,7 +240,9 @@ def merge_and_export_results(
 ) -> None:
     """
     Merge pandas statistics with LLM insights and export to a JSON report.
-    Converts DataFrame objects into dictionaries to ensure JSON serialization.
+
+    Converts DataFrame objects into dictionaries to ensure JSON
+    serialization.
     """
     _LOG.info("Merging technical stats with LLM insights...")
 
@@ -328,14 +332,13 @@ def infer_and_convert_datetime_columns(
     threshold: float = 0.8,
 ) -> typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]:
     """
-    Detect and convert date/datetime columns in a DataFrame.
-    Uses sampling to improve performance when checking format compliance.
+    Detect and convert date/datetime columns in a DataFrame. Uses sampling to
+    improve performance when checking format compliance.
 
     Returns:
         - Updated DataFrame with converted columns
         - Metadata dict with inference details per column
     """
-    from datetime import datetime
 
     COMMON_FORMATS = [
         "%Y-%m-%d",
@@ -373,7 +376,7 @@ def infer_and_convert_datetime_columns(
             success = 0
             for val in sample:
                 try:
-                    datetime.strptime(val, fmt)
+                    datetime.datetime.strptime(val, fmt)
                     success += 1
                 except Exception:
                     continue
@@ -422,39 +425,32 @@ def infer_and_convert_datetime_columns(
 def main() -> typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]:
     """
     Execute entry point for the data profiling pipeline.
+
     Flow: Load Data -> Clean Types -> Infer Dates -> Compute Stats -> Request LLM Insights -> Export.
     """
     df = hpanio.read_csv_to_df("global_ecommerce_forecasting.csv")
     df_typed = hpanconv.convert_df(df)
-
     # Process temporal inference
     df_typed, datetime_meta = infer_and_convert_datetime_columns(df_typed)
-
     # Identify categorical columns to calculate their distributions
     cat_cols = df_typed.select_dtypes(
         include=["object", "category", "string"]
     ).columns.tolist()
-
     # Compute base statistics
     stats = compute_llm_agent_stats(
         {"ecommerce_data": df_typed},
         categorical_cols_map={"ecommerce_data": cat_cols},
     )
-
     # FIX: Inject datetime metadata into stats so it's written to JSON and fed to LLM
     stats["datetime_columns"] = datetime_meta
-
     print(df_typed.dtypes)
     print(datetime_meta)
-
     # Send stats to LLM to generate testable hypotheses
     semantic_insights = generate_hypotheses_via_cli(stats)
-
     # Save the combined numerical stats and semantic insights to disk
     merge_and_export_results(stats, semantic_insights)
-
     return df_typed, stats
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From a2759f1f9190a0920ba8d0661ee16a5a87c856ed Mon Sep 17 00:00:00 2001
From: Pranav Shashidhara <pranavmay22@gmail.com>
Date: Thu, 2 Apr 2026 12:43:51 -0400
Subject: [PATCH 06/14] Update readme.md and schema_agents script
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-commit checks:
All checks passed ✅
---
 .../schema_agent/README.md                    |  111 +-
 .../schema_agent/schema_agent_utils.py        | 1024 +++++++++++++----
 2 files changed, 876 insertions(+), 259 deletions(-)

diff --git a/research/agentic_data_science/schema_agent/README.md b/research/agentic_data_science/schema_agent/README.md
index 777d42283..99ff8eafc 100644
--- a/research/agentic_data_science/schema_agent/README.md
+++ b/research/agentic_data_science/schema_agent/README.md
@@ -2,52 +2,103 @@
 
 Automated statistical profiling and LLM-powered semantic analysis for CSV datasets. Generates column-level insights including semantic meaning, data quality assessment, and testable business hypotheses.
 
-## Setup and Usage
+## Features
 
-To navigate to the repository:
+- **Temporal Detection:** Auto-detects and converts date/datetime columns across multiple formats
+- **Statistical Profiling:** Computes numeric summaries, data quality metrics, and categorical distributions
+- **LLM Semantic Analysis:** Generates column roles (ID, Feature, Target, Timestamp), semantic meaning, and hypotheses
+- **Cost Optimization:** Filter columns before LLM analysis to control token usage and API costs
+- **Multi-Format Output:** JSON reports and Markdown summaries
+
+## Setup
+
+Go into the schema folder: 
+```bash 
+> cd research/agentic_data_science/schema_agent
+```
+
+Install the requirements with the command: 
 ```bash
-cd research/agentic_data_science/schema_agent/schema_agent_utils.py
+> pip install -r requirements.txt
 ```
-Setup the OpenAI key in your environment before running in a .env file
+Set the OPENAI_API_KEY in the .env file: 
+```bash 
+> export OPENAI_API_KEY=sk-...
+```
+## Usage
 
-## Current Files
+### Basic
 
-- **`requirements.txt`** – Lists the Python dependencies required to run the agent
-- **`schema_agent_utils.py`** – Contains functions for parsing data, computing column statistics, and preparing summaries for LLM-based analysis
-- **`global_ecommerce_forecasting.csv`** – The dataset used for testing
+```bash
+python schema_agent_utils.py data.csv
+```
 
+Outputs:
+- `data_profile_report.json` — Machine-readable report
+- `data_profile_summary.md` — Human-readable summary
 
-## Setup
+### Advanced
 
-### 1. Load CSV
+```bash
+# Multiple files with tags
+python schema_agent_utils.py dataset1.csv dataset2.csv --tags sales_2024 inv_q1
 
-- Read into a `pandas.DataFrame`
-- Ensure the DataFrame is non-empty
+# Cost-optimized: only high-null columns
+python schema_agent_utils.py data.csv --llm-scope nulls --model gpt-4o-mini
 
-### 2. Compute Column Stats
+# Custom metrics and output
+python schema_agent_utils.py data.csv --metrics mean std max --output-json my_report.json
+```
+
+## Command-Line Arguments
+
+| Argument | Default | Description |
+|----------|---------|-------------|
+| `csv_paths` | Required | One or more CSV file paths |
+| `--tags` | File stems | Tags for each CSV (must match count) |
+| `--model` | `gpt-4o` | LLM model (`gpt-4o`, `gpt-4o-mini`, etc.) |
+| `--llm-scope` | `all` | Which columns to profile: `all`, `semantic`, `nulls` |
+| `--metrics` | Subset | Numeric metrics: `mean`, `std`, `min`, `25%`, `50%`, `75%`, `max` |
+| `--use-langchain` | False | Use LangChain instead of hllmcli |
+| `--output-json` | `data_profile_report.json` | JSON report path |
+| `--output-md` | `data_profile_summary.md` | Markdown summary path |
 
-- Identify column types: numeric, categorical, datetime
-- Compute per-column statistics:
-  - **Numeric**: min, max, mean, median
-  - **Categorical**: unique count, top values
-  - **Datetime**: ranges, durations
-- Capture null percentages and sample values
+## LLM Scoping
 
-### 3. Build LLM Prompt
+- **`all`** — Every column (highest cost, comprehensive)
+- **`semantic`** — Non-numeric columns only
+- **`nulls`** — Columns with >5% null values (cost-optimized)
 
-- Serialize per-column stats with optional user context
-- Designed for efficient LLM input (summaries only, not full data)
+## Python API
 
-### 4. LLM Analysis
+```python
+import research.agentic_data_science.schema_agent.schema_agent_utils as radssasau
+
+tag_to_df, stats = radssasau.run_pipeline(
+    csv_paths=["data.csv"],
+    model="gpt-4o-mini",
+    llm_scope="semantic"
+)
+```
 
-- Generate hypotheses about each column's meaning
-- Suggest semantic roles (identifier, timestamp, category, etc.)
-- Highlight data quality concerns
+## Output
 
-### 5. Merge Results
+### data_profile_report.json
+Structured report with column profiles, technical stats, and LLM insights.
 
-- Combine pandas statistics and LLM output by column name
+### data_profile_summary.md
+Formatted table summary: Column | Meaning | Role | Quality | Hypotheses
+
+## Troubleshooting
+
+**API Key Error:**
+```bash
+export OPENAI_API_KEY=sk-...
+```
 
-### 6. Export
+**Validation Errors:**
+- Use `--llm-scope nulls` or `--llm-scope semantic` to reduce columns
+- Try `--model gpt-4o-mini`
 
-- JSON output for downstream automation or agents
+**Datetime Detection:**
+Skipped automatically if no temporal columns detected.
diff --git a/research/agentic_data_science/schema_agent/schema_agent_utils.py b/research/agentic_data_science/schema_agent/schema_agent_utils.py
index a235e5e13..ba9f36b46 100644
--- a/research/agentic_data_science/schema_agent/schema_agent_utils.py
+++ b/research/agentic_data_science/schema_agent/schema_agent_utils.py
@@ -1,9 +1,17 @@
 """
+Data Profiler Agent — single-file implementation.
+
 Import as:
+    import research.agentic_data_science.schema_agent.schema_agent_utils as radssasau
 
-import research.agentic_data_science.schema_agent.schema_agent_utils as radssasau
+CLI usage:
+    python schema_agent_utils.py data.csv
+    python schema_agent_utils.py data.csv --model gpt-4o-mini --llm-scope nulls
+    python schema_agent_utils.py data.csv --metrics mean std min max --output-json out.json
+    python schema_agent_utils.py data.csv data2.csv --tags sales inventory
 """
 
+import argparse
 import datetime
 import json
 import logging
@@ -25,11 +33,15 @@
 import helpers.hpandas_io as hpanio
 import helpers.hpandas_stats as hpanstat
 
-# --- Configuration & Logging ---
+
+# =============================================================================
+# Configuration & Logging
+# =============================================================================
+
 dotenv.load_dotenv()
 api_key = os.environ.get("OPENAI_API_KEY")
 if not api_key:
-    print("Error: OPENAI_API_KEY not found.")
+    print("Error: OPENAI_API_KEY not found in environment.")
     sys.exit(1)
 
 client = openai.OpenAI(api_key=api_key)
@@ -47,10 +59,16 @@
     report_cpu_usage=True,
 )
 
+# Allowed metric names for numeric summaries.
+VALID_METRICS: typing.List[str] = ["mean", "std", "min", "25%", "50%", "75%", "max"]
+
+# Default metric subset shown in reports.
+DEFAULT_METRICS: typing.List[str] = ["mean", "std", "min", "50%", "max"]
+
 
-# #############################################################################
-# ColumnInsight
-# #############################################################################
+# =============================================================================
+# Pydantic schemas
+# =============================================================================
 
 
 class ColumnInsight(pydantic.BaseModel):
@@ -69,42 +87,189 @@ class ColumnInsight(pydantic.BaseModel):
     )
 
 
-# #############################################################################
-# DatasetInsights
-# #############################################################################
-
-
 class DatasetInsights(pydantic.BaseModel):
     columns: typing.Dict[str, ColumnInsight]
 
 
-# --- Core Logic ---
+# =============================================================================
+# Data loading
+# =============================================================================
 
 
-def load_employee_data(csv_path: str) -> pd.DataFrame:
+def load_csv(csv_path: str) -> pd.DataFrame:
     """
-    Load employee data from CSV with error handling for missing files.
+    Load a CSV into a DataFrame with clear error handling.
+
+    Parameters
+    ----------
+    csv_path : str
+        Path to the CSV file.
+
+    Returns
+    -------
+    pd.DataFrame
     """
     try:
-        return hpanio.read_csv_to_df(csv_path)
+        df = hpanio.read_csv_to_df(csv_path)
     except FileNotFoundError:
         _LOG.error("CSV not found at '%s'.", csv_path)
         raise
+    if df.empty:
+        raise ValueError(f"CSV at '{csv_path}' loaded as an empty DataFrame.")
+    _LOG.info("Loaded '%s': %d rows × %d columns.", csv_path, len(df), len(df.columns))
+    return df
+
+
+# keep legacy name for backwards compatibility
+load_employee_data = load_csv
+
+
+# =============================================================================
+# Datetime inference
+# =============================================================================
+
+
+def infer_and_convert_datetime_columns(
+    df: pd.DataFrame,
+    sample_size: int = 100,
+    threshold: float = 0.8,
+) -> typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]:
+    """
+    Detect and convert date/datetime columns in a DataFrame.
+
+    Uses sampling for performance. Returns the updated DataFrame and a
+    metadata dict with inference details per column.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+    sample_size : int
+        Number of rows to sample when testing format compliance.
+    threshold : float
+        Minimum fraction of parsed values required to accept a column as temporal.
+
+    Returns
+    -------
+    (pd.DataFrame, dict)
+        Updated DataFrame with converted columns + metadata per column.
+    """
+    COMMON_FORMATS = [
+        "%Y-%m-%d",
+        "%d-%m-%Y",
+        "%m-%d-%Y",
+        "%Y/%m/%d",
+        "%d/%m/%Y",
+        "%m/%d/%Y",
+        "%Y-%m-%d %H:%M:%S",
+        "%Y-%m-%d %H:%M",
+        "%d-%m-%Y %H:%M:%S",
+        "%m/%d/%Y %H:%M:%S",
+    ]
+
+    metadata: typing.Dict[str, typing.Any] = {}
+    df_out = df.copy()
+
+    for col in df.columns:
+        if not (
+            pd.api.types.is_object_dtype(df[col])
+            or pd.api.types.is_string_dtype(df[col])
+        ):
+            continue
+
+        series = df[col].dropna().astype(str)
+        if series.empty:
+            continue
+
+        sample = series.head(sample_size)
+        best_format: typing.Optional[str] = None
+        best_score = 0.0
+
+        for fmt in COMMON_FORMATS:
+            success = sum(
+                1
+                for val in sample
+                if _try_strptime(val, fmt)
+            )
+            score = success / len(sample)
+            if score > best_score:
+                best_score = score
+                best_format = fmt
+
+        if best_score >= threshold:
+            parsed = pd.to_datetime(df[col], format=best_format, errors="coerce")
+            used_format = best_format
+        else:
+            parsed = pd.to_datetime(df[col], errors="coerce")
+            used_format = None
+
+        confidence = float(parsed.notna().mean())
+        if confidence < threshold:
+            continue
+
+        has_time = (parsed.dt.time != pd.Timestamp("00:00:00").time()).any()
+        col_type = "datetime" if has_time else "date"
+        df_out[col] = parsed
+
+        metadata[col] = {
+            "semantic_type": "temporal",
+            "granularity": col_type,
+            "format": used_format,
+            "confidence": confidence,
+        }
+        _LOG.info(
+            "Column '%s' detected as %s (format=%s, confidence=%.2f)",
+            col,
+            col_type,
+            used_format,
+            confidence,
+        )
+
+    return df_out, metadata
+
+
+def _try_strptime(val: str, fmt: str) -> bool:
+    """Return True if val parses under fmt, False otherwise."""
+    try:
+        datetime.datetime.strptime(val, fmt)
+        return True
+    except Exception:  # pylint: disable=broad-exception-caught
+        return False
+
+
+# =============================================================================
+# Stats computation
+# =============================================================================
 
 
 def compute_llm_agent_stats(
     tag_to_df: typing.Dict[str, pd.DataFrame],
-    categorical_cols_map: typing.Optional[
-        typing.Dict[str, typing.List[str]]
-    ] = None,
+    categorical_cols_map: typing.Optional[typing.Dict[str, typing.List[str]]] = None,
+    metrics: typing.Optional[typing.List[str]] = None,
 ) -> typing.Dict[str, typing.Any]:
     """
     Compute a statistical profile including temporal boundaries, data quality,
     categorical distributions, and numeric summaries for LLM injection.
+
+    Parameters
+    ----------
+    tag_to_df : dict
+        Mapping of dataset tag → DataFrame. Supports multiple datasets.
+    categorical_cols_map : dict, optional
+        Mapping of tag → list of categorical column names to profile.
+    metrics : list of str, optional
+        Subset of numeric metrics to include. Must be from VALID_METRICS.
+        Defaults to DEFAULT_METRICS.
+
+    Returns
+    -------
+    dict
+        Keys: temporal_boundaries, quality_reports, categorical_distributions,
+        numeric_summary.
     """
+    metrics = _resolve_metrics(metrics)
     dataframe_stats: typing.Dict[str, typing.Any] = {}
 
-    # 1. Temporal Analysis
+    # 1. Temporal boundaries
     try:
         duration_stats, _ = hpanstat.compute_duration_df(tag_to_df)
         dataframe_stats["temporal_boundaries"] = duration_stats
@@ -113,16 +278,13 @@ def compute_llm_agent_stats(
         _LOG.warning("Skipping duration stats: %s", e)
         dataframe_stats["temporal_boundaries"] = None
 
-    # 2. Data Quality Profiling
+    # 2. Data quality
     dataframe_stats["quality_reports"] = {}
     for tag, df in tag_to_df.items():
         numeric_df = df.select_dtypes(include="number")
         if numeric_df.empty:
-            _LOG.warning(
-                "No numeric columns in '%s'; skipping quality report", tag
-            )
+            _LOG.warning("No numeric columns in '%s'; skipping quality report", tag)
             continue
-
         df_stamped = hpanstat.add_end_download_timestamp(numeric_df.copy())
         try:
             quality = hpanstat.report_zero_nan_inf_stats(
@@ -136,78 +298,258 @@ def compute_llm_agent_stats(
         except Exception as e:  # pylint: disable=broad-exception-caught
             _LOG.warning("Quality report failed for '%s': %s", tag, e)
 
-    # 3. Categorical Distributions
+    # 3. Categorical distributions
     dataframe_stats["categorical_distributions"] = {}
     if categorical_cols_map:
         for tag, cols in categorical_cols_map.items():
             if tag not in tag_to_df:
+                _LOG.warning("Tag '%s' not found in tag_to_df; skipping.", tag)
                 continue
             dataframe_stats["categorical_distributions"][tag] = {}
             for col in cols:
-                if col in tag_to_df[tag].columns:
-                    dist = hpanstat.get_value_counts_stats_df(tag_to_df[tag], col)
-                    dataframe_stats["categorical_distributions"][tag][col] = dist
-                    print(
-                        f"\n=== Distribution: {tag} / {col} ===\n",
-                        dist.to_string(),
-                    )
+                if col not in tag_to_df[tag].columns:
+                    _LOG.warning("Column '%s' not in '%s'; skipping.", col, tag)
+                    continue
+                dist = hpanstat.get_value_counts_stats_df(tag_to_df[tag], col)
+                dataframe_stats["categorical_distributions"][tag][col] = dist
+                print(f"\n=== Distribution: {tag} / {col} ===\n", dist.to_string())
 
-    # 4. Numeric Summary
+    # 4. Numeric summary (customisable metric subset)
     dataframe_stats["numeric_summary"] = {}
     for tag, df in tag_to_df.items():
         numeric_df = df.select_dtypes(include="number")
-        if not numeric_df.empty:
-            summary = numeric_df.describe().T[
-                ["mean", "std", "min", "50%", "max"]
-            ]
-            summary.rename(columns={"50%": "median"}, inplace=True)
-            dataframe_stats["numeric_summary"][tag] = summary
-            print(f"\n=== Numeric Summary: {tag} ===\n", summary.to_string())
+        if numeric_df.empty:
+            continue
+        full_summary = numeric_df.describe().T
+        available = [m for m in metrics if m in full_summary.columns]
+        if not available:
+            _LOG.warning("None of the requested metrics %s are available.", metrics)
+        summary = full_summary[available].copy()
+        if "50%" in summary.columns:
+            summary = summary.rename(columns={"50%": "median"})
+        dataframe_stats["numeric_summary"][tag] = summary
+        print(f"\n=== Numeric Summary: {tag} ===\n", summary.to_string())
 
     return dataframe_stats
 
 
-def build_llm_prompt(stats: typing.Dict[str, typing.Any]) -> str:
+def _resolve_metrics(metrics: typing.Optional[typing.List[str]]) -> typing.List[str]:
     """
-    Serialize statistical data into a structured string prompt for LLM
-    consumption.
+    Validate and return the metric list, falling back to DEFAULT_METRICS.
+    """
+    if metrics is None:
+        return DEFAULT_METRICS
+    invalid = [m for m in metrics if m not in VALID_METRICS]
+    if invalid:
+        _LOG.warning(
+            "Unknown metrics %s will be ignored. Valid options: %s",
+            invalid,
+            VALID_METRICS,
+        )
+    resolved = [m for m in metrics if m in VALID_METRICS]
+    return resolved if resolved else DEFAULT_METRICS
+
+
+# =============================================================================
+# LLM scope filtering
+# =============================================================================
+
+
+def _select_columns_for_llm(
+    df: pd.DataFrame,
+    scope: str,
+    null_threshold: float = 0.05,
+) -> typing.List[str]:
+    """
+    Return the list of column names that should be sent to the LLM.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+    scope : str
+        "all"      — every column
+        "semantic" — non-numeric columns only (object / category / string)
+        "nulls"    — columns with null fraction above null_threshold
+    null_threshold : float
+        Fraction of nulls required for "nulls" scope. Default 5 %.
+
+    Returns
+    -------
+    list of str
+    """
+    if scope == "all":
+        return list(df.columns)
+
+    if scope == "semantic":
+        cols = df.select_dtypes(
+            include=["object", "category", "string"]
+        ).columns.tolist()
+        _LOG.info("LLM scope='semantic': %d columns selected.", len(cols))
+        return cols
+
+    if scope == "nulls":
+        cols = [
+            col
+            for col in df.columns
+            if df[col].isnull().mean() > null_threshold
+        ]
+        _LOG.info(
+            "LLM scope='nulls' (threshold=%.0f%%): %d columns selected.",
+            null_threshold * 100,
+            len(cols),
+        )
+        return cols
+
+    _LOG.warning("Unknown LLM scope '%s'; falling back to 'all'.", scope)
+    return list(df.columns)
+
+
+# =============================================================================
+# Prompt building
+# =============================================================================
+
+
+def build_llm_prompt(
+    stats: typing.Dict[str, typing.Any],
+    columns_to_include: typing.Optional[typing.List[str]] = None,
+) -> str:
+    """
+    Serialize statistical data into a structured string prompt for LLM consumption.
+
+    Parameters
+    ----------
+    stats : dict
+        Output of compute_llm_agent_stats().
+    columns_to_include : list of str, optional
+        Subset of column names to include in the prompt. None = all.
+
+    Returns
+    -------
+    str
     """
     prompt_segments = [
         "You are a Senior Data Scientist and Domain Expert.",
         "Analyze the provided dataset statistics and generate a profile for each column.",
-        "For each column, provide 2-3 testable hypotheses. "
+        "For each column, provide 2-3 testable hypotheses.",
         "Example: 'Higher discount rates correlate with higher volume but lower margins.'",
         "\n--- DATASET STATISTICS ---",
     ]
-    # Append datetime column metadata if available
+
     if "datetime_columns" in stats and stats["datetime_columns"]:
         prompt_segments.append(
-            f"\nDetected Datetime Columns:\n{json.dumps(stats['datetime_columns'], indent=2)}"
+            f"\nDetected Datetime Columns:\n"
+            f"{json.dumps(stats['datetime_columns'], indent=2)}"
         )
-    # Append numeric summaries if available
+
     if "numeric_summary" in stats:
-        for tag, summary in stats["numeric_summary"].items():
+        for tag, summary_df in stats["numeric_summary"].items():
+            if columns_to_include is not None:
+                summary_df = summary_df[
+                    summary_df.index.isin(columns_to_include)
+                ]
             prompt_segments.append(
-                f"\nDataset [{tag}] Numeric Summary:\n{summary.to_string()}"
+                f"\nDataset [{tag}] Numeric Summary:\n{summary_df.to_string()}"
             )
-    # Append categorical distributions if available
+
     if "categorical_distributions" in stats:
         for tag, cols in stats["categorical_distributions"].items():
             for col_name, dist in cols.items():
+                if columns_to_include is not None and col_name not in columns_to_include:
+                    continue
                 prompt_segments.append(
                     f"\nDistribution for [{col_name}]:\n{dist.to_string()}"
                 )
+
     return "\n".join(prompt_segments)
 
 
+# =============================================================================
+# LLM calls
+# =============================================================================
+
+
+def generate_hypotheses_via_cli(
+    stats: typing.Dict[str, typing.Any],
+    model: str = "gpt-4o",
+    columns_to_include: typing.Optional[typing.List[str]] = None,
+) -> typing.Dict[str, typing.Any]:
+    """
+    Generate insights and hypotheses using internal hllmcli logic.
+
+    Parses and Pydantic-validates the LLM response against DatasetInsights.
+
+    Parameters
+    ----------
+    stats : dict
+    model : str
+    columns_to_include : list of str, optional
+        If provided, only these columns are sent to the LLM (cost control).
+
+    Returns
+    -------
+    dict  — DatasetInsights-shaped dict, or {"error": ...} on failure.
+    """
+    _LOG.info("Generating hypotheses via hllmcli (model=%s)...", model)
+
+    schema_json = DatasetInsights.model_json_schema()
+    user_prompt = build_llm_prompt(stats, columns_to_include=columns_to_include)
+    system_prompt = (
+        "You are a Senior Data Scientist. Analyze the following data statistics.\n"
+        "Generate a set of 2-3 predictive or causal hypotheses for EVERY column.\n"
+        f"Return the output strictly in JSON matching this schema:\n"
+        f"{json.dumps(schema_json)}"
+    )
+
+    try:
+        response_text, cost = hllmcli.apply_llm(
+            input_str=user_prompt,
+            system_prompt=system_prompt,
+            model=model,
+            use_llm_executable=False,
+        )
+        _LOG.info("LLM call successful. Estimated cost: $%.6f", cost)
+
+        cleaned = (
+            response_text.strip()
+            .removeprefix("```json")
+            .removesuffix("```")
+            .strip()
+        )
+        raw = json.loads(cleaned)
+
+        # Pydantic validation — raises ValidationError on schema mismatch.
+        validated = DatasetInsights.model_validate(raw)
+        return validated.model_dump()
+
+    except pydantic.ValidationError as e:
+        _LOG.error("LLM output failed Pydantic validation: %s", e)
+        return {"error": str(e)}
+    except json.JSONDecodeError as e:
+        _LOG.error("LLM returned invalid JSON: %s", e)
+        return {"error": f"JSON parse error: {e}"}
+    except Exception as e:  # pylint: disable=broad-exception-caught
+        _LOG.error("hllmcli call failed: %s", e)
+        return {"error": str(e)}
+
+
 def get_llm_semantic_insights_langchain(
-    prompt_text: str, model: str = "gpt-4o"
+    prompt_text: str,
+    model: str = "gpt-4o",
 ) -> typing.Dict[str, typing.Any]:
     """
-    Process dataset metadata via LangChain to extract structured semantic
-    insights.
+    Process dataset metadata via LangChain to extract structured semantic insights.
+
+    Uses JsonOutputParser alongside the Pydantic schema. Validates output.
 
-    Uses LangChain's JsonOutputParser alongside the Pydantic schema.
+    Parameters
+    ----------
+    prompt_text : str
+        Serialized stats from build_llm_prompt().
+    model : str
+
+    Returns
+    -------
+    dict
     """
     _LOG.info("Querying LLM via LangChain (%s)...", model)
     llm = lco.ChatOpenAI(model=model, temperature=0)
@@ -227,230 +569,454 @@ def get_llm_semantic_insights_langchain(
     chain = prompt | llm | parser
     try:
         result = chain.invoke({"metadata_stats": prompt_text})
-        return typing.cast(typing.Dict[str, typing.Any], result)
+        # Validate against Pydantic schema.
+        validated = DatasetInsights.model_validate(result)
+        return validated.model_dump()
+    except pydantic.ValidationError as e:
+        _LOG.error("LangChain output failed Pydantic validation: %s", e)
+        return {"error": str(e)}
     except Exception as e:  # pylint: disable=broad-exception-caught
         _LOG.error("LangChain invocation failed: %s", e)
         return {"error": str(e)}
 
 
+# =============================================================================
+# Column profiles
+# =============================================================================
+
+
+def build_column_profiles(
+    df: pd.DataFrame,
+    stats: typing.Dict[str, typing.Any],
+    insights: typing.Dict[str, typing.Any],
+) -> typing.List[typing.Dict[str, typing.Any]]:
+    """
+    Convert stat-centric structure into per-column profiles.
+
+    Merges numeric stats, categorical distributions, datetime metadata,
+    and LLM semantic insights keyed on column name.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+    stats : dict
+    insights : dict  — output of generate_hypotheses_via_cli()
+
+    Returns
+    -------
+    list of dict, one entry per column.
+    """
+    profiles: typing.List[typing.Dict[str, typing.Any]] = []
+
+    numeric_summary = stats.get("numeric_summary", {})
+    categorical_stats = stats.get("categorical_distributions", {})
+    datetime_meta = stats.get("datetime_columns", {})
+
+    for col in df.columns:
+        profile: typing.Dict[str, typing.Any] = {
+            "column": col,
+            "dtype": str(df[col].dtype),
+            "null_pct": float(df[col].isnull().mean()),
+            "unique_count": int(df[col].nunique()),
+            "sample_values": df[col].dropna().head(3).tolist(),
+        }
+
+        # Numeric stats
+        for _, summary_df in numeric_summary.items():
+            if col in summary_df.index:
+                col_stats = summary_df.loc[col]
+                for metric in col_stats.index:
+                    profile[metric] = col_stats[metric]
+
+        # Categorical top values
+        for _, cols in categorical_stats.items():
+            if col in cols:
+                dist = cols[col]
+                try:
+                    profile["top_values"] = (
+                        dist.head(3).to_dict()
+                        if hasattr(dist, "head")
+                        else dict(list(dist.items())[:3])
+                    )
+                except Exception:  # pylint: disable=broad-exception-caught
+                    pass
+
+        # Datetime metadata
+        if col in datetime_meta:
+            profile["temporal"] = datetime_meta[col]
+
+        # LLM insights
+        if "columns" in insights and col in insights["columns"]:
+            insight = insights["columns"][col]
+            if hasattr(insight, "dict"):
+                insight = insight.dict()
+            profile.update(
+                {
+                    "semantic_meaning": insight.get("semantic_meaning"),
+                    "role": insight.get("role"),
+                    "data_quality_notes": insight.get("data_quality_notes"),
+                    "hypotheses": insight.get("hypotheses", []),
+                }
+            )
+
+        profiles.append(profile)
+
+    return profiles
+
+
+# =============================================================================
+# Export helpers
+# =============================================================================
+
+
 def merge_and_export_results(
     stats: typing.Dict[str, typing.Any],
     insights: typing.Dict[str, typing.Any],
+    column_profiles: typing.List[typing.Dict[str, typing.Any]],
     output_path: str = "data_profile_report.json",
 ) -> None:
     """
-    Merge pandas statistics with LLM insights and export to a JSON report.
-
-    Converts DataFrame objects into dictionaries to ensure JSON
-    serialization.
+    Merge stats + insights + column_profiles and export to JSON.
+
+    Parameters
+    ----------
+    stats : dict
+    insights : dict
+    column_profiles : list of dict
+    output_path : str
     """
-    _LOG.info("Merging technical stats with LLM insights...")
-
-    serializable_stats = {}
-
-    for key, value in stats.items():
-        if isinstance(value, pd.DataFrame):
-            serializable_stats[key] = value.to_dict(orient="index")
-        elif isinstance(value, dict):
-            inner_dict = {}
-            for k, v in value.items():
-                if isinstance(v, pd.DataFrame):
-                    inner_dict[k] = v.to_dict(orient="index")
-                elif isinstance(v, dict):
-                    # Handle nested dicts (e.g. categorical_distributions[tag][col])
-                    inner_inner = {}
-                    for kk, vv in v.items():
-                        inner_inner[kk] = (
-                            vv.to_dict(orient="index")
-                            if isinstance(vv, pd.DataFrame)
-                            else vv
-                        )
-                    inner_dict[k] = inner_inner
-                else:
-                    inner_dict[k] = v
-            serializable_stats[key] = inner_dict
-        else:
-            serializable_stats[key] = value
+    _LOG.info("Merging results...")
+    serializable_stats = _make_serializable(stats)
 
     final_report = {
-        "report_metadata": {"version": "1.0", "agent": "LangChain-Data-Profiler"},
+        "report_metadata": {
+            "version": "1.2",
+            "agent": "Data-Profiler-Agent",
+            "generated_at": datetime.datetime.utcnow().isoformat() + "Z",
+        },
+        "column_profiles": column_profiles,
         "technical_stats": serializable_stats,
         "semantic_insights": insights,
     }
 
-    try:
-        with open(output_path, "w", encoding="utf-8") as f:
-            json.dump(final_report, f, indent=4, default=str)
-        _LOG.info("Successfully exported merged profile to: %s", output_path)
-    except Exception as e:  # pylint: disable=broad-exception-caught
-        _LOG.error("Failed to export results: %s", e)
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(final_report, f, indent=4, default=str)
 
+    _LOG.info("Exported JSON report to '%s'.", output_path)
 
-def generate_hypotheses_via_cli(
-    stats: typing.Dict[str, typing.Any], model: str = "gpt-4o"
-) -> typing.Dict[str, typing.Any]:
+
+def _make_serializable(obj: typing.Any) -> typing.Any:
     """
-    Generate insights and hypotheses using internal hllmcli logic.
+    Recursively convert DataFrames and nested dicts to JSON-safe structures.
     """
-    _LOG.info("Generating hypotheses via hllmcli logic...")
-
-    schema_json = DatasetInsights.model_json_schema()
-    user_prompt = build_llm_prompt(stats)
-    system_prompt = (
-        "You are a Senior Data Scientist. Analyze the following data statistics.\n"
-        "Generate a set of 2-3 predictive or causal hypotheses for EVERY column.\n"
-        f"Return the output strictly in JSON matching this schema: {json.dumps(schema_json)}"
-    )
-
-    try:
-        response_text, cost = hllmcli.apply_llm(
-            input_str=user_prompt,
-            system_prompt=system_prompt,
-            model=model,
-            use_llm_executable=False,
-        )
-
-        _LOG.info("LLM Call successful. Cost: $%.6f", cost)
-
-        cleaned_response = (
-            response_text.strip()
-            .removeprefix("```json")
-            .removesuffix("```")
-            .strip()
-        )
-        parsed = json.loads(cleaned_response)
-        return typing.cast(typing.Dict[str, typing.Any], parsed)
-
-    except Exception as e:  # pylint: disable=broad-exception-caught
-        _LOG.error("hllmcli call failed: %s", e)
-        return {"error": str(e)}
-
+    if isinstance(obj, pd.DataFrame):
+        return obj.to_dict(orient="index")
+    if isinstance(obj, dict):
+        return {k: _make_serializable(v) for k, v in obj.items()}
+    if isinstance(obj, list):
+        return [_make_serializable(v) for v in obj]
+    return obj
+
+
+def export_markdown_from_profiles(
+    column_profiles: typing.List[typing.Dict[str, typing.Any]],
+    numeric_stats: typing.Optional[typing.Dict[str, pd.DataFrame]] = None,
+    output_path: str = "data_profile_summary.md",
+) -> None:
+    """
+    Generate a readable Markdown report from column profiles and numeric stats.
 
-def infer_and_convert_datetime_columns(
-    df: pd.DataFrame,
-    sample_size: int = 100,
-    threshold: float = 0.8,
-) -> typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]:
+    Parameters
+    ----------
+    column_profiles : list of dict
+    numeric_stats : dict of str → DataFrame, optional
+    output_path : str
     """
-    Detect and convert date/datetime columns in a DataFrame. Uses sampling to
-    improve performance when checking format compliance.
 
-    Returns:
-        - Updated DataFrame with converted columns
-        - Metadata dict with inference details per column
+    def _clean(val: typing.Any) -> str:
+        if val is None:
+            return ""
+        return str(val).replace("|", "\\|").replace("\n", " ")
+
+    def _fmt(val: typing.Any) -> str:
+        if isinstance(val, int):
+            return str(val)
+        if isinstance(val, float):
+            return f"{val:,.2f}" if abs(val) >= 1 else f"{val:.4f}"
+        return str(val)
+
+    lines = ["# Data Profile Summary\n"]
+
+    # Column profiles table
+    lines.append("## Column Profiles\n")
+    lines.append("| Column | Meaning | Role | Quality | Hypotheses |")
+    lines.append("|--------|---------|------|---------|------------|")
+
+    for p in column_profiles:
+        hypotheses = p.get("hypotheses", [])
+        if isinstance(hypotheses, list) and hypotheses:
+            hyp_str = "<br>".join(
+                f"{i+1}. {_clean(h)}" for i, h in enumerate(hypotheses[:3])
+            )
+        else:
+            hyp_str = _clean(hypotheses) or "N/A"
+
+        row = [
+            _clean(p.get("column")),
+            _clean(p.get("semantic_meaning")),
+            _clean(p.get("role")),
+            _clean(p.get("data_quality_notes")),
+            hyp_str,
+        ]
+        lines.append("| " + " | ".join(row) + " |")
+
+    # Numeric stats table
+    if numeric_stats:
+        lines.append("\n## Numeric Column Statistics\n")
+        for tag, df in numeric_stats.items():
+            lines.append(f"### {tag}\n")
+            lines.append("| Column | Metric | Value |")
+            lines.append("|--------|--------|-------|")
+            for col_name in df.index:
+                for metric in df.columns:
+                    val = df.loc[col_name, metric]
+                    lines.append(f"| {col_name} | {metric} | {_fmt(val)} |")
+            lines.append("")
+
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write("\n".join(lines) + "\n")
+
+    _LOG.info("Exported Markdown report to '%s'.", output_path)
+
+
+# =============================================================================
+# Pipeline
+# =============================================================================
+
+
+def run_pipeline(
+    csv_paths: typing.List[str],
+    tags: typing.Optional[typing.List[str]] = None,
+    model: str = "gpt-4o",
+    metrics: typing.Optional[typing.List[str]] = None,
+    llm_scope: str = "all",
+    output_json: str = "data_profile_report.json",
+    output_md: str = "data_profile_summary.md",
+    use_langchain: bool = False,
+) -> typing.Tuple[typing.Dict[str, pd.DataFrame], typing.Dict[str, typing.Any]]:
     """
+    Execute the full data profiling pipeline over one or more CSV files.
+
+    Parameters
+    ----------
+    csv_paths : list of str
+        One or more CSV file paths to profile.
+    tags : list of str, optional
+        Human-readable tag for each CSV. Defaults to filename stems.
+    model : str
+        LLM model name passed to OpenAI / hllmcli.
+    metrics : list of str, optional
+        Numeric metrics to include. Defaults to DEFAULT_METRICS.
+    llm_scope : str
+        "all", "semantic", or "nulls" — controls which columns are LLM-profiled.
+    output_json : str
+        Path for the merged JSON report.
+    output_md : str
+        Path for the Markdown summary.
+    use_langchain : bool
+        Use LangChain chain instead of hllmcli for LLM calls.
+
+    Returns
+    -------
+    (dict of tag → df, stats dict)
+    """
+    if tags is None:
+        tags = [os.path.splitext(os.path.basename(p))[0] for p in csv_paths]
 
-    COMMON_FORMATS = [
-        "%Y-%m-%d",
-        "%d-%m-%Y",
-        "%m-%d-%Y",
-        "%Y/%m/%d",
-        "%d/%m/%Y",
-        "%m/%d/%Y",
-        "%Y-%m-%d %H:%M:%S",
-        "%Y-%m-%d %H:%M",
-        "%d-%m-%Y %H:%M:%S",
-        "%m/%d/%Y %H:%M:%S",
-    ]
+    if len(tags) != len(csv_paths):
+        raise ValueError(
+            f"Length of tags ({len(tags)}) must match csv_paths ({len(csv_paths)})."
+        )
 
-    metadata: typing.Dict[str, typing.Any] = {}
-    df_out = df.copy()
+    # --- Load & type-coerce ---
+    tag_to_df: typing.Dict[str, pd.DataFrame] = {}
+    for path, tag in zip(csv_paths, tags):
+        df = load_csv(path)
+        df = hpanconv.convert_df(df)
+        df, datetime_meta_partial = infer_and_convert_datetime_columns(df)
+        tag_to_df[tag] = df
+
+    # Merge datetime metadata across all DataFrames (using the last loaded tag
+    # as the primary df for single-dataset runs; full merge for multi).
+    _, datetime_meta = infer_and_convert_datetime_columns(
+        pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True)
+    )
 
-    for col in df.columns:
-        if not pd.api.types.is_object_dtype(
-            df[col]
-        ) and not pd.api.types.is_string_dtype(df[col]):
-            continue
+    # --- Categorical column map ---
+    cat_cols_map: typing.Dict[str, typing.List[str]] = {
+        tag: df.select_dtypes(
+            include=["object", "category", "string"]
+        ).columns.tolist()
+        for tag, df in tag_to_df.items()
+    }
 
-        series = df[col].dropna().astype(str)
+    # --- Compute stats ---
+    stats = compute_llm_agent_stats(
+        tag_to_df,
+        categorical_cols_map=cat_cols_map,
+        metrics=metrics,
+    )
+    stats["datetime_columns"] = datetime_meta
 
-        if series.empty:
-            continue
+    # --- LLM scope ---
+    # Use the concatenated DataFrame to decide which columns to send.
+    combined_df = pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True)
+    columns_for_llm = _select_columns_for_llm(combined_df, scope=llm_scope)
+    _LOG.info(
+        "LLM will profile %d / %d columns (scope=%s).",
+        len(columns_for_llm),
+        len(combined_df.columns),
+        llm_scope,
+    )
 
-        sample = series.head(sample_size)
+    # --- LLM call ---
+    if use_langchain:
+        prompt_text = build_llm_prompt(stats, columns_to_include=columns_for_llm)
+        semantic_insights = get_llm_semantic_insights_langchain(
+            prompt_text, model=model
+        )
+    else:
+        semantic_insights = generate_hypotheses_via_cli(
+            stats,
+            model=model,
+            columns_to_include=columns_for_llm,
+        )
 
-        best_format = None
-        best_score = 0.0
+    # --- Build column profiles (use first / primary df for column ordering) ---
+    primary_df = list(tag_to_df.values())[0]
+    column_profiles = build_column_profiles(
+        df=primary_df,
+        stats=stats,
+        insights=semantic_insights,
+    )
 
-        for fmt in COMMON_FORMATS:
-            success = 0
-            for val in sample:
-                try:
-                    datetime.datetime.strptime(val, fmt)
-                    success += 1
-                except Exception:
-                    continue
+    # --- Export ---
+    merge_and_export_results(
+        stats=stats,
+        insights=semantic_insights,
+        column_profiles=column_profiles,
+        output_path=output_json,
+    )
+    export_markdown_from_profiles(
+        column_profiles,
+        numeric_stats=stats.get("numeric_summary", {}),
+        output_path=output_md,
+    )
 
-            score = success / len(sample)
+    return tag_to_df, stats
 
-            if score > best_score:
-                best_score = score
-                best_format = fmt
 
-        if best_score >= threshold:
-            parsed = pd.to_datetime(df[col], format=best_format, errors="coerce")
-            used_format = best_format
-        else:
-            parsed = pd.to_datetime(df[col], errors="coerce")
-            used_format = None
+# =============================================================================
+# CLI
+# =============================================================================
 
-        confidence = parsed.notna().mean()
 
-        if confidence < threshold:
-            continue
+def _build_arg_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="schema_agent_utils",
+        description="Data Profiler Agent — statistical + LLM column profiling",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
 
-        has_time = (parsed.dt.time != pd.Timestamp("00:00:00").time()).any()
-        col_type = "datetime" if has_time else "date"
+    # --- Inputs ---
+    parser.add_argument(
+        "csv_paths",
+        nargs="+",
+        metavar="CSV",
+        help="One or more CSV file paths to profile.",
+    )
+    parser.add_argument(
+        "--tags",
+        nargs="+",
+        metavar="TAG",
+        help="Human-readable tag for each CSV (must match number of csv_paths).",
+    )
 
-        df_out[col] = parsed
+    # --- LLM options ---
+    parser.add_argument(
+        "--model",
+        default="gpt-4o",
+        help="LLM model for semantic analysis.",
+    )
+    parser.add_argument(
+        "--llm-scope",
+        choices=["all", "semantic", "nulls"],
+        default="all",
+        dest="llm_scope",
+        help=(
+            "Which columns to send to the LLM. "
+            "'all'=every column, 'semantic'=non-numeric only, "
+            "'nulls'=high-null columns only (saves cost)."
+        ),
+    )
+    parser.add_argument(
+        "--use-langchain",
+        action="store_true",
+        dest="use_langchain",
+        help="Use LangChain pipeline instead of hllmcli for LLM calls.",
+    )
 
-        metadata[col] = {
-            "semantic_type": "temporal",
-            "granularity": col_type,
-            "format": used_format,
-            "confidence": float(confidence),
-        }
+    # --- Stat options ---
+    parser.add_argument(
+        "--metrics",
+        nargs="+",
+        choices=VALID_METRICS,
+        default=None,
+        metavar="METRIC",
+        help=(
+            f"Numeric metrics to include in the summary. "
+            f"Valid: {', '.join(VALID_METRICS)}. "
+            f"Default: {', '.join(DEFAULT_METRICS)}."
+        ),
+    )
 
-        _LOG.info(
-            "Column '%s' detected as %s (format=%s, confidence=%.2f)",
-            col,
-            col_type,
-            used_format,
-            confidence,
-        )
+    # --- Output options ---
+    parser.add_argument(
+        "--output-json",
+        default="data_profile_report.json",
+        dest="output_json",
+        metavar="PATH",
+        help="Output path for the merged JSON report.",
+    )
+    parser.add_argument(
+        "--output-md",
+        default="data_profile_summary.md",
+        dest="output_md",
+        metavar="PATH",
+        help="Output path for the Markdown summary.",
+    )
 
-    return df_out, metadata
+    return parser
 
 
-def main() -> typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]:
+def main() -> None:
     """
-    Execute entry point for the data profiling pipeline.
-
-    Flow: Load Data -> Clean Types -> Infer Dates -> Compute Stats -> Request LLM Insights -> Export.
+    CLI entry point. Parses arguments and delegates to run_pipeline().
     """
-    df = hpanio.read_csv_to_df("global_ecommerce_forecasting.csv")
-    df_typed = hpanconv.convert_df(df)
-    # Process temporal inference
-    df_typed, datetime_meta = infer_and_convert_datetime_columns(df_typed)
-    # Identify categorical columns to calculate their distributions
-    cat_cols = df_typed.select_dtypes(
-        include=["object", "category", "string"]
-    ).columns.tolist()
-    # Compute base statistics
-    stats = compute_llm_agent_stats(
-        {"ecommerce_data": df_typed},
-        categorical_cols_map={"ecommerce_data": cat_cols},
+    parser = _build_arg_parser()
+    args = parser.parse_args()
+
+    run_pipeline(
+        csv_paths=args.csv_paths,
+        tags=args.tags,
+        model=args.model,
+        metrics=args.metrics,
+        llm_scope=args.llm_scope,
+        output_json=args.output_json,
+        output_md=args.output_md,
+        use_langchain=args.use_langchain,
     )
-    # FIX: Inject datetime metadata into stats so it's written to JSON and fed to LLM
-    stats["datetime_columns"] = datetime_meta
-    print(df_typed.dtypes)
-    print(datetime_meta)
-    # Send stats to LLM to generate testable hypotheses
-    semantic_insights = generate_hypotheses_via_cli(stats)
-    # Save the combined numerical stats and semantic insights to disk
-    merge_and_export_results(stats, semantic_insights)
-    return df_typed, stats
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file

From 52aaa71457339b35674b6c4b243872187cdd90fb Mon Sep 17 00:00:00 2001
From: Pranav Shashidhara <pranavmay22@gmail.com>
Date: Fri, 3 Apr 2026 12:22:44 -0400
Subject: [PATCH 07/14] Update README.md, modularize and lint code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-commit checks:
All checks passed ✅
---
 .../schema_agent/README.md                    |   60 +-
 .../schema_agent/schema_agent_utils.py        | 1022 -----------------
 2 files changed, 45 insertions(+), 1037 deletions(-)
 delete mode 100644 research/agentic_data_science/schema_agent/schema_agent_utils.py

diff --git a/research/agentic_data_science/schema_agent/README.md b/research/agentic_data_science/schema_agent/README.md
index 99ff8eafc..c8d1b5119 100644
--- a/research/agentic_data_science/schema_agent/README.md
+++ b/research/agentic_data_science/schema_agent/README.md
@@ -12,25 +12,40 @@ Automated statistical profiling and LLM-powered semantic analysis for CSV datase
 
 ## Setup
 
-Go into the schema folder: 
-```bash 
-> cd research/agentic_data_science/schema_agent
+Go into the schema folder:
+```bash
+cd research/agentic_data_science/schema_agent
 ```
 
-Install the requirements with the command: 
+Install the requirements:
 ```bash
-> pip install -r requirements.txt
+pip install -r requirements.txt
 ```
-Set the OPENAI_API_KEY in the .env file: 
-```bash 
-> export OPENAI_API_KEY=sk-...
+
+Set the `OPENAI_API_KEY` in your environment:
+```bash
+export OPENAI_API_KEY=sk-...
 ```
+
+## Module Structure
+
+The agent is split into six focused modules:
+
+| Module | Responsibility |
+|--------|---------------|
+| `schema_agent_models.py` | Pydantic schemas for type-safe column/dataset insights |
+| `schema_agent_loader.py` | CSV loading, type inference, datetime detection |
+| `schema_agent_stats.py` | Numeric summaries, quality reports, categorical distributions |
+| `schema_agent_llm.py` | Prompt building, OpenAI/LangChain calls, structured output parsing |
+| `schema_agent_report.py` | Column profiles, JSON and Markdown export |
+| `schema_agent.py` | Pipeline orchestration and CLI entry point |
+
 ## Usage
 
 ### Basic
 
 ```bash
-python schema_agent_utils.py data.csv
+python schema_agent.py data.csv
 ```
 
 Outputs:
@@ -41,13 +56,16 @@ Outputs:
 
 ```bash
 # Multiple files with tags
-python schema_agent_utils.py dataset1.csv dataset2.csv --tags sales_2024 inv_q1
+python schema_agent.py dataset1.csv dataset2.csv --tags sales_2024 inv_q1
 
 # Cost-optimized: only high-null columns
-python schema_agent_utils.py data.csv --llm-scope nulls --model gpt-4o-mini
+python schema_agent.py data.csv --llm-scope nulls --model gpt-4o-mini
 
 # Custom metrics and output
-python schema_agent_utils.py data.csv --metrics mean std max --output-json my_report.json
+python schema_agent.py data.csv --metrics mean std max --output-json my_report.json
+
+# LangChain backend
+python schema_agent.py data.csv --use-langchain
 ```
 
 ## Command-Line Arguments
@@ -71,16 +89,28 @@ python schema_agent_utils.py data.csv --metrics mean std max --output-json my_re
 
 ## Python API
 
-```python
-import research.agentic_data_science.schema_agent.schema_agent_utils as radssasau
+### Full pipeline
 
-tag_to_df, stats = radssasau.run_pipeline(
+```python
+import schema_agent as radsasag
+tag_to_df, stats = radsasag.run_pipeline(
     csv_paths=["data.csv"],
     model="gpt-4o-mini",
     llm_scope="semantic"
 )
 ```
 
+### Individual modules
+
+Each module can be imported independently for exploratory use or testing:
+
+```python
+import schema_agent_loader as radsasal
+import schema_agent_stats as radsasas
+import schema_agent_llm as radsasal
+import schema_agent_report as radsasar
+```
+
 ## Output
 
 ### data_profile_report.json
diff --git a/research/agentic_data_science/schema_agent/schema_agent_utils.py b/research/agentic_data_science/schema_agent/schema_agent_utils.py
deleted file mode 100644
index ba9f36b46..000000000
--- a/research/agentic_data_science/schema_agent/schema_agent_utils.py
+++ /dev/null
@@ -1,1022 +0,0 @@
-"""
-Data Profiler Agent — single-file implementation.
-
-Import as:
-    import research.agentic_data_science.schema_agent.schema_agent_utils as radssasau
-
-CLI usage:
-    python schema_agent_utils.py data.csv
-    python schema_agent_utils.py data.csv --model gpt-4o-mini --llm-scope nulls
-    python schema_agent_utils.py data.csv --metrics mean std min max --output-json out.json
-    python schema_agent_utils.py data.csv data2.csv --tags sales inventory
-"""
-
-import argparse
-import datetime
-import json
-import logging
-import os
-import sys
-import typing
-
-import dotenv
-import langchain_core.output_parsers as lcop
-import langchain_core.prompts as lcpr
-import langchain_openai as lco
-import openai
-import pandas as pd
-import pydantic
-
-import helpers.hllm_cli as hllmcli
-import helpers.hlogging as hloggin
-import helpers.hpandas_conversion as hpanconv
-import helpers.hpandas_io as hpanio
-import helpers.hpandas_stats as hpanstat
-
-
-# =============================================================================
-# Configuration & Logging
-# =============================================================================
-
-dotenv.load_dotenv()
-api_key = os.environ.get("OPENAI_API_KEY")
-if not api_key:
-    print("Error: OPENAI_API_KEY not found in environment.")
-    sys.exit(1)
-
-client = openai.OpenAI(api_key=api_key)
-_LOG = hloggin.getLogger(__name__)
-_LOG.setLevel(logging.DEBUG)
-
-console_handler = logging.StreamHandler(sys.stdout)
-hloggin.set_v2_formatter(
-    ch=console_handler,
-    root_logger=_LOG,
-    force_no_warning=False,
-    force_print_format=False,
-    force_verbose_format=True,
-    report_memory_usage=True,
-    report_cpu_usage=True,
-)
-
-# Allowed metric names for numeric summaries.
-VALID_METRICS: typing.List[str] = ["mean", "std", "min", "25%", "50%", "75%", "max"]
-
-# Default metric subset shown in reports.
-DEFAULT_METRICS: typing.List[str] = ["mean", "std", "min", "50%", "max"]
-
-
-# =============================================================================
-# Pydantic schemas
-# =============================================================================
-
-
-class ColumnInsight(pydantic.BaseModel):
-    semantic_meaning: str = pydantic.Field(
-        description="Brief description of what the data represents"
-    )
-    role: str = pydantic.Field(
-        description="One of [ID, Feature, Target, Timestamp]"
-    )
-    data_quality_notes: str = pydantic.Field(
-        description="Any concerns based on the stats (e.g. high nulls, outliers)"
-    )
-    hypotheses: typing.List[str] = pydantic.Field(
-        description="List of testable hypotheses regarding the column's relationship "
-        "to business outcomes."
-    )
-
-
-class DatasetInsights(pydantic.BaseModel):
-    columns: typing.Dict[str, ColumnInsight]
-
-
-# =============================================================================
-# Data loading
-# =============================================================================
-
-
-def load_csv(csv_path: str) -> pd.DataFrame:
-    """
-    Load a CSV into a DataFrame with clear error handling.
-
-    Parameters
-    ----------
-    csv_path : str
-        Path to the CSV file.
-
-    Returns
-    -------
-    pd.DataFrame
-    """
-    try:
-        df = hpanio.read_csv_to_df(csv_path)
-    except FileNotFoundError:
-        _LOG.error("CSV not found at '%s'.", csv_path)
-        raise
-    if df.empty:
-        raise ValueError(f"CSV at '{csv_path}' loaded as an empty DataFrame.")
-    _LOG.info("Loaded '%s': %d rows × %d columns.", csv_path, len(df), len(df.columns))
-    return df
-
-
-# keep legacy name for backwards compatibility
-load_employee_data = load_csv
-
-
-# =============================================================================
-# Datetime inference
-# =============================================================================
-
-
-def infer_and_convert_datetime_columns(
-    df: pd.DataFrame,
-    sample_size: int = 100,
-    threshold: float = 0.8,
-) -> typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]:
-    """
-    Detect and convert date/datetime columns in a DataFrame.
-
-    Uses sampling for performance. Returns the updated DataFrame and a
-    metadata dict with inference details per column.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-    sample_size : int
-        Number of rows to sample when testing format compliance.
-    threshold : float
-        Minimum fraction of parsed values required to accept a column as temporal.
-
-    Returns
-    -------
-    (pd.DataFrame, dict)
-        Updated DataFrame with converted columns + metadata per column.
-    """
-    COMMON_FORMATS = [
-        "%Y-%m-%d",
-        "%d-%m-%Y",
-        "%m-%d-%Y",
-        "%Y/%m/%d",
-        "%d/%m/%Y",
-        "%m/%d/%Y",
-        "%Y-%m-%d %H:%M:%S",
-        "%Y-%m-%d %H:%M",
-        "%d-%m-%Y %H:%M:%S",
-        "%m/%d/%Y %H:%M:%S",
-    ]
-
-    metadata: typing.Dict[str, typing.Any] = {}
-    df_out = df.copy()
-
-    for col in df.columns:
-        if not (
-            pd.api.types.is_object_dtype(df[col])
-            or pd.api.types.is_string_dtype(df[col])
-        ):
-            continue
-
-        series = df[col].dropna().astype(str)
-        if series.empty:
-            continue
-
-        sample = series.head(sample_size)
-        best_format: typing.Optional[str] = None
-        best_score = 0.0
-
-        for fmt in COMMON_FORMATS:
-            success = sum(
-                1
-                for val in sample
-                if _try_strptime(val, fmt)
-            )
-            score = success / len(sample)
-            if score > best_score:
-                best_score = score
-                best_format = fmt
-
-        if best_score >= threshold:
-            parsed = pd.to_datetime(df[col], format=best_format, errors="coerce")
-            used_format = best_format
-        else:
-            parsed = pd.to_datetime(df[col], errors="coerce")
-            used_format = None
-
-        confidence = float(parsed.notna().mean())
-        if confidence < threshold:
-            continue
-
-        has_time = (parsed.dt.time != pd.Timestamp("00:00:00").time()).any()
-        col_type = "datetime" if has_time else "date"
-        df_out[col] = parsed
-
-        metadata[col] = {
-            "semantic_type": "temporal",
-            "granularity": col_type,
-            "format": used_format,
-            "confidence": confidence,
-        }
-        _LOG.info(
-            "Column '%s' detected as %s (format=%s, confidence=%.2f)",
-            col,
-            col_type,
-            used_format,
-            confidence,
-        )
-
-    return df_out, metadata
-
-
-def _try_strptime(val: str, fmt: str) -> bool:
-    """Return True if val parses under fmt, False otherwise."""
-    try:
-        datetime.datetime.strptime(val, fmt)
-        return True
-    except Exception:  # pylint: disable=broad-exception-caught
-        return False
-
-
-# =============================================================================
-# Stats computation
-# =============================================================================
-
-
-def compute_llm_agent_stats(
-    tag_to_df: typing.Dict[str, pd.DataFrame],
-    categorical_cols_map: typing.Optional[typing.Dict[str, typing.List[str]]] = None,
-    metrics: typing.Optional[typing.List[str]] = None,
-) -> typing.Dict[str, typing.Any]:
-    """
-    Compute a statistical profile including temporal boundaries, data quality,
-    categorical distributions, and numeric summaries for LLM injection.
-
-    Parameters
-    ----------
-    tag_to_df : dict
-        Mapping of dataset tag → DataFrame. Supports multiple datasets.
-    categorical_cols_map : dict, optional
-        Mapping of tag → list of categorical column names to profile.
-    metrics : list of str, optional
-        Subset of numeric metrics to include. Must be from VALID_METRICS.
-        Defaults to DEFAULT_METRICS.
-
-    Returns
-    -------
-    dict
-        Keys: temporal_boundaries, quality_reports, categorical_distributions,
-        numeric_summary.
-    """
-    metrics = _resolve_metrics(metrics)
-    dataframe_stats: typing.Dict[str, typing.Any] = {}
-
-    # 1. Temporal boundaries
-    try:
-        duration_stats, _ = hpanstat.compute_duration_df(tag_to_df)
-        dataframe_stats["temporal_boundaries"] = duration_stats
-        print("\n=== Temporal Boundaries ===\n", duration_stats.to_string())
-    except Exception as e:  # pylint: disable=broad-exception-caught
-        _LOG.warning("Skipping duration stats: %s", e)
-        dataframe_stats["temporal_boundaries"] = None
-
-    # 2. Data quality
-    dataframe_stats["quality_reports"] = {}
-    for tag, df in tag_to_df.items():
-        numeric_df = df.select_dtypes(include="number")
-        if numeric_df.empty:
-            _LOG.warning("No numeric columns in '%s'; skipping quality report", tag)
-            continue
-        df_stamped = hpanstat.add_end_download_timestamp(numeric_df.copy())
-        try:
-            quality = hpanstat.report_zero_nan_inf_stats(
-                df_stamped,
-                zero_threshold=1e-9,
-                verbose=True,
-                as_txt=True,
-            )
-            dataframe_stats["quality_reports"][tag] = quality
-            print(f"\n=== Quality Report: {tag} ===\n", quality.to_string())
-        except Exception as e:  # pylint: disable=broad-exception-caught
-            _LOG.warning("Quality report failed for '%s': %s", tag, e)
-
-    # 3. Categorical distributions
-    dataframe_stats["categorical_distributions"] = {}
-    if categorical_cols_map:
-        for tag, cols in categorical_cols_map.items():
-            if tag not in tag_to_df:
-                _LOG.warning("Tag '%s' not found in tag_to_df; skipping.", tag)
-                continue
-            dataframe_stats["categorical_distributions"][tag] = {}
-            for col in cols:
-                if col not in tag_to_df[tag].columns:
-                    _LOG.warning("Column '%s' not in '%s'; skipping.", col, tag)
-                    continue
-                dist = hpanstat.get_value_counts_stats_df(tag_to_df[tag], col)
-                dataframe_stats["categorical_distributions"][tag][col] = dist
-                print(f"\n=== Distribution: {tag} / {col} ===\n", dist.to_string())
-
-    # 4. Numeric summary (customisable metric subset)
-    dataframe_stats["numeric_summary"] = {}
-    for tag, df in tag_to_df.items():
-        numeric_df = df.select_dtypes(include="number")
-        if numeric_df.empty:
-            continue
-        full_summary = numeric_df.describe().T
-        available = [m for m in metrics if m in full_summary.columns]
-        if not available:
-            _LOG.warning("None of the requested metrics %s are available.", metrics)
-        summary = full_summary[available].copy()
-        if "50%" in summary.columns:
-            summary = summary.rename(columns={"50%": "median"})
-        dataframe_stats["numeric_summary"][tag] = summary
-        print(f"\n=== Numeric Summary: {tag} ===\n", summary.to_string())
-
-    return dataframe_stats
-
-
-def _resolve_metrics(metrics: typing.Optional[typing.List[str]]) -> typing.List[str]:
-    """
-    Validate and return the metric list, falling back to DEFAULT_METRICS.
-    """
-    if metrics is None:
-        return DEFAULT_METRICS
-    invalid = [m for m in metrics if m not in VALID_METRICS]
-    if invalid:
-        _LOG.warning(
-            "Unknown metrics %s will be ignored. Valid options: %s",
-            invalid,
-            VALID_METRICS,
-        )
-    resolved = [m for m in metrics if m in VALID_METRICS]
-    return resolved if resolved else DEFAULT_METRICS
-
-
-# =============================================================================
-# LLM scope filtering
-# =============================================================================
-
-
-def _select_columns_for_llm(
-    df: pd.DataFrame,
-    scope: str,
-    null_threshold: float = 0.05,
-) -> typing.List[str]:
-    """
-    Return the list of column names that should be sent to the LLM.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-    scope : str
-        "all"      — every column
-        "semantic" — non-numeric columns only (object / category / string)
-        "nulls"    — columns with null fraction above null_threshold
-    null_threshold : float
-        Fraction of nulls required for "nulls" scope. Default 5 %.
-
-    Returns
-    -------
-    list of str
-    """
-    if scope == "all":
-        return list(df.columns)
-
-    if scope == "semantic":
-        cols = df.select_dtypes(
-            include=["object", "category", "string"]
-        ).columns.tolist()
-        _LOG.info("LLM scope='semantic': %d columns selected.", len(cols))
-        return cols
-
-    if scope == "nulls":
-        cols = [
-            col
-            for col in df.columns
-            if df[col].isnull().mean() > null_threshold
-        ]
-        _LOG.info(
-            "LLM scope='nulls' (threshold=%.0f%%): %d columns selected.",
-            null_threshold * 100,
-            len(cols),
-        )
-        return cols
-
-    _LOG.warning("Unknown LLM scope '%s'; falling back to 'all'.", scope)
-    return list(df.columns)
-
-
-# =============================================================================
-# Prompt building
-# =============================================================================
-
-
-def build_llm_prompt(
-    stats: typing.Dict[str, typing.Any],
-    columns_to_include: typing.Optional[typing.List[str]] = None,
-) -> str:
-    """
-    Serialize statistical data into a structured string prompt for LLM consumption.
-
-    Parameters
-    ----------
-    stats : dict
-        Output of compute_llm_agent_stats().
-    columns_to_include : list of str, optional
-        Subset of column names to include in the prompt. None = all.
-
-    Returns
-    -------
-    str
-    """
-    prompt_segments = [
-        "You are a Senior Data Scientist and Domain Expert.",
-        "Analyze the provided dataset statistics and generate a profile for each column.",
-        "For each column, provide 2-3 testable hypotheses.",
-        "Example: 'Higher discount rates correlate with higher volume but lower margins.'",
-        "\n--- DATASET STATISTICS ---",
-    ]
-
-    if "datetime_columns" in stats and stats["datetime_columns"]:
-        prompt_segments.append(
-            f"\nDetected Datetime Columns:\n"
-            f"{json.dumps(stats['datetime_columns'], indent=2)}"
-        )
-
-    if "numeric_summary" in stats:
-        for tag, summary_df in stats["numeric_summary"].items():
-            if columns_to_include is not None:
-                summary_df = summary_df[
-                    summary_df.index.isin(columns_to_include)
-                ]
-            prompt_segments.append(
-                f"\nDataset [{tag}] Numeric Summary:\n{summary_df.to_string()}"
-            )
-
-    if "categorical_distributions" in stats:
-        for tag, cols in stats["categorical_distributions"].items():
-            for col_name, dist in cols.items():
-                if columns_to_include is not None and col_name not in columns_to_include:
-                    continue
-                prompt_segments.append(
-                    f"\nDistribution for [{col_name}]:\n{dist.to_string()}"
-                )
-
-    return "\n".join(prompt_segments)
-
-
-# =============================================================================
-# LLM calls
-# =============================================================================
-
-
-def generate_hypotheses_via_cli(
-    stats: typing.Dict[str, typing.Any],
-    model: str = "gpt-4o",
-    columns_to_include: typing.Optional[typing.List[str]] = None,
-) -> typing.Dict[str, typing.Any]:
-    """
-    Generate insights and hypotheses using internal hllmcli logic.
-
-    Parses and Pydantic-validates the LLM response against DatasetInsights.
-
-    Parameters
-    ----------
-    stats : dict
-    model : str
-    columns_to_include : list of str, optional
-        If provided, only these columns are sent to the LLM (cost control).
-
-    Returns
-    -------
-    dict  — DatasetInsights-shaped dict, or {"error": ...} on failure.
-    """
-    _LOG.info("Generating hypotheses via hllmcli (model=%s)...", model)
-
-    schema_json = DatasetInsights.model_json_schema()
-    user_prompt = build_llm_prompt(stats, columns_to_include=columns_to_include)
-    system_prompt = (
-        "You are a Senior Data Scientist. Analyze the following data statistics.\n"
-        "Generate a set of 2-3 predictive or causal hypotheses for EVERY column.\n"
-        f"Return the output strictly in JSON matching this schema:\n"
-        f"{json.dumps(schema_json)}"
-    )
-
-    try:
-        response_text, cost = hllmcli.apply_llm(
-            input_str=user_prompt,
-            system_prompt=system_prompt,
-            model=model,
-            use_llm_executable=False,
-        )
-        _LOG.info("LLM call successful. Estimated cost: $%.6f", cost)
-
-        cleaned = (
-            response_text.strip()
-            .removeprefix("```json")
-            .removesuffix("```")
-            .strip()
-        )
-        raw = json.loads(cleaned)
-
-        # Pydantic validation — raises ValidationError on schema mismatch.
-        validated = DatasetInsights.model_validate(raw)
-        return validated.model_dump()
-
-    except pydantic.ValidationError as e:
-        _LOG.error("LLM output failed Pydantic validation: %s", e)
-        return {"error": str(e)}
-    except json.JSONDecodeError as e:
-        _LOG.error("LLM returned invalid JSON: %s", e)
-        return {"error": f"JSON parse error: {e}"}
-    except Exception as e:  # pylint: disable=broad-exception-caught
-        _LOG.error("hllmcli call failed: %s", e)
-        return {"error": str(e)}
-
-
-def get_llm_semantic_insights_langchain(
-    prompt_text: str,
-    model: str = "gpt-4o",
-) -> typing.Dict[str, typing.Any]:
-    """
-    Process dataset metadata via LangChain to extract structured semantic insights.
-
-    Uses JsonOutputParser alongside the Pydantic schema. Validates output.
-
-    Parameters
-    ----------
-    prompt_text : str
-        Serialized stats from build_llm_prompt().
-    model : str
-
-    Returns
-    -------
-    dict
-    """
-    _LOG.info("Querying LLM via LangChain (%s)...", model)
-    llm = lco.ChatOpenAI(model=model, temperature=0)
-    parser = lcop.JsonOutputParser(pydantic_object=DatasetInsights)
-
-    prompt = lcpr.ChatPromptTemplate.from_messages(
-        [
-            (
-                "system",
-                "You are a Senior Data Scientist. Answer in JSON format.\n"
-                "{format_instructions}",
-            ),
-            ("user", "{metadata_stats}"),
-        ]
-    ).partial(format_instructions=parser.get_format_instructions())
-
-    chain = prompt | llm | parser
-    try:
-        result = chain.invoke({"metadata_stats": prompt_text})
-        # Validate against Pydantic schema.
-        validated = DatasetInsights.model_validate(result)
-        return validated.model_dump()
-    except pydantic.ValidationError as e:
-        _LOG.error("LangChain output failed Pydantic validation: %s", e)
-        return {"error": str(e)}
-    except Exception as e:  # pylint: disable=broad-exception-caught
-        _LOG.error("LangChain invocation failed: %s", e)
-        return {"error": str(e)}
-
-
-# =============================================================================
-# Column profiles
-# =============================================================================
-
-
-def build_column_profiles(
-    df: pd.DataFrame,
-    stats: typing.Dict[str, typing.Any],
-    insights: typing.Dict[str, typing.Any],
-) -> typing.List[typing.Dict[str, typing.Any]]:
-    """
-    Convert stat-centric structure into per-column profiles.
-
-    Merges numeric stats, categorical distributions, datetime metadata,
-    and LLM semantic insights keyed on column name.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-    stats : dict
-    insights : dict  — output of generate_hypotheses_via_cli()
-
-    Returns
-    -------
-    list of dict, one entry per column.
-    """
-    profiles: typing.List[typing.Dict[str, typing.Any]] = []
-
-    numeric_summary = stats.get("numeric_summary", {})
-    categorical_stats = stats.get("categorical_distributions", {})
-    datetime_meta = stats.get("datetime_columns", {})
-
-    for col in df.columns:
-        profile: typing.Dict[str, typing.Any] = {
-            "column": col,
-            "dtype": str(df[col].dtype),
-            "null_pct": float(df[col].isnull().mean()),
-            "unique_count": int(df[col].nunique()),
-            "sample_values": df[col].dropna().head(3).tolist(),
-        }
-
-        # Numeric stats
-        for _, summary_df in numeric_summary.items():
-            if col in summary_df.index:
-                col_stats = summary_df.loc[col]
-                for metric in col_stats.index:
-                    profile[metric] = col_stats[metric]
-
-        # Categorical top values
-        for _, cols in categorical_stats.items():
-            if col in cols:
-                dist = cols[col]
-                try:
-                    profile["top_values"] = (
-                        dist.head(3).to_dict()
-                        if hasattr(dist, "head")
-                        else dict(list(dist.items())[:3])
-                    )
-                except Exception:  # pylint: disable=broad-exception-caught
-                    pass
-
-        # Datetime metadata
-        if col in datetime_meta:
-            profile["temporal"] = datetime_meta[col]
-
-        # LLM insights
-        if "columns" in insights and col in insights["columns"]:
-            insight = insights["columns"][col]
-            if hasattr(insight, "dict"):
-                insight = insight.dict()
-            profile.update(
-                {
-                    "semantic_meaning": insight.get("semantic_meaning"),
-                    "role": insight.get("role"),
-                    "data_quality_notes": insight.get("data_quality_notes"),
-                    "hypotheses": insight.get("hypotheses", []),
-                }
-            )
-
-        profiles.append(profile)
-
-    return profiles
-
-
-# =============================================================================
-# Export helpers
-# =============================================================================
-
-
-def merge_and_export_results(
-    stats: typing.Dict[str, typing.Any],
-    insights: typing.Dict[str, typing.Any],
-    column_profiles: typing.List[typing.Dict[str, typing.Any]],
-    output_path: str = "data_profile_report.json",
-) -> None:
-    """
-    Merge stats + insights + column_profiles and export to JSON.
-
-    Parameters
-    ----------
-    stats : dict
-    insights : dict
-    column_profiles : list of dict
-    output_path : str
-    """
-    _LOG.info("Merging results...")
-    serializable_stats = _make_serializable(stats)
-
-    final_report = {
-        "report_metadata": {
-            "version": "1.2",
-            "agent": "Data-Profiler-Agent",
-            "generated_at": datetime.datetime.utcnow().isoformat() + "Z",
-        },
-        "column_profiles": column_profiles,
-        "technical_stats": serializable_stats,
-        "semantic_insights": insights,
-    }
-
-    with open(output_path, "w", encoding="utf-8") as f:
-        json.dump(final_report, f, indent=4, default=str)
-
-    _LOG.info("Exported JSON report to '%s'.", output_path)
-
-
-def _make_serializable(obj: typing.Any) -> typing.Any:
-    """
-    Recursively convert DataFrames and nested dicts to JSON-safe structures.
-    """
-    if isinstance(obj, pd.DataFrame):
-        return obj.to_dict(orient="index")
-    if isinstance(obj, dict):
-        return {k: _make_serializable(v) for k, v in obj.items()}
-    if isinstance(obj, list):
-        return [_make_serializable(v) for v in obj]
-    return obj
-
-
-def export_markdown_from_profiles(
-    column_profiles: typing.List[typing.Dict[str, typing.Any]],
-    numeric_stats: typing.Optional[typing.Dict[str, pd.DataFrame]] = None,
-    output_path: str = "data_profile_summary.md",
-) -> None:
-    """
-    Generate a readable Markdown report from column profiles and numeric stats.
-
-    Parameters
-    ----------
-    column_profiles : list of dict
-    numeric_stats : dict of str → DataFrame, optional
-    output_path : str
-    """
-
-    def _clean(val: typing.Any) -> str:
-        if val is None:
-            return ""
-        return str(val).replace("|", "\\|").replace("\n", " ")
-
-    def _fmt(val: typing.Any) -> str:
-        if isinstance(val, int):
-            return str(val)
-        if isinstance(val, float):
-            return f"{val:,.2f}" if abs(val) >= 1 else f"{val:.4f}"
-        return str(val)
-
-    lines = ["# Data Profile Summary\n"]
-
-    # Column profiles table
-    lines.append("## Column Profiles\n")
-    lines.append("| Column | Meaning | Role | Quality | Hypotheses |")
-    lines.append("|--------|---------|------|---------|------------|")
-
-    for p in column_profiles:
-        hypotheses = p.get("hypotheses", [])
-        if isinstance(hypotheses, list) and hypotheses:
-            hyp_str = "<br>".join(
-                f"{i+1}. {_clean(h)}" for i, h in enumerate(hypotheses[:3])
-            )
-        else:
-            hyp_str = _clean(hypotheses) or "N/A"
-
-        row = [
-            _clean(p.get("column")),
-            _clean(p.get("semantic_meaning")),
-            _clean(p.get("role")),
-            _clean(p.get("data_quality_notes")),
-            hyp_str,
-        ]
-        lines.append("| " + " | ".join(row) + " |")
-
-    # Numeric stats table
-    if numeric_stats:
-        lines.append("\n## Numeric Column Statistics\n")
-        for tag, df in numeric_stats.items():
-            lines.append(f"### {tag}\n")
-            lines.append("| Column | Metric | Value |")
-            lines.append("|--------|--------|-------|")
-            for col_name in df.index:
-                for metric in df.columns:
-                    val = df.loc[col_name, metric]
-                    lines.append(f"| {col_name} | {metric} | {_fmt(val)} |")
-            lines.append("")
-
-    with open(output_path, "w", encoding="utf-8") as f:
-        f.write("\n".join(lines) + "\n")
-
-    _LOG.info("Exported Markdown report to '%s'.", output_path)
-
-
-# =============================================================================
-# Pipeline
-# =============================================================================
-
-
-def run_pipeline(
-    csv_paths: typing.List[str],
-    tags: typing.Optional[typing.List[str]] = None,
-    model: str = "gpt-4o",
-    metrics: typing.Optional[typing.List[str]] = None,
-    llm_scope: str = "all",
-    output_json: str = "data_profile_report.json",
-    output_md: str = "data_profile_summary.md",
-    use_langchain: bool = False,
-) -> typing.Tuple[typing.Dict[str, pd.DataFrame], typing.Dict[str, typing.Any]]:
-    """
-    Execute the full data profiling pipeline over one or more CSV files.
-
-    Parameters
-    ----------
-    csv_paths : list of str
-        One or more CSV file paths to profile.
-    tags : list of str, optional
-        Human-readable tag for each CSV. Defaults to filename stems.
-    model : str
-        LLM model name passed to OpenAI / hllmcli.
-    metrics : list of str, optional
-        Numeric metrics to include. Defaults to DEFAULT_METRICS.
-    llm_scope : str
-        "all", "semantic", or "nulls" — controls which columns are LLM-profiled.
-    output_json : str
-        Path for the merged JSON report.
-    output_md : str
-        Path for the Markdown summary.
-    use_langchain : bool
-        Use LangChain chain instead of hllmcli for LLM calls.
-
-    Returns
-    -------
-    (dict of tag → df, stats dict)
-    """
-    if tags is None:
-        tags = [os.path.splitext(os.path.basename(p))[0] for p in csv_paths]
-
-    if len(tags) != len(csv_paths):
-        raise ValueError(
-            f"Length of tags ({len(tags)}) must match csv_paths ({len(csv_paths)})."
-        )
-
-    # --- Load & type-coerce ---
-    tag_to_df: typing.Dict[str, pd.DataFrame] = {}
-    for path, tag in zip(csv_paths, tags):
-        df = load_csv(path)
-        df = hpanconv.convert_df(df)
-        df, datetime_meta_partial = infer_and_convert_datetime_columns(df)
-        tag_to_df[tag] = df
-
-    # Merge datetime metadata across all DataFrames (using the last loaded tag
-    # as the primary df for single-dataset runs; full merge for multi).
-    _, datetime_meta = infer_and_convert_datetime_columns(
-        pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True)
-    )
-
-    # --- Categorical column map ---
-    cat_cols_map: typing.Dict[str, typing.List[str]] = {
-        tag: df.select_dtypes(
-            include=["object", "category", "string"]
-        ).columns.tolist()
-        for tag, df in tag_to_df.items()
-    }
-
-    # --- Compute stats ---
-    stats = compute_llm_agent_stats(
-        tag_to_df,
-        categorical_cols_map=cat_cols_map,
-        metrics=metrics,
-    )
-    stats["datetime_columns"] = datetime_meta
-
-    # --- LLM scope ---
-    # Use the concatenated DataFrame to decide which columns to send.
-    combined_df = pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True)
-    columns_for_llm = _select_columns_for_llm(combined_df, scope=llm_scope)
-    _LOG.info(
-        "LLM will profile %d / %d columns (scope=%s).",
-        len(columns_for_llm),
-        len(combined_df.columns),
-        llm_scope,
-    )
-
-    # --- LLM call ---
-    if use_langchain:
-        prompt_text = build_llm_prompt(stats, columns_to_include=columns_for_llm)
-        semantic_insights = get_llm_semantic_insights_langchain(
-            prompt_text, model=model
-        )
-    else:
-        semantic_insights = generate_hypotheses_via_cli(
-            stats,
-            model=model,
-            columns_to_include=columns_for_llm,
-        )
-
-    # --- Build column profiles (use first / primary df for column ordering) ---
-    primary_df = list(tag_to_df.values())[0]
-    column_profiles = build_column_profiles(
-        df=primary_df,
-        stats=stats,
-        insights=semantic_insights,
-    )
-
-    # --- Export ---
-    merge_and_export_results(
-        stats=stats,
-        insights=semantic_insights,
-        column_profiles=column_profiles,
-        output_path=output_json,
-    )
-    export_markdown_from_profiles(
-        column_profiles,
-        numeric_stats=stats.get("numeric_summary", {}),
-        output_path=output_md,
-    )
-
-    return tag_to_df, stats
-
-
-# =============================================================================
-# CLI
-# =============================================================================
-
-
-def _build_arg_parser() -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser(
-        prog="schema_agent_utils",
-        description="Data Profiler Agent — statistical + LLM column profiling",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-
-    # --- Inputs ---
-    parser.add_argument(
-        "csv_paths",
-        nargs="+",
-        metavar="CSV",
-        help="One or more CSV file paths to profile.",
-    )
-    parser.add_argument(
-        "--tags",
-        nargs="+",
-        metavar="TAG",
-        help="Human-readable tag for each CSV (must match number of csv_paths).",
-    )
-
-    # --- LLM options ---
-    parser.add_argument(
-        "--model",
-        default="gpt-4o",
-        help="LLM model for semantic analysis.",
-    )
-    parser.add_argument(
-        "--llm-scope",
-        choices=["all", "semantic", "nulls"],
-        default="all",
-        dest="llm_scope",
-        help=(
-            "Which columns to send to the LLM. "
-            "'all'=every column, 'semantic'=non-numeric only, "
-            "'nulls'=high-null columns only (saves cost)."
-        ),
-    )
-    parser.add_argument(
-        "--use-langchain",
-        action="store_true",
-        dest="use_langchain",
-        help="Use LangChain pipeline instead of hllmcli for LLM calls.",
-    )
-
-    # --- Stat options ---
-    parser.add_argument(
-        "--metrics",
-        nargs="+",
-        choices=VALID_METRICS,
-        default=None,
-        metavar="METRIC",
-        help=(
-            f"Numeric metrics to include in the summary. "
-            f"Valid: {', '.join(VALID_METRICS)}. "
-            f"Default: {', '.join(DEFAULT_METRICS)}."
-        ),
-    )
-
-    # --- Output options ---
-    parser.add_argument(
-        "--output-json",
-        default="data_profile_report.json",
-        dest="output_json",
-        metavar="PATH",
-        help="Output path for the merged JSON report.",
-    )
-    parser.add_argument(
-        "--output-md",
-        default="data_profile_summary.md",
-        dest="output_md",
-        metavar="PATH",
-        help="Output path for the Markdown summary.",
-    )
-
-    return parser
-
-
-def main() -> None:
-    """
-    CLI entry point. Parses arguments and delegates to run_pipeline().
-    """
-    parser = _build_arg_parser()
-    args = parser.parse_args()
-
-    run_pipeline(
-        csv_paths=args.csv_paths,
-        tags=args.tags,
-        model=args.model,
-        metrics=args.metrics,
-        llm_scope=args.llm_scope,
-        output_json=args.output_json,
-        output_md=args.output_md,
-        use_langchain=args.use_langchain,
-    )
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file

From ad5364f1043918dc645445feb2d5e180158d1600 Mon Sep 17 00:00:00 2001
From: Pranav Shashidhara <pranavmay22@gmail.com>
Date: Tue, 7 Apr 2026 12:14:15 -0400
Subject: [PATCH 08/14] Add schema_agent_api.ipnb, docker, dassert and update
 README.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-commit checks:
All checks passed ✅
---
 .../schema_agent/Dockerfile                   |  30 +
 .../schema_agent/README.md                    |  23 +-
 .../schema_agent/data_profile_report.json     | 608 ++++++++++++++++++
 .../schema_agent/data_profile_summary.md      | 102 +++
 .../schema_agent/docker_bash.sh               |  34 +
 .../schema_agent/docker_build.sh              |  40 ++
 .../schema_agent/docker_build.version.log     | 166 +++++
 .../schema_agent/docker_clean.sh              |  26 +
 .../schema_agent/docker_cmd.sh                |  41 ++
 .../schema_agent/docker_exec.sh               |  25 +
 .../schema_agent/docker_jupyter.sh            |  37 ++
 .../schema_agent/docker_name.sh               |  12 +
 .../schema_agent/docker_push.sh               |  25 +
 .../schema_agent/requirements.txt             |   4 +-
 .../schema_agent/run_jupyter.sh               |  36 ++
 .../schema_agent/schema_agent.py              |  60 +-
 .../schema_agent/schema_agent_hllmcli.py      |  91 ++-
 .../schema_agent/schema_agent_loader.py       |  61 +-
 .../schema_agent/schema_agent_report.py       |  16 +-
 .../schema_agent/schema_agent_stats.py        |  21 +-
 .../schema_agent/scmea_agent_example.ipynb    | 371 +++++++++++
 .../schema_agent/utils.sh                     | 504 +++++++++++++++
 .../schema_agent/version.sh                   |  28 +
 23 files changed, 2225 insertions(+), 136 deletions(-)
 create mode 100644 research/agentic_data_science/schema_agent/Dockerfile
 create mode 100644 research/agentic_data_science/schema_agent/data_profile_report.json
 create mode 100644 research/agentic_data_science/schema_agent/data_profile_summary.md
 create mode 100755 research/agentic_data_science/schema_agent/docker_bash.sh
 create mode 100755 research/agentic_data_science/schema_agent/docker_build.sh
 create mode 100644 research/agentic_data_science/schema_agent/docker_build.version.log
 create mode 100755 research/agentic_data_science/schema_agent/docker_clean.sh
 create mode 100755 research/agentic_data_science/schema_agent/docker_cmd.sh
 create mode 100755 research/agentic_data_science/schema_agent/docker_exec.sh
 create mode 100755 research/agentic_data_science/schema_agent/docker_jupyter.sh
 create mode 100644 research/agentic_data_science/schema_agent/docker_name.sh
 create mode 100755 research/agentic_data_science/schema_agent/docker_push.sh
 create mode 100755 research/agentic_data_science/schema_agent/run_jupyter.sh
 create mode 100644 research/agentic_data_science/schema_agent/scmea_agent_example.ipynb
 create mode 100644 research/agentic_data_science/schema_agent/utils.sh
 create mode 100755 research/agentic_data_science/schema_agent/version.sh

diff --git a/research/agentic_data_science/schema_agent/Dockerfile b/research/agentic_data_science/schema_agent/Dockerfile
new file mode 100644
index 000000000..a7e060b4a
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/Dockerfile
@@ -0,0 +1,30 @@
+FROM python:3.12-slim
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PATH="/opt/venv/bin:$PATH"
+
+# This allows 'import helpers' to work if helpers is inside /git_root/helpers_root
+ENV PYTHONPATH="/git_root/research/agentic_data_science/schema_agent:/git_root/helpers_root:${PYTHONPATH:-}"
+
+RUN apt-get update && apt-get install -y \
+    ca-certificates build-essential curl sudo gnupg git vim \
+    libgl1 libglib2.0-0 libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN curl -Ls https://astral.sh/uv/install.sh | sh
+ENV PATH="/root/.local/bin:$PATH"
+
+RUN uv venv /opt/venv
+
+# Requirements installation
+COPY requirements.txt /install/requirements.txt
+RUN uv pip install --python /opt/venv/bin/python --no-cache -r /install/requirements.txt jupyterlab
+
+# Create the skeleton directory structure
+WORKDIR /git_root
+
+# Address reviewer feedback: We assume schema_agent.py is in the context
+# We will chmod it inside the container during build or via the mount script
+EXPOSE 8888
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/research/agentic_data_science/schema_agent/README.md b/research/agentic_data_science/schema_agent/README.md
index c8d1b5119..020f63e53 100644
--- a/research/agentic_data_science/schema_agent/README.md
+++ b/research/agentic_data_science/schema_agent/README.md
@@ -14,19 +14,22 @@ Automated statistical profiling and LLM-powered semantic analysis for CSV datase
 
 Go into the schema folder:
 ```bash
-cd research/agentic_data_science/schema_agent
+> cd research/agentic_data_science/schema_agent
 ```
 
 Install the requirements:
 ```bash
-pip install -r requirements.txt
+> pip install -r requirements.txt
 ```
 
 Set the `OPENAI_API_KEY` in your environment:
 ```bash
-export OPENAI_API_KEY=sk-...
+> export OPENAI_API_KEY=sk-...
+```
+Make the script executable 
+```bash 
+> chmod +x schema_agent.py
 ```
-
 ## Module Structure
 
 The agent is split into six focused modules:
@@ -45,7 +48,7 @@ The agent is split into six focused modules:
 ### Basic
 
 ```bash
-python schema_agent.py data.csv
+> ./schema_agent.py data.csv
 ```
 
 Outputs:
@@ -56,16 +59,16 @@ Outputs:
 
 ```bash
 # Multiple files with tags
-python schema_agent.py dataset1.csv dataset2.csv --tags sales_2024 inv_q1
+> ./schema_agent.py dataset1.csv dataset2.csv --tags sales_2024 inv_q1
 
 # Cost-optimized: only high-null columns
-python schema_agent.py data.csv --llm-scope nulls --model gpt-4o-mini
+> ./schema_agent.py data.csv --llm-scope nulls --model gpt-4o-mini
 
 # Custom metrics and output
-python schema_agent.py data.csv --metrics mean std max --output-json my_report.json
+> ./schema_agent.py data.csv --metrics mean std max --output-json my_report.json
 
 # LangChain backend
-python schema_agent.py data.csv --use-langchain
+> ./schema_agent.py data.csv --use-langchain
 ```
 
 ## Command-Line Arguments
@@ -123,7 +126,7 @@ Formatted table summary: Column | Meaning | Role | Quality | Hypotheses
 
 **API Key Error:**
 ```bash
-export OPENAI_API_KEY=sk-...
+> export OPENAI_API_KEY=sk-...
 ```
 
 **Validation Errors:**
diff --git a/research/agentic_data_science/schema_agent/data_profile_report.json b/research/agentic_data_science/schema_agent/data_profile_report.json
new file mode 100644
index 000000000..df7adbef6
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/data_profile_report.json
@@ -0,0 +1,608 @@
+{
+    "report_metadata": {
+        "version": "1.2",
+        "agent": "Data-Profiler-Agent",
+        "generated_at": "2026-04-07T16:06:18.296448Z"
+    },
+    "column_profiles": [
+        {
+            "column": "order_datetime",
+            "dtype": "datetime64[us]",
+            "null_pct": 0.0,
+            "unique_count": 14903,
+            "sample_values": [
+                "2009-12-01 07:45:00",
+                "2009-12-01 07:45:00",
+                "2009-12-01 09:06:00"
+            ]
+        },
+        {
+            "column": "year",
+            "dtype": "int64",
+            "null_pct": 0.0,
+            "unique_count": 2,
+            "sample_values": [
+                2009,
+                2009,
+                2009
+            ],
+            "mean": 2009.92928,
+            "std": 0.2563578334933183,
+            "min": 2009.0,
+            "median": 2010.0,
+            "max": 2010.0
+        },
+        {
+            "column": "month",
+            "dtype": "int64",
+            "null_pct": 0.0,
+            "unique_count": 12,
+            "sample_values": [
+                12,
+                12,
+                12
+            ],
+            "mean": 7.37759,
+            "std": 3.456656661667856,
+            "min": 1.0,
+            "median": 8.0,
+            "max": 12.0
+        },
+        {
+            "column": "week_of_year",
+            "dtype": "int64",
+            "null_pct": 0.0,
+            "unique_count": 52,
+            "sample_values": [
+                49,
+                49,
+                49
+            ],
+            "mean": 29.91514,
+            "std": 15.003268635903897,
+            "min": 1.0,
+            "median": 33.0,
+            "max": 52.0
+        },
+        {
+            "column": "day_of_week",
+            "dtype": "int64",
+            "null_pct": 0.0,
+            "unique_count": 7,
+            "sample_values": [
+                1,
+                1,
+                1
+            ],
+            "mean": 2.58328,
+            "std": 1.9231592308007859,
+            "min": 0.0,
+            "median": 2.0,
+            "max": 6.0
+        },
+        {
+            "column": "order_hour",
+            "dtype": "int64",
+            "null_pct": 0.0,
+            "unique_count": 14,
+            "sample_values": [
+                7,
+                7,
+                9
+            ],
+            "mean": 12.68047,
+            "std": 2.35158794833593,
+            "min": 7.0,
+            "median": 13.0,
+            "max": 20.0
+        },
+        {
+            "column": "is_weekend",
+            "dtype": "int64",
+            "null_pct": 0.0,
+            "unique_count": 2,
+            "sample_values": [
+                0,
+                0,
+                0
+            ],
+            "mean": 0.15396,
+            "std": 0.36091220674314933,
+            "min": 0.0,
+            "median": 0.0,
+            "max": 1.0
+        },
+        {
+            "column": "country",
+            "dtype": "str",
+            "null_pct": 0.0,
+            "unique_count": 34,
+            "sample_values": [
+                "United Kingdom",
+                "United Kingdom",
+                "United Kingdom"
+            ],
+            "top_values": {
+                "count": {
+                    "United Kingdom": 64417,
+                    "Ireland": 8507,
+                    "Germany": 7654
+                },
+                "pct [%]": {
+                    "United Kingdom": 64.417,
+                    "Ireland": 8.507000000000001,
+                    "Germany": 7.654
+                }
+            },
+            "semantic_meaning": "Represents the country where the transaction originated.",
+            "role": "Feature",
+            "data_quality_notes": "Data is well-distributed across several countries with a predominance in the United Kingdom.",
+            "hypotheses": [
+                "Transactions from the United Kingdom have a higher total value than other countries.",
+                "Countries with a lower transaction count like Sweden have a higher average transaction value.",
+                "Country-specific marketing strategies positively impact sales volume."
+            ]
+        },
+        {
+            "column": "country_code",
+            "dtype": "str",
+            "null_pct": 0.0,
+            "unique_count": 34,
+            "sample_values": [
+                "GBR",
+                "GBR",
+                "GBR"
+            ],
+            "top_values": {
+                "count": {
+                    "GBR": 64417,
+                    "IRL": 8507,
+                    "DEU": 7654
+                },
+                "pct [%]": {
+                    "GBR": 64.417,
+                    "IRL": 8.507000000000001,
+                    "DEU": 7.654
+                }
+            },
+            "semantic_meaning": "3-letter code representing the country of each transaction.",
+            "role": "Feature",
+            "data_quality_notes": "Consistent with country, providing coded labels for countries.",
+            "hypotheses": [
+                "Country codes correlate strongly with country-specific purchasing patterns.",
+                "The use of certain country codes predicts higher shipping costs.",
+                "Country codes are better predictors for regional discounts than country names."
+            ]
+        },
+        {
+            "column": "product_id",
+            "dtype": "str",
+            "null_pct": 0.0,
+            "unique_count": 3623,
+            "sample_values": [
+                "21523",
+                "79323W",
+                "82582"
+            ],
+            "top_values": {
+                "count": {
+                    "POST": 731,
+                    "85123A": 615,
+                    "21212": 438
+                },
+                "pct [%]": {
+                    "POST": 0.731,
+                    "85123A": 0.615,
+                    "21212": 0.438
+                }
+            },
+            "semantic_meaning": "Unique identifier for each product sold.",
+            "role": "Feature",
+            "data_quality_notes": "Varied distribution across products indicates a potential for high product diversity.",
+            "hypotheses": [
+                "Products with higher sale counts like 'POST' have a higher discount rate applied.",
+                "Products with lower counts have a higher average profit margin.",
+                "Rarely sold products are linked with specific promotional campaigns."
+            ]
+        },
+        {
+            "column": "customer_id",
+            "dtype": "int64",
+            "null_pct": 0.0,
+            "unique_count": 4012,
+            "sample_values": [
+                13085,
+                13085,
+                13078
+            ],
+            "mean": 14768.12664,
+            "std": 1799.1647503828826,
+            "min": 12346.0,
+            "median": 14646.0,
+            "max": 18287.0
+        },
+        {
+            "column": "unit_price_gbp",
+            "dtype": "float64",
+            "null_pct": 0.0,
+            "unique_count": 300,
+            "sample_values": [
+                5.95,
+                6.75,
+                2.1
+            ],
+            "mean": 3.88915772,
+            "std": 59.75020429686513,
+            "min": 0.001,
+            "median": 1.95,
+            "max": 10953.5
+        },
+        {
+            "column": "quantity_sold",
+            "dtype": "int64",
+            "null_pct": 0.0,
+            "unique_count": 232,
+            "sample_values": [
+                10,
+                12,
+                12
+            ],
+            "mean": 18.65779,
+            "std": 159.34650236322747,
+            "min": 1.0,
+            "median": 6.0,
+            "max": 19152.0
+        },
+        {
+            "column": "sales_amount_gbp",
+            "dtype": "float64",
+            "null_pct": 0.0,
+            "unique_count": 1541,
+            "sample_values": [
+                59.5,
+                81.0,
+                25.200000000000003
+            ],
+            "mean": 26.948917120000004,
+            "std": 92.39021385230444,
+            "min": 0.001,
+            "median": 14.98,
+            "max": 10953.5
+        },
+        {
+            "column": "population_total",
+            "dtype": "float64",
+            "null_pct": 0.0,
+            "unique_count": 55,
+            "sample_values": [
+                62276270.0,
+                62276270.0,
+                62276270.0
+            ],
+            "mean": 54098116.95651,
+            "std": 26644482.35245398,
+            "min": 318041.0,
+            "median": 62766365.0,
+            "max": 309378227.0
+        },
+        {
+            "column": "gdp_current_usd",
+            "dtype": "float64",
+            "null_pct": 0.0,
+            "unique_count": 55,
+            "sample_values": [
+                2412840006231.5,
+                2412840006231.5,
+                2412840006231.5
+            ],
+            "mean": 2161192799869.4167,
+            "std": 1115049256125.8184,
+            "min": 9035824366.00804,
+            "median": 2485482596184.709,
+            "max": 15048971000000.0
+        },
+        {
+            "column": "gdp_growth_pct",
+            "dtype": "float64",
+            "null_pct": 0.0,
+            "unique_count": 55,
+            "sample_values": [
+                -17.633975690892566,
+                -17.633975690892566,
+                -17.633975690892566
+            ],
+            "mean": 0.46262588609441824,
+            "std": 6.134116051369821,
+            "min": -19.62987001225588,
+            "median": 3.010667502428644,
+            "max": 32.50404703691798
+        },
+        {
+            "column": "inflation_consumer_pct",
+            "dtype": "float64",
+            "null_pct": 0.0,
+            "unique_count": 55,
+            "sample_values": [
+                1.89709031895291,
+                1.89709031895291,
+                1.89709031895291
+            ],
+            "mean": 1.1042501219771699,
+            "std": 1.6555131180385045,
+            "min": -15.1829798865339,
+            "median": 1.58908069179591,
+            "max": 16.5278863640702
+        }
+    ],
+    "technical_stats": {
+        "temporal_boundaries": null,
+        "quality_reports": {},
+        "categorical_distributions": {
+            "ecommerce_data": {
+                "country": {
+                    "United Kingdom": {
+                        "count": 64417,
+                        "pct [%]": 64.417
+                    },
+                    "Ireland": {
+                        "count": 8507,
+                        "pct [%]": 8.507000000000001
+                    },
+                    "Germany": {
+                        "count": 7654,
+                        "pct [%]": 7.654
+                    },
+                    "France": {
+                        "count": 5470,
+                        "pct [%]": 5.47
+                    },
+                    "Netherlands": {
+                        "count": 2729,
+                        "pct [%]": 2.7289999999999996
+                    },
+                    "Spain": {
+                        "count": 1235,
+                        "pct [%]": 1.2349999999999999
+                    },
+                    "Switzerland": {
+                        "count": 1170,
+                        "pct [%]": 1.17
+                    },
+                    "Belgium": {
+                        "count": 1037,
+                        "pct [%]": 1.0370000000000001
+                    },
+                    "Portugal": {
+                        "count": 984,
+                        "pct [%]": 0.984
+                    },
+                    "Sweden": {
+                        "count": 868,
+                        "pct [%]": 0.868
+                    }
+                },
+                "country_code": {
+                    "GBR": {
+                        "count": 64417,
+                        "pct [%]": 64.417
+                    },
+                    "IRL": {
+                        "count": 8507,
+                        "pct [%]": 8.507000000000001
+                    },
+                    "DEU": {
+                        "count": 7654,
+                        "pct [%]": 7.654
+                    },
+                    "FRA": {
+                        "count": 5470,
+                        "pct [%]": 5.47
+                    },
+                    "NLD": {
+                        "count": 2729,
+                        "pct [%]": 2.7289999999999996
+                    },
+                    "ESP": {
+                        "count": 1235,
+                        "pct [%]": 1.2349999999999999
+                    },
+                    "CHE": {
+                        "count": 1170,
+                        "pct [%]": 1.17
+                    },
+                    "BEL": {
+                        "count": 1037,
+                        "pct [%]": 1.0370000000000001
+                    },
+                    "PRT": {
+                        "count": 984,
+                        "pct [%]": 0.984
+                    },
+                    "SWE": {
+                        "count": 868,
+                        "pct [%]": 0.868
+                    }
+                },
+                "product_id": {
+                    "POST": {
+                        "count": 731,
+                        "pct [%]": 0.731
+                    },
+                    "85123A": {
+                        "count": 615,
+                        "pct [%]": 0.615
+                    },
+                    "21212": {
+                        "count": 438,
+                        "pct [%]": 0.438
+                    },
+                    "22423": {
+                        "count": 437,
+                        "pct [%]": 0.437
+                    },
+                    "85099B": {
+                        "count": 391,
+                        "pct [%]": 0.391
+                    },
+                    "20725": {
+                        "count": 334,
+                        "pct [%]": 0.334
+                    },
+                    "84991": {
+                        "count": 298,
+                        "pct [%]": 0.298
+                    },
+                    "20914": {
+                        "count": 295,
+                        "pct [%]": 0.295
+                    },
+                    "21232": {
+                        "count": 295,
+                        "pct [%]": 0.295
+                    },
+                    "84879": {
+                        "count": 285,
+                        "pct [%]": 0.28500000000000003
+                    }
+                }
+            }
+        },
+        "numeric_summary": {
+            "ecommerce_data": {
+                "year": {
+                    "mean": 2009.92928,
+                    "std": 0.2563578334933183,
+                    "min": 2009.0,
+                    "median": 2010.0,
+                    "max": 2010.0
+                },
+                "month": {
+                    "mean": 7.37759,
+                    "std": 3.456656661667856,
+                    "min": 1.0,
+                    "median": 8.0,
+                    "max": 12.0
+                },
+                "week_of_year": {
+                    "mean": 29.91514,
+                    "std": 15.003268635903897,
+                    "min": 1.0,
+                    "median": 33.0,
+                    "max": 52.0
+                },
+                "day_of_week": {
+                    "mean": 2.58328,
+                    "std": 1.9231592308007859,
+                    "min": 0.0,
+                    "median": 2.0,
+                    "max": 6.0
+                },
+                "order_hour": {
+                    "mean": 12.68047,
+                    "std": 2.35158794833593,
+                    "min": 7.0,
+                    "median": 13.0,
+                    "max": 20.0
+                },
+                "is_weekend": {
+                    "mean": 0.15396,
+                    "std": 0.36091220674314933,
+                    "min": 0.0,
+                    "median": 0.0,
+                    "max": 1.0
+                },
+                "customer_id": {
+                    "mean": 14768.12664,
+                    "std": 1799.1647503828826,
+                    "min": 12346.0,
+                    "median": 14646.0,
+                    "max": 18287.0
+                },
+                "unit_price_gbp": {
+                    "mean": 3.88915772,
+                    "std": 59.75020429686513,
+                    "min": 0.001,
+                    "median": 1.95,
+                    "max": 10953.5
+                },
+                "quantity_sold": {
+                    "mean": 18.65779,
+                    "std": 159.34650236322747,
+                    "min": 1.0,
+                    "median": 6.0,
+                    "max": 19152.0
+                },
+                "sales_amount_gbp": {
+                    "mean": 26.948917120000004,
+                    "std": 92.39021385230444,
+                    "min": 0.001,
+                    "median": 14.98,
+                    "max": 10953.5
+                },
+                "population_total": {
+                    "mean": 54098116.95651,
+                    "std": 26644482.35245398,
+                    "min": 318041.0,
+                    "median": 62766365.0,
+                    "max": 309378227.0
+                },
+                "gdp_current_usd": {
+                    "mean": 2161192799869.4167,
+                    "std": 1115049256125.8184,
+                    "min": 9035824366.00804,
+                    "median": 2485482596184.709,
+                    "max": 15048971000000.0
+                },
+                "gdp_growth_pct": {
+                    "mean": 0.46262588609441824,
+                    "std": 6.134116051369821,
+                    "min": -19.62987001225588,
+                    "median": 3.010667502428644,
+                    "max": 32.50404703691798
+                },
+                "inflation_consumer_pct": {
+                    "mean": 1.1042501219771699,
+                    "std": 1.6555131180385045,
+                    "min": -15.1829798865339,
+                    "median": 1.58908069179591,
+                    "max": 16.5278863640702
+                }
+            }
+        },
+        "datetime_columns": {}
+    },
+    "semantic_insights": {
+        "columns": {
+            "country": {
+                "semantic_meaning": "Represents the country where the transaction originated.",
+                "role": "Feature",
+                "data_quality_notes": "Data is well-distributed across several countries with a predominance in the United Kingdom.",
+                "hypotheses": [
+                    "Transactions from the United Kingdom have a higher total value than other countries.",
+                    "Countries with a lower transaction count like Sweden have a higher average transaction value.",
+                    "Country-specific marketing strategies positively impact sales volume."
+                ]
+            },
+            "country_code": {
+                "semantic_meaning": "3-letter code representing the country of each transaction.",
+                "role": "Feature",
+                "data_quality_notes": "Consistent with country, providing coded labels for countries.",
+                "hypotheses": [
+                    "Country codes correlate strongly with country-specific purchasing patterns.",
+                    "The use of certain country codes predicts higher shipping costs.",
+                    "Country codes are better predictors for regional discounts than country names."
+                ]
+            },
+            "product_id": {
+                "semantic_meaning": "Unique identifier for each product sold.",
+                "role": "Feature",
+                "data_quality_notes": "Varied distribution across products indicates a potential for high product diversity.",
+                "hypotheses": [
+                    "Products with higher sale counts like 'POST' have a higher discount rate applied.",
+                    "Products with lower counts have a higher average profit margin.",
+                    "Rarely sold products are linked with specific promotional campaigns."
+                ]
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/research/agentic_data_science/schema_agent/data_profile_summary.md b/research/agentic_data_science/schema_agent/data_profile_summary.md
new file mode 100644
index 000000000..5ba7c62fe
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/data_profile_summary.md
@@ -0,0 +1,102 @@
+# Data Profile Summary
+
+## Column Profiles
+
+| Column | Meaning | Role | Quality | Hypotheses |
+|--------|---------|------|---------|------------|
+| order_datetime |  |  |  | [] |
+| year |  |  |  | [] |
+| month |  |  |  | [] |
+| week_of_year |  |  |  | [] |
+| day_of_week |  |  |  | [] |
+| order_hour |  |  |  | [] |
+| is_weekend |  |  |  | [] |
+| country | Represents the country where the transaction originated. | Feature | Data is well-distributed across several countries with a predominance in the United Kingdom. | 1. Transactions from the United Kingdom have a higher total value than other countries.<br>2. Countries with a lower transaction count like Sweden have a higher average transaction value.<br>3. Country-specific marketing strategies positively impact sales volume. |
+| country_code | 3-letter code representing the country of each transaction. | Feature | Consistent with country, providing coded labels for countries. | 1. Country codes correlate strongly with country-specific purchasing patterns.<br>2. The use of certain country codes predicts higher shipping costs.<br>3. Country codes are better predictors for regional discounts than country names. |
+| product_id | Unique identifier for each product sold. | Feature | Varied distribution across products indicates a potential for high product diversity. | 1. Products with higher sale counts like 'POST' have a higher discount rate applied.<br>2. Products with lower counts have a higher average profit margin.<br>3. Rarely sold products are linked with specific promotional campaigns. |
+| customer_id |  |  |  | [] |
+| unit_price_gbp |  |  |  | [] |
+| quantity_sold |  |  |  | [] |
+| sales_amount_gbp |  |  |  | [] |
+| population_total |  |  |  | [] |
+| gdp_current_usd |  |  |  | [] |
+| gdp_growth_pct |  |  |  | [] |
+| inflation_consumer_pct |  |  |  | [] |
+
+## Numeric Column Statistics
+
+### ecommerce_data
+
+| Column | Metric | Value |
+|--------|--------|-------|
+| year | mean | 2,009.93 |
+| year | std | 0.2564 |
+| year | min | 2,009.00 |
+| year | median | 2,010.00 |
+| year | max | 2,010.00 |
+| month | mean | 7.38 |
+| month | std | 3.46 |
+| month | min | 1.00 |
+| month | median | 8.00 |
+| month | max | 12.00 |
+| week_of_year | mean | 29.92 |
+| week_of_year | std | 15.00 |
+| week_of_year | min | 1.00 |
+| week_of_year | median | 33.00 |
+| week_of_year | max | 52.00 |
+| day_of_week | mean | 2.58 |
+| day_of_week | std | 1.92 |
+| day_of_week | min | 0.0000 |
+| day_of_week | median | 2.00 |
+| day_of_week | max | 6.00 |
+| order_hour | mean | 12.68 |
+| order_hour | std | 2.35 |
+| order_hour | min | 7.00 |
+| order_hour | median | 13.00 |
+| order_hour | max | 20.00 |
+| is_weekend | mean | 0.1540 |
+| is_weekend | std | 0.3609 |
+| is_weekend | min | 0.0000 |
+| is_weekend | median | 0.0000 |
+| is_weekend | max | 1.00 |
+| customer_id | mean | 14,768.13 |
+| customer_id | std | 1,799.16 |
+| customer_id | min | 12,346.00 |
+| customer_id | median | 14,646.00 |
+| customer_id | max | 18,287.00 |
+| unit_price_gbp | mean | 3.89 |
+| unit_price_gbp | std | 59.75 |
+| unit_price_gbp | min | 0.0010 |
+| unit_price_gbp | median | 1.95 |
+| unit_price_gbp | max | 10,953.50 |
+| quantity_sold | mean | 18.66 |
+| quantity_sold | std | 159.35 |
+| quantity_sold | min | 1.00 |
+| quantity_sold | median | 6.00 |
+| quantity_sold | max | 19,152.00 |
+| sales_amount_gbp | mean | 26.95 |
+| sales_amount_gbp | std | 92.39 |
+| sales_amount_gbp | min | 0.0010 |
+| sales_amount_gbp | median | 14.98 |
+| sales_amount_gbp | max | 10,953.50 |
+| population_total | mean | 54,098,116.96 |
+| population_total | std | 26,644,482.35 |
+| population_total | min | 318,041.00 |
+| population_total | median | 62,766,365.00 |
+| population_total | max | 309,378,227.00 |
+| gdp_current_usd | mean | 2,161,192,799,869.42 |
+| gdp_current_usd | std | 1,115,049,256,125.82 |
+| gdp_current_usd | min | 9,035,824,366.01 |
+| gdp_current_usd | median | 2,485,482,596,184.71 |
+| gdp_current_usd | max | 15,048,971,000,000.00 |
+| gdp_growth_pct | mean | 0.4626 |
+| gdp_growth_pct | std | 6.13 |
+| gdp_growth_pct | min | -19.63 |
+| gdp_growth_pct | median | 3.01 |
+| gdp_growth_pct | max | 32.50 |
+| inflation_consumer_pct | mean | 1.10 |
+| inflation_consumer_pct | std | 1.66 |
+| inflation_consumer_pct | min | -15.18 |
+| inflation_consumer_pct | median | 1.59 |
+| inflation_consumer_pct | max | 16.53 |
+
diff --git a/research/agentic_data_science/schema_agent/docker_bash.sh b/research/agentic_data_science/schema_agent/docker_bash.sh
new file mode 100755
index 000000000..0025e81f4
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/docker_bash.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# """
+# This script launches a Docker container with an interactive bash shell for
+# development.
+# """
+
+# Exit immediately if any command exits with a non-zero status.
+set -e
+
+# Import the utility functions from the project template.
+GIT_ROOT=$(git rev-parse --show-toplevel)
+source $GIT_ROOT/class_project/project_template/utils.sh
+
+# Parse default args (-h, -v) and enable set -x if -v is passed.
+parse_default_args "$@"
+
+# Load Docker configuration variables for this script.
+get_docker_vars_script ${BASH_SOURCE[0]}
+source $DOCKER_NAME
+print_docker_vars
+
+# List the available Docker images matching the expected image name.
+run "docker image ls $FULL_IMAGE_NAME"
+
+# Configure and run the Docker container with interactive bash shell.
+# - Container is removed automatically on exit (--rm)
+# - Interactive mode with TTY allocation (-ti)
+# - Port forwarding for Jupyter or other services
+# - Git root mounted to /git_root inside container
+CONTAINER_NAME=${IMAGE_NAME}_bash
+PORT=
+DOCKER_CMD=$(get_docker_bash_command)
+DOCKER_CMD_OPTS=$(get_docker_bash_options $CONTAINER_NAME $PORT)
+run "$DOCKER_CMD $DOCKER_CMD_OPTS $FULL_IMAGE_NAME"
diff --git a/research/agentic_data_science/schema_agent/docker_build.sh b/research/agentic_data_science/schema_agent/docker_build.sh
new file mode 100755
index 000000000..5b0957a99
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/docker_build.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# """
+# Build a Docker container image for the project.
+#
+# This script sets up the build environment with error handling and command
+# tracing, loads Docker configuration from docker_name.sh, and builds the
+# Docker image using the build_container_image utility function. It supports
+# both single-architecture and multi-architecture builds via the
+# DOCKER_BUILD_MULTI_ARCH environment variable.
+# """
+
+# Exit immediately if any command exits with a non-zero status.
+set -e
+
+# Import the utility functions.
+GIT_ROOT=$(git rev-parse --show-toplevel)
+source $GIT_ROOT/class_project/project_template/utils.sh
+
+# Parse default args (-h, -v) and enable set -x if -v is passed.
+# Shift processed option flags so remaining args are passed to the build.
+parse_default_args "$@"
+shift $((OPTIND-1))
+
+# Load Docker configuration variables (REPO_NAME, IMAGE_NAME, FULL_IMAGE_NAME).
+get_docker_vars_script ${BASH_SOURCE[0]}
+source $DOCKER_NAME
+print_docker_vars
+
+# Configure Docker build settings.
+# Enable BuildKit for improved build performance and features.
+export DOCKER_BUILDKIT=1
+#export DOCKER_BUILDKIT=0
+
+# Configure single-architecture build (set to 1 for multi-arch build).
+#export DOCKER_BUILD_MULTI_ARCH=1
+export DOCKER_BUILD_MULTI_ARCH=0
+
+# Build the container image.
+# Pass extra arguments (e.g., --no-cache) via command line after -v.
+build_container_image "$@"
diff --git a/research/agentic_data_science/schema_agent/docker_build.version.log b/research/agentic_data_science/schema_agent/docker_build.version.log
new file mode 100644
index 000000000..d60536643
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/docker_build.version.log
@@ -0,0 +1,166 @@
+# Python3
+Python 3.12.13
+# pip3
+pip 26.0.1 from /opt/venv/lib/python3.12/site-packages/pip (python 3.12)
+# jupyter
+Selected Jupyter core packages...
+IPython          : 9.12.0
+ipykernel        : 7.2.0
+ipywidgets       : not installed
+jupyter_client   : 8.8.0
+jupyter_core     : 5.9.1
+jupyter_server   : 2.17.0
+jupyterlab       : 4.5.6
+nbclient         : 0.10.4
+nbconvert        : 7.17.0
+nbformat         : 5.10.4
+notebook         : not installed
+qtconsole        : not installed
+traitlets        : 5.14.3
+# Python packages
+Package                   Version
+------------------------- ------------
+aiohappyeyeballs          2.6.1
+aiohttp                   3.13.5
+aiosignal                 1.4.0
+annotated-types           0.7.0
+anthropic                 0.89.0
+anyio                     4.13.0
+argon2-cffi               25.1.0
+argon2-cffi-bindings      25.1.0
+arrow                     1.4.0
+asttokens                 3.0.1
+async-lru                 2.3.0
+attrs                     26.1.0
+babel                     2.18.0
+beautifulsoup4            4.14.3
+bleach                    6.3.0
+certifi                   2026.2.25
+cffi                      2.0.0
+charset-normalizer        3.4.7
+click                     8.3.2
+click-default-group       1.2.4
+comm                      0.2.3
+condense-json             0.1.3
+debugpy                   1.8.20
+decorator                 5.2.1
+defusedxml                0.7.1
+distro                    1.9.0
+docstring_parser          0.17.0
+dotenv                    0.9.9
+executing                 2.2.1
+fastjsonschema            2.21.2
+fqdn                      1.5.1
+frozenlist                1.8.0
+h11                       0.16.0
+httpcore                  1.0.9
+httpx                     0.28.1
+idna                      3.11
+ipykernel                 7.2.0
+ipython                   9.12.0
+ipython_pygments_lexers   1.1.1
+isoduration               20.11.0
+jedi                      0.19.2
+Jinja2                    3.1.6
+jiter                     0.13.0
+json5                     0.14.0
+jsonpatch                 1.33
+jsonpointer               3.1.1
+jsonschema                4.26.0
+jsonschema-specifications 2025.9.1
+jupyter_client            8.8.0
+jupyter_core              5.9.1
+jupyter-events            0.12.0
+jupyter-lsp               2.3.1
+jupyter_server            2.17.0
+jupyter_server_terminals  0.5.4
+jupyterlab                4.5.6
+jupyterlab_pygments       0.3.0
+jupyterlab_server         2.28.0
+langchain-core            1.2.27
+langchain-openai          1.1.12
+langgraph                 1.1.6
+langgraph-checkpoint      4.0.1
+langgraph-prebuilt        1.0.9
+langgraph-sdk             0.3.12
+langsmith                 0.7.26
+lark                      1.3.1
+llm                       0.30
+MarkupSafe                3.0.3
+matplotlib-inline         0.2.1
+mistune                   3.2.0
+multidict                 6.7.1
+nbclient                  0.10.4
+nbconvert                 7.17.0
+nbformat                  5.10.4
+nest-asyncio              1.6.0
+notebook_shim             0.2.4
+numpy                     2.4.4
+openai                    2.30.0
+orjson                    3.11.8
+ormsgpack                 1.12.2
+packaging                 26.0
+pandas                    3.0.2
+pandocfilters             1.5.1
+parso                     0.8.6
+pexpect                   4.9.0
+pip                       26.0.1
+platformdirs              4.9.4
+pluggy                    1.6.0
+prometheus_client         0.24.1
+prompt_toolkit            3.0.52
+propcache                 0.4.1
+psutil                    7.2.2
+ptyprocess                0.7.0
+pure_eval                 0.2.3
+puremagic                 2.1.1
+pycparser                 3.0
+pydantic                  2.12.5
+pydantic_core             2.41.5
+Pygments                  2.20.0
+python-dateutil           2.9.0.post0
+python-dotenv             1.2.2
+python-json-logger        4.1.0
+python-ulid               3.1.0
+pytz                      2026.1.post1
+PyYAML                    6.0.3
+pyzmq                     27.1.0
+referencing               0.37.0
+regex                     2026.4.4
+requests                  2.33.1
+requests-toolbelt         1.0.0
+rfc3339-validator         0.1.4
+rfc3986-validator         0.1.1
+rfc3987-syntax            1.1.0
+rpds-py                   0.30.0
+Send2Trash                2.1.0
+setuptools                82.0.1
+six                       1.17.0
+sniffio                   1.3.1
+soupsieve                 2.8.3
+sqlite-fts4               1.0.3
+sqlite-migrate            0.1b0
+sqlite-utils              3.39
+stack-data                0.6.3
+tabulate                  0.10.0
+tenacity                  9.1.4
+terminado                 0.18.1
+tiktoken                  0.12.0
+tinycss2                  1.4.0
+tokencost                 0.1.26
+tornado                   6.5.5
+tqdm                      4.67.3
+traitlets                 5.14.3
+typing_extensions         4.15.0
+typing-inspection         0.4.2
+tzdata                    2026.1
+uri-template              1.3.0
+urllib3                   2.6.3
+uuid_utils                0.14.1
+wcwidth                   0.6.0
+webcolors                 25.10.0
+webencodings              0.5.1
+websocket-client          1.9.0
+xxhash                    3.6.0
+yarl                      1.23.0
+zstandard                 0.25.0
diff --git a/research/agentic_data_science/schema_agent/docker_clean.sh b/research/agentic_data_science/schema_agent/docker_clean.sh
new file mode 100755
index 000000000..7e40839ae
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/docker_clean.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# """
+# Remove Docker container image for the project.
+#
+# This script cleans up Docker images by removing the container image
+# matching the project configuration. Useful for freeing disk space or
+# ensuring a fresh build.
+# """
+
+# Exit immediately if any command exits with a non-zero status.
+set -e
+
+# Import the utility functions.
+GIT_ROOT=$(git rev-parse --show-toplevel)
+source $GIT_ROOT/class_project/project_template/utils.sh
+
+# Parse default args (-h, -v) and enable set -x if -v is passed.
+parse_default_args "$@"
+
+# Load Docker configuration variables for this script.
+get_docker_vars_script ${BASH_SOURCE[0]}
+source $DOCKER_NAME
+print_docker_vars
+
+# Remove the container image.
+remove_container_image
diff --git a/research/agentic_data_science/schema_agent/docker_cmd.sh b/research/agentic_data_science/schema_agent/docker_cmd.sh
new file mode 100755
index 000000000..906d7a77b
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/docker_cmd.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+# """
+# Execute a command in a Docker container.
+#
+# This script runs a specified command inside a new Docker container instance.
+# The container is removed automatically after the command completes. The
+# git root is mounted to /git_root inside the container.
+# """
+
+# Exit immediately if any command exits with a non-zero status.
+set -e
+
+# Import the utility functions.
+GIT_ROOT=$(git rev-parse --show-toplevel)
+source $GIT_ROOT/class_project/project_template/utils.sh
+
+# Parse default args (-h, -v) and enable set -x if -v is passed.
+# Shift processed option flags so remaining args form the command.
+parse_default_args "$@"
+shift $((OPTIND-1))
+
+# Capture the command to execute from remaining arguments.
+CMD="$@"
+echo "Executing: '$CMD'"
+
+# Load Docker configuration variables for this script.
+get_docker_vars_script ${BASH_SOURCE[0]}
+source $DOCKER_NAME
+print_docker_vars
+
+# List available Docker images matching the expected image name.
+run "docker image ls $FULL_IMAGE_NAME"
+#(docker manifest inspect $FULL_IMAGE_NAME | grep arch) || true
+
+# Configure and run the Docker container with the specified command.
+CONTAINER_NAME=$IMAGE_NAME
+DOCKER_CMD=$(get_docker_cmd_command)
+PORT=""
+DOCKER_RUN_OPTS=""
+DOCKER_CMD_OPTS=$(get_docker_bash_options $CONTAINER_NAME $PORT $DOCKER_RUN_OPTS)
+run "$DOCKER_CMD $DOCKER_CMD_OPTS $FULL_IMAGE_NAME bash -c '$CMD'"
diff --git a/research/agentic_data_science/schema_agent/docker_exec.sh b/research/agentic_data_science/schema_agent/docker_exec.sh
new file mode 100755
index 000000000..24f8e401a
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/docker_exec.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# """
+# Execute a bash shell in a running Docker container.
+#
+# This script connects to an already running Docker container and opens an
+# interactive bash session for debugging or inspection purposes.
+# """
+
+# Exit immediately if any command exits with a non-zero status.
+set -e
+
+# Import the utility functions.
+GIT_ROOT=$(git rev-parse --show-toplevel)
+source $GIT_ROOT/class_project/project_template/utils.sh
+
+# Parse default args (-h, -v) and enable set -x if -v is passed.
+parse_default_args "$@"
+
+# Load Docker configuration variables for this script.
+get_docker_vars_script ${BASH_SOURCE[0]}
+source $DOCKER_NAME
+print_docker_vars
+
+# Execute bash shell in the running container.
+exec_container
diff --git a/research/agentic_data_science/schema_agent/docker_jupyter.sh b/research/agentic_data_science/schema_agent/docker_jupyter.sh
new file mode 100755
index 000000000..6c7d09b13
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/docker_jupyter.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# """
+# Execute Jupyter Lab in a Docker container.
+#
+# This script launches a Docker container running Jupyter Lab with
+# configurable port, directory mounting, and vim bindings. It passes
+# command-line options to the run_jupyter.sh script inside the container.
+#
+# Usage:
+# > docker_jupyter.sh [options]
+# """
+
+# Exit immediately if any command exits with a non-zero status.
+set -e
+
+# Import the utility functions.
+GIT_ROOT=$(git rev-parse --show-toplevel)
+source $GIT_ROOT/class_project/project_template/utils.sh
+
+# Parse command-line options and set Jupyter configuration variables.
+parse_docker_jupyter_args "$@"
+
+# Load Docker configuration variables for this script.
+get_docker_vars_script ${BASH_SOURCE[0]}
+source $DOCKER_NAME
+print_docker_vars
+
+# List available Docker images and inspect architecture.
+run "docker image ls $FULL_IMAGE_NAME"
+(docker manifest inspect $FULL_IMAGE_NAME | grep arch) || true
+
+# Run the Docker container with Jupyter Lab.
+CMD=$(get_run_jupyter_cmd "${BASH_SOURCE[0]}" "$OLD_CMD_OPTS")
+CONTAINER_NAME=$IMAGE_NAME
+DOCKER_CMD=$(get_docker_jupyter_command)
+DOCKER_CMD_OPTS=$(get_docker_jupyter_options $CONTAINER_NAME $JUPYTER_HOST_PORT $JUPYTER_USE_VIM)
+run "$DOCKER_CMD $DOCKER_CMD_OPTS $FULL_IMAGE_NAME $CMD"
diff --git a/research/agentic_data_science/schema_agent/docker_name.sh b/research/agentic_data_science/schema_agent/docker_name.sh
new file mode 100644
index 000000000..1d6f8a55c
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/docker_name.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+# """
+# Docker image naming configuration.
+#
+# This file defines the repository name, image name, and full image name
+# variables used by all docker_*.sh scripts in the project template.
+# """
+
+REPO_NAME=gpsaggese
+# The file should be all lower case.
+IMAGE_NAME=umd_schema_agent
+FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME
diff --git a/research/agentic_data_science/schema_agent/docker_push.sh b/research/agentic_data_science/schema_agent/docker_push.sh
new file mode 100755
index 000000000..27d752dd9
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/docker_push.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# """
+# Push Docker container image to Docker Hub or registry.
+#
+# This script authenticates with the Docker registry using credentials from
+# ~/.docker/passwd.$REPO_NAME.txt and pushes the locally built container
+# image to the remote repository.
+# """
+
+# Exit immediately if any command exits with a non-zero status.
+set -e
+
+# Import the utility functions.
+GIT_ROOT=$(git rev-parse --show-toplevel)
+source $GIT_ROOT/class_project/project_template/utils.sh
+
+# Parse default args (-h, -v) and enable set -x if -v is passed.
+parse_default_args "$@"
+
+# Load Docker image naming configuration.
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source $SCRIPT_DIR/docker_name.sh
+
+# Push the container image to the registry.
+push_container_image
diff --git a/research/agentic_data_science/schema_agent/requirements.txt b/research/agentic_data_science/schema_agent/requirements.txt
index 0ce56331c..ed4078da6 100644
--- a/research/agentic_data_science/schema_agent/requirements.txt
+++ b/research/agentic_data_science/schema_agent/requirements.txt
@@ -3,4 +3,6 @@ langchain_core
 langchain_openai
 langgraph
 llm
-tokencost
\ No newline at end of file
+tokencost
+pytz 
+dotenv
\ No newline at end of file
diff --git a/research/agentic_data_science/schema_agent/run_jupyter.sh b/research/agentic_data_science/schema_agent/run_jupyter.sh
new file mode 100755
index 000000000..342a73f79
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/run_jupyter.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# """
+# Launch Jupyter Lab server.
+#
+# This script starts Jupyter Lab on port 8888 with the following configuration:
+# - No browser auto-launch (useful for Docker containers)
+# - Accessible from any IP address (0.0.0.0)
+# - Root user allowed (required for Docker environments)
+# - No authentication token or password (for development convenience)
+# - Vim keybindings can be enabled via JUPYTER_USE_VIM environment variable
+# """
+
+# Exit immediately if any command exits with a non-zero status.
+set -e
+
+# Print each command to stdout before executing it.
+#set -x
+
+# Import the utility functions from /git_root.
+GIT_ROOT=/git_root
+source $GIT_ROOT/class_project/project_template/utils.sh
+
+# Load Docker configuration variables for this script.
+get_docker_vars_script ${BASH_SOURCE[0]}
+source $DOCKER_NAME
+print_docker_vars
+
+# Configure vim keybindings and notifications.
+configure_jupyter_vim_keybindings
+configure_jupyter_notifications
+
+# Initialize Jupyter Lab command with base configuration.
+JUPYTER_ARGS=$(get_jupyter_args)
+
+# Start Jupyter Lab with development-friendly settings.
+run "jupyter lab $JUPYTER_ARGS"
diff --git a/research/agentic_data_science/schema_agent/schema_agent.py b/research/agentic_data_science/schema_agent/schema_agent.py
index 14491b6f6..e5f995b56 100644
--- a/research/agentic_data_science/schema_agent/schema_agent.py
+++ b/research/agentic_data_science/schema_agent/schema_agent.py
@@ -19,14 +19,16 @@
 import os
 import sys
 import typing
+import pytz
 
 import dotenv
 import pandas as pd
-import research.agentic_data_science.schema_agent.schema_agent_hllmcli as radsasah
+import schema_agent_hllmcli as radsasah
 import schema_agent_loader as radsasal
 import schema_agent_report as radsasar
 import schema_agent_stats as radsasas
 
+import helpers.hdbg as hdbg
 import helpers.hlogging as hloggin
 
 # =============================================================================
@@ -35,13 +37,14 @@
 
 dotenv.load_dotenv()
 api_key = os.environ.get("OPENAI_API_KEY")
-if not api_key:
-    print("Error: OPENAI_API_KEY not found in environment.")
-    sys.exit(1)
+
+# Use dassert to ensure the API key exists
+hdbg.dassert(api_key, "OPENAI_API_KEY not found in environment.")
 
 _LOG = hloggin.getLogger(__name__)
 _LOG.setLevel(logging.DEBUG)
 
+# Ensure sys is imported for the handler
 console_handler = logging.StreamHandler(sys.stdout)
 hloggin.set_v2_formatter(
     ch=console_handler,
@@ -71,46 +74,23 @@ def run_pipeline(
 ) -> typing.Tuple[typing.Dict[str, pd.DataFrame], typing.Dict[str, typing.Any]]:
     """
     Execute the full data profiling pipeline over one or more CSV files.
-
-    Parameters
-    ----------
-    csv_paths : list of str
-        One or more CSV file paths to profile.
-    tags : list of str, optional
-        Human-readable tag for each CSV. Defaults to filename stems.
-    model : str
-        LLM model name passed to OpenAI / hllmcli.
-    metrics : list of str, optional
-        Numeric metrics to include. Defaults to DEFAULT_METRICS.
-    llm_scope : str
-        "all", "semantic", or "nulls" — controls which columns are LLM-profiled.
-    output_json : str
-        Path for the merged JSON report.
-    output_md : str
-        Path for the Markdown summary.
-    use_langchain : bool
-        Use LangChain chain instead of hllmcli for LLM calls.
-
-    Returns
-    -------
-    (dict of tag → df, stats dict)
     """
     if tags is None:
         tags = [os.path.splitext(os.path.basename(p))[0] for p in csv_paths]
 
-    if len(tags) != len(csv_paths):
-        raise ValueError(
-            f"Length of tags ({len(tags)}) must match csv_paths ({len(csv_paths)})."
-        )
+    # Use dassert_eq to check that the number of tags matches files
+    hdbg.dassert_eq(
+        len(tags), 
+        len(csv_paths), 
+        msg="Number of tags must match number of CSV paths"
+    )
 
     # --- Load & type-coerce ---
     tag_to_df, cat_cols_map = radsasal.prepare_dataframes(csv_paths, tags)
 
-    # Merge datetime metadata across all DataFrames (using the last loaded tag
-    # as the primary df for single-dataset runs; full merge for multi).
-    _, datetime_meta = radsasal.infer_and_convert_datetime_columns(
-        pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True)
-    )
+    # Merge datetime metadata
+    combined_for_dt = pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True)
+    _, datetime_meta = radsasal.infer_and_convert_datetime_columns(combined_for_dt)
 
     # --- Compute stats ---
     stats = radsasas.compute_llm_agent_stats(
@@ -121,9 +101,9 @@ def run_pipeline(
     stats["datetime_columns"] = datetime_meta
 
     # --- LLM scope ---
-    # Use the concatenated DataFrame to decide which columns to send.
     combined_df = pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True)
     columns_for_llm = radsasah._select_columns_for_llm(combined_df, scope=llm_scope)
+    
     _LOG.info(
         "LLM will profile %d / %d columns (scope=%s).",
         len(columns_for_llm),
@@ -146,8 +126,11 @@ def run_pipeline(
             columns_to_include=columns_for_llm,
         )
 
-    # --- Build column profiles (use first / primary df for column ordering) ---
+    # --- Build column profiles ---
+    # Ensure tag_to_df is not empty before accessing
+    hdbg.dassert(tag_to_df, "No dataframes were loaded.")
     primary_df = list(tag_to_df.values())[0]
+    
     column_profiles = radsasar.build_column_profiles(
         df=primary_df,
         stats=stats,
@@ -169,7 +152,6 @@ def run_pipeline(
 
     return tag_to_df, stats
 
-
 # =============================================================================
 # CLI
 # =============================================================================
diff --git a/research/agentic_data_science/schema_agent/schema_agent_hllmcli.py b/research/agentic_data_science/schema_agent/schema_agent_hllmcli.py
index d2194684a..37b4c8f0e 100644
--- a/research/agentic_data_science/schema_agent/schema_agent_hllmcli.py
+++ b/research/agentic_data_science/schema_agent/schema_agent_hllmcli.py
@@ -10,9 +10,11 @@
 import langchain_core.output_parsers as lcop
 import langchain_core.prompts as lcpr
 import langchain_openai as lco
+import pandas as pd
 import pydantic
 import schema_agent_models as radsasam
 
+import helpers.hdbg as hdbg
 import helpers.hllm_cli as hllmcli
 import helpers.hlogging as hloggin
 
@@ -27,20 +29,19 @@ def _select_columns_for_llm(
     """
     Return the list of column names that should be sent to the LLM.
 
-    Parameters
-    ----------
-    df : pd.DataFrame
-    scope : str
-        "all"      — every column
-        "semantic" — non-numeric columns only (object / category / string)
-        "nulls"    — columns with null fraction above null_threshold
-    null_threshold : float
-        Fraction of nulls required for "nulls" scope. Default 5 %.
-
-    Returns
-    -------
-    list of str
+    :param df: Input dataframe.
+    :type df: pd.DataFrame
+    :param scope: "all" — every column, "semantic" — non-numeric columns
+        only, "nulls" — columns with high nulls.
+    :type scope: str
+    :param null_threshold: Fraction of nulls required for "nulls" scope.
+        Default 0.05.
+    :type null_threshold: float
+    :return: List of valid columns to process.
+    :rtype: typing.List[str]
     """
+    hdbg.dassert_isinstance(df, pd.DataFrame)
+
     if scope == "all":
         return list(df.columns)
 
@@ -74,17 +75,16 @@ def build_llm_prompt(
     Serialize statistical data into a structured string prompt for LLM
     consumption.
 
-    Parameters
-    ----------
-    stats : dict
-        Output of compute_llm_agent_stats().
-    columns_to_include : list of str, optional
-        Subset of column names to include in the prompt. None = all.
-
-    Returns
-    -------
-    str
+    :param stats: Output of compute_llm_agent_stats().
+    :type stats: typing.Dict[str, typing.Any]
+    :param columns_to_include: Subset of column names to include in the
+        prompt. None = all.
+    :type columns_to_include: typing.Optional[typing.List[str]]
+    :return: Formatted string prompt.
+    :rtype: str
     """
+    hdbg.dassert_isinstance(stats, dict)
+
     prompt_segments = [
         "You are a Senior Data Scientist and Domain Expert.",
         "Analyze the provided dataset statistics and generate a profile for each column.",
@@ -132,20 +132,20 @@ def generate_hypotheses_via_cli(
 
     Parses and Pydantic-validates the LLM response against DatasetInsights.
 
-    Parameters
-    ----------
-    stats : dict
-    model : str
-    columns_to_include : list of str, optional
-        If provided, only these columns are sent to the LLM (cost control).
-
-    Returns
-    -------
-    dict  — DatasetInsights-shaped dict, or {"error": ...} on failure.
+    :param stats: Computed dataset statistics.
+    :type stats: typing.Dict[str, typing.Any]
+    :param model: The target LLM model.
+    :type model: str
+    :param columns_to_include: Subset of column names to include.
+    :type columns_to_include: typing.Optional[typing.List[str]]
+    :return: DatasetInsights-shaped dict, or {"error": ...} on failure.
+    :rtype: typing.Dict[str, typing.Any]
     """
+    hdbg.dassert_isinstance(stats, dict)
+
     _LOG.info("Generating hypotheses via hllmcli (model=%s)...", model)
 
-    schema_json = radsasam.atasetInsights.model_json_schema()
+    schema_json = radsasam.DatasetInsights.model_json_schema()
     user_prompt = build_llm_prompt(stats, columns_to_include=columns_to_include)
     system_prompt = (
         "You are a Senior Data Scientist. Analyze the following data statistics.\n"
@@ -171,7 +171,7 @@ def generate_hypotheses_via_cli(
         )
         raw = json.loads(cleaned)
 
-        # Pydantic validation — raises ValidationError on schema mismatch.
+        # Pydantic validation
         validated = radsasam.DatasetInsights.model_validate(raw)
         return validated.model_dump()
 
@@ -194,18 +194,18 @@ def get_llm_semantic_insights_langchain(
     Process dataset metadata via LangChain to extract structured semantic
     insights.
 
-    Uses JsonOutputParser alongside the Pydantic schema. Validates output.
-
-    Parameters
-    ----------
-    prompt_text : str
-        Serialized stats from build_llm_prompt().
-    model : str
+    Uses JsonOutputParser alongside the Pydantic schema. Validates
+    output.
 
-    Returns
-    -------
-    dict
+    :param prompt_text: Serialized stats from build_llm_prompt().
+    :type prompt_text: str
+    :param model: The target LLM model.
+    :type model: str
+    :return: Validated insights dictionary.
+    :rtype: typing.Dict[str, typing.Any]
     """
+    hdbg.dassert_isinstance(prompt_text, str)
+
     _LOG.info("Querying LLM via LangChain (%s)...", model)
     llm = lco.ChatOpenAI(model=model, temperature=0)
     parser = lcop.JsonOutputParser(pydantic_object=radsasam.DatasetInsights)
@@ -224,7 +224,6 @@ def get_llm_semantic_insights_langchain(
     chain = prompt | llm | parser
     try:
         result = chain.invoke({"metadata_stats": prompt_text})
-        # Validate against Pydantic schema.
         validated = radsasam.DatasetInsights.model_validate(result)
         return validated.model_dump()
     except pydantic.ValidationError as e:
diff --git a/research/agentic_data_science/schema_agent/schema_agent_loader.py b/research/agentic_data_science/schema_agent/schema_agent_loader.py
index 02fae3514..d8f649547 100644
--- a/research/agentic_data_science/schema_agent/schema_agent_loader.py
+++ b/research/agentic_data_science/schema_agent/schema_agent_loader.py
@@ -13,6 +13,7 @@
 
 import pandas as pd
 
+import helpers.hdbg as hdbg
 import helpers.hlogging as hloggin
 import helpers.hpandas_conversion as hpanconv
 import helpers.hpandas_io as hpanio
@@ -24,22 +25,20 @@ def load_csv(csv_path: str) -> pd.DataFrame:
     """
     Load a CSV into a DataFrame with clear error handling.
 
-    Parameters
-    ----------
-    csv_path : str
-        Path to the CSV file.
-
-    Returns
-    -------
-    pd.DataFrame
+    :param csv_path: Path to the CSV file.
+    :type csv_path: str
+    :return: Loaded dataframe.
+    :rtype: pd.DataFrame
     """
+    hdbg.dassert_isinstance(csv_path, str)
     try:
         df = hpanio.read_csv_to_df(csv_path)
     except FileNotFoundError:
         _LOG.error("CSV not found at '%s'.", csv_path)
         raise
-    if df.empty:
-        raise ValueError(f"CSV at '{csv_path}' loaded as an empty DataFrame.")
+        
+    hdbg.dassert_lt(0, len(df), "CSV at '%s' loaded as an empty DataFrame.", csv_path)
+    
     _LOG.info(
         "Loaded '%s': %d rows × %d columns.", csv_path, len(df), len(df.columns)
     )
@@ -61,19 +60,17 @@ def infer_and_convert_datetime_columns(
     Uses sampling for performance. Returns the updated DataFrame and a
     metadata dict with inference details per column.
 
-    Parameters
-    ----------
-    df : pd.DataFrame
-    sample_size : int
-        Number of rows to sample when testing format compliance.
-    threshold : float
-        Minimum fraction of parsed values required to accept a column as temporal.
-
-    Returns
-    -------
-    (pd.DataFrame, dict)
-        Updated DataFrame with converted columns + metadata per column.
+    :param df: Input DataFrame.
+    :type df: pd.DataFrame
+    :param sample_size: Number of rows to sample when testing format compliance.
+    :type sample_size: int
+    :param threshold: Minimum fraction of parsed values required to accept a column as temporal.
+    :type threshold: float
+    :return: Updated DataFrame with converted columns + metadata per column.
+    :rtype: typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]
     """
+    hdbg.dassert_isinstance(df, pd.DataFrame)
+    
     COMMON_FORMATS = [
         "%Y-%m-%d",
         "%d-%m-%Y",
@@ -166,16 +163,16 @@ def prepare_dataframes(
 
     Applies type coercion, datetime inference, and categorical detection.
 
-    Parameters
-    ----------
-    csv_paths : list of str
-    tags : list of str, optional
-        Human-readable tags; defaults to filename stems.
-
-    Returns
-    -------
-    (dict of tag → df, dict of tag → categorical_columns)
+    :param csv_paths: List of CSV file paths.
+    :type csv_paths: typing.List[str]
+    :param tags: Human-readable tags; defaults to filename stems.
+    :type tags: typing.Optional[typing.List[str]]
+    :return: A tuple containing a dict mapping tags to DataFrames, and a dict mapping tags to categorical columns.
+    :rtype: typing.Tuple[typing.Dict[str, pd.DataFrame], typing.Dict[str, typing.List[str]]]
     """
+    hdbg.dassert_isinstance(csv_paths, list)
+    hdbg.dassert_lt(0, len(csv_paths))
+    
     tag_to_df: typing.Dict[str, pd.DataFrame] = {}
     cat_cols_map: typing.Dict[str, typing.List[str]] = {}
 
@@ -189,4 +186,4 @@ def prepare_dataframes(
             include=["object", "category", "string"]
         ).columns.tolist()
 
-    return tag_to_df, cat_cols_map
+    return tag_to_df, cat_cols_map
\ No newline at end of file
diff --git a/research/agentic_data_science/schema_agent/schema_agent_report.py b/research/agentic_data_science/schema_agent/schema_agent_report.py
index 2c377322e..e46a34b07 100644
--- a/research/agentic_data_science/schema_agent/schema_agent_report.py
+++ b/research/agentic_data_science/schema_agent/schema_agent_report.py
@@ -14,6 +14,7 @@
 
 import pandas as pd
 
+import helpers.hdbg as hdbg
 import helpers.hlogging as hloggin
 
 _LOG = hloggin.getLogger(__name__)
@@ -42,6 +43,9 @@ def build_column_profiles(
     """
     profiles: typing.List[typing.Dict[str, typing.Any]] = []
 
+    hdbg.dassert_isinstance(df, pd.DataFrame)
+    hdbg.dassert_isinstance(stats, dict)
+    hdbg.dassert_isinstance(insights, dict)
     numeric_summary = stats.get("numeric_summary", {})
     categorical_stats = stats.get("categorical_distributions", {})
     datetime_meta = stats.get("datetime_columns", {})
@@ -115,6 +119,11 @@ def merge_and_export_results(
     output_path : str
     """
     _LOG.info("Merging results...")
+    hdbg.dassert_isinstance(stats, dict)
+    hdbg.dassert_isinstance(insights, dict)
+    hdbg.dassert_isinstance(column_profiles, list)
+    hdbg.dassert_isinstance(output_path, str)
+    hdbg.dassert(output_path, "output_path must be a non-empty string.")
     serializable_stats = _make_serializable(stats)
 
     final_report = {
@@ -167,6 +176,11 @@ def _clean(val: typing.Any) -> str:
             return ""
         return str(val).replace("|", "\\|").replace("\n", " ")
 
+    hdbg.dassert_isinstance(column_profiles, list)
+    hdbg.dassert_lt(0, len(column_profiles), "column_profiles must be non-empty.")
+    hdbg.dassert_isinstance(output_path, str)
+    hdbg.dassert(output_path, "output_path must be a non-empty string.")
+
     def _fmt(val: typing.Any) -> str:
         if isinstance(val, int):
             return str(val)
@@ -215,4 +229,4 @@ def _fmt(val: typing.Any) -> str:
     with open(output_path, "w", encoding="utf-8") as f:
         f.write("\n".join(lines) + "\n")
 
-    _LOG.info("Exported Markdown report to '%s'.", output_path)
+    _LOG.info("Exported Markdown report to '%s'.", output_path)
\ No newline at end of file
diff --git a/research/agentic_data_science/schema_agent/schema_agent_stats.py b/research/agentic_data_science/schema_agent/schema_agent_stats.py
index 213c3d25f..24ab40857 100644
--- a/research/agentic_data_science/schema_agent/schema_agent_stats.py
+++ b/research/agentic_data_science/schema_agent/schema_agent_stats.py
@@ -13,6 +13,7 @@
 
 import pandas as pd
 
+import helpers.hdbg as hdbg
 import helpers.hlogging as hloggin
 import helpers.hpandas_stats as hpanstat
 
@@ -61,6 +62,8 @@ def compute_llm_agent_stats(
         numeric_summary.
     """
     metrics = _resolve_metrics(metrics)
+    hdbg.dassert_isinstance(tag_to_df, dict)
+    hdbg.dassert_lt(0, len(tag_to_df), "tag_to_df must be non-empty.")
     dataframe_stats: typing.Dict[str, typing.Any] = {}
 
     # 1. Temporal boundaries
@@ -98,14 +101,18 @@ def compute_llm_agent_stats(
     dataframe_stats["categorical_distributions"] = {}
     if categorical_cols_map:
         for tag, cols in categorical_cols_map.items():
-            if tag not in tag_to_df:
-                _LOG.warning("Tag '%s' not found in tag_to_df; skipping.", tag)
-                continue
+            hdbg.dassert_in(
+                tag, tag_to_df, "Tag '%s' not found in tag_to_df.", tag
+            )
             dataframe_stats["categorical_distributions"][tag] = {}
             for col in cols:
-                if col not in tag_to_df[tag].columns:
-                    _LOG.warning("Column '%s' not in '%s'; skipping.", col, tag)
-                    continue
+                hdbg.dassert_in(
+                    col,
+                    tag_to_df[tag].columns,
+                    "Column '%s' not found in dataset '%s'.",
+                    col,
+                    tag,
+                )
                 dist = hpanstat.get_value_counts_stats_df(tag_to_df[tag], col)
                 dataframe_stats["categorical_distributions"][tag][col] = dist
                 print(
@@ -149,4 +156,4 @@ def _resolve_metrics(
             VALID_METRICS,
         )
     resolved = [m for m in metrics if m in VALID_METRICS]
-    return resolved if resolved else DEFAULT_METRICS
+    return resolved if resolved else DEFAULT_METRICS
\ No newline at end of file
diff --git a/research/agentic_data_science/schema_agent/scmea_agent_example.ipynb b/research/agentic_data_science/schema_agent/scmea_agent_example.ipynb
new file mode 100644
index 000000000..6104a4038
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/scmea_agent_example.ipynb
@@ -0,0 +1,371 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "49bffb6b-9c87-4d5a-a32a-a2382c2b700c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "import argparse\n",
+    "import logging\n",
+    "import os\n",
+    "import typing\n",
+    "\n",
+    "import dotenv\n",
+    "import pandas as pd\n",
+    "import research.agentic_data_science.schema_agent.schema_agent_hllmcli as radsasah\n",
+    "import schema_agent_loader as radsasal\n",
+    "import schema_agent_report as radsasar\n",
+    "import schema_agent_stats as radsasas\n",
+    "\n",
+    "import helpers.hdbg as hdbg\n",
+    "import helpers.hlogging as hloggin\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "3770d3bd-200f-4b7a-bb10-1fe76f26c4d7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n",
+      "WARNING: Running in Jupyter\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
+      "  parsed = pd.to_datetime(df[col], errors=\"coerce\")\n",
+      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
+      "  parsed = pd.to_datetime(df[col], errors=\"coerce\")\n",
+      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
+      "  parsed = pd.to_datetime(df[col], errors=\"coerce\")\n",
+      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
+      "  parsed = pd.to_datetime(df[col], errors=\"coerce\")\n",
+      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
+      "  parsed = pd.to_datetime(df[col], errors=\"coerce\")\n",
+      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
+      "  parsed = pd.to_datetime(df[col], errors=\"coerce\")\n",
+      "Skipping duration stats: 'int' object has no attribute 'tzinfo'\n",
+      "Quality report failed for 'ecommerce_data': 'RangeIndex' object has no attribute 'date'\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "       year month week_of_year day_of_week order_hour is_weekend customer_id unit_price_gbp quantity_sold sales_amount_gbp population_total       gdp_current_usd gdp_growth_pct inflation_consumer_pct            end_download_timestamp\n",
+      "0      2009    12           49           1          7          0       13085           5.95            10             59.5       62276270.0       2412840006231.5     -17.633976                1.89709  2026-04-07 16:06:12.386207+00:00\n",
+      "1      2009    12           49           1          7          0       13085           6.75            12             81.0       62276270.0       2412840006231.5     -17.633976                1.89709  2026-04-07 16:06:12.386207+00:00\n",
+      "...     ...   ...          ...         ...        ...        ...         ...            ...           ...              ...              ...                   ...            ...                    ...                               ...\n",
+      "99998  2010    12           49           3         20          0       17530           1.95             4              7.8       62766365.0  2485482596184.708984       3.010668               1.589081  2026-04-07 16:06:12.386207+00:00\n",
+      "99999  2010    12           49           3         20          0       17530           1.25             4              5.0       62766365.0  2485482596184.708984       3.010668               1.589081  2026-04-07 16:06:12.386207+00:00\n",
+      "\n",
+      "=== Distribution: ecommerce_data / country ===\n",
+      "                 count  pct [%]\n",
+      "country                       \n",
+      "United Kingdom  64417   64.417\n",
+      "Ireland          8507    8.507\n",
+      "Germany          7654    7.654\n",
+      "France           5470    5.470\n",
+      "Netherlands      2729    2.729\n",
+      "Spain            1235    1.235\n",
+      "Switzerland      1170    1.170\n",
+      "Belgium          1037    1.037\n",
+      "Portugal          984    0.984\n",
+      "Sweden            868    0.868\n",
+      "\n",
+      "=== Distribution: ecommerce_data / country_code ===\n",
+      "               count  pct [%]\n",
+      "country_code                \n",
+      "GBR           64417   64.417\n",
+      "IRL            8507    8.507\n",
+      "DEU            7654    7.654\n",
+      "FRA            5470    5.470\n",
+      "NLD            2729    2.729\n",
+      "ESP            1235    1.235\n",
+      "CHE            1170    1.170\n",
+      "BEL            1037    1.037\n",
+      "PRT             984    0.984\n",
+      "SWE             868    0.868\n",
+      "\n",
+      "=== Distribution: ecommerce_data / product_id ===\n",
+      "             count  pct [%]\n",
+      "product_id                \n",
+      "POST          731    0.731\n",
+      "85123A        615    0.615\n",
+      "21212         438    0.438\n",
+      "22423         437    0.437\n",
+      "85099B        391    0.391\n",
+      "20725         334    0.334\n",
+      "84991         298    0.298\n",
+      "20914         295    0.295\n",
+      "21232         295    0.295\n",
+      "84879         285    0.285\n",
+      "\n",
+      "=== Numeric Summary: ecommerce_data ===\n",
+      "                                 mean           std           min        median           max\n",
+      "year                    2.009929e+03  2.563578e-01  2.009000e+03  2.010000e+03  2.010000e+03\n",
+      "month                   7.377590e+00  3.456657e+00  1.000000e+00  8.000000e+00  1.200000e+01\n",
+      "week_of_year            2.991514e+01  1.500327e+01  1.000000e+00  3.300000e+01  5.200000e+01\n",
+      "day_of_week             2.583280e+00  1.923159e+00  0.000000e+00  2.000000e+00  6.000000e+00\n",
+      "order_hour              1.268047e+01  2.351588e+00  7.000000e+00  1.300000e+01  2.000000e+01\n",
+      "is_weekend              1.539600e-01  3.609122e-01  0.000000e+00  0.000000e+00  1.000000e+00\n",
+      "customer_id             1.476813e+04  1.799165e+03  1.234600e+04  1.464600e+04  1.828700e+04\n",
+      "unit_price_gbp          3.889158e+00  5.975020e+01  1.000000e-03  1.950000e+00  1.095350e+04\n",
+      "quantity_sold           1.865779e+01  1.593465e+02  1.000000e+00  6.000000e+00  1.915200e+04\n",
+      "sales_amount_gbp        2.694892e+01  9.239021e+01  1.000000e-03  1.498000e+01  1.095350e+04\n",
+      "population_total        5.409812e+07  2.664448e+07  3.180410e+05  6.276636e+07  3.093782e+08\n",
+      "gdp_current_usd         2.161193e+12  1.115049e+12  9.035824e+09  2.485483e+12  1.504897e+13\n",
+      "gdp_growth_pct          4.626259e-01  6.134116e+00 -1.962987e+01  3.010668e+00  3.250405e+01\n",
+      "inflation_consumer_pct  1.104250e+00  1.655513e+00 -1.518298e+01  1.589081e+00  1.652789e+01\n",
+      "12:06:12 rss=0.234GB vms=1.655GB mem_pct=2% cpu=100% - \u001b[36mINFO \u001b[0m Task-43 schema_agent.py run_pipeline:107 LLM will profile 3 / 18 columns (scope=semantic).\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>order_datetime</th>\n",
+       "      <th>year</th>\n",
+       "      <th>month</th>\n",
+       "      <th>week_of_year</th>\n",
+       "      <th>day_of_week</th>\n",
+       "      <th>order_hour</th>\n",
+       "      <th>is_weekend</th>\n",
+       "      <th>country</th>\n",
+       "      <th>country_code</th>\n",
+       "      <th>product_id</th>\n",
+       "      <th>customer_id</th>\n",
+       "      <th>unit_price_gbp</th>\n",
+       "      <th>quantity_sold</th>\n",
+       "      <th>sales_amount_gbp</th>\n",
+       "      <th>population_total</th>\n",
+       "      <th>gdp_current_usd</th>\n",
+       "      <th>gdp_growth_pct</th>\n",
+       "      <th>inflation_consumer_pct</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2009-12-01 07:45:00</td>\n",
+       "      <td>2009</td>\n",
+       "      <td>12</td>\n",
+       "      <td>49</td>\n",
+       "      <td>1</td>\n",
+       "      <td>7</td>\n",
+       "      <td>0</td>\n",
+       "      <td>United Kingdom</td>\n",
+       "      <td>GBR</td>\n",
+       "      <td>21523</td>\n",
+       "      <td>13085</td>\n",
+       "      <td>5.95</td>\n",
+       "      <td>10</td>\n",
+       "      <td>59.50</td>\n",
+       "      <td>62276270.0</td>\n",
+       "      <td>2.412840e+12</td>\n",
+       "      <td>-17.633976</td>\n",
+       "      <td>1.89709</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2009-12-01 07:45:00</td>\n",
+       "      <td>2009</td>\n",
+       "      <td>12</td>\n",
+       "      <td>49</td>\n",
+       "      <td>1</td>\n",
+       "      <td>7</td>\n",
+       "      <td>0</td>\n",
+       "      <td>United Kingdom</td>\n",
+       "      <td>GBR</td>\n",
+       "      <td>79323W</td>\n",
+       "      <td>13085</td>\n",
+       "      <td>6.75</td>\n",
+       "      <td>12</td>\n",
+       "      <td>81.00</td>\n",
+       "      <td>62276270.0</td>\n",
+       "      <td>2.412840e+12</td>\n",
+       "      <td>-17.633976</td>\n",
+       "      <td>1.89709</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2009-12-01 09:06:00</td>\n",
+       "      <td>2009</td>\n",
+       "      <td>12</td>\n",
+       "      <td>49</td>\n",
+       "      <td>1</td>\n",
+       "      <td>9</td>\n",
+       "      <td>0</td>\n",
+       "      <td>United Kingdom</td>\n",
+       "      <td>GBR</td>\n",
+       "      <td>82582</td>\n",
+       "      <td>13078</td>\n",
+       "      <td>2.10</td>\n",
+       "      <td>12</td>\n",
+       "      <td>25.20</td>\n",
+       "      <td>62276270.0</td>\n",
+       "      <td>2.412840e+12</td>\n",
+       "      <td>-17.633976</td>\n",
+       "      <td>1.89709</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2009-12-01 09:06:00</td>\n",
+       "      <td>2009</td>\n",
+       "      <td>12</td>\n",
+       "      <td>49</td>\n",
+       "      <td>1</td>\n",
+       "      <td>9</td>\n",
+       "      <td>0</td>\n",
+       "      <td>United Kingdom</td>\n",
+       "      <td>GBR</td>\n",
+       "      <td>22111</td>\n",
+       "      <td>13078</td>\n",
+       "      <td>4.25</td>\n",
+       "      <td>24</td>\n",
+       "      <td>102.00</td>\n",
+       "      <td>62276270.0</td>\n",
+       "      <td>2.412840e+12</td>\n",
+       "      <td>-17.633976</td>\n",
+       "      <td>1.89709</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2009-12-01 09:06:00</td>\n",
+       "      <td>2009</td>\n",
+       "      <td>12</td>\n",
+       "      <td>49</td>\n",
+       "      <td>1</td>\n",
+       "      <td>9</td>\n",
+       "      <td>0</td>\n",
+       "      <td>United Kingdom</td>\n",
+       "      <td>GBR</td>\n",
+       "      <td>21756</td>\n",
+       "      <td>13078</td>\n",
+       "      <td>5.95</td>\n",
+       "      <td>3</td>\n",
+       "      <td>17.85</td>\n",
+       "      <td>62276270.0</td>\n",
+       "      <td>2.412840e+12</td>\n",
+       "      <td>-17.633976</td>\n",
+       "      <td>1.89709</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       order_datetime  year  month  week_of_year  day_of_week  order_hour  \\\n",
+       "0 2009-12-01 07:45:00  2009     12            49            1           7   \n",
+       "1 2009-12-01 07:45:00  2009     12            49            1           7   \n",
+       "2 2009-12-01 09:06:00  2009     12            49            1           9   \n",
+       "3 2009-12-01 09:06:00  2009     12            49            1           9   \n",
+       "4 2009-12-01 09:06:00  2009     12            49            1           9   \n",
+       "\n",
+       "   is_weekend         country country_code product_id  customer_id  \\\n",
+       "0           0  United Kingdom          GBR      21523        13085   \n",
+       "1           0  United Kingdom          GBR     79323W        13085   \n",
+       "2           0  United Kingdom          GBR      82582        13078   \n",
+       "3           0  United Kingdom          GBR      22111        13078   \n",
+       "4           0  United Kingdom          GBR      21756        13078   \n",
+       "\n",
+       "   unit_price_gbp  quantity_sold  sales_amount_gbp  population_total  \\\n",
+       "0            5.95             10             59.50        62276270.0   \n",
+       "1            6.75             12             81.00        62276270.0   \n",
+       "2            2.10             12             25.20        62276270.0   \n",
+       "3            4.25             24            102.00        62276270.0   \n",
+       "4            5.95              3             17.85        62276270.0   \n",
+       "\n",
+       "   gdp_current_usd  gdp_growth_pct  inflation_consumer_pct  \n",
+       "0     2.412840e+12      -17.633976                 1.89709  \n",
+       "1     2.412840e+12      -17.633976                 1.89709  \n",
+       "2     2.412840e+12      -17.633976                 1.89709  \n",
+       "3     2.412840e+12      -17.633976                 1.89709  \n",
+       "4     2.412840e+12      -17.633976                 1.89709  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Make sure this is at the top of your notebook\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import schema_agent as radsasag\n",
+    "\n",
+    "# Now run the pipeline\n",
+    "csv_files = [\"global_ecommerce_forecasting.csv\"]\n",
+    "tags = [\"ecommerce_data\"]\n",
+    "\n",
+    "tag_to_df, stats = radsasag.run_pipeline(\n",
+    "    csv_paths=csv_files,\n",
+    "    tags=tags,\n",
+    "    model=\"gpt-4o\",\n",
+    "    llm_scope=\"semantic\"\n",
+    ")\n",
+    "\n",
+    "display(tag_to_df[\"ecommerce_data\"].head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36f51050-4ead-49dc-9fec-cd430f24de6f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/research/agentic_data_science/schema_agent/utils.sh b/research/agentic_data_science/schema_agent/utils.sh
new file mode 100644
index 000000000..67426f5d5
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/utils.sh
@@ -0,0 +1,504 @@
+#!/bin/bash
+# """
+# Utility functions for Docker container management.
+# """
+
+
+# #############################################################################
+# General utilities
+# #############################################################################
+
+
+run() {
+    # """
+    # Execute a command with echo output.
+    #
+    # :param cmd: Command string to execute
+    # :return: Exit status of the executed command
+    # """
+    cmd="$*"
+    echo "> $cmd"
+    eval "$cmd"
+}
+
+
+enable_verbose_mode() {
+    # """
+    # Enable shell command tracing (set -x) when VERBOSE is set to 1.
+    #
+    # Reads the VERBOSE variable set by parse_docker_jupyter_args.
+    # Call this after parsing args to activate tracing for the rest of the script.
+    # """
+    if [[ $VERBOSE == 1 ]]; then
+        set -x
+    fi
+}
+
+
+# #############################################################################
+# Argument parsing
+# #############################################################################
+
+
+_print_default_help() {
+    # """
+    # Print usage information and available default options for docker scripts.
+    # """
+    echo "Usage: $(basename $0) [options]"
+    echo ""
+    echo "Options:"
+    echo "  -h    Print this help message and exit"
+    echo "  -v    Enable verbose output (set -x)"
+}
+
+
+parse_default_args() {
+    # """
+    # Parse default command-line arguments for docker scripts.
+    #
+    # Sets VERBOSE variable in the caller's scope and enables set -x when -v
+    # is passed.  Prints help and exits when -h is passed.
+    # Updates OPTIND so the caller can shift away processed arguments.
+    #
+    # :param @: command-line arguments forwarded from the calling script
+    # """
+    VERBOSE=0
+    while getopts "hv" flag; do
+        case "${flag}" in
+            h) _print_default_help; exit 0;;
+            v) VERBOSE=1;;
+            *) _print_default_help; exit 1;;
+        esac
+    done
+    enable_verbose_mode
+}
+
+
+_print_docker_jupyter_help() {
+    # """
+    # Print usage information and available options for docker_jupyter.sh.
+    # """
+    echo "Usage: $(basename $0) [options]"
+    echo ""
+    echo "Launch Jupyter Lab inside a Docker container."
+    echo ""
+    echo "Options:"
+    echo "  -h          Print this help message and exit"
+    echo "  -p PORT     Host port to forward to Jupyter Lab (default: 8888)"
+    echo "  -u          Enable vim keybindings in Jupyter Lab"
+    echo "  -v          Enable verbose output (set -x)"
+}
+
+
+parse_docker_jupyter_args() {
+    # """
+    # Parse command-line arguments for docker_jupyter.sh.
+    #
+    # Sets JUPYTER_HOST_PORT, JUPYTER_USE_VIM, TARGET_DIR, VERBOSE, and
+    # OLD_CMD_OPTS in the caller's scope.  Enables set -x when -v is passed.
+    # Prints help and exits when -h is passed.
+    #
+    # :param @: command-line arguments forwarded from the calling script
+    # """
+    # Set defaults.
+    JUPYTER_HOST_PORT=8888
+    JUPYTER_USE_VIM=0
+    VERBOSE=0
+    # Save original args to pass through to run_jupyter.sh.
+    OLD_CMD_OPTS="$*"
+    # Parse options.
+    while getopts "hp:uv" flag; do
+        case "${flag}" in
+            h) _print_docker_jupyter_help; exit 0;;
+            p) JUPYTER_HOST_PORT=${OPTARG};;  # Port for Jupyter Lab.
+            u) JUPYTER_USE_VIM=1;;            # Enable vim bindings.
+            v) VERBOSE=1;;                    # Enable verbose output.
+            *) _print_docker_jupyter_help; exit 1;;
+        esac
+    done
+    # Enable command tracing if verbose mode is requested.
+    enable_verbose_mode
+}
+
+
+# #############################################################################
+# Docker image management
+# #############################################################################
+
+
+get_docker_vars_script() {
+    # """
+    # Load Docker variables from docker_name.sh script.
+    #
+    # :param script_path: Path to the script to determine the Docker configuration directory
+    # :return: Sources REPO_NAME, IMAGE_NAME, and FULL_IMAGE_NAME variables
+    # """
+    local script_path=$1
+    # Find the name of the container.
+    SCRIPT_DIR=$(dirname $script_path)
+    DOCKER_NAME="$SCRIPT_DIR/docker_name.sh"
+    if [[ ! -e $SCRIPT_DIR ]]; then
+        echo "Can't find $DOCKER_NAME"
+        exit -1
+    fi;
+    source $DOCKER_NAME
+}
+
+
+print_docker_vars() {
+    # """
+    # Print current Docker variables to stdout.
+    # """
+    echo "REPO_NAME=$REPO_NAME"
+    echo "IMAGE_NAME=$IMAGE_NAME"
+    echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME"
+}
+
+
+build_container_image() {
+    # """
+    # Build a Docker container image.
+    #
+    # Supports both single-architecture and multi-architecture builds.
+    # Creates temporary build directory, copies files, and builds the image.
+    #
+    # :param @: Additional options to pass to docker build/buildx build
+    # """
+    echo "# ${FUNCNAME[0]} ..."
+    FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME
+    echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME"
+    # Prepare build area.
+    #tar -czh . | docker build $OPTS -t $IMAGE_NAME -
+    DIR="../tmp.build"
+    if [[ -d $DIR ]]; then
+        rm -rf $DIR
+    fi;
+    cp -Lr . $DIR || true
+    # Build container.
+    echo "DOCKER_BUILDKIT=$DOCKER_BUILDKIT"
+    echo "DOCKER_BUILD_MULTI_ARCH=$DOCKER_BUILD_MULTI_ARCH"
+    if [[ $DOCKER_BUILD_MULTI_ARCH != 1 ]]; then
+        # Build for a single architecture.
+        echo "Building for current architecture..."
+        OPTS="--progress plain $@"
+        (cd $DIR; docker build $OPTS -t $FULL_IMAGE_NAME . 2>&1 | tee ../docker_build.log; exit ${PIPESTATUS[0]})
+    else
+        # Build for multiple architectures.
+        echo "Building for multiple architectures..."
+        OPTS="$@"
+        export DOCKER_CLI_EXPERIMENTAL=enabled
+        # Create a new builder.
+        #docker buildx rm --all-inactive --force
+        #docker buildx create --name mybuilder
+        #docker buildx use mybuilder
+        # Use the default builder.
+        docker buildx use multiarch
+        docker buildx inspect --bootstrap
+        # Note that one needs to push to the repo since otherwise it is not
+        # possible to keep multiple.
+        (cd $DIR; docker buildx build --push --platform linux/arm64,linux/amd64 $OPTS --tag $FULL_IMAGE_NAME . 2>&1 | tee ../docker_build.log; exit ${PIPESTATUS[0]})
+        # Report the status.
+        docker buildx imagetools inspect $FULL_IMAGE_NAME
+    fi;
+    # Report build version.
+    if [ -f docker_build.version.log ]; then
+      rm docker_build.version.log
+    fi
+    (cd $DIR; docker run --rm -it -v $(pwd):/data $FULL_IMAGE_NAME bash -c "/data/version.sh") 2>&1 | tee docker_build.version.log
+    #
+    docker image ls $REPO_NAME/$IMAGE_NAME
+    rm -rf $DIR
+    echo "*****************************"
+    echo "SUCCESS"
+    echo "*****************************"
+}
+
+
+remove_container_image() {
+    # """
+    # Remove Docker container image(s) matching the current configuration.
+    # """
+    echo "# ${FUNCNAME[0]} ..."
+    FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME
+    echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME"
+    docker image ls | grep $FULL_IMAGE_NAME
+    docker image ls | grep $FULL_IMAGE_NAME | awk '{print $1}' | xargs -n 1 -t docker image rm -f
+    docker image ls
+    echo "${FUNCNAME[0]} ... done"
+}
+
+
+push_container_image() {
+    # """
+    # Push Docker container image to registry.
+    #
+    # Authenticates using credentials from ~/.docker/passwd.$REPO_NAME.txt.
+    # """
+    echo "# ${FUNCNAME[0]} ..."
+    FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME
+    echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME"
+    docker login --username $REPO_NAME --password-stdin <~/.docker/passwd.$REPO_NAME.txt
+    docker images $FULL_IMAGE_NAME
+    docker push $FULL_IMAGE_NAME
+    echo "${FUNCNAME[0]} ... done"
+}
+
+
+pull_container_image() {
+    # """
+    # Pull Docker container image from registry.
+    # """
+    echo "# ${FUNCNAME[0]} ..."
+    FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME
+    echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME"
+    docker pull $FULL_IMAGE_NAME
+    echo "${FUNCNAME[0]} ... done"
+}
+
+
+# #############################################################################
+# Docker container management
+# #############################################################################
+
+
+kill_container() {
+    # """
+    # Kill and remove Docker container(s) matching the current configuration.
+    # """
+    echo "# ${FUNCNAME[0]} ..."
+    FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME
+    echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME"
+    docker container ls
+    #
+    CONTAINER_ID=$(docker container ls -a | grep $FULL_IMAGE_NAME | awk '{print $1}')
+    echo "CONTAINER_ID=$CONTAINER_ID"
+    if [[ ! -z $CONTAINER_ID ]]; then
+        docker container rm -f $CONTAINER_ID
+        docker container ls
+    fi;
+    echo "${FUNCNAME[0]} ... done"
+}
+
+
+exec_container() {
+    # """
+    # Execute bash shell in running Docker container.
+    #
+    # Opens an interactive bash session in the first container matching the
+    # current configuration.
+    # """
+    echo "# ${FUNCNAME[0]} ..."
+    FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME
+    echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME"
+    docker container ls
+    #
+    CONTAINER_ID=$(docker container ls -a | grep $FULL_IMAGE_NAME | awk '{print $1}')
+    echo "CONTAINER_ID=$CONTAINER_ID"
+    docker exec -it $CONTAINER_ID bash
+    echo "${FUNCNAME[0]} ... done"
+}
+
+
+# #############################################################################
+# Docker common options
+# #############################################################################
+
+
+get_docker_common_options() {
+    # """
+    # Return docker run options common to all container types.
+    #
+    # Includes volume mount for the git root, plus environment variables for
+    # PYTHONPATH and host OS name.
+    #
+    # :return: docker run options string with volume mounts and env vars
+    # """
+    echo "-v $GIT_ROOT:/git_root \
+    -e PYTHONPATH=/git_root:/git_root/helpers_root:/git_root/msml610/tutorials \
+    -e CSFY_GIT_ROOT_PATH=/git_root \
+    -e CSFY_HOST_OS_NAME=$(uname -s) \
+    -e CSFY_HOST_NAME=$(uname -n)"
+}
+
+
+# #############################################################################
+# Docker bash
+# #############################################################################
+
+
+get_docker_bash_command() {
+    # """
+    # Return the base docker run command for an interactive bash shell.
+    #
+    # :return: docker run command string with --rm and -ti flags
+    # """
+    if [ -t 0 ]; then
+        echo "docker run --rm -ti"
+    else
+        echo "docker run --rm -i"
+    fi
+}
+
+
+get_docker_bash_options() {
+    # """
+    # Return docker run options for a Docker container.
+    #
+    # :param container_name: Name for the Docker container
+    # :param port: Port number to forward (optional, skipped if empty)
+    # :param extra_opts: Additional docker run options (optional)
+    # :return: docker run options string with name, volume mounts, and env vars
+    # """
+    local container_name=$1
+    local port=$2
+    local extra_opts=$3
+    local port_opt=""
+    if [[ -n $port ]]; then
+        port_opt="-p $port:$port"
+    fi
+    echo "--name $container_name \
+    $port_opt \
+    $extra_opts \
+    $(get_docker_common_options)"
+}
+
+
+# #############################################################################
+# Docker cmd
+# #############################################################################
+
+
+get_docker_cmd_command() {
+    # """
+    # Return the base docker run command for executing a non-interactive command.
+    #
+    # :return: docker run command string with --rm and -i flags
+    # """
+    echo "docker run --rm -i"
+}
+
+
+# #############################################################################
+# Docker Jupyter
+# #############################################################################
+
+
+get_docker_jupyter_command() {
+    # """
+    # Return the base docker run command for running Jupyter Lab interactively.
+    #
+    # :return: docker run command string with --rm and -ti flags
+    # """
+    echo "docker run --rm -ti"
+}
+
+
+get_docker_jupyter_options() {
+    # """
+    # Return docker run options for a Jupyter Lab container.
+    #
+    # :param container_name: Name for the Docker container
+    # :param host_port: Host port to forward to container port 8888
+    # :param jupyter_use_vim: 0 or 1 to enable vim bindings
+    # :return: docker run options string
+    # """
+    local container_name=$1
+    local host_port=$2
+    local jupyter_use_vim=$3
+    # Run as the current user when user is saggese.
+    if [[ "$(whoami)" == "saggese" ]]; then
+        echo "Overwriting jupyter_use_vim since user='saggese'"
+        jupyter_use_vim=1
+    fi
+    echo "--name $container_name \
+    -p $host_port:8888 \
+    $(get_docker_common_options) \
+    -e JUPYTER_USE_VIM=$jupyter_use_vim"
+}
+
+
+configure_jupyter_vim_keybindings() {
+    # """
+    # Configure JupyterLab vim keybindings based on JUPYTER_USE_VIM env var.
+    #
+    # Reads JUPYTER_USE_VIM; if 1, verifies jupyterlab_vim is installed and
+    # writes enabled settings; otherwise writes disabled settings.
+    # """
+    mkdir -p ~/.jupyter/lab/user-settings/@axlair/jupyterlab_vim
+    if [[ $JUPYTER_USE_VIM == 1 ]]; then
+        # Check that jupyterlab_vim is installed before trying to enable it.
+        if ! pip show jupyterlab_vim > /dev/null 2>&1; then
+            echo "ERROR: jupyterlab_vim is not installed but vim bindings were requested."
+            echo "Install it with: pip install jupyterlab_vim"
+            exit 1
+        fi
+        echo "Enabling vim."
+        cat <<EOF > ~/.jupyter/lab/user-settings/\@axlair/jupyterlab_vim/plugin.jupyterlab-settings
+{
+    "enabled": true,
+    "enabledInEditors": true,
+    "extraKeybindings": []
+}
+EOF
+    else
+        echo "Disabling vim."
+        cat <<EOF > ~/.jupyter/lab/user-settings/\@axlair/jupyterlab_vim/plugin.jupyterlab-settings
+{
+    "enabled": false,
+    "enabledInEditors": false,
+    "extraKeybindings": []
+}
+EOF
+    fi;
+}
+
+
+configure_jupyter_notifications() {
+    # """
+    # Disable JupyterLab news fetching and update checks.
+    # """
+    mkdir -p ~/.jupyter/lab/user-settings/@jupyterlab/apputils-extension
+    cat <<EOF > ~/.jupyter/lab/user-settings/\@jupyterlab/apputils-extension/notification.jupyterlab-settings
+{
+    // Notifications
+    // @jupyterlab/apputils-extension:notification
+    // Notifications settings.
+
+    // Fetch official Jupyter news
+    // Whether to fetch news from the Jupyter news feed. If Always (`true`), it will make a request to a website.
+    "fetchNews": "false",
+    "checkForUpdates": false
+}
+EOF
+}
+
+
+get_jupyter_args() {
+    # """
+    # Print the standard Jupyter Lab command-line arguments.
+    #
+    # :return: space-separated Jupyter Lab args for port 8888 with no browser,
+    #   allow root, and no authentication
+    # """
+    echo "--port=8888 --no-browser --ip=0.0.0.0 --allow-root --ServerApp.token='' --ServerApp.password=''"
+}
+
+
+get_run_jupyter_cmd() {
+    # """
+    # Return the command to run run_jupyter.sh inside a container.
+    #
+    # Computes the script's path relative to GIT_ROOT and builds the
+    # corresponding /git_root/... path used inside the container.
+    #
+    # :param script_path: path of the calling script (pass ${BASH_SOURCE[0]})
+    # :param cmd_opts: options to forward to run_jupyter.sh
+    # :return: full command string to run run_jupyter.sh
+    # """
+    local script_path=$1
+    local cmd_opts=$2
+    local script_dir
+    script_dir=$(cd "$(dirname "$script_path")" && pwd)
+    local rel_dir="${script_dir#${GIT_ROOT}/}"
+    echo "/git_root/${rel_dir}/run_jupyter.sh $cmd_opts"
+}
diff --git a/research/agentic_data_science/schema_agent/version.sh b/research/agentic_data_science/schema_agent/version.sh
new file mode 100755
index 000000000..c46ed254c
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/version.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# """
+# Display versions of installed tools and packages.
+#
+# This script prints version information for Python, pip, Jupyter, and all
+# installed Python packages. Used for debugging and documentation purposes
+# to verify the Docker container environment setup.
+# """
+
+# Display Python 3 version.
+echo "# Python3"
+python3 --version
+
+# Display pip version.
+echo "# pip3"
+pip3 --version
+
+# Display Jupyter version.
+echo "# jupyter"
+jupyter --version
+
+# List all installed Python packages and their versions.
+echo "# Python packages"
+pip3 list
+
+# Template for adding additional tool versions.
+# echo "# mongo"
+# mongod --version

From 211344925bd59eb404b77254e6bb066670d8a676 Mon Sep 17 00:00:00 2001
From: Pranav Shashidhara <pranavmay22@gmail.com>
Date: Tue, 7 Apr 2026 12:55:57 -0400
Subject: [PATCH 09/14] Update schema_agent.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-commit checks:
All checks passed ✅
---
 .../schema_agent/schema_agent.py              | 58 ++++++++++++-------
 1 file changed, 37 insertions(+), 21 deletions(-)

diff --git a/research/agentic_data_science/schema_agent/schema_agent.py b/research/agentic_data_science/schema_agent/schema_agent.py
index e5f995b56..356f8f157 100644
--- a/research/agentic_data_science/schema_agent/schema_agent.py
+++ b/research/agentic_data_science/schema_agent/schema_agent.py
@@ -4,10 +4,10 @@
 Main pipeline and CLI orchestration for end-to-end data profiling.
 
 Usage:
-    python schema_agent.py data.csv
-    python schema_agent.py data.csv --model gpt-4o-mini --llm-scope nulls
-    python schema_agent.py data.csv --metrics mean std min max --output-json out.json
-    python schema_agent.py data.csv data2.csv --tags sales inventory
+    ./schema_agent.py data.csv
+    ./schema_agent.py data.csv --model gpt-4o-mini --llm-scope nulls
+    ./schema_agent.py data.csv --metrics mean std min max --output-json out.json
+    ./schema_agent.py data.csv data2.csv --tags sales inventory
 
 Import as:
 
@@ -19,11 +19,10 @@
 import os
 import sys
 import typing
-import pytz
 
 import dotenv
 import pandas as pd
-import schema_agent_hllmcli as radsasah
+import research.agentic_data_science.schema_agent.schema_agent_hllmcli as radsasah
 import schema_agent_loader as radsasal
 import schema_agent_report as radsasar
 import schema_agent_stats as radsasas
@@ -38,13 +37,11 @@
 dotenv.load_dotenv()
 api_key = os.environ.get("OPENAI_API_KEY")
 
-# Use dassert to ensure the API key exists
 hdbg.dassert(api_key, "OPENAI_API_KEY not found in environment.")
 
 _LOG = hloggin.getLogger(__name__)
 _LOG.setLevel(logging.DEBUG)
 
-# Ensure sys is imported for the handler
 console_handler = logging.StreamHandler(sys.stdout)
 hloggin.set_v2_formatter(
     ch=console_handler,
@@ -74,23 +71,47 @@ def run_pipeline(
 ) -> typing.Tuple[typing.Dict[str, pd.DataFrame], typing.Dict[str, typing.Any]]:
     """
     Execute the full data profiling pipeline over one or more CSV files.
+
+    :param csv_paths: One or more CSV file paths to profile.
+    :type csv_paths: typing.List[str]
+    :param tags: Human-readable tag for each CSV. Defaults to filename stems.
+    :type tags: typing.Optional[typing.List[str]]
+    :param model: LLM model name passed to OpenAI / hllmcli.
+    :type model: str
+    :param metrics: Numeric metrics to include. Defaults to DEFAULT_METRICS.
+    :type metrics: typing.Optional[typing.List[str]]
+    :param llm_scope: "all", "semantic", or "nulls" — controls which columns are LLM-profiled.
+    :type llm_scope: str
+    :param output_json: Path for the merged JSON report.
+    :type output_json: str
+    :param output_md: Path for the Markdown summary.
+    :type output_md: str
+    :param use_langchain: Use LangChain chain instead of hllmcli for LLM calls.
+    :type use_langchain: bool
+    :return: A tuple containing a dict of tag -> df mappings, and a stats dict.
+    :rtype: typing.Tuple[typing.Dict[str, pd.DataFrame], typing.Dict[str, typing.Any]]
     """
+    hdbg.dassert_isinstance(csv_paths, list)
+    hdbg.dassert_lt(0, len(csv_paths), "csv_paths must not be empty.")
+
     if tags is None:
         tags = [os.path.splitext(os.path.basename(p))[0] for p in csv_paths]
 
-    # Use dassert_eq to check that the number of tags matches files
     hdbg.dassert_eq(
         len(tags), 
         len(csv_paths), 
-        msg="Number of tags must match number of CSV paths"
+        "Length of tags (%d) must match csv_paths (%d).", 
+        len(tags), 
+        len(csv_paths)
     )
 
     # --- Load & type-coerce ---
     tag_to_df, cat_cols_map = radsasal.prepare_dataframes(csv_paths, tags)
 
-    # Merge datetime metadata
-    combined_for_dt = pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True)
-    _, datetime_meta = radsasal.infer_and_convert_datetime_columns(combined_for_dt)
+    # Merge datetime metadata across all DataFrames
+    _, datetime_meta = radsasal.infer_and_convert_datetime_columns(
+        pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True)
+    )
 
     # --- Compute stats ---
     stats = radsasas.compute_llm_agent_stats(
@@ -103,7 +124,6 @@ def run_pipeline(
     # --- LLM scope ---
     combined_df = pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True)
     columns_for_llm = radsasah._select_columns_for_llm(combined_df, scope=llm_scope)
-    
     _LOG.info(
         "LLM will profile %d / %d columns (scope=%s).",
         len(columns_for_llm),
@@ -127,10 +147,7 @@ def run_pipeline(
         )
 
     # --- Build column profiles ---
-    # Ensure tag_to_df is not empty before accessing
-    hdbg.dassert(tag_to_df, "No dataframes were loaded.")
     primary_df = list(tag_to_df.values())[0]
-    
     column_profiles = radsasar.build_column_profiles(
         df=primary_df,
         stats=stats,
@@ -152,6 +169,7 @@ def run_pipeline(
 
     return tag_to_df, stats
 
+
 # =============================================================================
 # CLI
 # =============================================================================
@@ -232,9 +250,7 @@ def _build_arg_parser() -> argparse.ArgumentParser:
 
 def main() -> None:
     """
-    CLI entry point.
-
-    Parses arguments and delegates to run_pipeline().
+    CLI entry point. Parses arguments and delegates to run_pipeline().
     """
     parser = _build_arg_parser()
     args = parser.parse_args()
@@ -251,4 +267,4 @@ def main() -> None:
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file

From cd350f32cceaac568aecb9f3eab917abcacf6ef7 Mon Sep 17 00:00:00 2001
From: Pranav Shashidhara <pranavmay22@gmail.com>
Date: Wed, 8 Apr 2026 12:20:07 -0400
Subject: [PATCH 10/14] Add API.ipynb, example.ipynb files and update datetime
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-commit checks:
All checks passed ✅
---
 .../schema_agent/schema_agent.py              |  29 +-
 .../schema_agent/schema_agent_API.ipynb       | 394 ++++++++++++++++
 .../schema_agent/schema_agent_example.ipynb   | 435 ++++++++++++++++++
 .../schema_agent/schema_agent_loader.py       | 147 +++---
 .../schema_agent/schema_agent_stats.py        |  17 +-
 .../schema_agent/scmea_agent_example.ipynb    | 371 ---------------
 6 files changed, 911 insertions(+), 482 deletions(-)
 create mode 100644 research/agentic_data_science/schema_agent/schema_agent_API.ipynb
 create mode 100644 research/agentic_data_science/schema_agent/schema_agent_example.ipynb
 delete mode 100644 research/agentic_data_science/schema_agent/scmea_agent_example.ipynb

diff --git a/research/agentic_data_science/schema_agent/schema_agent.py b/research/agentic_data_science/schema_agent/schema_agent.py
index 356f8f157..8dec89e21 100644
--- a/research/agentic_data_science/schema_agent/schema_agent.py
+++ b/research/agentic_data_science/schema_agent/schema_agent.py
@@ -23,9 +23,9 @@
 import dotenv
 import pandas as pd
 import research.agentic_data_science.schema_agent.schema_agent_hllmcli as radsasah
-import schema_agent_loader as radsasal
-import schema_agent_report as radsasar
-import schema_agent_stats as radsasas
+import research.agentic_data_science.schema_agent.schema_agent_loader as radsasal
+import research.agentic_data_science.schema_agent.schema_agent_report as radsasar
+import research.agentic_data_science.schema_agent.schema_agent_stats as radsasas
 
 import helpers.hdbg as hdbg
 import helpers.hlogging as hloggin
@@ -106,24 +106,28 @@ def run_pipeline(
     )
 
     # --- Load & type-coerce ---
-    tag_to_df, cat_cols_map = radsasal.prepare_dataframes(csv_paths, tags)
-
-    # Merge datetime metadata across all DataFrames
-    _, datetime_meta = radsasal.infer_and_convert_datetime_columns(
-        pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True)
-    )
+    # UPDATED: We now capture datetime_meta during loading to ensure timezone 
+    # consistency and avoid re-inference warnings.
+    tag_to_df, cat_cols_map, datetime_meta = radsasal.prepare_dataframes(csv_paths, tags)
 
     # --- Compute stats ---
+    # The stats module now handles DatetimeIndex and filters out timestamp columns
+    # from math operations to prevent 'abs()' errors.
     stats = radsasas.compute_llm_agent_stats(
         tag_to_df,
         categorical_cols_map=cat_cols_map,
         metrics=metrics,
     )
+    
+    # Inject captured datetime metadata into the stats object for the LLM.
     stats["datetime_columns"] = datetime_meta
 
     # --- LLM scope ---
-    combined_df = pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True)
+    # Combine dataframes for column selection logic. 
+    # Note: We preserve the DatetimeIndex by not using ignore_index=True.
+    combined_df = pd.concat(list(tag_to_df.values()), axis=0)
     columns_for_llm = radsasah._select_columns_for_llm(combined_df, scope=llm_scope)
+    
     _LOG.info(
         "LLM will profile %d / %d columns (scope=%s).",
         len(columns_for_llm),
@@ -147,7 +151,8 @@ def run_pipeline(
         )
 
     # --- Build column profiles ---
-    primary_df = list(tag_to_df.values())[0]
+    # We use the primary dataframe (first tag) as the template for the profile.
+    primary_df = tag_to_df[tags[0]]
     column_profiles = radsasar.build_column_profiles(
         df=primary_df,
         stats=stats,
@@ -161,6 +166,7 @@ def run_pipeline(
         column_profiles=column_profiles,
         output_path=output_json,
     )
+    
     radsasar.export_markdown_from_profiles(
         column_profiles,
         numeric_stats=stats.get("numeric_summary", {}),
@@ -169,7 +175,6 @@ def run_pipeline(
 
     return tag_to_df, stats
 
-
 # =============================================================================
 # CLI
 # =============================================================================
diff --git a/research/agentic_data_science/schema_agent/schema_agent_API.ipynb b/research/agentic_data_science/schema_agent/schema_agent_API.ipynb
new file mode 100644
index 000000000..4845fc9e1
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/schema_agent_API.ipynb
@@ -0,0 +1,394 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "8881f77e-d668-4210-b5c5-06fad5f80608",
+   "metadata": {},
+   "source": [
+    "# API usage Notebook \n",
+    "- This notebook shows the implementation of each function from the respective libraries."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "3d4af97d-2052-4791-8b80-f9fa973b8233",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import dotenv\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "# Load environment variables (ensure OPENAI_API_KEY is set in your .env)\n",
+    "dotenv.load_dotenv()\n",
+    "\n",
+    "# Import the schema agent modules\n",
+    "import research.agentic_data_science.schema_agent.schema_agent_loader as radsasal\n",
+    "import research.agentic_data_science.schema_agent.schema_agent_stats as radsasas\n",
+    "import research.agentic_data_science.schema_agent.schema_agent_hllmcli as radsasah\n",
+    "import research.agentic_data_science.schema_agent.schema_agent_report as radsasar"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "20ab6884-fddf-4205-8bb7-eab2704f6f1d",
+   "metadata": {},
+   "source": [
+    "## 1. Create dummy Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "1ada4cd7-10bb-45c5-be2e-7cce4a5b4de1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Created dummy dataset at: dummy_employees.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 1. Create a dummy dataset\n",
+    "np.random.seed(42)\n",
+    "num_rows = 100\n",
+    "\n",
+    "dummy_data = pd.DataFrame({\n",
+    "    \"employee_id\": range(1000, 1000 + num_rows),\n",
+    "    \"department\": np.random.choice([\"Engineering\", \"Sales\", \"HR\", \"Marketing\"], num_rows),\n",
+    "    \"salary\": np.random.normal(85000, 20000, num_rows),\n",
+    "    \"satisfaction_score\": np.random.uniform(1.0, 5.0, num_rows),\n",
+    "    \"hire_date\": pd.date_range(start=\"2018-01-01\", periods=num_rows, freq=\"W\").astype(str),\n",
+    "    \"notes\": [\"Good performance\"] * 50 + [None] * 50  # 50% nulls\n",
+    "})\n",
+    "\n",
+    "# Inject some missing values into salary\n",
+    "dummy_data.loc[10:20, \"salary\"] = np.nan\n",
+    "\n",
+    "# Save to CSV\n",
+    "csv_path = \"dummy_employees.csv\"\n",
+    "dummy_data.to_csv(csv_path, index=False)\n",
+    "print(f\"Created dummy dataset at: {csv_path}\")\n",
+    "dummy_data.head()\n",
+    "\n",
+    "csv_paths = [csv_path]\n",
+    "tags = [\"dummy_employees\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "25a81ada-67c5-4a5e-a22e-453f2b222b06",
+   "metadata": {},
+   "source": [
+    "## 2. Load and Infer datatypes from the columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "e79d8059-c49d-438e-ad61-7a4c160370d4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--- Loaded DataFrames ---\n",
+      "<class 'pandas.DataFrame'>\n",
+      "DatetimeIndex: 100 entries, 2018-01-07 00:00:00+00:00 to 2019-12-01 00:00:00+00:00\n",
+      "Data columns (total 6 columns):\n",
+      " #   Column              Non-Null Count  Dtype              \n",
+      "---  ------              --------------  -----              \n",
+      " 0   employee_id         100 non-null    int64              \n",
+      " 1   department          100 non-null    str                \n",
+      " 2   salary              89 non-null     float64            \n",
+      " 3   satisfaction_score  100 non-null    float64            \n",
+      " 4   hire_date           100 non-null    datetime64[us, UTC]\n",
+      " 5   notes               50 non-null     str                \n",
+      "dtypes: datetime64[us, UTC](1), float64(2), int64(1), str(2)\n",
+      "memory usage: 5.5 KB\n",
+      "None\n",
+      "\n",
+      "--- Datetime Inference Metadata ---\n",
+      "{'hire_date': {'semantic_type': 'temporal', 'granularity': 'date', 'format': 'inferred', 'confidence': 1.0}}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:75: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
+      "  parsed = pd.to_datetime(df[col], errors=\"coerce\", utc=True)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 1. Load and prepare DataFrames - now receiving 3 variables\n",
+    "tag_to_df, cat_cols_map, datetime_meta = radsasal.prepare_dataframes(csv_paths, tags)\n",
+    "\n",
+    "print(\"--- Loaded DataFrames ---\")\n",
+    "# The index will now show as a DatetimeIndex instead of a RangeIndex\n",
+    "print(tag_to_df[\"dummy_employees\"].info())\n",
+    "\n",
+    "# 2. Combine DataFrames while preserving the index\n",
+    "# We do NOT use ignore_index=True here because we want to keep the DatetimeIndex \n",
+    "# we just created in the loader.\n",
+    "updated_df = pd.concat(list(tag_to_df.values()), axis=0)\n",
+    "\n",
+    "print(\"\\n--- Datetime Inference Metadata ---\")\n",
+    "# This will now correctly show your temporal column info\n",
+    "print(datetime_meta)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dc6c4466-d602-4c82-a2f6-b7ff7ada8e19",
+   "metadata": {},
+   "source": [
+    "## 3. Statistical Profiling"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "1368b932-6299-43c1-a70c-c8e9489eb2b2",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "=== Temporal Boundaries ===\n",
+      "                                 min_index                 max_index           min_valid_index           max_valid_index\n",
+      "dummy_employees 2018-01-07 00:00:00+00:00 2019-12-01 00:00:00+00:00 2018-01-07 00:00:00+00:00 2018-12-16 00:00:00+00:00\n",
+      "                          employee_id         salary satisfaction_score\n",
+      "2018-01-07 00:00:00+00:00        1000     99769.3316           4.268889\n",
+      "2018-01-14 00:00:00+00:00        1001   88427.365624           3.220803\n",
+      "...                               ...            ...                ...\n",
+      "2019-11-24 00:00:00+00:00        1098  101270.344347           3.313121\n",
+      "2019-12-01 00:00:00+00:00        1099   60382.713671           1.143769\n",
+      "                    num_rows  num_zeros zeros [%]  num_nans nans [%]  num_infs infs [%]  num_valid valid [%]\n",
+      "employee_id              100          0       0.0         0      0.0         0      0.0        100     100.0\n",
+      "salary                   100          0       0.0        11     11.0         0      0.0         89      89.0\n",
+      "satisfaction_score       100          0       0.0         0      0.0         0      0.0        100     100.0\n",
+      "\n",
+      "=== Quality Report: dummy_employees ===\n",
+      "                     num_rows  num_zeros zeros [%]  num_nans nans [%]  \\\n",
+      "employee_id              100          0       0.0         0      0.0   \n",
+      "salary                   100          0       0.0        11     11.0   \n",
+      "satisfaction_score       100          0       0.0         0      0.0   \n",
+      "\n",
+      "                    num_infs infs [%]  num_valid valid [%]  \n",
+      "employee_id                0      0.0        100     100.0  \n",
+      "salary                     0      0.0         89      89.0  \n",
+      "satisfaction_score         0      0.0        100     100.0  \n",
+      "\n",
+      "=== Distribution: dummy_employees / department ===\n",
+      "              count  pct [%]\n",
+      "department                 \n",
+      "Marketing       30     30.0\n",
+      "Sales           26     26.0\n",
+      "HR              24     24.0\n",
+      "Engineering     20     20.0\n",
+      "\n",
+      "=== Distribution: dummy_employees / notes ===\n",
+      "                   count  pct [%]\n",
+      "notes                           \n",
+      "Good performance     50     50.0\n",
+      "\n",
+      "=== Numeric Summary: dummy_employees ===\n",
+      "                             mean           std           min            max\n",
+      "employee_id          1049.500000     29.011492   1000.000000    1099.000000\n",
+      "salary              83981.174276  19304.098590  32605.097918  134264.842250\n",
+      "satisfaction_score      3.197062      1.163419      1.020246       4.960215\n",
+      "\n",
+      "--- Stats Computation Complete ---\n",
+      "Calculated stats for tags: ['dummy_employees']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# We pass the metadata we just generated into the stats function\n",
+    "stats = radsasas.compute_llm_agent_stats(\n",
+    "    tag_to_df=tag_to_df,\n",
+    "    categorical_cols_map=cat_cols_map,\n",
+    "    metrics=[\"mean\", \"std\", \"min\", \"max\"]\n",
+    ")\n",
+    "\n",
+    "# Manually ensure the datetime_columns key is populated for the LLM\n",
+    "stats[\"datetime_columns\"] = datetime_meta\n",
+    "\n",
+    "print(\"\\n--- Stats Computation Complete ---\")\n",
+    "print(f\"Calculated stats for tags: {list(stats['numeric_summary'].keys())}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "41cf298b-2e84-4d82-8073-79f7f0c07277",
+   "metadata": {},
+   "source": [
+    "## 4. Call LLM for column type inferencing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "db84d7ee-715d-464a-94e9-fd05261e36a4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Cache hit for apply_llm\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Selected columns for LLM: ['employee_id', 'department', 'salary', 'satisfaction_score', 'hire_date', 'notes']\n",
+      "\n",
+      "--- LLM Prompt Snippet ---\n",
+      "You are a Senior Data Scientist and Domain Expert.\n",
+      "Analyze the provided dataset statistics and generate a profile for each column.\n",
+      "For each column, provide 2-3 testable hypotheses.\n",
+      "Example: 'Higher discount rates correlate with higher volume but lower margins.'\n",
+      "\n",
+      "--- DATASET STATISTICS ---\n",
+      "\n",
+      "Detected Datetime Columns:\n",
+      "{\n",
+      "  \"hire_date\": {\n",
+      "    \"semantic_type\": \"temporal\",\n",
+      "    \"granularity\": \"date\",\n",
+      "    \"format\": \"inferred\",\n",
+      "    \"confidence\": 1.0\n",
+      "  }\n",
+      "}\n",
+      "\n",
+      "Dataset [dummy_employees] Numeric Summary:\n",
+      "     \n",
+      "...\n",
+      "\n",
+      "--- LLM Insights Retrieved Successfully ---\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 1. Select columns (e.g., let's just send everything)\n",
+    "columns_for_llm = radsasah._select_columns_for_llm(updated_df, scope=\"all\")\n",
+    "print(f\"Selected columns for LLM: {columns_for_llm}\\n\")\n",
+    "\n",
+    "# 2. Build the exact prompt string that goes to the LLM\n",
+    "prompt_text = radsasah.build_llm_prompt(stats, columns_to_include=columns_for_llm)\n",
+    "print(\"--- LLM Prompt Snippet ---\")\n",
+    "print(prompt_text[:500] + \"\\n...\\n\")\n",
+    "\n",
+    "# 3. Call the LLM to generate hypotheses (using gpt-4o as default)\n",
+    "# If you don't have an API key configured, you can mock this response by creating a static dict.\n",
+    "try:\n",
+    "    semantic_insights = radsasah.generate_hypotheses_via_cli(\n",
+    "        stats=stats,\n",
+    "        model=\"gpt-4o\",\n",
+    "        columns_to_include=columns_for_llm\n",
+    "    )\n",
+    "    print(\"--- LLM Insights Retrieved Successfully ---\")\n",
+    "except Exception as e:\n",
+    "    print(f\"LLM call failed (Check API key): {e}\")\n",
+    "    semantic_insights = {\"columns\": {}} # Fallback empty dict"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0b497a08-1e1f-47b3-8f7d-8a7038488c4d",
+   "metadata": {},
+   "source": [
+    "## 5. Export to JSON and Markdown"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "a3cd0c87-5951-4da3-b2ca-22abecebe626",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Pipeline complete! Check your directory for:\n",
+      "1. dummy_profile_report.json\n",
+      "2. dummy_profile_summary.md\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 1. Build structured column profiles\n",
+    "primary_df = list(tag_to_df.values())[0]\n",
+    "column_profiles = radsasar.build_column_profiles(\n",
+    "    df=primary_df,\n",
+    "    stats=stats,\n",
+    "    insights=semantic_insights\n",
+    ")\n",
+    "\n",
+    "# 2. Export to JSON\n",
+    "json_out = \"dummy_profile_report.json\"\n",
+    "radsasar.merge_and_export_results(\n",
+    "    stats=stats,\n",
+    "    insights=semantic_insights,\n",
+    "    column_profiles=column_profiles,\n",
+    "    output_path=json_out\n",
+    ")\n",
+    "\n",
+    "# 3. Export to Markdown\n",
+    "md_out = \"dummy_profile_summary.md\"\n",
+    "radsasar.export_markdown_from_profiles(\n",
+    "    column_profiles=column_profiles,\n",
+    "    numeric_stats=stats.get(\"numeric_summary\", {}),\n",
+    "    output_path=md_out\n",
+    ")\n",
+    "\n",
+    "print(f\"\\nPipeline complete! Check your directory for:\")\n",
+    "print(f\"1. {json_out}\")\n",
+    "print(f\"2. {md_out}\")\n",
+    "\n",
+    "# Clean up dummy CSV if desired\n",
+    "# os.remove(csv_path)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/research/agentic_data_science/schema_agent/schema_agent_example.ipynb b/research/agentic_data_science/schema_agent/schema_agent_example.ipynb
new file mode 100644
index 000000000..44b96491c
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/schema_agent_example.ipynb
@@ -0,0 +1,435 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b6e62e00-6cb3-45ef-8b7d-3a8ce84eb825",
+   "metadata": {},
+   "source": [
+    "# Schema Parser example \n",
+    "- This implementation in the notebook utilizes a suite of pre-existing functions to parse a single Excel (or CSV) file, automatically inferring data types and capturing temporal metadata for downstream analysis."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "3770d3bd-200f-4b7a-bb10-1fe76f26c4d7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:79: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
+      "  parsed = pd.to_datetime(df[col], errors=\"coerce\", utc=True)\n",
+      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:79: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
+      "  parsed = pd.to_datetime(df[col], errors=\"coerce\", utc=True)\n",
+      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:79: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
+      "  parsed = pd.to_datetime(df[col], errors=\"coerce\", utc=True)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "=== Temporal Boundaries ===\n",
+      "                                min_index                 max_index           min_valid_index           max_valid_index\n",
+      "ecommerce_data 2009-12-01 07:45:00+00:00 2010-12-09 20:01:00+00:00 2009-12-01 07:45:00+00:00 2010-12-09 20:01:00+00:00\n",
+      "                           year month week_of_year day_of_week order_hour is_weekend customer_id unit_price_gbp quantity_sold sales_amount_gbp population_total       gdp_current_usd gdp_growth_pct inflation_consumer_pct\n",
+      "2009-12-01 07:45:00+00:00  2009    12           49           1          7          0       13085           5.95            10             59.5       62276270.0       2412840006231.5     -17.633976                1.89709\n",
+      "2009-12-01 07:45:00+00:00  2009    12           49           1          7          0       13085           6.75            12             81.0       62276270.0       2412840006231.5     -17.633976                1.89709\n",
+      "...                         ...   ...          ...         ...        ...        ...         ...            ...           ...              ...              ...                   ...            ...                    ...\n",
+      "2010-12-09 20:01:00+00:00  2010    12           49           3         20          0       17530           1.95             4              7.8       62766365.0  2485482596184.708984       3.010668               1.589081\n",
+      "2010-12-09 20:01:00+00:00  2010    12           49           3         20          0       17530           1.25             4              5.0       62766365.0  2485482596184.708984       3.010668               1.589081\n",
+      "                       num_rows num_zeros zeros [%] num_nans nans [%] num_infs infs [%] num_valid valid [%]\n",
+      "year                     100000         0       0.0        0      0.0        0      0.0    100000     100.0\n",
+      "month                    100000         0       0.0        0      0.0        0      0.0    100000     100.0\n",
+      "...                         ...       ...       ...      ...      ...      ...      ...       ...       ...\n",
+      "gdp_growth_pct           100000         0       0.0        0      0.0        0      0.0    100000     100.0\n",
+      "inflation_consumer_pct   100000         0       0.0        0      0.0        0      0.0    100000     100.0\n",
+      "\n",
+      "=== Quality Report: ecommerce_data ===\n",
+      "                         num_rows  num_zeros zeros [%]  num_nans nans [%]  \\\n",
+      "year                      100000          0       0.0         0      0.0   \n",
+      "month                     100000          0       0.0         0      0.0   \n",
+      "week_of_year              100000          0       0.0         0      0.0   \n",
+      "day_of_week               100000      16298      16.3         0      0.0   \n",
+      "order_hour                100000          0       0.0         0      0.0   \n",
+      "is_weekend                100000      84604      84.6         0      0.0   \n",
+      "customer_id               100000          0       0.0         0      0.0   \n",
+      "unit_price_gbp            100000          0       0.0         0      0.0   \n",
+      "quantity_sold             100000          0       0.0         0      0.0   \n",
+      "sales_amount_gbp          100000          0       0.0         0      0.0   \n",
+      "population_total          100000          0       0.0         0      0.0   \n",
+      "gdp_current_usd           100000          0       0.0         0      0.0   \n",
+      "gdp_growth_pct            100000          0       0.0         0      0.0   \n",
+      "inflation_consumer_pct    100000          0       0.0         0      0.0   \n",
+      "\n",
+      "                        num_infs infs [%]  num_valid valid [%]  \n",
+      "year                           0      0.0     100000     100.0  \n",
+      "month                          0      0.0     100000     100.0  \n",
+      "week_of_year                   0      0.0     100000     100.0  \n",
+      "day_of_week                    0      0.0      83702      83.7  \n",
+      "order_hour                     0      0.0     100000     100.0  \n",
+      "is_weekend                     0      0.0      15396      15.4  \n",
+      "customer_id                    0      0.0     100000     100.0  \n",
+      "unit_price_gbp                 0      0.0     100000     100.0  \n",
+      "quantity_sold                  0      0.0     100000     100.0  \n",
+      "sales_amount_gbp               0      0.0     100000     100.0  \n",
+      "population_total               0      0.0     100000     100.0  \n",
+      "gdp_current_usd                0      0.0     100000     100.0  \n",
+      "gdp_growth_pct                 0      0.0     100000     100.0  \n",
+      "inflation_consumer_pct         0      0.0     100000     100.0  \n",
+      "\n",
+      "=== Distribution: ecommerce_data / country ===\n",
+      "                 count  pct [%]\n",
+      "country                       \n",
+      "United Kingdom  64417   64.417\n",
+      "Ireland          8507    8.507\n",
+      "Germany          7654    7.654\n",
+      "France           5470    5.470\n",
+      "Netherlands      2729    2.729\n",
+      "Spain            1235    1.235\n",
+      "Switzerland      1170    1.170\n",
+      "Belgium          1037    1.037\n",
+      "Portugal          984    0.984\n",
+      "Sweden            868    0.868\n",
+      "\n",
+      "=== Distribution: ecommerce_data / country_code ===\n",
+      "               count  pct [%]\n",
+      "country_code                \n",
+      "GBR           64417   64.417\n",
+      "IRL            8507    8.507\n",
+      "DEU            7654    7.654\n",
+      "FRA            5470    5.470\n",
+      "NLD            2729    2.729\n",
+      "ESP            1235    1.235\n",
+      "CHE            1170    1.170\n",
+      "BEL            1037    1.037\n",
+      "PRT             984    0.984\n",
+      "SWE             868    0.868\n",
+      "\n",
+      "=== Distribution: ecommerce_data / product_id ===\n",
+      "             count  pct [%]\n",
+      "product_id                \n",
+      "POST          731    0.731\n",
+      "85123A        615    0.615\n",
+      "21212         438    0.438\n",
+      "22423         437    0.437\n",
+      "85099B        391    0.391\n",
+      "20725         334    0.334\n",
+      "84991         298    0.298\n",
+      "20914         295    0.295\n",
+      "21232         295    0.295\n",
+      "84879         285    0.285\n",
+      "\n",
+      "=== Numeric Summary: ecommerce_data ===\n",
+      "                                 mean           std           min        median           max\n",
+      "year                    2.009929e+03  2.563578e-01  2.009000e+03  2.010000e+03  2.010000e+03\n",
+      "month                   7.377590e+00  3.456657e+00  1.000000e+00  8.000000e+00  1.200000e+01\n",
+      "week_of_year            2.991514e+01  1.500327e+01  1.000000e+00  3.300000e+01  5.200000e+01\n",
+      "day_of_week             2.583280e+00  1.923159e+00  0.000000e+00  2.000000e+00  6.000000e+00\n",
+      "order_hour              1.268047e+01  2.351588e+00  7.000000e+00  1.300000e+01  2.000000e+01\n",
+      "is_weekend              1.539600e-01  3.609122e-01  0.000000e+00  0.000000e+00  1.000000e+00\n",
+      "customer_id             1.476813e+04  1.799165e+03  1.234600e+04  1.464600e+04  1.828700e+04\n",
+      "unit_price_gbp          3.889158e+00  5.975020e+01  1.000000e-03  1.950000e+00  1.095350e+04\n",
+      "quantity_sold           1.865779e+01  1.593465e+02  1.000000e+00  6.000000e+00  1.915200e+04\n",
+      "sales_amount_gbp        2.694892e+01  9.239021e+01  1.000000e-03  1.498000e+01  1.095350e+04\n",
+      "population_total        5.409812e+07  2.664448e+07  3.180410e+05  6.276636e+07  3.093782e+08\n",
+      "gdp_current_usd         2.161193e+12  1.115049e+12  9.035824e+09  2.485483e+12  1.504897e+13\n",
+      "gdp_growth_pct          4.626259e-01  6.134116e+00 -1.962987e+01  3.010668e+00  3.250405e+01\n",
+      "inflation_consumer_pct  1.104250e+00  1.655513e+00 -1.518298e+01  1.589081e+00  1.652789e+01\n",
+      "12:11:45 rss=0.267GB vms=1.690GB mem_pct=2% cpu=0% - \u001b[36mINFO \u001b[0m Task-193 schema_agent.py run_pipeline:131 LLM will profile 3 / 18 columns (scope=semantic).\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Cache hit for apply_llm\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>order_datetime</th>\n",
+       "      <th>year</th>\n",
+       "      <th>month</th>\n",
+       "      <th>week_of_year</th>\n",
+       "      <th>day_of_week</th>\n",
+       "      <th>order_hour</th>\n",
+       "      <th>is_weekend</th>\n",
+       "      <th>country</th>\n",
+       "      <th>country_code</th>\n",
+       "      <th>product_id</th>\n",
+       "      <th>customer_id</th>\n",
+       "      <th>unit_price_gbp</th>\n",
+       "      <th>quantity_sold</th>\n",
+       "      <th>sales_amount_gbp</th>\n",
+       "      <th>population_total</th>\n",
+       "      <th>gdp_current_usd</th>\n",
+       "      <th>gdp_growth_pct</th>\n",
+       "      <th>inflation_consumer_pct</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>order_datetime</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2009-12-01 07:45:00+00:00</th>\n",
+       "      <td>2009-12-01 07:45:00+00:00</td>\n",
+       "      <td>2009</td>\n",
+       "      <td>12</td>\n",
+       "      <td>49</td>\n",
+       "      <td>1</td>\n",
+       "      <td>7</td>\n",
+       "      <td>0</td>\n",
+       "      <td>United Kingdom</td>\n",
+       "      <td>GBR</td>\n",
+       "      <td>21523</td>\n",
+       "      <td>13085</td>\n",
+       "      <td>5.95</td>\n",
+       "      <td>10</td>\n",
+       "      <td>59.50</td>\n",
+       "      <td>62276270.0</td>\n",
+       "      <td>2.412840e+12</td>\n",
+       "      <td>-17.633976</td>\n",
+       "      <td>1.89709</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2009-12-01 07:45:00+00:00</th>\n",
+       "      <td>2009-12-01 07:45:00+00:00</td>\n",
+       "      <td>2009</td>\n",
+       "      <td>12</td>\n",
+       "      <td>49</td>\n",
+       "      <td>1</td>\n",
+       "      <td>7</td>\n",
+       "      <td>0</td>\n",
+       "      <td>United Kingdom</td>\n",
+       "      <td>GBR</td>\n",
+       "      <td>79323W</td>\n",
+       "      <td>13085</td>\n",
+       "      <td>6.75</td>\n",
+       "      <td>12</td>\n",
+       "      <td>81.00</td>\n",
+       "      <td>62276270.0</td>\n",
+       "      <td>2.412840e+12</td>\n",
+       "      <td>-17.633976</td>\n",
+       "      <td>1.89709</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2009-12-01 09:06:00+00:00</th>\n",
+       "      <td>2009-12-01 09:06:00+00:00</td>\n",
+       "      <td>2009</td>\n",
+       "      <td>12</td>\n",
+       "      <td>49</td>\n",
+       "      <td>1</td>\n",
+       "      <td>9</td>\n",
+       "      <td>0</td>\n",
+       "      <td>United Kingdom</td>\n",
+       "      <td>GBR</td>\n",
+       "      <td>82582</td>\n",
+       "      <td>13078</td>\n",
+       "      <td>2.10</td>\n",
+       "      <td>12</td>\n",
+       "      <td>25.20</td>\n",
+       "      <td>62276270.0</td>\n",
+       "      <td>2.412840e+12</td>\n",
+       "      <td>-17.633976</td>\n",
+       "      <td>1.89709</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2009-12-01 09:06:00+00:00</th>\n",
+       "      <td>2009-12-01 09:06:00+00:00</td>\n",
+       "      <td>2009</td>\n",
+       "      <td>12</td>\n",
+       "      <td>49</td>\n",
+       "      <td>1</td>\n",
+       "      <td>9</td>\n",
+       "      <td>0</td>\n",
+       "      <td>United Kingdom</td>\n",
+       "      <td>GBR</td>\n",
+       "      <td>22111</td>\n",
+       "      <td>13078</td>\n",
+       "      <td>4.25</td>\n",
+       "      <td>24</td>\n",
+       "      <td>102.00</td>\n",
+       "      <td>62276270.0</td>\n",
+       "      <td>2.412840e+12</td>\n",
+       "      <td>-17.633976</td>\n",
+       "      <td>1.89709</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2009-12-01 09:06:00+00:00</th>\n",
+       "      <td>2009-12-01 09:06:00+00:00</td>\n",
+       "      <td>2009</td>\n",
+       "      <td>12</td>\n",
+       "      <td>49</td>\n",
+       "      <td>1</td>\n",
+       "      <td>9</td>\n",
+       "      <td>0</td>\n",
+       "      <td>United Kingdom</td>\n",
+       "      <td>GBR</td>\n",
+       "      <td>21756</td>\n",
+       "      <td>13078</td>\n",
+       "      <td>5.95</td>\n",
+       "      <td>3</td>\n",
+       "      <td>17.85</td>\n",
+       "      <td>62276270.0</td>\n",
+       "      <td>2.412840e+12</td>\n",
+       "      <td>-17.633976</td>\n",
+       "      <td>1.89709</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                     order_datetime  year  month  \\\n",
+       "order_datetime                                                     \n",
+       "2009-12-01 07:45:00+00:00 2009-12-01 07:45:00+00:00  2009     12   \n",
+       "2009-12-01 07:45:00+00:00 2009-12-01 07:45:00+00:00  2009     12   \n",
+       "2009-12-01 09:06:00+00:00 2009-12-01 09:06:00+00:00  2009     12   \n",
+       "2009-12-01 09:06:00+00:00 2009-12-01 09:06:00+00:00  2009     12   \n",
+       "2009-12-01 09:06:00+00:00 2009-12-01 09:06:00+00:00  2009     12   \n",
+       "\n",
+       "                           week_of_year  day_of_week  order_hour  is_weekend  \\\n",
+       "order_datetime                                                                 \n",
+       "2009-12-01 07:45:00+00:00            49            1           7           0   \n",
+       "2009-12-01 07:45:00+00:00            49            1           7           0   \n",
+       "2009-12-01 09:06:00+00:00            49            1           9           0   \n",
+       "2009-12-01 09:06:00+00:00            49            1           9           0   \n",
+       "2009-12-01 09:06:00+00:00            49            1           9           0   \n",
+       "\n",
+       "                                  country country_code product_id  \\\n",
+       "order_datetime                                                      \n",
+       "2009-12-01 07:45:00+00:00  United Kingdom          GBR      21523   \n",
+       "2009-12-01 07:45:00+00:00  United Kingdom          GBR     79323W   \n",
+       "2009-12-01 09:06:00+00:00  United Kingdom          GBR      82582   \n",
+       "2009-12-01 09:06:00+00:00  United Kingdom          GBR      22111   \n",
+       "2009-12-01 09:06:00+00:00  United Kingdom          GBR      21756   \n",
+       "\n",
+       "                           customer_id  unit_price_gbp  quantity_sold  \\\n",
+       "order_datetime                                                          \n",
+       "2009-12-01 07:45:00+00:00        13085            5.95             10   \n",
+       "2009-12-01 07:45:00+00:00        13085            6.75             12   \n",
+       "2009-12-01 09:06:00+00:00        13078            2.10             12   \n",
+       "2009-12-01 09:06:00+00:00        13078            4.25             24   \n",
+       "2009-12-01 09:06:00+00:00        13078            5.95              3   \n",
+       "\n",
+       "                           sales_amount_gbp  population_total  \\\n",
+       "order_datetime                                                  \n",
+       "2009-12-01 07:45:00+00:00             59.50        62276270.0   \n",
+       "2009-12-01 07:45:00+00:00             81.00        62276270.0   \n",
+       "2009-12-01 09:06:00+00:00             25.20        62276270.0   \n",
+       "2009-12-01 09:06:00+00:00            102.00        62276270.0   \n",
+       "2009-12-01 09:06:00+00:00             17.85        62276270.0   \n",
+       "\n",
+       "                           gdp_current_usd  gdp_growth_pct  \\\n",
+       "order_datetime                                               \n",
+       "2009-12-01 07:45:00+00:00     2.412840e+12      -17.633976   \n",
+       "2009-12-01 07:45:00+00:00     2.412840e+12      -17.633976   \n",
+       "2009-12-01 09:06:00+00:00     2.412840e+12      -17.633976   \n",
+       "2009-12-01 09:06:00+00:00     2.412840e+12      -17.633976   \n",
+       "2009-12-01 09:06:00+00:00     2.412840e+12      -17.633976   \n",
+       "\n",
+       "                           inflation_consumer_pct  \n",
+       "order_datetime                                     \n",
+       "2009-12-01 07:45:00+00:00                 1.89709  \n",
+       "2009-12-01 07:45:00+00:00                 1.89709  \n",
+       "2009-12-01 09:06:00+00:00                 1.89709  \n",
+       "2009-12-01 09:06:00+00:00                 1.89709  \n",
+       "2009-12-01 09:06:00+00:00                 1.89709  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "import research.agentic_data_science.schema_agent.schema_agent as radsasag\n",
+    "\n",
+    "# Now run the pipeline\n",
+    "csv_files = [\"global_ecommerce_forecasting.csv\"]\n",
+    "tags = [\"ecommerce_data\"]\n",
+    "\n",
+    "tag_to_df, stats = radsasag.run_pipeline(\n",
+    "    csv_paths=csv_files,\n",
+    "    tags=tags,\n",
+    "    model=\"gpt-4o\",\n",
+    "    llm_scope=\"semantic\"\n",
+    ")\n",
+    "\n",
+    "display(tag_to_df[\"ecommerce_data\"].head())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/research/agentic_data_science/schema_agent/schema_agent_loader.py b/research/agentic_data_science/schema_agent/schema_agent_loader.py
index d8f649547..84a421b11 100644
--- a/research/agentic_data_science/schema_agent/schema_agent_loader.py
+++ b/research/agentic_data_science/schema_agent/schema_agent_loader.py
@@ -45,99 +45,55 @@ def load_csv(csv_path: str) -> pd.DataFrame:
     return df
 
 
-# keep legacy name for backwards compatibility
-load_employee_data = load_csv
-
-
 def infer_and_convert_datetime_columns(
     df: pd.DataFrame,
     sample_size: int = 100,
     threshold: float = 0.8,
 ) -> typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]:
-    """
-    Detect and convert date/datetime columns in a DataFrame.
-
-    Uses sampling for performance. Returns the updated DataFrame and a
-    metadata dict with inference details per column.
-
-    :param df: Input DataFrame.
-    :type df: pd.DataFrame
-    :param sample_size: Number of rows to sample when testing format compliance.
-    :type sample_size: int
-    :param threshold: Minimum fraction of parsed values required to accept a column as temporal.
-    :type threshold: float
-    :return: Updated DataFrame with converted columns + metadata per column.
-    :rtype: typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]
-    """
-    hdbg.dassert_isinstance(df, pd.DataFrame)
-    
-    COMMON_FORMATS = [
-        "%Y-%m-%d",
-        "%d-%m-%Y",
-        "%m-%d-%Y",
-        "%Y/%m/%d",
-        "%d/%m/%Y",
-        "%m/%d/%Y",
-        "%Y-%m-%d %H:%M:%S",
-        "%Y-%m-%d %H:%M",
-        "%d-%m-%Y %H:%M:%S",
-        "%m/%d/%Y %H:%M:%S",
-    ]
-
     metadata: typing.Dict[str, typing.Any] = {}
     df_out = df.copy()
 
     for col in df.columns:
-        if not (
-            pd.api.types.is_object_dtype(df[col])
-            or pd.api.types.is_string_dtype(df[col])
-        ):
+        # 1. If it's already datetime, just ensure UTC awareness
+        if pd.api.types.is_datetime64_any_dtype(df[col]):
+            df_out[col] = pd.to_datetime(df[col], utc=True)
+            metadata[col] = {
+                "semantic_type": "temporal",
+                "granularity": "datetime",
+                "format": "pre-converted",
+                "confidence": 1.0,
+            }
             continue
 
-        series = df[col].dropna().astype(str)
-        if series.empty:
+        # 2. Only attempt conversion on strings/objects
+        if not (pd.api.types.is_object_dtype(df[col]) or pd.api.types.is_string_dtype(df[col])):
             continue
 
-        sample = series.head(sample_size)
-        best_format: typing.Optional[str] = None
-        best_score = 0.0
-
-        for fmt in COMMON_FORMATS:
-            success = sum(1 for val in sample if _try_strptime(val, fmt))
-            score = success / len(sample)
-            if score > best_score:
-                best_score = score
-                best_format = fmt
-
-        if best_score >= threshold:
-            parsed = pd.to_datetime(df[col], format=best_format, errors="coerce")
-            used_format = best_format
-        else:
-            parsed = pd.to_datetime(df[col], errors="coerce")
-            used_format = None
-
-        confidence = float(parsed.notna().mean())
-        if confidence < threshold:
+        # Try to parse
+        try:
+            # We use errors="coerce" so non-dates become NaT
+            parsed = pd.to_datetime(df[col], errors="coerce", utc=True)
+            
+            valid_count = parsed.notna().sum()
+            if valid_count == 0:
+                continue
+                
+            confidence = float(valid_count / len(df[col]))
+            
+            # Only convert if it meets our confidence threshold
+            if confidence >= threshold:
+                df_out[col] = parsed
+                has_time = (parsed.dt.time != pd.Timestamp("00:00:00").time()).any()
+                metadata[col] = {
+                    "semantic_type": "temporal",
+                    "granularity": "datetime" if has_time else "date",
+                    "format": "inferred",
+                    "confidence": confidence,
+                }
+                _LOG.info("Converted column '%s' to datetime", col)
+        except Exception:
             continue
 
-        has_time = (parsed.dt.time != pd.Timestamp("00:00:00").time()).any()
-        col_type = "datetime" if has_time else "date"
-        df_out[col] = parsed
-
-        metadata[col] = {
-            "semantic_type": "temporal",
-            "granularity": col_type,
-            "format": used_format,
-            "confidence": confidence,
-        }
-        _LOG.info(
-            "Column '%s' detected as %s (format=%s, confidence=%.2f)",
-            col,
-            col_type,
-            used_format,
-            confidence,
-        )
-
     return df_out, metadata
 
 
@@ -152,38 +108,47 @@ def _try_strptime(val: str, fmt: str) -> bool:
         return False
 
 
+
 def prepare_dataframes(
     csv_paths: typing.List[str],
     tags: typing.Optional[typing.List[str]] = None,
 ) -> typing.Tuple[
-    typing.Dict[str, pd.DataFrame], typing.Dict[str, typing.List[str]]
+    typing.Dict[str, pd.DataFrame], 
+    typing.Dict[str, typing.List[str]],
+    typing.Dict[str, typing.Any]  # Added return type for metadata
 ]:
     """
     Load and prepare all CSV files in one pass.
-
-    Applies type coercion, datetime inference, and categorical detection.
-
-    :param csv_paths: List of CSV file paths.
-    :type csv_paths: typing.List[str]
-    :param tags: Human-readable tags; defaults to filename stems.
-    :type tags: typing.Optional[typing.List[str]]
-    :return: A tuple containing a dict mapping tags to DataFrames, and a dict mapping tags to categorical columns.
-    :rtype: typing.Tuple[typing.Dict[str, pd.DataFrame], typing.Dict[str, typing.List[str]]]
     """
     hdbg.dassert_isinstance(csv_paths, list)
-    hdbg.dassert_lt(0, len(csv_paths))
+    if tags is None:
+        import os
+        tags = [os.path.splitext(os.path.basename(p))[0] for p in csv_paths]
     
     tag_to_df: typing.Dict[str, pd.DataFrame] = {}
     cat_cols_map: typing.Dict[str, typing.List[str]] = {}
+    combined_dt_meta: typing.Dict[str, typing.Any] = {} # Store metadata here
 
     for path, tag in zip(csv_paths, tags):
+        # 1. Load and perform initial type conversion
         df = load_csv(path)
         df = hpanconv.convert_df(df)
-        df, _ = infer_and_convert_datetime_columns(df)
+        
+        # 2. Perform datetime inference and CAPTURE metadata
+        df, dt_meta = infer_and_convert_datetime_columns(df)
+        combined_dt_meta.update(dt_meta) # Merge metadata
+        
+        # 3. FIX: Automatically promote the first detected temporal column to 
+        # the Index for Quality and Duration reports.
+        temporal_cols = [c for c, m in dt_meta.items() if m.get("semantic_type") == "temporal"]
+        if temporal_cols:
+            df = df.set_index(temporal_cols[0], drop=False)
+            
         tag_to_df[tag] = df
 
+        # 4. Identify categorical/string columns
         cat_cols_map[tag] = df.select_dtypes(
             include=["object", "category", "string"]
         ).columns.tolist()
 
-    return tag_to_df, cat_cols_map
\ No newline at end of file
+    return tag_to_df, cat_cols_map, combined_dt_meta
\ No newline at end of file
diff --git a/research/agentic_data_science/schema_agent/schema_agent_stats.py b/research/agentic_data_science/schema_agent/schema_agent_stats.py
index 24ab40857..5c170fe17 100644
--- a/research/agentic_data_science/schema_agent/schema_agent_stats.py
+++ b/research/agentic_data_science/schema_agent/schema_agent_stats.py
@@ -78,23 +78,24 @@ def compute_llm_agent_stats(
     # 2. Data quality
     dataframe_stats["quality_reports"] = {}
     for tag, df in tag_to_df.items():
-        numeric_df = df.select_dtypes(include="number")
+        # Select ONLY actual numeric columns for the quality math
+        numeric_df = df.select_dtypes(include=["int64", "float64"])
+        
         if numeric_df.empty:
-            _LOG.warning(
-                "No numeric columns in '%s'; skipping quality report", tag
-            )
+            _LOG.warning("No numeric columns in '%s'; skipping quality report", tag)
             continue
-        df_stamped = hpanstat.add_end_download_timestamp(numeric_df.copy())
+            
         try:
+            # Pass ONLY the numeric dataframe here
             quality = hpanstat.report_zero_nan_inf_stats(
-                df_stamped,
+                numeric_df,
                 zero_threshold=1e-9,
                 verbose=True,
                 as_txt=True,
             )
             dataframe_stats["quality_reports"][tag] = quality
-            print(f"\n=== Quality Report: {tag} ===\n", quality.to_string())
-        except Exception as e:  # pylint: disable=broad-exception-caught
+            print(f"\n=== Quality Report: {tag} ===\n", quality)
+        except Exception as e:
             _LOG.warning("Quality report failed for '%s': %s", tag, e)
 
     # 3. Categorical distributions
diff --git a/research/agentic_data_science/schema_agent/scmea_agent_example.ipynb b/research/agentic_data_science/schema_agent/scmea_agent_example.ipynb
deleted file mode 100644
index 6104a4038..000000000
--- a/research/agentic_data_science/schema_agent/scmea_agent_example.ipynb
+++ /dev/null
@@ -1,371 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "49bffb6b-9c87-4d5a-a32a-a2382c2b700c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%load_ext autoreload\n",
-    "%autoreload 2\n",
-    "import argparse\n",
-    "import logging\n",
-    "import os\n",
-    "import typing\n",
-    "\n",
-    "import dotenv\n",
-    "import pandas as pd\n",
-    "import research.agentic_data_science.schema_agent.schema_agent_hllmcli as radsasah\n",
-    "import schema_agent_loader as radsasal\n",
-    "import schema_agent_report as radsasar\n",
-    "import schema_agent_stats as radsasas\n",
-    "\n",
-    "import helpers.hdbg as hdbg\n",
-    "import helpers.hlogging as hloggin\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "3770d3bd-200f-4b7a-bb10-1fe76f26c4d7",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n",
-      "WARNING: Running in Jupyter\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
-      "  parsed = pd.to_datetime(df[col], errors=\"coerce\")\n",
-      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
-      "  parsed = pd.to_datetime(df[col], errors=\"coerce\")\n",
-      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
-      "  parsed = pd.to_datetime(df[col], errors=\"coerce\")\n",
-      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
-      "  parsed = pd.to_datetime(df[col], errors=\"coerce\")\n",
-      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
-      "  parsed = pd.to_datetime(df[col], errors=\"coerce\")\n",
-      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
-      "  parsed = pd.to_datetime(df[col], errors=\"coerce\")\n",
-      "Skipping duration stats: 'int' object has no attribute 'tzinfo'\n",
-      "Quality report failed for 'ecommerce_data': 'RangeIndex' object has no attribute 'date'\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "       year month week_of_year day_of_week order_hour is_weekend customer_id unit_price_gbp quantity_sold sales_amount_gbp population_total       gdp_current_usd gdp_growth_pct inflation_consumer_pct            end_download_timestamp\n",
-      "0      2009    12           49           1          7          0       13085           5.95            10             59.5       62276270.0       2412840006231.5     -17.633976                1.89709  2026-04-07 16:06:12.386207+00:00\n",
-      "1      2009    12           49           1          7          0       13085           6.75            12             81.0       62276270.0       2412840006231.5     -17.633976                1.89709  2026-04-07 16:06:12.386207+00:00\n",
-      "...     ...   ...          ...         ...        ...        ...         ...            ...           ...              ...              ...                   ...            ...                    ...                               ...\n",
-      "99998  2010    12           49           3         20          0       17530           1.95             4              7.8       62766365.0  2485482596184.708984       3.010668               1.589081  2026-04-07 16:06:12.386207+00:00\n",
-      "99999  2010    12           49           3         20          0       17530           1.25             4              5.0       62766365.0  2485482596184.708984       3.010668               1.589081  2026-04-07 16:06:12.386207+00:00\n",
-      "\n",
-      "=== Distribution: ecommerce_data / country ===\n",
-      "                 count  pct [%]\n",
-      "country                       \n",
-      "United Kingdom  64417   64.417\n",
-      "Ireland          8507    8.507\n",
-      "Germany          7654    7.654\n",
-      "France           5470    5.470\n",
-      "Netherlands      2729    2.729\n",
-      "Spain            1235    1.235\n",
-      "Switzerland      1170    1.170\n",
-      "Belgium          1037    1.037\n",
-      "Portugal          984    0.984\n",
-      "Sweden            868    0.868\n",
-      "\n",
-      "=== Distribution: ecommerce_data / country_code ===\n",
-      "               count  pct [%]\n",
-      "country_code                \n",
-      "GBR           64417   64.417\n",
-      "IRL            8507    8.507\n",
-      "DEU            7654    7.654\n",
-      "FRA            5470    5.470\n",
-      "NLD            2729    2.729\n",
-      "ESP            1235    1.235\n",
-      "CHE            1170    1.170\n",
-      "BEL            1037    1.037\n",
-      "PRT             984    0.984\n",
-      "SWE             868    0.868\n",
-      "\n",
-      "=== Distribution: ecommerce_data / product_id ===\n",
-      "             count  pct [%]\n",
-      "product_id                \n",
-      "POST          731    0.731\n",
-      "85123A        615    0.615\n",
-      "21212         438    0.438\n",
-      "22423         437    0.437\n",
-      "85099B        391    0.391\n",
-      "20725         334    0.334\n",
-      "84991         298    0.298\n",
-      "20914         295    0.295\n",
-      "21232         295    0.295\n",
-      "84879         285    0.285\n",
-      "\n",
-      "=== Numeric Summary: ecommerce_data ===\n",
-      "                                 mean           std           min        median           max\n",
-      "year                    2.009929e+03  2.563578e-01  2.009000e+03  2.010000e+03  2.010000e+03\n",
-      "month                   7.377590e+00  3.456657e+00  1.000000e+00  8.000000e+00  1.200000e+01\n",
-      "week_of_year            2.991514e+01  1.500327e+01  1.000000e+00  3.300000e+01  5.200000e+01\n",
-      "day_of_week             2.583280e+00  1.923159e+00  0.000000e+00  2.000000e+00  6.000000e+00\n",
-      "order_hour              1.268047e+01  2.351588e+00  7.000000e+00  1.300000e+01  2.000000e+01\n",
-      "is_weekend              1.539600e-01  3.609122e-01  0.000000e+00  0.000000e+00  1.000000e+00\n",
-      "customer_id             1.476813e+04  1.799165e+03  1.234600e+04  1.464600e+04  1.828700e+04\n",
-      "unit_price_gbp          3.889158e+00  5.975020e+01  1.000000e-03  1.950000e+00  1.095350e+04\n",
-      "quantity_sold           1.865779e+01  1.593465e+02  1.000000e+00  6.000000e+00  1.915200e+04\n",
-      "sales_amount_gbp        2.694892e+01  9.239021e+01  1.000000e-03  1.498000e+01  1.095350e+04\n",
-      "population_total        5.409812e+07  2.664448e+07  3.180410e+05  6.276636e+07  3.093782e+08\n",
-      "gdp_current_usd         2.161193e+12  1.115049e+12  9.035824e+09  2.485483e+12  1.504897e+13\n",
-      "gdp_growth_pct          4.626259e-01  6.134116e+00 -1.962987e+01  3.010668e+00  3.250405e+01\n",
-      "inflation_consumer_pct  1.104250e+00  1.655513e+00 -1.518298e+01  1.589081e+00  1.652789e+01\n",
-      "12:06:12 rss=0.234GB vms=1.655GB mem_pct=2% cpu=100% - \u001b[36mINFO \u001b[0m Task-43 schema_agent.py run_pipeline:107 LLM will profile 3 / 18 columns (scope=semantic).\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>order_datetime</th>\n",
-       "      <th>year</th>\n",
-       "      <th>month</th>\n",
-       "      <th>week_of_year</th>\n",
-       "      <th>day_of_week</th>\n",
-       "      <th>order_hour</th>\n",
-       "      <th>is_weekend</th>\n",
-       "      <th>country</th>\n",
-       "      <th>country_code</th>\n",
-       "      <th>product_id</th>\n",
-       "      <th>customer_id</th>\n",
-       "      <th>unit_price_gbp</th>\n",
-       "      <th>quantity_sold</th>\n",
-       "      <th>sales_amount_gbp</th>\n",
-       "      <th>population_total</th>\n",
-       "      <th>gdp_current_usd</th>\n",
-       "      <th>gdp_growth_pct</th>\n",
-       "      <th>inflation_consumer_pct</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>2009-12-01 07:45:00</td>\n",
-       "      <td>2009</td>\n",
-       "      <td>12</td>\n",
-       "      <td>49</td>\n",
-       "      <td>1</td>\n",
-       "      <td>7</td>\n",
-       "      <td>0</td>\n",
-       "      <td>United Kingdom</td>\n",
-       "      <td>GBR</td>\n",
-       "      <td>21523</td>\n",
-       "      <td>13085</td>\n",
-       "      <td>5.95</td>\n",
-       "      <td>10</td>\n",
-       "      <td>59.50</td>\n",
-       "      <td>62276270.0</td>\n",
-       "      <td>2.412840e+12</td>\n",
-       "      <td>-17.633976</td>\n",
-       "      <td>1.89709</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>2009-12-01 07:45:00</td>\n",
-       "      <td>2009</td>\n",
-       "      <td>12</td>\n",
-       "      <td>49</td>\n",
-       "      <td>1</td>\n",
-       "      <td>7</td>\n",
-       "      <td>0</td>\n",
-       "      <td>United Kingdom</td>\n",
-       "      <td>GBR</td>\n",
-       "      <td>79323W</td>\n",
-       "      <td>13085</td>\n",
-       "      <td>6.75</td>\n",
-       "      <td>12</td>\n",
-       "      <td>81.00</td>\n",
-       "      <td>62276270.0</td>\n",
-       "      <td>2.412840e+12</td>\n",
-       "      <td>-17.633976</td>\n",
-       "      <td>1.89709</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>2009-12-01 09:06:00</td>\n",
-       "      <td>2009</td>\n",
-       "      <td>12</td>\n",
-       "      <td>49</td>\n",
-       "      <td>1</td>\n",
-       "      <td>9</td>\n",
-       "      <td>0</td>\n",
-       "      <td>United Kingdom</td>\n",
-       "      <td>GBR</td>\n",
-       "      <td>82582</td>\n",
-       "      <td>13078</td>\n",
-       "      <td>2.10</td>\n",
-       "      <td>12</td>\n",
-       "      <td>25.20</td>\n",
-       "      <td>62276270.0</td>\n",
-       "      <td>2.412840e+12</td>\n",
-       "      <td>-17.633976</td>\n",
-       "      <td>1.89709</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>2009-12-01 09:06:00</td>\n",
-       "      <td>2009</td>\n",
-       "      <td>12</td>\n",
-       "      <td>49</td>\n",
-       "      <td>1</td>\n",
-       "      <td>9</td>\n",
-       "      <td>0</td>\n",
-       "      <td>United Kingdom</td>\n",
-       "      <td>GBR</td>\n",
-       "      <td>22111</td>\n",
-       "      <td>13078</td>\n",
-       "      <td>4.25</td>\n",
-       "      <td>24</td>\n",
-       "      <td>102.00</td>\n",
-       "      <td>62276270.0</td>\n",
-       "      <td>2.412840e+12</td>\n",
-       "      <td>-17.633976</td>\n",
-       "      <td>1.89709</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>2009-12-01 09:06:00</td>\n",
-       "      <td>2009</td>\n",
-       "      <td>12</td>\n",
-       "      <td>49</td>\n",
-       "      <td>1</td>\n",
-       "      <td>9</td>\n",
-       "      <td>0</td>\n",
-       "      <td>United Kingdom</td>\n",
-       "      <td>GBR</td>\n",
-       "      <td>21756</td>\n",
-       "      <td>13078</td>\n",
-       "      <td>5.95</td>\n",
-       "      <td>3</td>\n",
-       "      <td>17.85</td>\n",
-       "      <td>62276270.0</td>\n",
-       "      <td>2.412840e+12</td>\n",
-       "      <td>-17.633976</td>\n",
-       "      <td>1.89709</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "       order_datetime  year  month  week_of_year  day_of_week  order_hour  \\\n",
-       "0 2009-12-01 07:45:00  2009     12            49            1           7   \n",
-       "1 2009-12-01 07:45:00  2009     12            49            1           7   \n",
-       "2 2009-12-01 09:06:00  2009     12            49            1           9   \n",
-       "3 2009-12-01 09:06:00  2009     12            49            1           9   \n",
-       "4 2009-12-01 09:06:00  2009     12            49            1           9   \n",
-       "\n",
-       "   is_weekend         country country_code product_id  customer_id  \\\n",
-       "0           0  United Kingdom          GBR      21523        13085   \n",
-       "1           0  United Kingdom          GBR     79323W        13085   \n",
-       "2           0  United Kingdom          GBR      82582        13078   \n",
-       "3           0  United Kingdom          GBR      22111        13078   \n",
-       "4           0  United Kingdom          GBR      21756        13078   \n",
-       "\n",
-       "   unit_price_gbp  quantity_sold  sales_amount_gbp  population_total  \\\n",
-       "0            5.95             10             59.50        62276270.0   \n",
-       "1            6.75             12             81.00        62276270.0   \n",
-       "2            2.10             12             25.20        62276270.0   \n",
-       "3            4.25             24            102.00        62276270.0   \n",
-       "4            5.95              3             17.85        62276270.0   \n",
-       "\n",
-       "   gdp_current_usd  gdp_growth_pct  inflation_consumer_pct  \n",
-       "0     2.412840e+12      -17.633976                 1.89709  \n",
-       "1     2.412840e+12      -17.633976                 1.89709  \n",
-       "2     2.412840e+12      -17.633976                 1.89709  \n",
-       "3     2.412840e+12      -17.633976                 1.89709  \n",
-       "4     2.412840e+12      -17.633976                 1.89709  "
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# Make sure this is at the top of your notebook\n",
-    "%load_ext autoreload\n",
-    "%autoreload 2\n",
-    "\n",
-    "import schema_agent as radsasag\n",
-    "\n",
-    "# Now run the pipeline\n",
-    "csv_files = [\"global_ecommerce_forecasting.csv\"]\n",
-    "tags = [\"ecommerce_data\"]\n",
-    "\n",
-    "tag_to_df, stats = radsasag.run_pipeline(\n",
-    "    csv_paths=csv_files,\n",
-    "    tags=tags,\n",
-    "    model=\"gpt-4o\",\n",
-    "    llm_scope=\"semantic\"\n",
-    ")\n",
-    "\n",
-    "display(tag_to_df[\"ecommerce_data\"].head())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "36f51050-4ead-49dc-9fec-cd430f24de6f",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

From 49659579a9c3308637225dda7af8763799176f82 Mon Sep 17 00:00:00 2001
From: Pranav Shashidhara <pranavmay22@gmail.com>
Date: Wed, 8 Apr 2026 12:29:40 -0400
Subject: [PATCH 11/14] update README.md

---
 .../schema_agent/README.md                    | 123 ++++++++++--------
 1 file changed, 70 insertions(+), 53 deletions(-)

diff --git a/research/agentic_data_science/schema_agent/README.md b/research/agentic_data_science/schema_agent/README.md
index 020f63e53..c8c397510 100644
--- a/research/agentic_data_science/schema_agent/README.md
+++ b/research/agentic_data_science/schema_agent/README.md
@@ -1,46 +1,48 @@
 # Data Profiler Agent
 
-Automated statistical profiling and LLM-powered semantic analysis for CSV datasets. Generates column-level insights including semantic meaning, data quality assessment, and testable business hypotheses.
+Automated statistical profiling and LLM-powered semantic analysis for CSV datasets. Generates column-level insights including semantic classification, data quality assessment, and testable business hypotheses.
 
 ## Features
 
-- **Temporal Detection:** Auto-detects and converts date/datetime columns across multiple formats
-- **Statistical Profiling:** Computes numeric summaries, data quality metrics, and categorical distributions
-- **LLM Semantic Analysis:** Generates column roles (ID, Feature, Target, Timestamp), semantic meaning, and hypotheses
-- **Cost Optimization:** Filter columns before LLM analysis to control token usage and API costs
-- **Multi-Format Output:** JSON reports and Markdown summaries
+- **Temporal detection** — Auto-detects and converts date/datetime columns across multiple formats
+- **Statistical profiling** — Computes numeric summaries, data quality metrics, and categorical distributions
+- **LLM semantic analysis** — Infers column roles (ID, Feature, Target, Timestamp), semantic meaning, and hypotheses
+- **Cost optimization** — Filter columns prior to LLM analysis to control token usage and API costs
+- **Multi-format output** — JSON reports and Markdown summaries
 
 ## Setup
 
-Go into the schema folder:
+Navigate to the project directory:
 ```bash
 > cd research/agentic_data_science/schema_agent
 ```
 
-Install the requirements:
+Install dependencies:
 ```bash
 > pip install -r requirements.txt
 ```
 
-Set the `OPENAI_API_KEY` in your environment:
+Set your API key:
 ```bash
 > export OPENAI_API_KEY=sk-...
 ```
-Make the script executable 
-```bash 
+
+Make the entry point executable:
+```bash
 > chmod +x schema_agent.py
 ```
+
 ## Module Structure
 
-The agent is split into six focused modules:
+The agent is organized into six focused modules:
 
 | Module | Responsibility |
-|--------|---------------|
-| `schema_agent_models.py` | Pydantic schemas for type-safe column/dataset insights |
-| `schema_agent_loader.py` | CSV loading, type inference, datetime detection |
-| `schema_agent_stats.py` | Numeric summaries, quality reports, categorical distributions |
-| `schema_agent_llm.py` | Prompt building, OpenAI/LangChain calls, structured output parsing |
-| `schema_agent_report.py` | Column profiles, JSON and Markdown export |
+|--------|----------------|
+| `schema_agent_models.py` | Pydantic schemas for type-safe column and dataset insights |
+| `schema_agent_loader.py` | CSV loading, type inference, and datetime detection |
+| `schema_agent_stats.py` | Numeric summaries, quality reports, and categorical distributions |
+| `schema_agent_llm.py` | Prompt construction, OpenAI/LangChain calls, and structured output parsing |
+| `schema_agent_report.py` | Column profiles, JSON export, and Markdown export |
 | `schema_agent.py` | Pipeline orchestration and CLI entry point |
 
 ## Usage
@@ -51,52 +53,58 @@ The agent is split into six focused modules:
 > ./schema_agent.py data.csv
 ```
 
-Outputs:
-- `data_profile_report.json` — Machine-readable report
-- `data_profile_summary.md` — Human-readable summary
+Produces two output files:
+
+- `data_profile_report.json` — Machine-readable column profiles and statistics
+- `data_profile_summary.md` — Human-readable summary table
 
 ### Advanced
 
 ```bash
-# Multiple files with tags
+# Profile multiple files with custom tags
 > ./schema_agent.py dataset1.csv dataset2.csv --tags sales_2024 inv_q1
 
-# Cost-optimized: only high-null columns
+# Cost-optimized: analyze only high-null columns
 > ./schema_agent.py data.csv --llm-scope nulls --model gpt-4o-mini
 
-# Custom metrics and output
+# Custom metrics and output path
 > ./schema_agent.py data.csv --metrics mean std max --output-json my_report.json
 
-# LangChain backend
+# Use LangChain as the inference backend
 > ./schema_agent.py data.csv --use-langchain
 ```
 
-## Command-Line Arguments
+## Command-Line Reference
 
 | Argument | Default | Description |
 |----------|---------|-------------|
 | `csv_paths` | Required | One or more CSV file paths |
-| `--tags` | File stems | Tags for each CSV (must match count) |
-| `--model` | `gpt-4o` | LLM model (`gpt-4o`, `gpt-4o-mini`, etc.) |
-| `--llm-scope` | `all` | Which columns to profile: `all`, `semantic`, `nulls` |
-| `--metrics` | Subset | Numeric metrics: `mean`, `std`, `min`, `25%`, `50%`, `75%`, `max` |
-| `--use-langchain` | False | Use LangChain instead of hllmcli |
-| `--output-json` | `data_profile_report.json` | JSON report path |
-| `--output-md` | `data_profile_summary.md` | Markdown summary path |
+| `--tags` | File stems | Labels for each CSV (count must match `csv_paths`) |
+| `--model` | `gpt-4o` | OpenAI model (`gpt-4o`, `gpt-4o-mini`, etc.) |
+| `--llm-scope` | `all` | Column selection strategy: `all`, `semantic`, or `nulls` |
+| `--metrics` | Subset | Numeric summary stats: `mean`, `std`, `min`, `25%`, `50%`, `75%`, `max` |
+| `--use-langchain` | `false` | Use LangChain instead of the default inference client |
+| `--output-json` | `data_profile_report.json` | Output path for the JSON report |
+| `--output-md` | `data_profile_summary.md` | Output path for the Markdown summary |
 
 ## LLM Scoping
 
-- **`all`** — Every column (highest cost, comprehensive)
-- **`semantic`** — Non-numeric columns only
-- **`nulls`** — Columns with >5% null values (cost-optimized)
+Control which columns are sent to the LLM to manage cost and latency:
+
+| Scope | Behavior |
+|-------|----------|
+| `all` | Profiles every column — most comprehensive, highest cost |
+| `semantic` | Profiles non-numeric columns only |
+| `nulls` | Profiles only columns with >5% null values — most cost-efficient |
 
 ## Python API
 
 ### Full pipeline
 
 ```python
-import schema_agent as radsasag
-tag_to_df, stats = radsasag.run_pipeline(
+import schema_agent as agent
+
+tag_to_df, stats = agent.run_pipeline(
     csv_paths=["data.csv"],
     model="gpt-4o-mini",
     llm_scope="semantic"
@@ -108,30 +116,39 @@ tag_to_df, stats = radsasag.run_pipeline(
 Each module can be imported independently for exploratory use or testing:
 
 ```python
-import schema_agent_loader as radsasal
-import schema_agent_stats as radsasas
-import schema_agent_llm as radsasal
-import schema_agent_report as radsasar
+import schema_agent_loader as loader
+import schema_agent_stats as stats
+import schema_agent_llm as llm
+import schema_agent_report as report
 ```
 
-## Output
+## Output Reference
+
+### `data_profile_report.json`
 
-### data_profile_report.json
-Structured report with column profiles, technical stats, and LLM insights.
+Structured report containing per-column profiles, statistical summaries, and LLM-generated insights.
 
-### data_profile_summary.md
-Formatted table summary: Column | Meaning | Role | Quality | Hypotheses
+### `data_profile_summary.md`
+
+Formatted Markdown table with columns: **Column · Meaning · Role · Quality · Hypotheses**
 
 ## Troubleshooting
 
-**API Key Error:**
+**API key not set**
+
 ```bash
 > export OPENAI_API_KEY=sk-...
 ```
 
-**Validation Errors:**
-- Use `--llm-scope nulls` or `--llm-scope semantic` to reduce columns
-- Try `--model gpt-4o-mini`
+**Validation or parsing errors**
+
+Reduce the number of columns sent to the LLM:
+
+```bash
+> ./schema_agent.py data.csv --llm-scope nulls
+> ./schema_agent.py data.csv --llm-scope semantic --model gpt-4o-mini
+```
+
+**No datetime columns detected**
 
-**Datetime Detection:**
-Skipped automatically if no temporal columns detected.
+Expected behavior — datetime detection is skipped automatically when no temporal columns are present in the dataset.
\ No newline at end of file

From be0094b169e4422feee16c74e15afcf4be6528db Mon Sep 17 00:00:00 2001
From: Pranav Shashidhara <pranavmay22@gmail.com>
Date: Thu, 9 Apr 2026 10:32:12 -0400
Subject: [PATCH 12/14] Use uv and lint *.ipynb files
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-commit checks:
All checks passed ✅
---
 .../schema_agent/README.md                    |  10 +-
 .../schema_agent/requirements.in              |  18 ++
 .../schema_agent/requirements.txt             | 234 +++++++++++++++++-
 .../schema_agent/schema_agent_API.py          | 163 ++++++++++++
 .../schema_agent/schema_agent_example.ipynb   |  13 +-
 .../schema_agent/schema_agent_example.py      |  35 +++
 .../schema_agent/schema_agent_hllmcli.py      |   2 +-
 7 files changed, 454 insertions(+), 21 deletions(-)
 create mode 100644 research/agentic_data_science/schema_agent/requirements.in
 create mode 100644 research/agentic_data_science/schema_agent/schema_agent_API.py
 create mode 100644 research/agentic_data_science/schema_agent/schema_agent_example.py

diff --git a/research/agentic_data_science/schema_agent/README.md b/research/agentic_data_science/schema_agent/README.md
index c8c397510..dc7a08292 100644
--- a/research/agentic_data_science/schema_agent/README.md
+++ b/research/agentic_data_science/schema_agent/README.md
@@ -102,7 +102,7 @@ Control which columns are sent to the LLM to manage cost and latency:
 ### Full pipeline
 
 ```python
-import schema_agent as agent
+import research.agentic_data_science.schema_agent.schema_agent as agent
 
 tag_to_df, stats = agent.run_pipeline(
     csv_paths=["data.csv"],
@@ -116,10 +116,10 @@ tag_to_df, stats = agent.run_pipeline(
 Each module can be imported independently for exploratory use or testing:
 
 ```python
-import schema_agent_loader as loader
-import schema_agent_stats as stats
-import schema_agent_llm as llm
-import schema_agent_report as report
+import research.agentic_data_science.schema_agent.schema_agent_loader as loader
+import research.agentic_data_science.schema_agent.schema_agent_stats as stats
+import research.agentic_data_science.schema_agent.schema_agent_llm as llm
+import research.agentic_data_science.schema_agent.schema_agent_report as report
 ```
 
 ## Output Reference
diff --git a/research/agentic_data_science/schema_agent/requirements.in b/research/agentic_data_science/schema_agent/requirements.in
new file mode 100644
index 000000000..08dbfa79b
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/requirements.in
@@ -0,0 +1,18 @@
+pandas==3.0.2
+numpy==2.4.4
+
+langchain-core==1.2.27
+langchain-openai==1.1.12
+
+langgraph==1.1.6
+langgraph-checkpoint==4.0.1
+langgraph-prebuilt==1.0.9
+langgraph-sdk==0.3.12
+
+llm==0.30
+tokencost==0.1.26
+
+pytz==2026.1.post1
+python-dotenv==1.2.2
+
+setuptools>=65.0.0
\ No newline at end of file
diff --git a/research/agentic_data_science/schema_agent/requirements.txt b/research/agentic_data_science/schema_agent/requirements.txt
index ed4078da6..3d14b9e25 100644
--- a/research/agentic_data_science/schema_agent/requirements.txt
+++ b/research/agentic_data_science/schema_agent/requirements.txt
@@ -1,8 +1,226 @@
-pandas
-langchain_core
-langchain_openai
-langgraph
-llm
-tokencost
-pytz 
-dotenv
\ No newline at end of file
+#
+# This file is autogenerated by pip-compile with Python 3.12
+# by the following command:
+#
+#    pip-compile requirements.in
+#
+aiohappyeyeballs==2.6.1
+    # via aiohttp
+aiohttp==3.13.5
+    # via tokencost
+aiosignal==1.4.0
+    # via aiohttp
+annotated-types==0.7.0
+    # via pydantic
+anthropic==0.92.0
+    # via tokencost
+anyio==4.13.0
+    # via
+    #   anthropic
+    #   httpx
+    #   openai
+attrs==26.1.0
+    # via aiohttp
+certifi==2026.2.25
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+charset-normalizer==3.4.7
+    # via requests
+click==8.3.2
+    # via
+    #   click-default-group
+    #   llm
+    #   sqlite-utils
+click-default-group==1.2.4
+    # via
+    #   llm
+    #   sqlite-utils
+condense-json==0.1.3
+    # via llm
+distro==1.9.0
+    # via
+    #   anthropic
+    #   openai
+docstring-parser==0.17.0
+    # via anthropic
+frozenlist==1.8.0
+    # via
+    #   aiohttp
+    #   aiosignal
+h11==0.16.0
+    # via httpcore
+httpcore==1.0.9
+    # via httpx
+httpx==0.28.1
+    # via
+    #   anthropic
+    #   langgraph-sdk
+    #   langsmith
+    #   openai
+idna==3.11
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+    #   yarl
+jiter==0.13.0
+    # via
+    #   anthropic
+    #   openai
+jsonpatch==1.33
+    # via langchain-core
+jsonpointer==3.1.1
+    # via jsonpatch
+langchain-core==1.2.27
+    # via
+    #   -r requirements.in
+    #   langchain-openai
+    #   langgraph
+    #   langgraph-checkpoint
+    #   langgraph-prebuilt
+langchain-openai==1.1.12
+    # via -r requirements.in
+langgraph==1.1.6
+    # via -r requirements.in
+langgraph-checkpoint==4.0.1
+    # via
+    #   -r requirements.in
+    #   langgraph
+    #   langgraph-prebuilt
+langgraph-prebuilt==1.0.9
+    # via
+    #   -r requirements.in
+    #   langgraph
+langgraph-sdk==0.3.12
+    # via
+    #   -r requirements.in
+    #   langgraph
+langsmith==0.7.29
+    # via langchain-core
+llm==0.30
+    # via -r requirements.in
+multidict==6.7.1
+    # via
+    #   aiohttp
+    #   yarl
+numpy==2.4.4
+    # via
+    #   -r requirements.in
+    #   pandas
+openai==2.31.0
+    # via
+    #   langchain-openai
+    #   llm
+orjson==3.11.8
+    # via
+    #   langgraph-sdk
+    #   langsmith
+ormsgpack==1.12.2
+    # via langgraph-checkpoint
+packaging==26.0
+    # via
+    #   langchain-core
+    #   langsmith
+pandas==3.0.2
+    # via -r requirements.in
+pluggy==1.6.0
+    # via
+    #   llm
+    #   sqlite-utils
+propcache==0.4.1
+    # via
+    #   aiohttp
+    #   yarl
+puremagic==2.2.0
+    # via llm
+pydantic==2.12.5
+    # via
+    #   anthropic
+    #   langchain-core
+    #   langgraph
+    #   langsmith
+    #   llm
+    #   openai
+pydantic-core==2.41.5
+    # via pydantic
+python-dateutil==2.9.0.post0
+    # via
+    #   pandas
+    #   sqlite-utils
+python-dotenv==1.2.2
+    # via -r requirements.in
+python-ulid==3.1.0
+    # via llm
+pytz==2026.1.post1
+    # via -r requirements.in
+pyyaml==6.0.3
+    # via
+    #   langchain-core
+    #   llm
+regex==2026.4.4
+    # via tiktoken
+requests==2.33.1
+    # via
+    #   langsmith
+    #   requests-toolbelt
+    #   tiktoken
+requests-toolbelt==1.0.0
+    # via langsmith
+six==1.17.0
+    # via python-dateutil
+sniffio==1.3.1
+    # via
+    #   anthropic
+    #   openai
+sqlite-fts4==1.0.3
+    # via sqlite-utils
+sqlite-migrate==0.1b0
+    # via llm
+sqlite-utils==3.39
+    # via
+    #   llm
+    #   sqlite-migrate
+tabulate==0.10.0
+    # via sqlite-utils
+tenacity==9.1.4
+    # via langchain-core
+tiktoken==0.12.0
+    # via
+    #   langchain-openai
+    #   tokencost
+tokencost==0.1.26
+    # via -r requirements.in
+tqdm==4.67.3
+    # via openai
+typing-extensions==4.15.0
+    # via
+    #   aiosignal
+    #   anthropic
+    #   anyio
+    #   langchain-core
+    #   openai
+    #   pydantic
+    #   pydantic-core
+    #   typing-inspection
+typing-inspection==0.4.2
+    # via pydantic
+urllib3==2.6.3
+    # via requests
+uuid-utils==0.14.1
+    # via
+    #   langchain-core
+    #   langsmith
+xxhash==3.6.0
+    # via
+    #   langgraph
+    #   langsmith
+yarl==1.23.0
+    # via aiohttp
+zstandard==0.25.0
+    # via langsmith
+
+# The following packages are considered to be unsafe in a requirements file:
+# pip
+# setuptools
diff --git a/research/agentic_data_science/schema_agent/schema_agent_API.py b/research/agentic_data_science/schema_agent/schema_agent_API.py
new file mode 100644
index 000000000..939295380
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/schema_agent_API.py
@@ -0,0 +1,163 @@
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.19.1
+#   kernelspec:
+#     display_name: Python 3 (ipykernel)
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # API usage Notebook 
+# - This notebook shows the implementation of each function from the respective libraries.
+
+# %%
+# %load_ext autoreload
+# %autoreload 2
+
+import dotenv
+import os
+import pandas as pd
+import numpy as np
+
+# Load environment variables (ensure OPENAI_API_KEY is set in your .env)
+dotenv.load_dotenv()
+
+# Import the schema agent modules
+import research.agentic_data_science.schema_agent.schema_agent_loader as radsasal
+import research.agentic_data_science.schema_agent.schema_agent_stats as radsasas
+import research.agentic_data_science.schema_agent.schema_agent_hllmcli as radsasah
+import research.agentic_data_science.schema_agent.schema_agent_report as radsasar
+
+# %% [markdown]
+# ## 1. Create dummy Dataset
+
+# %%
+# 1. Create a dummy dataset
+np.random.seed(42)
+num_rows = 100
+
+dummy_data = pd.DataFrame({
+    "employee_id": range(1000, 1000 + num_rows),
+    "department": np.random.choice(["Engineering", "Sales", "HR", "Marketing"], num_rows),
+    "salary": np.random.normal(85000, 20000, num_rows),
+    "satisfaction_score": np.random.uniform(1.0, 5.0, num_rows),
+    "hire_date": pd.date_range(start="2018-01-01", periods=num_rows, freq="W").astype(str),
+    "notes": ["Good performance"] * 50 + [None] * 50  # 50% nulls
+})
+
+# Inject some missing values into salary
+dummy_data.loc[10:20, "salary"] = np.nan
+
+# Save to CSV
+csv_path = "dummy_employees.csv"
+dummy_data.to_csv(csv_path, index=False)
+print(f"Created dummy dataset at: {csv_path}")
+dummy_data.head()
+
+csv_paths = [csv_path]
+tags = ["dummy_employees"]
+
+# %% [markdown]
+# ## 2. Load and Infer datatypes from the columns
+
+# %%
+# 1. Load and prepare DataFrames - now receiving 3 variables
+tag_to_df, cat_cols_map, datetime_meta = radsasal.prepare_dataframes(csv_paths, tags)
+
+print("--- Loaded DataFrames ---")
+# The index will now show as a DatetimeIndex instead of a RangeIndex
+print(tag_to_df["dummy_employees"].info())
+
+# 2. Combine DataFrames while preserving the index
+# We do NOT use ignore_index=True here because we want to keep the DatetimeIndex 
+# we just created in the loader.
+updated_df = pd.concat(list(tag_to_df.values()), axis=0)
+
+print("\n--- Datetime Inference Metadata ---")
+# This will now correctly show your temporal column info
+print(datetime_meta)
+
+# %% [markdown]
+# ## 3. Statistical Profiling
+
+# %%
+# We pass the metadata we just generated into the stats function
+stats = radsasas.compute_llm_agent_stats(
+    tag_to_df=tag_to_df,
+    categorical_cols_map=cat_cols_map,
+    metrics=["mean", "std", "min", "max"]
+)
+
+# Manually ensure the datetime_columns key is populated for the LLM
+stats["datetime_columns"] = datetime_meta
+
+print("\n--- Stats Computation Complete ---")
+print(f"Calculated stats for tags: {list(stats['numeric_summary'].keys())}")
+
+# %% [markdown]
+# ## 4. Call LLM for column type inferencing
+
+# %%
+# 1. Select columns (e.g., let's just send everything)
+columns_for_llm = radsasah._select_columns_for_llm(updated_df, scope="all")
+print(f"Selected columns for LLM: {columns_for_llm}\n")
+
+# 2. Build the exact prompt string that goes to the LLM
+prompt_text = radsasah.build_llm_prompt(stats, columns_to_include=columns_for_llm)
+print("--- LLM Prompt Snippet ---")
+print(prompt_text[:500] + "\n...\n")
+
+# 3. Call the LLM to generate hypotheses (using gpt-4o as default)
+# If you don't have an API key configured, you can mock this response by creating a static dict.
+try:
+    semantic_insights = radsasah.generate_hypotheses_via_cli(
+        stats=stats,
+        model="gpt-4o",
+        columns_to_include=columns_for_llm
+    )
+    print("--- LLM Insights Retrieved Successfully ---")
+except Exception as e:
+    print(f"LLM call failed (Check API key): {e}")
+    semantic_insights = {"columns": {}} # Fallback empty dict
+
+# %% [markdown]
+# ## 5. Export to JSON and Markdown
+
+# %%
+# 1. Build structured column profiles
+primary_df = list(tag_to_df.values())[0]
+column_profiles = radsasar.build_column_profiles(
+    df=primary_df,
+    stats=stats,
+    insights=semantic_insights
+)
+
+# 2. Export to JSON
+json_out = "dummy_profile_report.json"
+radsasar.merge_and_export_results(
+    stats=stats,
+    insights=semantic_insights,
+    column_profiles=column_profiles,
+    output_path=json_out
+)
+
+# 3. Export to Markdown
+md_out = "dummy_profile_summary.md"
+radsasar.export_markdown_from_profiles(
+    column_profiles=column_profiles,
+    numeric_stats=stats.get("numeric_summary", {}),
+    output_path=md_out
+)
+
+print(f"\nPipeline complete! Check your directory for:")
+print(f"1. {json_out}")
+print(f"2. {md_out}")
+
+# Clean up dummy CSV if desired
+# os.remove(csv_path)
diff --git a/research/agentic_data_science/schema_agent/schema_agent_example.ipynb b/research/agentic_data_science/schema_agent/schema_agent_example.ipynb
index 44b96491c..0355550c0 100644
--- a/research/agentic_data_science/schema_agent/schema_agent_example.ipynb
+++ b/research/agentic_data_science/schema_agent/schema_agent_example.ipynb
@@ -11,7 +11,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 1,
    "id": "3770d3bd-200f-4b7a-bb10-1fe76f26c4d7",
    "metadata": {},
    "outputs": [
@@ -19,19 +19,18 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
+      "WARNING: Running in Jupyter\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:79: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
+      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:75: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
       "  parsed = pd.to_datetime(df[col], errors=\"coerce\", utc=True)\n",
-      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:79: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
+      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:75: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
       "  parsed = pd.to_datetime(df[col], errors=\"coerce\", utc=True)\n",
-      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:79: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
+      "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:75: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n",
       "  parsed = pd.to_datetime(df[col], errors=\"coerce\", utc=True)\n"
      ]
     },
@@ -147,7 +146,7 @@
       "gdp_current_usd         2.161193e+12  1.115049e+12  9.035824e+09  2.485483e+12  1.504897e+13\n",
       "gdp_growth_pct          4.626259e-01  6.134116e+00 -1.962987e+01  3.010668e+00  3.250405e+01\n",
       "inflation_consumer_pct  1.104250e+00  1.655513e+00 -1.518298e+01  1.589081e+00  1.652789e+01\n",
-      "12:11:45 rss=0.267GB vms=1.690GB mem_pct=2% cpu=0% - \u001b[36mINFO \u001b[0m Task-193 schema_agent.py run_pipeline:131 LLM will profile 3 / 18 columns (scope=semantic).\n"
+      "10:24:44 rss=0.222GB vms=1.643GB mem_pct=1% cpu=100% - \u001b[36mINFO \u001b[0m Task-20 schema_agent.py run_pipeline:131 LLM will profile 3 / 18 columns (scope=semantic).\n"
      ]
     },
     {
diff --git a/research/agentic_data_science/schema_agent/schema_agent_example.py b/research/agentic_data_science/schema_agent/schema_agent_example.py
new file mode 100644
index 000000000..1c9f4a455
--- /dev/null
+++ b/research/agentic_data_science/schema_agent/schema_agent_example.py
@@ -0,0 +1,35 @@
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.19.1
+#   kernelspec:
+#     display_name: Python 3 (ipykernel)
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Schema Parser example 
+# - This implementation in the notebook utilizes a suite of pre-existing functions to parse a single Excel (or CSV) file, automatically inferring data types and capturing temporal metadata for downstream analysis.
+
+# %%
+# %load_ext autoreload
+# %autoreload 2
+import research.agentic_data_science.schema_agent.schema_agent as radsasag
+
+# Now run the pipeline
+csv_files = ["global_ecommerce_forecasting.csv"]
+tags = ["ecommerce_data"]
+
+tag_to_df, stats = radsasag.run_pipeline(
+    csv_paths=csv_files,
+    tags=tags,
+    model="gpt-4o",
+    llm_scope="semantic"
+)
+
+display(tag_to_df["ecommerce_data"].head())
diff --git a/research/agentic_data_science/schema_agent/schema_agent_hllmcli.py b/research/agentic_data_science/schema_agent/schema_agent_hllmcli.py
index 37b4c8f0e..740dc6ce1 100644
--- a/research/agentic_data_science/schema_agent/schema_agent_hllmcli.py
+++ b/research/agentic_data_science/schema_agent/schema_agent_hllmcli.py
@@ -12,7 +12,7 @@
 import langchain_openai as lco
 import pandas as pd
 import pydantic
-import schema_agent_models as radsasam
+import research.agentic_data_science.schema_agent.schema_agent_models as radsasam
 
 import helpers.hdbg as hdbg
 import helpers.hllm_cli as hllmcli

From 15d6cc5ae87f247deb0ecd143fc78c69f1d202bb Mon Sep 17 00:00:00 2001
From: Pranav Shashidhara <pranavmay22@gmail.com>
Date: Fri, 10 Apr 2026 11:41:09 -0400
Subject: [PATCH 13/14] Update README.md, Blog and lint notebooks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-commit checks:
All checks passed ✅
---
 .../schema_agent/README.md                    | 168 +++++++++---------
 ...agent_API.ipynb => schema_agent.API.ipynb} |   0
 ...chema_agent_API.py => schema_agent.API.py} |   0
 ...ample.ipynb => schema_agent.example.ipynb} |   0
 ...ent_example.py => schema_agent.example.py} |   0
 .../schema_agent/schema_agent.py              |   1 +
 website/docs/blog/posts/draft.Schema_agent.md |  58 ++++++
 7 files changed, 147 insertions(+), 80 deletions(-)
 rename research/agentic_data_science/schema_agent/{schema_agent_API.ipynb => schema_agent.API.ipynb} (100%)
 rename research/agentic_data_science/schema_agent/{schema_agent_API.py => schema_agent.API.py} (100%)
 rename research/agentic_data_science/schema_agent/{schema_agent_example.ipynb => schema_agent.example.ipynb} (100%)
 rename research/agentic_data_science/schema_agent/{schema_agent_example.py => schema_agent.example.py} (100%)
 mode change 100644 => 100755 research/agentic_data_science/schema_agent/schema_agent.py
 create mode 100644 website/docs/blog/posts/draft.Schema_agent.md

diff --git a/research/agentic_data_science/schema_agent/README.md b/research/agentic_data_science/schema_agent/README.md
index dc7a08292..7c33925bd 100644
--- a/research/agentic_data_science/schema_agent/README.md
+++ b/research/agentic_data_science/schema_agent/README.md
@@ -2,104 +2,96 @@
 
 Automated statistical profiling and LLM-powered semantic analysis for CSV datasets. Generates column-level insights including semantic classification, data quality assessment, and testable business hypotheses.
 
-## Features
+## Key Features
 
-- **Temporal detection** — Auto-detects and converts date/datetime columns across multiple formats
+- **Automatic temporal detection** — Identifies and converts date/datetime columns across multiple formats
 - **Statistical profiling** — Computes numeric summaries, data quality metrics, and categorical distributions
-- **LLM semantic analysis** — Infers column roles (ID, Feature, Target, Timestamp), semantic meaning, and hypotheses
-- **Cost optimization** — Filter columns prior to LLM analysis to control token usage and API costs
-- **Multi-format output** — JSON reports and Markdown summaries
+- **LLM-powered semantic analysis** — Infers column roles (ID, Feature, Target, Timestamp), semantic meaning, and generates testable business hypotheses
+- **Smart cost control** — Selectively analyze columns to optimize API usage and reduce costs
+- **Flexible output formats** — Generate machine-readable JSON reports and human-friendly Markdown summaries
 
-## Setup
+## Quick Start
 
-Navigate to the project directory:
-```bash
-> cd research/agentic_data_science/schema_agent
-```
-
-Install dependencies:
-```bash
-> pip install -r requirements.txt
-```
+### Installation
 
-Set your API key:
-```bash
-> export OPENAI_API_KEY=sk-...
-```
+Navigate to the project directory and install dependencies:
 
-Make the entry point executable:
 ```bash
-> chmod +x schema_agent.py
+cd research/agentic_data_science/schema_agent
+pip install -r requirements.txt
+export OPENAI_API_KEY=sk-...
+chmod +x schema_agent.py
 ```
 
-## Module Structure
-
-The agent is organized into six focused modules:
-
-| Module | Responsibility |
-|--------|----------------|
-| `schema_agent_models.py` | Pydantic schemas for type-safe column and dataset insights |
-| `schema_agent_loader.py` | CSV loading, type inference, and datetime detection |
-| `schema_agent_stats.py` | Numeric summaries, quality reports, and categorical distributions |
-| `schema_agent_llm.py` | Prompt construction, OpenAI/LangChain calls, and structured output parsing |
-| `schema_agent_report.py` | Column profiles, JSON export, and Markdown export |
-| `schema_agent.py` | Pipeline orchestration and CLI entry point |
-
-## Usage
+### Basic Usage
 
-### Basic
+Profile a single CSV file:
 
 ```bash
-> ./schema_agent.py data.csv
+./schema_agent.py data.csv
 ```
 
-Produces two output files:
-
-- `data_profile_report.json` — Machine-readable column profiles and statistics
-- `data_profile_summary.md` — Human-readable summary table
+This generates two output files:
+- **`data_profile_report.json`** — Complete statistical and semantic analysis
+- **`data_profile_summary.md`** — Readable summary table with insights
 
-### Advanced
+### Advanced Usage
 
 ```bash
-# Profile multiple files with custom tags
-> ./schema_agent.py dataset1.csv dataset2.csv --tags sales_2024 inv_q1
+# Profile multiple files with custom labels
+./schema_agent.py dataset1.csv dataset2.csv --tags sales_2024 inventory_q1
 
-# Cost-optimized: analyze only high-null columns
-> ./schema_agent.py data.csv --llm-scope nulls --model gpt-4o-mini
+# Cost-optimized analysis (only high-null columns)
+./schema_agent.py data.csv --llm-scope nulls --model gpt-4o-mini
 
-# Custom metrics and output path
-> ./schema_agent.py data.csv --metrics mean std max --output-json my_report.json
+# Custom metrics and output paths
+./schema_agent.py data.csv --metrics mean std max --output-json my_report.json
 
 # Use LangChain as the inference backend
-> ./schema_agent.py data.csv --use-langchain
+./schema_agent.py data.csv --use-langchain
 ```
 
-## Command-Line Reference
+## Architecture
+
+The agent consists of six focused modules working together:
+
+| Module | Purpose |
+|--------|---------|
+| `schema_agent_models.py` | Type-safe Pydantic schemas for column profiles and dataset insights |
+| `schema_agent_loader.py` | CSV loading, type inference, and datetime detection |
+| `schema_agent_stats.py` | Numeric summaries, data quality metrics, and categorical distributions |
+| `schema_agent_llm.py` | LLM integration for semantic analysis and hypothesis generation |
+| `schema_agent_report.py` | Report generation in JSON and Markdown formats |
+| `schema_agent.py` | Pipeline orchestration and command-line interface |
+
+For detailed examples of individual module usage, see `schema_agent.example`. For end-to-end pipeline examples, see `schema_agent.API`.
+
+## Command-Line Options
 
 | Argument | Default | Description |
 |----------|---------|-------------|
-| `csv_paths` | Required | One or more CSV file paths |
-| `--tags` | File stems | Labels for each CSV (count must match `csv_paths`) |
-| `--model` | `gpt-4o` | OpenAI model (`gpt-4o`, `gpt-4o-mini`, etc.) |
-| `--llm-scope` | `all` | Column selection strategy: `all`, `semantic`, or `nulls` |
-| `--metrics` | Subset | Numeric summary stats: `mean`, `std`, `min`, `25%`, `50%`, `75%`, `max` |
-| `--use-langchain` | `false` | Use LangChain instead of the default inference client |
-| `--output-json` | `data_profile_report.json` | Output path for the JSON report |
-| `--output-md` | `data_profile_summary.md` | Output path for the Markdown summary |
+| `csv_paths` | Required | One or more CSV file paths to analyze |
+| `--tags` | File stems | Custom labels for each CSV (must match number of files) |
+| `--model` | `gpt-4o` | OpenAI model to use (`gpt-4o`, `gpt-4o-mini`, etc.) |
+| `--llm-scope` | `all` | Strategy for column selection: `all`, `semantic`, or `nulls` |
+| `--metrics` | Subset | Statistics to compute: `mean`, `std`, `min`, `25%`, `50%`, `75%`, `max` |
+| `--use-langchain` | `false` | Use LangChain instead of default inference client |
+| `--output-json` | `data_profile_report.json` | Path for JSON report output |
+| `--output-md` | `data_profile_summary.md` | Path for Markdown summary output |
 
-## LLM Scoping
+## Cost Optimization with LLM Scoping
 
-Control which columns are sent to the LLM to manage cost and latency:
+The `--llm-scope` parameter controls which columns are sent to the LLM, helping you balance analysis depth with costs:
 
-| Scope | Behavior |
-|-------|----------|
-| `all` | Profiles every column — most comprehensive, highest cost |
-| `semantic` | Profiles non-numeric columns only |
-| `nulls` | Profiles only columns with >5% null values — most cost-efficient |
+| Scope | What Gets Analyzed | Cost Level | Best For |
+|-------|-------------------|-----------|----------|
+| `all` | Every column | High | Complete dataset understanding |
+| `semantic` | Non-numeric columns only | Medium | Text and categorical analysis |
+| `nulls` | Columns with >5% null values | Low | Data quality issues only |
 
 ## Python API
 
-### Full pipeline
+### Run the full pipeline programmatically
 
 ```python
 import research.agentic_data_science.schema_agent.schema_agent as agent
@@ -111,9 +103,9 @@ tag_to_df, stats = agent.run_pipeline(
 )
 ```
 
-### Individual modules
+### Use individual modules independently
 
-Each module can be imported independently for exploratory use or testing:
+Each module can be imported and used separately for custom workflows:
 
 ```python
 import research.agentic_data_science.schema_agent.schema_agent_loader as loader
@@ -122,33 +114,49 @@ import research.agentic_data_science.schema_agent.schema_agent_llm as llm
 import research.agentic_data_science.schema_agent.schema_agent_report as report
 ```
 
-## Output Reference
+## Output Details
 
 ### `data_profile_report.json`
 
-Structured report containing per-column profiles, statistical summaries, and LLM-generated insights.
+A structured JSON report containing:
+- Per-column statistical profiles
+- Data quality metrics
+- LLM-generated semantic insights
+- Column role classifications
 
 ### `data_profile_summary.md`
 
-Formatted Markdown table with columns: **Column · Meaning · Role · Quality · Hypotheses**
+A formatted Markdown table with columns:
+- **Column** — Column name
+- **Meaning** — Inferred semantic description
+- **Role** — Classified role (ID, Feature, Target, Timestamp)
+- **Quality** — Data quality assessment
+- **Hypotheses** — Generated business insights
 
 ## Troubleshooting
 
-**API key not set**
+### API key not configured
 
+Set your OpenAI API key:
 ```bash
-> export OPENAI_API_KEY=sk-...
+export OPENAI_API_KEY=sk-...
 ```
 
-**Validation or parsing errors**
-
-Reduce the number of columns sent to the LLM:
+### Validation or parsing errors on large datasets
 
+Reduce the number of columns analyzed by the LLM:
 ```bash
-> ./schema_agent.py data.csv --llm-scope nulls
-> ./schema_agent.py data.csv --llm-scope semantic --model gpt-4o-mini
+./schema_agent.py data.csv --llm-scope nulls
+./schema_agent.py data.csv --llm-scope semantic --model gpt-4o-mini
 ```
 
-**No datetime columns detected**
+### No datetime columns detected
+
+This is normal behavior — the agent automatically skips temporal detection when no date-like columns are present in the dataset.
+
+## Next Steps
 
-Expected behavior — datetime detection is skipped automatically when no temporal columns are present in the dataset.
\ No newline at end of file
+- Check out example notebooks for detailed workflows
+- Integrate into your data science pipelines
+- Extend with custom metrics or export formats
+- Review individual module documentation for advanced use cases
\ No newline at end of file
diff --git a/research/agentic_data_science/schema_agent/schema_agent_API.ipynb b/research/agentic_data_science/schema_agent/schema_agent.API.ipynb
similarity index 100%
rename from research/agentic_data_science/schema_agent/schema_agent_API.ipynb
rename to research/agentic_data_science/schema_agent/schema_agent.API.ipynb
diff --git a/research/agentic_data_science/schema_agent/schema_agent_API.py b/research/agentic_data_science/schema_agent/schema_agent.API.py
similarity index 100%
rename from research/agentic_data_science/schema_agent/schema_agent_API.py
rename to research/agentic_data_science/schema_agent/schema_agent.API.py
diff --git a/research/agentic_data_science/schema_agent/schema_agent_example.ipynb b/research/agentic_data_science/schema_agent/schema_agent.example.ipynb
similarity index 100%
rename from research/agentic_data_science/schema_agent/schema_agent_example.ipynb
rename to research/agentic_data_science/schema_agent/schema_agent.example.ipynb
diff --git a/research/agentic_data_science/schema_agent/schema_agent_example.py b/research/agentic_data_science/schema_agent/schema_agent.example.py
similarity index 100%
rename from research/agentic_data_science/schema_agent/schema_agent_example.py
rename to research/agentic_data_science/schema_agent/schema_agent.example.py
diff --git a/research/agentic_data_science/schema_agent/schema_agent.py b/research/agentic_data_science/schema_agent/schema_agent.py
old mode 100644
new mode 100755
index 8dec89e21..de1777da7
--- a/research/agentic_data_science/schema_agent/schema_agent.py
+++ b/research/agentic_data_science/schema_agent/schema_agent.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 """
 Data Profiler Agent — modular implementation.
 
diff --git a/website/docs/blog/posts/draft.Schema_agent.md b/website/docs/blog/posts/draft.Schema_agent.md
new file mode 100644
index 000000000..ac59d6e9f
--- /dev/null
+++ b/website/docs/blog/posts/draft.Schema_agent.md
@@ -0,0 +1,58 @@
+---
+title: "Data Profiler Agent in 30 Minutes"
+authors:
+  - Your Name
+date: 2026-04-10
+description:
+categories:
+  - AI Research
+  - Data Science
+---
+
+TL;DR: Learn how to automatically profile CSV datasets with statistical summaries and LLM-powered semantic analysis in 30 minutes. Generate column-level insights, detect temporal patterns, and discover data quality issues.
+
+<!-- more -->
+
+## Tutorial in 30 Seconds
+
+The Data Profiler Agent is an automated system that combines classical statistical analysis with LLM-powered semantic understanding to comprehensively profile CSV datasets.
+
+Key capabilities:
+
+- **Automatic temporal detection**: Identifies and converts date/datetime columns across multiple formats
+- **Statistical profiling**: Computes numeric summaries, data quality metrics, and categorical distributions
+- **LLM semantic analysis**: Infers column roles (ID, Feature, Target, Timestamp), semantic meaning, and testable hypotheses
+- **Smart cost control**: Selectively analyze columns to manage API costs without sacrificing insights
+- **Flexible output**: Machine-readable JSON reports and human-friendly Markdown summaries
+
+This tutorial's goal is to show you in 30 minutes:
+
+- How the modular architecture enables both quick profiling and extensibility
+- How to profile datasets and interpret results in multiple formats
+- How to optimize costs while maintaining analysis quality
+- How to integrate profiling into existing data pipelines
+
+## Official References
+
+- [Data Profiler Agent Repository](../../../../research/agentic_data_science/schema_agent)
+- [README](../../../../research/agentic_data_science/schema_agent/README.md)
+
+## Tutorial Content
+
+This tutorial includes all code, notebooks, and documentation in
+[research/agentic_data_science/schema_agent](../../../../research/agentic_data_science/schema_agent)
+
+- [`README.md`](../../../../research/agentic_data_science/schema_agent/README.md): Installation, usage, and configuration guide
+- Six modular Python files:
+  - `schema_agent_models.py`: Type-safe schemas for insights and profiles
+  - `schema_agent_loader.py`: CSV loading and type inference
+  - `schema_agent_stats.py`: Statistical computation and quality metrics
+  - `schema_agent_llm.py`: LLM integration and semantic analysis
+  - `schema_agent_report.py`: Report generation and export
+  - `schema_agent.py`: Pipeline orchestration and CLI
+- [`schema_agent.example`](../../../../research/agentic_data_science/schema_agent/schema_agent.example.ipynb): Individual module usage examples
+- [`schema_agent.API`](../../../../research/agentic_data_science/schema_agent/schema_agent.API.ipynb): End-to-end pipeline workflows and patterns
+- Example notebooks demonstrating real-world use cases:
+  - Basic profiling and interpretation
+  - Cost-optimized multi-file analysis
+  - Extracting and validating business hypotheses
\ No newline at end of file

From 6102e29eba764bfd718b68e624609e012d1a2e8f Mon Sep 17 00:00:00 2001
From: Pranav Shashidhara <pranavmay22@gmail.com>
Date: Mon, 20 Apr 2026 08:39:19 -0400
Subject: [PATCH 14/14] Update blog
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-commit checks:
All checks passed ✅
---
 website/docs/blog/posts/draft.Schema_agent.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/blog/posts/draft.Schema_agent.md b/website/docs/blog/posts/draft.Schema_agent.md
index ac59d6e9f..f74928d50 100644
--- a/website/docs/blog/posts/draft.Schema_agent.md
+++ b/website/docs/blog/posts/draft.Schema_agent.md
@@ -34,7 +34,7 @@ This tutorial's goal is to show you in 30 minutes:
 
 ## Official References
 
-- [Data Profiler Agent Repository](../../../../research/agentic_data_science/schema_agent)
+- [Data Profiler Agent Repository](../../../  ../research/agentic_data_science/schema_agent)
 - [README](../../../../research/agentic_data_science/schema_agent/README.md)
 
 ## Tutorial Content