From 69fc103b0b9e7ee9b7d591efed8b897c6209dc12 Mon Sep 17 00:00:00 2001 From: Pranav Shashidhara Date: Mon, 30 Mar 2026 12:48:50 -0400 Subject: [PATCH 01/14] Add README and utils.py --- .../schema_agent/schema_agent_utils.py | 115 ++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 research/agentic_data_science/schema_agent/schema_agent_utils.py diff --git a/research/agentic_data_science/schema_agent/schema_agent_utils.py b/research/agentic_data_science/schema_agent/schema_agent_utils.py new file mode 100644 index 000000000..10856c555 --- /dev/null +++ b/research/agentic_data_science/schema_agent/schema_agent_utils.py @@ -0,0 +1,115 @@ +import helpers.hpandas_conversion as hpandas_conversion +import helpers.hpandas_stats as hpanstat +import helpers.hpandas_io as hpanio +import helpers.hlogging as hloggin + +import pandas as pd +import typing + + +_LOG = hloggin.getLogger(__name__) + +def load_employee_data(csv_path: str) -> pd.DataFrame: + """ + Load employee data from CSV. Raises FileNotFoundError if the file does not exist. + """ + try: + df = hpanio.read_csv_to_df(csv_path) + except FileNotFoundError: + _LOG.error("CSV not found at '%s'.", csv_path) + raise + return df + +def compute_llm_agent_stats( + tag_to_df: typing.Dict[str, pd.DataFrame], + categorical_cols_map: typing.Optional[typing.Dict[str, typing.List[str]]] = None, +) -> typing.Dict[str, typing.Any]: + """ + Computes a comprehensive statistical profile of dataframes for LLM context. + Produces: temporal boundaries, data quality (zeros/nans/infs), categorical + distributions, and a numeric summary — all formatted for LLM prompt injection. + """ + dataframe_stats: typing.Dict[str, typing.Any] = {} + + # 1. Temporal boundaries + try: + duration_stats, _ = hpanstat.compute_duration_df(tag_to_df) + dataframe_stats["temporal_boundaries"] = duration_stats + print("\n=== Temporal Boundaries ===") + print(duration_stats.to_string()) + except Exception as e: + _LOG.warning("Skipping duration stats: %s", e) + dataframe_stats["temporal_boundaries"] = None + + # 2. Data quality profiling (zeros / nans / infs) + dataframe_stats["quality_reports"] = {} + for tag, df in tag_to_df.items(): + # Only numeric columns — report_zero_nan_inf_stats uses np.isnan/isinf + numeric_df = df.select_dtypes(include="number") + if numeric_df.empty: + _LOG.warning("No numeric columns in '%s'; skipping quality report", tag) + continue + df_stamped = hpanstat.add_end_download_timestamp(numeric_df.copy()) + try: + quality = hpanstat.report_zero_nan_inf_stats( + df_stamped, + zero_threshold=1e-9, + verbose=True, + as_txt=True, # plain text — avoids IPython display entirely + ) + dataframe_stats["quality_reports"][tag] = quality + print(f"\n=== Quality Report: {tag} ===") + print(quality.to_string()) + except Exception as e: + _LOG.warning("Quality report failed for '%s': %s", tag, e) + + # 3. Categorical distributions + dataframe_stats["categorical_distributions"] = {} + if categorical_cols_map: + for tag, cols in categorical_cols_map.items(): + if tag not in tag_to_df: + continue + dataframe_stats["categorical_distributions"][tag] = {} + for col in cols: + if col in tag_to_df[tag].columns: + dist = hpanstat.get_value_counts_stats_df(tag_to_df[tag], col) + dataframe_stats["categorical_distributions"][tag][col] = dist + print(f"\n=== Distribution: {tag} / {col} ===") + print(dist.to_string()) + + # 4. Numeric summary (mean / std / min / max / median) + dataframe_stats["numeric_summary"] = {} + for tag, df in tag_to_df.items(): + numeric_df = df.select_dtypes(include="number") + if not numeric_df.empty: + summary = numeric_df.describe().T[["mean", "std", "min", "50%", "max"]] + summary.rename(columns={"50%": "median"}, inplace=True) + dataframe_stats["numeric_summary"][tag] = summary + print(f"\n=== Numeric Summary: {tag} ===") + print(summary.to_string()) + + return dataframe_stats + +def main(): + df = load_employee_data("global_ecommerce_forecasting.csv") + + # Dynamically convert datetime-like columns and set best index + + df_typed = hpandas_conversion.convert_df(df) + # df = convert_flexible_datetime(df) + + print(df_typed.dtypes) + # Select categorical columns excluding datetime + categorical_cols = df_typed.select_dtypes(include=["object", "category"]).columns.tolist() + + stats = compute_llm_agent_stats( + {"ecommerce_data": df_typed}, + categorical_cols_map={"ecommerce_data": categorical_cols}, + ) + + + print(df_typed.head()) + return df_typed, stats + +if __name__ == "__main__": + main() \ No newline at end of file From 46507006cc82b8cb223d5796d4e8171cd2a70d74 Mon Sep 17 00:00:00 2001 From: Pranav Shashidhara Date: Tue, 31 Mar 2026 12:26:19 -0400 Subject: [PATCH 02/14] Add LLM logic and update requirements.txt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- .../schema_agent/schema_agent_utils.py | 213 ++++++++++++++++-- 1 file changed, 199 insertions(+), 14 deletions(-) diff --git a/research/agentic_data_science/schema_agent/schema_agent_utils.py b/research/agentic_data_science/schema_agent/schema_agent_utils.py index 10856c555..e1bfc2f4a 100644 --- a/research/agentic_data_science/schema_agent/schema_agent_utils.py +++ b/research/agentic_data_science/schema_agent/schema_agent_utils.py @@ -1,13 +1,45 @@ +import logging +import sys +import os +import json +import typing +import pandas as pd +from openai import OpenAI +from dotenv import load_dotenv + +# LangChain Imports +from langchain_openai import ChatOpenAI +from langchain_core.prompts import ChatPromptTemplate +from langchain_core.output_parsers import JsonOutputParser +from pydantic import BaseModel, Field +# Internal helper imports import helpers.hpandas_conversion as hpandas_conversion import helpers.hpandas_stats as hpanstat import helpers.hpandas_io as hpanio import helpers.hlogging as hloggin +import helpers.hllm_cli as hllmcli -import pandas as pd -import typing - +load_dotenv() +api_key = os.environ.get("OPENAI_API_KEY") +if not api_key: + print("Error: OPENAI_API_KEY not found.") + sys.exit(1) +client = OpenAI(api_key=api_key) _LOG = hloggin.getLogger(__name__) +_LOG.setLevel(logging.DEBUG) + +console_handler = logging.StreamHandler(sys.stdout) + +hloggin.set_v2_formatter( + ch=console_handler, + root_logger=_LOG, + force_no_warning=False, + force_print_format=False, + force_verbose_format=True, + report_memory_usage=True, + report_cpu_usage=True +) def load_employee_data(csv_path: str) -> pd.DataFrame: """ @@ -55,7 +87,7 @@ def compute_llm_agent_stats( df_stamped, zero_threshold=1e-9, verbose=True, - as_txt=True, # plain text — avoids IPython display entirely + as_txt=True, ) dataframe_stats["quality_reports"][tag] = quality print(f"\n=== Quality Report: {tag} ===") @@ -90,25 +122,178 @@ def compute_llm_agent_stats( return dataframe_stats -def main(): - df = load_employee_data("global_ecommerce_forecasting.csv") +def build_llm_prompt(stats: typing.Dict[str, typing.Any]) -> str: + """Serializes stats into a prompt block with instructions for hypothesis generation.""" + prompt_segments = [ + "You are a Senior Data Scientist and Domain Expert.", + "Analyze the provided dataset statistics and generate a profile for each column.", + "For each column, provide 2-3 testable hypotheses. For example, if the column is 'Discount', " + "a hypothesis might be: 'Higher discount rates correlate with higher sales volume but lower profit margins.'", + "\n--- DATASET STATISTICS ---" + ] + + if "numeric_summary" in stats: + for tag, summary in stats["numeric_summary"].items(): + prompt_segments.append(f"\nDataset [{tag}] Numeric Summary:\n{summary.to_string()}") + + if "categorical_distributions" in stats: + for tag, cols in stats["categorical_distributions"].items(): + for col_name, dist in cols.items(): + prompt_segments.append(f"\nDistribution for [{col_name}]:\n{dist.to_string()}") + + return "\n".join(prompt_segments) - # Dynamically convert datetime-like columns and set best index +# --- Structured Output Schema --- +class ColumnInsight(BaseModel): + semantic_meaning: str = Field(description="Brief description of what the data represents") + role: str = Field(description="One of [ID, Feature, Target, Timestamp]") + data_quality_notes: str = Field(description="Any concerns based on the stats (e.g. high nulls, outliers)") + hypotheses: typing.List[str] = Field( + description="A list of testable hypotheses about this column's relationship to the business outcome or target variable." + ) - df_typed = hpandas_conversion.convert_df(df) - # df = convert_flexible_datetime(df) +class DatasetInsights(BaseModel): + columns: typing.Dict[str, ColumnInsight] + +def get_llm_semantic_insights_langchain(prompt_text: str, model: str = "gpt-4o") -> typing.Dict[str, typing.Any]: + """ + Uses LangChain to process metadata and return structured insights. + """ + _LOG.info("Querying LLM via LangChain (%s)...", model) + + # 1. Initialize the Model + llm = ChatOpenAI(model=model, temperature=0) + + # 2. Set up the Parser and Prompt + parser = JsonOutputParser(pydantic_object=DatasetInsights) + + prompt = ChatPromptTemplate.from_messages([ + ("system", "You are a Senior Data Scientist. Answer in JSON format.\n{format_instructions}"), + ("user", "{metadata_stats}") + ]).partial(format_instructions=parser.get_format_instructions()) + + # 3. Create the Chain + chain = prompt | llm | parser + + # 4. Invoke + try: + insights = chain.invoke({"metadata_stats": prompt_text}) + return insights + except Exception as e: + _LOG.error("LangChain invocation failed: %s", e) + return {"error": str(e)} + +def merge_and_export_results( + stats: typing.Dict[str, typing.Any], + insights: typing.Dict[str, typing.Any], + output_path: str = "data_profile_report.json" +): + """ + Merges technical pandas stats with LangChain-generated semantic insights. - print(df_typed.dtypes) - # Select categorical columns excluding datetime - categorical_cols = df_typed.select_dtypes(include=["object", "category"]).columns.tolist() + :param stats: The dictionary returned by compute_llm_agent_stats (contains DataFrames) + :param insights: The dictionary returned by the LangChain invocation + :param output_path: Path to save the final JSON report + """ + _LOG.info("Merging technical stats with LLM insights...") + + # 1. Prepare the final structure + # We convert DataFrames to dicts/JSON-serializable formats within the 'stats' object + serializable_stats = {} + for key, value in stats.items(): + if isinstance(value, pd.DataFrame): + serializable_stats[key] = value.to_dict(orient="index") + elif isinstance(value, dict): + # Handle nested dictionaries that might contain DataFrames (like quality_reports) + inner_dict = {} + for k, v in value.items(): + inner_dict[k] = v.to_dict(orient="index") if isinstance(v, pd.DataFrame) else v + serializable_stats[key] = inner_dict + else: + serializable_stats[key] = value + + # 2. Combine into one master object + final_report = { + "report_metadata": { + "version": "1.0", + "agent": "LangChain-Data-Profiler" + }, + "technical_stats": serializable_stats, + "semantic_insights": insights + } + + # 3. Export to JSON + try: + with open(output_path, "w") as f: + json.dump(final_report, f, indent=4, default=str) + _LOG.info("Successfully exported merged profile to: %s", output_path) + except Exception as e: + _LOG.error("Failed to export results: %s", e) + +def generate_hypotheses_via_cli( + stats: typing.Dict[str, typing.Any], + model: str = "gpt-4o" +) -> typing.Dict[str, typing.Any]: + """ + Generates semantic insights and hypotheses using the underlying + logic of llm_cli (hllmcli). + """ + _LOG.info("Generating hypotheses via hllmcli logic...") + + # 1. Prepare the Schema + # We use Pydantic's schema to force the LLM into the correct JSON structure + schema_json = DatasetInsights.model_json_schema() + # 2. Build the Prompts + user_prompt = build_llm_prompt(stats) + + system_prompt = ( + "You are a Senior Data Scientist. Analyze the following data statistics.\n" + "Generate a set of 2-3 predictive or causal hypotheses for EVERY column.\n" + f"Return the output strictly in JSON matching this schema: {json.dumps(schema_json)}" + ) + + # 3. Call the library function used by llm_cli + try: + # apply_llm returns a Tuple[str, float] (response_text, cost) + response_text, cost = hllmcli.apply_llm( + input_str=user_prompt, + system_prompt=system_prompt, + model=model, + use_llm_executable=False # Use the Python library for better error handling + ) + + _LOG.info("LLM Call successful. Cost: $%.6f", cost) + + # 4. Parse the result + cleaned_response = response_text.strip().removeprefix("```json").removesuffix("```").strip() + parsed_data = json.loads(cleaned_response) + + return parsed_data + + except Exception as e: + _LOG.error("hllmcli call failed: %s", e) + return {"error": str(e)} + +# Update main to use the new CLI-based function if desired +def main(): + # 1. Load & Process Data + df = hpanio.read_csv_to_df("global_ecommerce_forecasting.csv") + df_typed = hpandas_conversion.convert_df(df) + + # 2. Compute Deterministic Stats + cat_cols = df_typed.select_dtypes(include=["object", "category", "string"]).columns.tolist() stats = compute_llm_agent_stats( {"ecommerce_data": df_typed}, - categorical_cols_map={"ecommerce_data": categorical_cols}, + categorical_cols_map={"ecommerce_data": cat_cols}, ) + # 3. Call LLM via our new CLI-based helper + semantic_insights = generate_hypotheses_via_cli(stats) - print(df_typed.head()) + # 4. Export + merge_and_export_results(stats, semantic_insights) + return df_typed, stats if __name__ == "__main__": From be1defd543b5e00df1af3b7873758ce1bafa6119 Mon Sep 17 00:00:00 2001 From: Pranav Shashidhara Date: Wed, 1 Apr 2026 10:49:12 -0400 Subject: [PATCH 03/14] Lint and Update README.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- .../schema_agent/README.md | 141 ++------ .../schema_agent/schema_agent_utils.py | 307 ++++++++++-------- 2 files changed, 194 insertions(+), 254 deletions(-) diff --git a/research/agentic_data_science/schema_agent/README.md b/research/agentic_data_science/schema_agent/README.md index c3d1c3a13..777d42283 100644 --- a/research/agentic_data_science/schema_agent/README.md +++ b/research/agentic_data_science/schema_agent/README.md @@ -2,133 +2,52 @@ Automated statistical profiling and LLM-powered semantic analysis for CSV datasets. Generates column-level insights including semantic meaning, data quality assessment, and testable business hypotheses. -## Features +## Setup and Usage -- **Temporal Detection:** Auto-detects and converts date/datetime columns across multiple formats -- **Statistical Profiling:** Computes numeric summaries, data quality metrics, and categorical distributions -- **LLM Semantic Analysis:** Generates column roles (ID, Feature, Target, Timestamp), semantic meaning, and hypotheses -- **Cost Optimization:** Filter columns before LLM analysis to control token usage and API costs -- **Multi-Format Output:** JSON reports and Markdown summaries - -## Setup - -Go into the schema folder: -```bash -cd research/agentic_data_science/schema_agent -``` - -Install the requirements: -```bash -pip install -r requirements.txt -``` - -Set the `OPENAI_API_KEY` in your environment: +To navigate to the repository: ```bash -export OPENAI_API_KEY=sk-... +cd research/agentic_data_science/schema_agent/schema_agent_utils.py ``` +Setup the OpenAI key in your environment before running in a .env file -## Module Structure - -The agent is split into six focused modules: - -| Module | Responsibility | -|--------|---------------| -| `schema_agent_models.py` | Pydantic schemas for type-safe column/dataset insights | -| `schema_agent_loader.py` | CSV loading, type inference, datetime detection | -| `schema_agent_stats.py` | Numeric summaries, quality reports, categorical distributions | -| `schema_agent_llm.py` | Prompt building, OpenAI/LangChain calls, structured output parsing | -| `schema_agent_report.py` | Column profiles, JSON and Markdown export | -| `schema_agent.py` | Pipeline orchestration and CLI entry point | - -## Usage - -### Basic - -```bash -python schema_agent.py data.csv -``` +## Current Files -Outputs: -- `data_profile_report.json` — Machine-readable report -- `data_profile_summary.md` — Human-readable summary +- **`requirements.txt`** – Lists the Python dependencies required to run the agent +- **`schema_agent_utils.py`** – Contains functions for parsing data, computing column statistics, and preparing summaries for LLM-based analysis +- **`global_ecommerce_forecasting.csv`** – The dataset used for testing -### Advanced -```bash -# Multiple files with tags -python schema_agent.py dataset1.csv dataset2.csv --tags sales_2024 inv_q1 - -# Cost-optimized: only high-null columns -python schema_agent.py data.csv --llm-scope nulls --model gpt-4o-mini - -# Custom metrics and output -python schema_agent.py data.csv --metrics mean std max --output-json my_report.json - -# LangChain backend -python schema_agent.py data.csv --use-langchain -``` - -## Command-Line Arguments - -| Argument | Default | Description | -|----------|---------|-------------| -| `csv_paths` | Required | One or more CSV file paths | -| `--tags` | File stems | Tags for each CSV (must match count) | -| `--model` | `gpt-4o` | LLM model (`gpt-4o`, `gpt-4o-mini`, etc.) | -| `--llm-scope` | `all` | Which columns to profile: `all`, `semantic`, `nulls` | -| `--metrics` | Subset | Numeric metrics: `mean`, `std`, `min`, `25%`, `50%`, `75%`, `max` | -| `--use-langchain` | False | Use LangChain instead of hllmcli | -| `--output-json` | `data_profile_report.json` | JSON report path | -| `--output-md` | `data_profile_summary.md` | Markdown summary path | - -## LLM Scoping - -- **`all`** — Every column (highest cost, comprehensive) -- **`semantic`** — Non-numeric columns only -- **`nulls`** — Columns with >5% null values (cost-optimized) +## Setup -## Python API +### 1. Load CSV -### Full pipeline +- Read into a `pandas.DataFrame` +- Ensure the DataFrame is non-empty -```python -import schema_agent as radsasag -tag_to_df, stats = radsasag.run_pipeline( - csv_paths=["data.csv"], - model="gpt-4o-mini", - llm_scope="semantic" -) -``` +### 2. Compute Column Stats -### Individual modules +- Identify column types: numeric, categorical, datetime +- Compute per-column statistics: + - **Numeric**: min, max, mean, median + - **Categorical**: unique count, top values + - **Datetime**: ranges, durations +- Capture null percentages and sample values -Each module can be imported independently for exploratory use or testing: +### 3. Build LLM Prompt -```python -import schema_agent_loader as radsasal -import schema_agent_stats as radsasas -import schema_agent_llm as radsasal -import schema_agent_report as radsasar -``` +- Serialize per-column stats with optional user context +- Designed for efficient LLM input (summaries only, not full data) -## Output +### 4. LLM Analysis -### data_profile_report.json -Structured report with column profiles, technical stats, and LLM insights. +- Generate hypotheses about each column's meaning +- Suggest semantic roles (identifier, timestamp, category, etc.) +- Highlight data quality concerns -### data_profile_summary.md -Formatted table summary: Column | Meaning | Role | Quality | Hypotheses +### 5. Merge Results -## Troubleshooting - -**API Key Error:** -```bash -export OPENAI_API_KEY=sk-... -``` +- Combine pandas statistics and LLM output by column name -**Validation Errors:** -- Use `--llm-scope nulls` or `--llm-scope semantic` to reduce columns -- Try `--model gpt-4o-mini` +### 6. Export -**Datetime Detection:** -Skipped automatically if no temporal columns detected. \ No newline at end of file +- JSON output for downstream automation or agents diff --git a/research/agentic_data_science/schema_agent/schema_agent_utils.py b/research/agentic_data_science/schema_agent/schema_agent_utils.py index e1bfc2f4a..262f42160 100644 --- a/research/agentic_data_science/schema_agent/schema_agent_utils.py +++ b/research/agentic_data_science/schema_agent/schema_agent_utils.py @@ -1,101 +1,140 @@ +""" +Import as: + +import research.agentic_data_science.schema_agent.schema_agent_utils as radsasau +""" + +import json import logging +import os import sys -import os -import json import typing + +import dotenv +import langchain_core.output_parsers as lcop +import langchain_core.prompts as lcpr +import langchain_openai as lco +import openai import pandas as pd -from openai import OpenAI -from dotenv import load_dotenv - -# LangChain Imports -from langchain_openai import ChatOpenAI -from langchain_core.prompts import ChatPromptTemplate -from langchain_core.output_parsers import JsonOutputParser -from pydantic import BaseModel, Field -# Internal helper imports -import helpers.hpandas_conversion as hpandas_conversion -import helpers.hpandas_stats as hpanstat -import helpers.hpandas_io as hpanio -import helpers.hlogging as hloggin +import pydantic + import helpers.hllm_cli as hllmcli +import helpers.hlogging as hloggin +import helpers.hpandas_conversion as hpanconv +import helpers.hpandas_io as hpanio +import helpers.hpandas_stats as hpanstat -load_dotenv() +# --- Configuration & Logging --- +dotenv.load_dotenv() api_key = os.environ.get("OPENAI_API_KEY") if not api_key: print("Error: OPENAI_API_KEY not found.") sys.exit(1) -client = OpenAI(api_key=api_key) +client = openai.OpenAI(api_key=api_key) _LOG = hloggin.getLogger(__name__) -_LOG.setLevel(logging.DEBUG) +_LOG.setLevel(logging.DEBUG) console_handler = logging.StreamHandler(sys.stdout) - -hloggin.set_v2_formatter( +hloggin.set_v2_formatter( ch=console_handler, root_logger=_LOG, force_no_warning=False, - force_print_format=False, - force_verbose_format=True, + force_print_format=False, + force_verbose_format=True, report_memory_usage=True, - report_cpu_usage=True + report_cpu_usage=True, ) + +# ############################################################################# +# ColumnInsight +# ############################################################################# + + +# --- Schemas --- +class ColumnInsight(pydantic.BaseModel): + semantic_meaning: str = pydantic.Field( + description="Brief description of what the data represents" + ) + role: str = pydantic.Field( + description="One of [ID, Feature, Target, Timestamp]" + ) + data_quality_notes: str = pydantic.Field( + description="Any concerns based on the stats (e.g. high nulls, outliers)" + ) + hypotheses: typing.List[str] = pydantic.Field( + description="List of testable hypotheses regarding the column's relationship " + "to business outcomes." + ) + + +# ############################################################################# +# DatasetInsights +# ############################################################################# + + +class DatasetInsights(pydantic.BaseModel): + columns: typing.Dict[str, ColumnInsight] + + +# --- Core Logic --- def load_employee_data(csv_path: str) -> pd.DataFrame: """ - Load employee data from CSV. Raises FileNotFoundError if the file does not exist. + Load employee data from CSV with error handling for missing files. """ try: - df = hpanio.read_csv_to_df(csv_path) + return hpanio.read_csv_to_df(csv_path) except FileNotFoundError: _LOG.error("CSV not found at '%s'.", csv_path) raise - return df + def compute_llm_agent_stats( tag_to_df: typing.Dict[str, pd.DataFrame], - categorical_cols_map: typing.Optional[typing.Dict[str, typing.List[str]]] = None, + categorical_cols_map: typing.Optional[ + typing.Dict[str, typing.List[str]] + ] = None, ) -> typing.Dict[str, typing.Any]: """ - Computes a comprehensive statistical profile of dataframes for LLM context. - Produces: temporal boundaries, data quality (zeros/nans/infs), categorical - distributions, and a numeric summary — all formatted for LLM prompt injection. + Compute a statistical profile including temporal boundaries, data quality, + categorical distributions, and numeric summaries for LLM injection. """ dataframe_stats: typing.Dict[str, typing.Any] = {} - # 1. Temporal boundaries + # 1. Temporal Analysis try: duration_stats, _ = hpanstat.compute_duration_df(tag_to_df) dataframe_stats["temporal_boundaries"] = duration_stats - print("\n=== Temporal Boundaries ===") - print(duration_stats.to_string()) - except Exception as e: + print("\n=== Temporal Boundaries ===\n", duration_stats.to_string()) + except Exception as e: # pylint: disable=broad-exception-caught _LOG.warning("Skipping duration stats: %s", e) dataframe_stats["temporal_boundaries"] = None - # 2. Data quality profiling (zeros / nans / infs) + # 2. Data Quality Profiling dataframe_stats["quality_reports"] = {} for tag, df in tag_to_df.items(): - # Only numeric columns — report_zero_nan_inf_stats uses np.isnan/isinf numeric_df = df.select_dtypes(include="number") if numeric_df.empty: - _LOG.warning("No numeric columns in '%s'; skipping quality report", tag) + _LOG.warning( + "No numeric columns in '%s'; skipping quality report", tag + ) continue + df_stamped = hpanstat.add_end_download_timestamp(numeric_df.copy()) try: quality = hpanstat.report_zero_nan_inf_stats( df_stamped, zero_threshold=1e-9, verbose=True, - as_txt=True, + as_txt=True, ) dataframe_stats["quality_reports"][tag] = quality - print(f"\n=== Quality Report: {tag} ===") - print(quality.to_string()) - except Exception as e: + print(f"\n=== Quality Report: {tag} ===\n", quality.to_string()) + except Exception as e: # pylint: disable=broad-exception-caught _LOG.warning("Quality report failed for '%s': %s", tag, e) - # 3. Categorical distributions + # 3. Categorical Distributions dataframe_stats["categorical_distributions"] = {} if categorical_cols_map: for tag, cols in categorical_cols_map.items(): @@ -106,195 +145,177 @@ def compute_llm_agent_stats( if col in tag_to_df[tag].columns: dist = hpanstat.get_value_counts_stats_df(tag_to_df[tag], col) dataframe_stats["categorical_distributions"][tag][col] = dist - print(f"\n=== Distribution: {tag} / {col} ===") - print(dist.to_string()) + print( + f"\n=== Distribution: {tag} / {col} ===\n", + dist.to_string(), + ) - # 4. Numeric summary (mean / std / min / max / median) + # 4. Numeric Summary dataframe_stats["numeric_summary"] = {} for tag, df in tag_to_df.items(): numeric_df = df.select_dtypes(include="number") if not numeric_df.empty: - summary = numeric_df.describe().T[["mean", "std", "min", "50%", "max"]] + summary = numeric_df.describe().T[ + ["mean", "std", "min", "50%", "max"] + ] summary.rename(columns={"50%": "median"}, inplace=True) dataframe_stats["numeric_summary"][tag] = summary - print(f"\n=== Numeric Summary: {tag} ===") - print(summary.to_string()) + print(f"\n=== Numeric Summary: {tag} ===\n", summary.to_string()) return dataframe_stats + def build_llm_prompt(stats: typing.Dict[str, typing.Any]) -> str: - """Serializes stats into a prompt block with instructions for hypothesis generation.""" + """ + Serialize statistical data into a structured prompt for hypothesis + generation. + """ prompt_segments = [ "You are a Senior Data Scientist and Domain Expert.", "Analyze the provided dataset statistics and generate a profile for each column.", - "For each column, provide 2-3 testable hypotheses. For example, if the column is 'Discount', " - "a hypothesis might be: 'Higher discount rates correlate with higher sales volume but lower profit margins.'", - "\n--- DATASET STATISTICS ---" + "For each column, provide 2-3 testable hypotheses. " + "Example: 'Higher discount rates correlate with higher volume but lower margins.'", + "\n--- DATASET STATISTICS ---", ] - if "numeric_summary" in stats: for tag, summary in stats["numeric_summary"].items(): - prompt_segments.append(f"\nDataset [{tag}] Numeric Summary:\n{summary.to_string()}") - + prompt_segments.append( + f"\nDataset [{tag}] Numeric Summary:\n{summary.to_string()}" + ) if "categorical_distributions" in stats: for tag, cols in stats["categorical_distributions"].items(): for col_name, dist in cols.items(): - prompt_segments.append(f"\nDistribution for [{col_name}]:\n{dist.to_string()}") - + prompt_segments.append( + f"\nDistribution for [{col_name}]:\n{dist.to_string()}" + ) return "\n".join(prompt_segments) -# --- Structured Output Schema --- -class ColumnInsight(BaseModel): - semantic_meaning: str = Field(description="Brief description of what the data represents") - role: str = Field(description="One of [ID, Feature, Target, Timestamp]") - data_quality_notes: str = Field(description="Any concerns based on the stats (e.g. high nulls, outliers)") - hypotheses: typing.List[str] = Field( - description="A list of testable hypotheses about this column's relationship to the business outcome or target variable." - ) - -class DatasetInsights(BaseModel): - columns: typing.Dict[str, ColumnInsight] -def get_llm_semantic_insights_langchain(prompt_text: str, model: str = "gpt-4o") -> typing.Dict[str, typing.Any]: +def get_llm_semantic_insights_langchain( + prompt_text: str, model: str = "gpt-4o" +) -> typing.Dict[str, typing.Any]: """ - Uses LangChain to process metadata and return structured insights. + Process dataset metadata via LangChain to extract structured semantic + insights. """ _LOG.info("Querying LLM via LangChain (%s)...", model) - - # 1. Initialize the Model - llm = ChatOpenAI(model=model, temperature=0) - - # 2. Set up the Parser and Prompt - parser = JsonOutputParser(pydantic_object=DatasetInsights) - - prompt = ChatPromptTemplate.from_messages([ - ("system", "You are a Senior Data Scientist. Answer in JSON format.\n{format_instructions}"), - ("user", "{metadata_stats}") - ]).partial(format_instructions=parser.get_format_instructions()) - - # 3. Create the Chain + llm = lco.ChatOpenAI(model=model, temperature=0) + parser = lcop.JsonOutputParser(pydantic_object=DatasetInsights) + prompt = lcpr.ChatPromptTemplate.from_messages( + [ + ( + "system", + "You are a Senior Data Scientist. Answer in JSON format.\n" + "{format_instructions}", + ), + ("user", "{metadata_stats}"), + ] + ).partial(format_instructions=parser.get_format_instructions()) chain = prompt | llm | parser - - # 4. Invoke try: - insights = chain.invoke({"metadata_stats": prompt_text}) - return insights - except Exception as e: + result = chain.invoke({"metadata_stats": prompt_text}) + return typing.cast(typing.Dict[str, typing.Any], result) + except Exception as e: # pylint: disable=broad-exception-caught _LOG.error("LangChain invocation failed: %s", e) return {"error": str(e)} + def merge_and_export_results( - stats: typing.Dict[str, typing.Any], + stats: typing.Dict[str, typing.Any], insights: typing.Dict[str, typing.Any], - output_path: str = "data_profile_report.json" -): + output_path: str = "data_profile_report.json", +) -> None: """ - Merges technical pandas stats with LangChain-generated semantic insights. - - :param stats: The dictionary returned by compute_llm_agent_stats (contains DataFrames) - :param insights: The dictionary returned by the LangChain invocation - :param output_path: Path to save the final JSON report + Merge pandas statistics with LLM insights and export to a JSON report. """ _LOG.info("Merging technical stats with LLM insights...") - # 1. Prepare the final structure - # We convert DataFrames to dicts/JSON-serializable formats within the 'stats' object serializable_stats = {} for key, value in stats.items(): if isinstance(value, pd.DataFrame): serializable_stats[key] = value.to_dict(orient="index") elif isinstance(value, dict): - # Handle nested dictionaries that might contain DataFrames (like quality_reports) inner_dict = {} for k, v in value.items(): - inner_dict[k] = v.to_dict(orient="index") if isinstance(v, pd.DataFrame) else v + inner_dict[k] = ( + v.to_dict(orient="index") + if isinstance(v, pd.DataFrame) + else v + ) serializable_stats[key] = inner_dict else: serializable_stats[key] = value - # 2. Combine into one master object final_report = { - "report_metadata": { - "version": "1.0", - "agent": "LangChain-Data-Profiler" - }, + "report_metadata": {"version": "1.0", "agent": "LangChain-Data-Profiler"}, "technical_stats": serializable_stats, - "semantic_insights": insights + "semantic_insights": insights, } - # 3. Export to JSON try: - with open(output_path, "w") as f: + with open(output_path, "w", encoding="utf-8") as f: json.dump(final_report, f, indent=4, default=str) _LOG.info("Successfully exported merged profile to: %s", output_path) - except Exception as e: + except Exception as e: # pylint: disable=broad-exception-caught _LOG.error("Failed to export results: %s", e) + def generate_hypotheses_via_cli( - stats: typing.Dict[str, typing.Any], - model: str = "gpt-4o" + stats: typing.Dict[str, typing.Any], model: str = "gpt-4o" ) -> typing.Dict[str, typing.Any]: """ - Generates semantic insights and hypotheses using the underlying - logic of llm_cli (hllmcli). + Generate insights and hypotheses using internal hllmcli logic. """ _LOG.info("Generating hypotheses via hllmcli logic...") - # 1. Prepare the Schema - # We use Pydantic's schema to force the LLM into the correct JSON structure schema_json = DatasetInsights.model_json_schema() - - # 2. Build the Prompts user_prompt = build_llm_prompt(stats) - system_prompt = ( "You are a Senior Data Scientist. Analyze the following data statistics.\n" "Generate a set of 2-3 predictive or causal hypotheses for EVERY column.\n" - f"Return the output strictly in JSON matching this schema: {json.dumps(schema_json)}" + f"Return the output strictly in JSON matching this schema: {json.dumps(schema_json)}" ) - # 3. Call the library function used by llm_cli try: - # apply_llm returns a Tuple[str, float] (response_text, cost) response_text, cost = hllmcli.apply_llm( input_str=user_prompt, system_prompt=system_prompt, model=model, - use_llm_executable=False # Use the Python library for better error handling + use_llm_executable=False, ) - + _LOG.info("LLM Call successful. Cost: $%.6f", cost) + cleaned_response = ( + response_text.strip() + .removeprefix("```json") + .removesuffix("```") + .strip() + ) + parsed = json.loads(cleaned_response) + return typing.cast(typing.Dict[str, typing.Any], parsed) - # 4. Parse the result - cleaned_response = response_text.strip().removeprefix("```json").removesuffix("```").strip() - parsed_data = json.loads(cleaned_response) - - return parsed_data - - except Exception as e: + except Exception as e: _LOG.error("hllmcli call failed: %s", e) return {"error": str(e)} -# Update main to use the new CLI-based function if desired -def main(): - # 1. Load & Process Data - df = hpanio.read_csv_to_df("global_ecommerce_forecasting.csv") - df_typed = hpandas_conversion.convert_df(df) - # 2. Compute Deterministic Stats - cat_cols = df_typed.select_dtypes(include=["object", "category", "string"]).columns.tolist() +def main() -> typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]: + """ + Execute entry point for the data profiling pipeline. + """ + df = hpanio.read_csv_to_df("global_ecommerce_forecasting.csv") + df_typed = hpanconv.convert_df(df) + cat_cols = df_typed.select_dtypes( + include=["object", "category", "string"] + ).columns.tolist() stats = compute_llm_agent_stats( {"ecommerce_data": df_typed}, categorical_cols_map={"ecommerce_data": cat_cols}, ) - - # 3. Call LLM via our new CLI-based helper semantic_insights = generate_hypotheses_via_cli(stats) - - # 4. Export merge_and_export_results(stats, semantic_insights) - return df_typed, stats + if __name__ == "__main__": - main() \ No newline at end of file + main() From 090352830f7cf139f8665ce74e6701595d2e18e3 Mon Sep 17 00:00:00 2001 From: Pranav Shashidhara Date: Wed, 1 Apr 2026 13:09:12 -0400 Subject: [PATCH 04/14] Add datetime convertion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- .../schema_agent/schema_agent_utils.py | 185 +++++++++++++++--- 1 file changed, 162 insertions(+), 23 deletions(-) diff --git a/research/agentic_data_science/schema_agent/schema_agent_utils.py b/research/agentic_data_science/schema_agent/schema_agent_utils.py index 262f42160..a99e7d8b0 100644 --- a/research/agentic_data_science/schema_agent/schema_agent_utils.py +++ b/research/agentic_data_science/schema_agent/schema_agent_utils.py @@ -11,12 +11,12 @@ import typing import dotenv -import langchain_core.output_parsers as lcop -import langchain_core.prompts as lcpr -import langchain_openai as lco -import openai +import langchain_core.output_parsers as lcop +import langchain_core.prompts as lcpr +import langchain_openai as lco +import openai import pandas as pd -import pydantic +import pydantic import helpers.hllm_cli as hllmcli import helpers.hlogging as hloggin @@ -36,7 +36,7 @@ _LOG.setLevel(logging.DEBUG) console_handler = logging.StreamHandler(sys.stdout) -hloggin.set_v2_formatter( +hloggin.set_v2_formatter( ch=console_handler, root_logger=_LOG, force_no_warning=False, @@ -48,11 +48,10 @@ # ############################################################################# -# ColumnInsight +# ColumnInsight Schema # ############################################################################# -# --- Schemas --- class ColumnInsight(pydantic.BaseModel): semantic_meaning: str = pydantic.Field( description="Brief description of what the data represents" @@ -70,7 +69,7 @@ class ColumnInsight(pydantic.BaseModel): # ############################################################################# -# DatasetInsights +# DatasetInsights Schema # ############################################################################# @@ -79,6 +78,8 @@ class DatasetInsights(pydantic.BaseModel): # --- Core Logic --- + + def load_employee_data(csv_path: str) -> pd.DataFrame: """ Load employee data from CSV with error handling for missing files. @@ -155,9 +156,7 @@ def compute_llm_agent_stats( for tag, df in tag_to_df.items(): numeric_df = df.select_dtypes(include="number") if not numeric_df.empty: - summary = numeric_df.describe().T[ - ["mean", "std", "min", "50%", "max"] - ] + summary = numeric_df.describe().T[["mean", "std", "min", "50%", "max"]] summary.rename(columns={"50%": "median"}, inplace=True) dataframe_stats["numeric_summary"][tag] = summary print(f"\n=== Numeric Summary: {tag} ===\n", summary.to_string()) @@ -167,8 +166,7 @@ def compute_llm_agent_stats( def build_llm_prompt(stats: typing.Dict[str, typing.Any]) -> str: """ - Serialize statistical data into a structured prompt for hypothesis - generation. + Serialize statistical data into a structured string prompt for LLM consumption. """ prompt_segments = [ "You are a Senior Data Scientist and Domain Expert.", @@ -177,17 +175,28 @@ def build_llm_prompt(stats: typing.Dict[str, typing.Any]) -> str: "Example: 'Higher discount rates correlate with higher volume but lower margins.'", "\n--- DATASET STATISTICS ---", ] + + # Append datetime column metadata if available + if "datetime_columns" in stats and stats["datetime_columns"]: + prompt_segments.append( + f"\nDetected Datetime Columns:\n{json.dumps(stats['datetime_columns'], indent=2)}" + ) + + # Append numeric summaries if available if "numeric_summary" in stats: for tag, summary in stats["numeric_summary"].items(): prompt_segments.append( f"\nDataset [{tag}] Numeric Summary:\n{summary.to_string()}" ) + + # Append categorical distributions if available if "categorical_distributions" in stats: for tag, cols in stats["categorical_distributions"].items(): for col_name, dist in cols.items(): prompt_segments.append( f"\nDistribution for [{col_name}]:\n{dist.to_string()}" ) + return "\n".join(prompt_segments) @@ -195,12 +204,13 @@ def get_llm_semantic_insights_langchain( prompt_text: str, model: str = "gpt-4o" ) -> typing.Dict[str, typing.Any]: """ - Process dataset metadata via LangChain to extract structured semantic - insights. + Process dataset metadata via LangChain to extract structured semantic insights. + Uses LangChain's JsonOutputParser alongside the Pydantic schema. """ _LOG.info("Querying LLM via LangChain (%s)...", model) llm = lco.ChatOpenAI(model=model, temperature=0) parser = lcop.JsonOutputParser(pydantic_object=DatasetInsights) + prompt = lcpr.ChatPromptTemplate.from_messages( [ ( @@ -211,6 +221,7 @@ def get_llm_semantic_insights_langchain( ("user", "{metadata_stats}"), ] ).partial(format_instructions=parser.get_format_instructions()) + chain = prompt | llm | parser try: result = chain.invoke({"metadata_stats": prompt_text}) @@ -227,21 +238,32 @@ def merge_and_export_results( ) -> None: """ Merge pandas statistics with LLM insights and export to a JSON report. + Converts DataFrame objects into dictionaries to ensure JSON serialization. """ _LOG.info("Merging technical stats with LLM insights...") serializable_stats = {} + for key, value in stats.items(): if isinstance(value, pd.DataFrame): serializable_stats[key] = value.to_dict(orient="index") elif isinstance(value, dict): inner_dict = {} for k, v in value.items(): - inner_dict[k] = ( - v.to_dict(orient="index") - if isinstance(v, pd.DataFrame) - else v - ) + if isinstance(v, pd.DataFrame): + inner_dict[k] = v.to_dict(orient="index") + elif isinstance(v, dict): + # Handle nested dicts (e.g. categorical_distributions[tag][col]) + inner_inner = {} + for kk, vv in v.items(): + inner_inner[kk] = ( + vv.to_dict(orient="index") + if isinstance(vv, pd.DataFrame) + else vv + ) + inner_dict[k] = inner_inner + else: + inner_dict[k] = v serializable_stats[key] = inner_dict else: serializable_stats[key] = value @@ -285,6 +307,7 @@ def generate_hypotheses_via_cli( ) _LOG.info("LLM Call successful. Cost: $%.6f", cost) + cleaned_response = ( response_text.strip() .removeprefix("```json") @@ -294,28 +317,144 @@ def generate_hypotheses_via_cli( parsed = json.loads(cleaned_response) return typing.cast(typing.Dict[str, typing.Any], parsed) - except Exception as e: + except Exception as e: # pylint: disable=broad-exception-caught _LOG.error("hllmcli call failed: %s", e) return {"error": str(e)} +def infer_and_convert_datetime_columns( + df: pd.DataFrame, + sample_size: int = 100, + threshold: float = 0.8, +) -> typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]: + """ + Detect and convert date/datetime columns in a DataFrame. + Uses sampling to improve performance when checking format compliance. + + Returns: + - Updated DataFrame with converted columns + - Metadata dict with inference details per column + """ + from datetime import datetime + + COMMON_FORMATS = [ + "%Y-%m-%d", + "%d-%m-%Y", + "%m-%d-%Y", + "%Y/%m/%d", + "%d/%m/%Y", + "%m/%d/%Y", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d %H:%M", + "%d-%m-%Y %H:%M:%S", + "%m/%d/%Y %H:%M:%S", + ] + + metadata: typing.Dict[str, typing.Any] = {} + df_out = df.copy() + + for col in df.columns: + if not pd.api.types.is_object_dtype( + df[col] + ) and not pd.api.types.is_string_dtype(df[col]): + continue + + series = df[col].dropna().astype(str) + + if series.empty: + continue + + sample = series.head(sample_size) + + best_format = None + best_score = 0.0 + + for fmt in COMMON_FORMATS: + success = 0 + for val in sample: + try: + datetime.strptime(val, fmt) + success += 1 + except Exception: + continue + + score = success / len(sample) + + if score > best_score: + best_score = score + best_format = fmt + + if best_score >= threshold: + parsed = pd.to_datetime(df[col], format=best_format, errors="coerce") + used_format = best_format + else: + parsed = pd.to_datetime(df[col], errors="coerce") + used_format = None + + confidence = parsed.notna().mean() + + if confidence < threshold: + continue + + has_time = (parsed.dt.time != pd.Timestamp("00:00:00").time()).any() + col_type = "datetime" if has_time else "date" + + df_out[col] = parsed + + metadata[col] = { + "semantic_type": "temporal", + "granularity": col_type, + "format": used_format, + "confidence": float(confidence), + } + + _LOG.info( + "Column '%s' detected as %s (format=%s, confidence=%.2f)", + col, + col_type, + used_format, + confidence, + ) + + return df_out, metadata + + def main() -> typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]: """ Execute entry point for the data profiling pipeline. + Flow: Load Data -> Clean Types -> Infer Dates -> Compute Stats -> Request LLM Insights -> Export. """ df = hpanio.read_csv_to_df("global_ecommerce_forecasting.csv") df_typed = hpanconv.convert_df(df) + + # Process temporal inference + df_typed, datetime_meta = infer_and_convert_datetime_columns(df_typed) + + # Identify categorical columns to calculate their distributions cat_cols = df_typed.select_dtypes( include=["object", "category", "string"] ).columns.tolist() + + # Compute base statistics stats = compute_llm_agent_stats( {"ecommerce_data": df_typed}, categorical_cols_map={"ecommerce_data": cat_cols}, ) + + # FIX: Inject datetime metadata into stats so it's written to JSON and fed to LLM + stats["datetime_columns"] = datetime_meta + + print(df_typed.dtypes) + print(datetime_meta) + + # Send stats to LLM to generate testable hypotheses semantic_insights = generate_hypotheses_via_cli(stats) + + # Save the combined numerical stats and semantic insights to disk merge_and_export_results(stats, semantic_insights) + return df_typed, stats if __name__ == "__main__": - main() + main() \ No newline at end of file From 7bd7d9611eae0993f3b4de0b80e7e8b6cf84f03c Mon Sep 17 00:00:00 2001 From: Pranav Shashidhara Date: Wed, 1 Apr 2026 13:14:17 -0400 Subject: [PATCH 05/14] Update datetime function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- .../schema_agent/schema_agent_utils.py | 44 +++++++++---------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/research/agentic_data_science/schema_agent/schema_agent_utils.py b/research/agentic_data_science/schema_agent/schema_agent_utils.py index a99e7d8b0..a235e5e13 100644 --- a/research/agentic_data_science/schema_agent/schema_agent_utils.py +++ b/research/agentic_data_science/schema_agent/schema_agent_utils.py @@ -1,9 +1,10 @@ """ Import as: -import research.agentic_data_science.schema_agent.schema_agent_utils as radsasau +import research.agentic_data_science.schema_agent.schema_agent_utils as radssasau """ +import datetime import json import logging import os @@ -48,7 +49,7 @@ # ############################################################################# -# ColumnInsight Schema +# ColumnInsight # ############################################################################# @@ -69,7 +70,7 @@ class ColumnInsight(pydantic.BaseModel): # ############################################################################# -# DatasetInsights Schema +# DatasetInsights # ############################################################################# @@ -156,7 +157,9 @@ def compute_llm_agent_stats( for tag, df in tag_to_df.items(): numeric_df = df.select_dtypes(include="number") if not numeric_df.empty: - summary = numeric_df.describe().T[["mean", "std", "min", "50%", "max"]] + summary = numeric_df.describe().T[ + ["mean", "std", "min", "50%", "max"] + ] summary.rename(columns={"50%": "median"}, inplace=True) dataframe_stats["numeric_summary"][tag] = summary print(f"\n=== Numeric Summary: {tag} ===\n", summary.to_string()) @@ -166,7 +169,8 @@ def compute_llm_agent_stats( def build_llm_prompt(stats: typing.Dict[str, typing.Any]) -> str: """ - Serialize statistical data into a structured string prompt for LLM consumption. + Serialize statistical data into a structured string prompt for LLM + consumption. """ prompt_segments = [ "You are a Senior Data Scientist and Domain Expert.", @@ -175,20 +179,17 @@ def build_llm_prompt(stats: typing.Dict[str, typing.Any]) -> str: "Example: 'Higher discount rates correlate with higher volume but lower margins.'", "\n--- DATASET STATISTICS ---", ] - # Append datetime column metadata if available if "datetime_columns" in stats and stats["datetime_columns"]: prompt_segments.append( f"\nDetected Datetime Columns:\n{json.dumps(stats['datetime_columns'], indent=2)}" ) - # Append numeric summaries if available if "numeric_summary" in stats: for tag, summary in stats["numeric_summary"].items(): prompt_segments.append( f"\nDataset [{tag}] Numeric Summary:\n{summary.to_string()}" ) - # Append categorical distributions if available if "categorical_distributions" in stats: for tag, cols in stats["categorical_distributions"].items(): @@ -196,7 +197,6 @@ def build_llm_prompt(stats: typing.Dict[str, typing.Any]) -> str: prompt_segments.append( f"\nDistribution for [{col_name}]:\n{dist.to_string()}" ) - return "\n".join(prompt_segments) @@ -204,7 +204,9 @@ def get_llm_semantic_insights_langchain( prompt_text: str, model: str = "gpt-4o" ) -> typing.Dict[str, typing.Any]: """ - Process dataset metadata via LangChain to extract structured semantic insights. + Process dataset metadata via LangChain to extract structured semantic + insights. + Uses LangChain's JsonOutputParser alongside the Pydantic schema. """ _LOG.info("Querying LLM via LangChain (%s)...", model) @@ -238,7 +240,9 @@ def merge_and_export_results( ) -> None: """ Merge pandas statistics with LLM insights and export to a JSON report. - Converts DataFrame objects into dictionaries to ensure JSON serialization. + + Converts DataFrame objects into dictionaries to ensure JSON + serialization. """ _LOG.info("Merging technical stats with LLM insights...") @@ -328,14 +332,13 @@ def infer_and_convert_datetime_columns( threshold: float = 0.8, ) -> typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]: """ - Detect and convert date/datetime columns in a DataFrame. - Uses sampling to improve performance when checking format compliance. + Detect and convert date/datetime columns in a DataFrame. Uses sampling to + improve performance when checking format compliance. Returns: - Updated DataFrame with converted columns - Metadata dict with inference details per column """ - from datetime import datetime COMMON_FORMATS = [ "%Y-%m-%d", @@ -373,7 +376,7 @@ def infer_and_convert_datetime_columns( success = 0 for val in sample: try: - datetime.strptime(val, fmt) + datetime.datetime.strptime(val, fmt) success += 1 except Exception: continue @@ -422,39 +425,32 @@ def infer_and_convert_datetime_columns( def main() -> typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]: """ Execute entry point for the data profiling pipeline. + Flow: Load Data -> Clean Types -> Infer Dates -> Compute Stats -> Request LLM Insights -> Export. """ df = hpanio.read_csv_to_df("global_ecommerce_forecasting.csv") df_typed = hpanconv.convert_df(df) - # Process temporal inference df_typed, datetime_meta = infer_and_convert_datetime_columns(df_typed) - # Identify categorical columns to calculate their distributions cat_cols = df_typed.select_dtypes( include=["object", "category", "string"] ).columns.tolist() - # Compute base statistics stats = compute_llm_agent_stats( {"ecommerce_data": df_typed}, categorical_cols_map={"ecommerce_data": cat_cols}, ) - # FIX: Inject datetime metadata into stats so it's written to JSON and fed to LLM stats["datetime_columns"] = datetime_meta - print(df_typed.dtypes) print(datetime_meta) - # Send stats to LLM to generate testable hypotheses semantic_insights = generate_hypotheses_via_cli(stats) - # Save the combined numerical stats and semantic insights to disk merge_and_export_results(stats, semantic_insights) - return df_typed, stats if __name__ == "__main__": - main() \ No newline at end of file + main() From a2759f1f9190a0920ba8d0661ee16a5a87c856ed Mon Sep 17 00:00:00 2001 From: Pranav Shashidhara Date: Thu, 2 Apr 2026 12:43:51 -0400 Subject: [PATCH 06/14] Update readme.md and schema_agents script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- .../schema_agent/README.md | 111 +- .../schema_agent/schema_agent_utils.py | 1024 +++++++++++++---- 2 files changed, 876 insertions(+), 259 deletions(-) diff --git a/research/agentic_data_science/schema_agent/README.md b/research/agentic_data_science/schema_agent/README.md index 777d42283..99ff8eafc 100644 --- a/research/agentic_data_science/schema_agent/README.md +++ b/research/agentic_data_science/schema_agent/README.md @@ -2,52 +2,103 @@ Automated statistical profiling and LLM-powered semantic analysis for CSV datasets. Generates column-level insights including semantic meaning, data quality assessment, and testable business hypotheses. -## Setup and Usage +## Features -To navigate to the repository: +- **Temporal Detection:** Auto-detects and converts date/datetime columns across multiple formats +- **Statistical Profiling:** Computes numeric summaries, data quality metrics, and categorical distributions +- **LLM Semantic Analysis:** Generates column roles (ID, Feature, Target, Timestamp), semantic meaning, and hypotheses +- **Cost Optimization:** Filter columns before LLM analysis to control token usage and API costs +- **Multi-Format Output:** JSON reports and Markdown summaries + +## Setup + +Go into the schema folder: +```bash +> cd research/agentic_data_science/schema_agent +``` + +Install the requirements with the command: ```bash -cd research/agentic_data_science/schema_agent/schema_agent_utils.py +> pip install -r requirements.txt ``` -Setup the OpenAI key in your environment before running in a .env file +Set the OPENAI_API_KEY in the .env file: +```bash +> export OPENAI_API_KEY=sk-... +``` +## Usage -## Current Files +### Basic -- **`requirements.txt`** – Lists the Python dependencies required to run the agent -- **`schema_agent_utils.py`** – Contains functions for parsing data, computing column statistics, and preparing summaries for LLM-based analysis -- **`global_ecommerce_forecasting.csv`** – The dataset used for testing +```bash +python schema_agent_utils.py data.csv +``` +Outputs: +- `data_profile_report.json` — Machine-readable report +- `data_profile_summary.md` — Human-readable summary -## Setup +### Advanced -### 1. Load CSV +```bash +# Multiple files with tags +python schema_agent_utils.py dataset1.csv dataset2.csv --tags sales_2024 inv_q1 -- Read into a `pandas.DataFrame` -- Ensure the DataFrame is non-empty +# Cost-optimized: only high-null columns +python schema_agent_utils.py data.csv --llm-scope nulls --model gpt-4o-mini -### 2. Compute Column Stats +# Custom metrics and output +python schema_agent_utils.py data.csv --metrics mean std max --output-json my_report.json +``` + +## Command-Line Arguments + +| Argument | Default | Description | +|----------|---------|-------------| +| `csv_paths` | Required | One or more CSV file paths | +| `--tags` | File stems | Tags for each CSV (must match count) | +| `--model` | `gpt-4o` | LLM model (`gpt-4o`, `gpt-4o-mini`, etc.) | +| `--llm-scope` | `all` | Which columns to profile: `all`, `semantic`, `nulls` | +| `--metrics` | Subset | Numeric metrics: `mean`, `std`, `min`, `25%`, `50%`, `75%`, `max` | +| `--use-langchain` | False | Use LangChain instead of hllmcli | +| `--output-json` | `data_profile_report.json` | JSON report path | +| `--output-md` | `data_profile_summary.md` | Markdown summary path | -- Identify column types: numeric, categorical, datetime -- Compute per-column statistics: - - **Numeric**: min, max, mean, median - - **Categorical**: unique count, top values - - **Datetime**: ranges, durations -- Capture null percentages and sample values +## LLM Scoping -### 3. Build LLM Prompt +- **`all`** — Every column (highest cost, comprehensive) +- **`semantic`** — Non-numeric columns only +- **`nulls`** — Columns with >5% null values (cost-optimized) -- Serialize per-column stats with optional user context -- Designed for efficient LLM input (summaries only, not full data) +## Python API -### 4. LLM Analysis +```python +import research.agentic_data_science.schema_agent.schema_agent_utils as radssasau + +tag_to_df, stats = radssasau.run_pipeline( + csv_paths=["data.csv"], + model="gpt-4o-mini", + llm_scope="semantic" +) +``` -- Generate hypotheses about each column's meaning -- Suggest semantic roles (identifier, timestamp, category, etc.) -- Highlight data quality concerns +## Output -### 5. Merge Results +### data_profile_report.json +Structured report with column profiles, technical stats, and LLM insights. -- Combine pandas statistics and LLM output by column name +### data_profile_summary.md +Formatted table summary: Column | Meaning | Role | Quality | Hypotheses + +## Troubleshooting + +**API Key Error:** +```bash +export OPENAI_API_KEY=sk-... +``` -### 6. Export +**Validation Errors:** +- Use `--llm-scope nulls` or `--llm-scope semantic` to reduce columns +- Try `--model gpt-4o-mini` -- JSON output for downstream automation or agents +**Datetime Detection:** +Skipped automatically if no temporal columns detected. diff --git a/research/agentic_data_science/schema_agent/schema_agent_utils.py b/research/agentic_data_science/schema_agent/schema_agent_utils.py index a235e5e13..ba9f36b46 100644 --- a/research/agentic_data_science/schema_agent/schema_agent_utils.py +++ b/research/agentic_data_science/schema_agent/schema_agent_utils.py @@ -1,9 +1,17 @@ """ +Data Profiler Agent — single-file implementation. + Import as: + import research.agentic_data_science.schema_agent.schema_agent_utils as radssasau -import research.agentic_data_science.schema_agent.schema_agent_utils as radssasau +CLI usage: + python schema_agent_utils.py data.csv + python schema_agent_utils.py data.csv --model gpt-4o-mini --llm-scope nulls + python schema_agent_utils.py data.csv --metrics mean std min max --output-json out.json + python schema_agent_utils.py data.csv data2.csv --tags sales inventory """ +import argparse import datetime import json import logging @@ -25,11 +33,15 @@ import helpers.hpandas_io as hpanio import helpers.hpandas_stats as hpanstat -# --- Configuration & Logging --- + +# ============================================================================= +# Configuration & Logging +# ============================================================================= + dotenv.load_dotenv() api_key = os.environ.get("OPENAI_API_KEY") if not api_key: - print("Error: OPENAI_API_KEY not found.") + print("Error: OPENAI_API_KEY not found in environment.") sys.exit(1) client = openai.OpenAI(api_key=api_key) @@ -47,10 +59,16 @@ report_cpu_usage=True, ) +# Allowed metric names for numeric summaries. +VALID_METRICS: typing.List[str] = ["mean", "std", "min", "25%", "50%", "75%", "max"] + +# Default metric subset shown in reports. +DEFAULT_METRICS: typing.List[str] = ["mean", "std", "min", "50%", "max"] + -# ############################################################################# -# ColumnInsight -# ############################################################################# +# ============================================================================= +# Pydantic schemas +# ============================================================================= class ColumnInsight(pydantic.BaseModel): @@ -69,42 +87,189 @@ class ColumnInsight(pydantic.BaseModel): ) -# ############################################################################# -# DatasetInsights -# ############################################################################# - - class DatasetInsights(pydantic.BaseModel): columns: typing.Dict[str, ColumnInsight] -# --- Core Logic --- +# ============================================================================= +# Data loading +# ============================================================================= -def load_employee_data(csv_path: str) -> pd.DataFrame: +def load_csv(csv_path: str) -> pd.DataFrame: """ - Load employee data from CSV with error handling for missing files. + Load a CSV into a DataFrame with clear error handling. + + Parameters + ---------- + csv_path : str + Path to the CSV file. + + Returns + ------- + pd.DataFrame """ try: - return hpanio.read_csv_to_df(csv_path) + df = hpanio.read_csv_to_df(csv_path) except FileNotFoundError: _LOG.error("CSV not found at '%s'.", csv_path) raise + if df.empty: + raise ValueError(f"CSV at '{csv_path}' loaded as an empty DataFrame.") + _LOG.info("Loaded '%s': %d rows × %d columns.", csv_path, len(df), len(df.columns)) + return df + + +# keep legacy name for backwards compatibility +load_employee_data = load_csv + + +# ============================================================================= +# Datetime inference +# ============================================================================= + + +def infer_and_convert_datetime_columns( + df: pd.DataFrame, + sample_size: int = 100, + threshold: float = 0.8, +) -> typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]: + """ + Detect and convert date/datetime columns in a DataFrame. + + Uses sampling for performance. Returns the updated DataFrame and a + metadata dict with inference details per column. + + Parameters + ---------- + df : pd.DataFrame + sample_size : int + Number of rows to sample when testing format compliance. + threshold : float + Minimum fraction of parsed values required to accept a column as temporal. + + Returns + ------- + (pd.DataFrame, dict) + Updated DataFrame with converted columns + metadata per column. + """ + COMMON_FORMATS = [ + "%Y-%m-%d", + "%d-%m-%Y", + "%m-%d-%Y", + "%Y/%m/%d", + "%d/%m/%Y", + "%m/%d/%Y", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d %H:%M", + "%d-%m-%Y %H:%M:%S", + "%m/%d/%Y %H:%M:%S", + ] + + metadata: typing.Dict[str, typing.Any] = {} + df_out = df.copy() + + for col in df.columns: + if not ( + pd.api.types.is_object_dtype(df[col]) + or pd.api.types.is_string_dtype(df[col]) + ): + continue + + series = df[col].dropna().astype(str) + if series.empty: + continue + + sample = series.head(sample_size) + best_format: typing.Optional[str] = None + best_score = 0.0 + + for fmt in COMMON_FORMATS: + success = sum( + 1 + for val in sample + if _try_strptime(val, fmt) + ) + score = success / len(sample) + if score > best_score: + best_score = score + best_format = fmt + + if best_score >= threshold: + parsed = pd.to_datetime(df[col], format=best_format, errors="coerce") + used_format = best_format + else: + parsed = pd.to_datetime(df[col], errors="coerce") + used_format = None + + confidence = float(parsed.notna().mean()) + if confidence < threshold: + continue + + has_time = (parsed.dt.time != pd.Timestamp("00:00:00").time()).any() + col_type = "datetime" if has_time else "date" + df_out[col] = parsed + + metadata[col] = { + "semantic_type": "temporal", + "granularity": col_type, + "format": used_format, + "confidence": confidence, + } + _LOG.info( + "Column '%s' detected as %s (format=%s, confidence=%.2f)", + col, + col_type, + used_format, + confidence, + ) + + return df_out, metadata + + +def _try_strptime(val: str, fmt: str) -> bool: + """Return True if val parses under fmt, False otherwise.""" + try: + datetime.datetime.strptime(val, fmt) + return True + except Exception: # pylint: disable=broad-exception-caught + return False + + +# ============================================================================= +# Stats computation +# ============================================================================= def compute_llm_agent_stats( tag_to_df: typing.Dict[str, pd.DataFrame], - categorical_cols_map: typing.Optional[ - typing.Dict[str, typing.List[str]] - ] = None, + categorical_cols_map: typing.Optional[typing.Dict[str, typing.List[str]]] = None, + metrics: typing.Optional[typing.List[str]] = None, ) -> typing.Dict[str, typing.Any]: """ Compute a statistical profile including temporal boundaries, data quality, categorical distributions, and numeric summaries for LLM injection. + + Parameters + ---------- + tag_to_df : dict + Mapping of dataset tag → DataFrame. Supports multiple datasets. + categorical_cols_map : dict, optional + Mapping of tag → list of categorical column names to profile. + metrics : list of str, optional + Subset of numeric metrics to include. Must be from VALID_METRICS. + Defaults to DEFAULT_METRICS. + + Returns + ------- + dict + Keys: temporal_boundaries, quality_reports, categorical_distributions, + numeric_summary. """ + metrics = _resolve_metrics(metrics) dataframe_stats: typing.Dict[str, typing.Any] = {} - # 1. Temporal Analysis + # 1. Temporal boundaries try: duration_stats, _ = hpanstat.compute_duration_df(tag_to_df) dataframe_stats["temporal_boundaries"] = duration_stats @@ -113,16 +278,13 @@ def compute_llm_agent_stats( _LOG.warning("Skipping duration stats: %s", e) dataframe_stats["temporal_boundaries"] = None - # 2. Data Quality Profiling + # 2. Data quality dataframe_stats["quality_reports"] = {} for tag, df in tag_to_df.items(): numeric_df = df.select_dtypes(include="number") if numeric_df.empty: - _LOG.warning( - "No numeric columns in '%s'; skipping quality report", tag - ) + _LOG.warning("No numeric columns in '%s'; skipping quality report", tag) continue - df_stamped = hpanstat.add_end_download_timestamp(numeric_df.copy()) try: quality = hpanstat.report_zero_nan_inf_stats( @@ -136,78 +298,258 @@ def compute_llm_agent_stats( except Exception as e: # pylint: disable=broad-exception-caught _LOG.warning("Quality report failed for '%s': %s", tag, e) - # 3. Categorical Distributions + # 3. Categorical distributions dataframe_stats["categorical_distributions"] = {} if categorical_cols_map: for tag, cols in categorical_cols_map.items(): if tag not in tag_to_df: + _LOG.warning("Tag '%s' not found in tag_to_df; skipping.", tag) continue dataframe_stats["categorical_distributions"][tag] = {} for col in cols: - if col in tag_to_df[tag].columns: - dist = hpanstat.get_value_counts_stats_df(tag_to_df[tag], col) - dataframe_stats["categorical_distributions"][tag][col] = dist - print( - f"\n=== Distribution: {tag} / {col} ===\n", - dist.to_string(), - ) + if col not in tag_to_df[tag].columns: + _LOG.warning("Column '%s' not in '%s'; skipping.", col, tag) + continue + dist = hpanstat.get_value_counts_stats_df(tag_to_df[tag], col) + dataframe_stats["categorical_distributions"][tag][col] = dist + print(f"\n=== Distribution: {tag} / {col} ===\n", dist.to_string()) - # 4. Numeric Summary + # 4. Numeric summary (customisable metric subset) dataframe_stats["numeric_summary"] = {} for tag, df in tag_to_df.items(): numeric_df = df.select_dtypes(include="number") - if not numeric_df.empty: - summary = numeric_df.describe().T[ - ["mean", "std", "min", "50%", "max"] - ] - summary.rename(columns={"50%": "median"}, inplace=True) - dataframe_stats["numeric_summary"][tag] = summary - print(f"\n=== Numeric Summary: {tag} ===\n", summary.to_string()) + if numeric_df.empty: + continue + full_summary = numeric_df.describe().T + available = [m for m in metrics if m in full_summary.columns] + if not available: + _LOG.warning("None of the requested metrics %s are available.", metrics) + summary = full_summary[available].copy() + if "50%" in summary.columns: + summary = summary.rename(columns={"50%": "median"}) + dataframe_stats["numeric_summary"][tag] = summary + print(f"\n=== Numeric Summary: {tag} ===\n", summary.to_string()) return dataframe_stats -def build_llm_prompt(stats: typing.Dict[str, typing.Any]) -> str: +def _resolve_metrics(metrics: typing.Optional[typing.List[str]]) -> typing.List[str]: """ - Serialize statistical data into a structured string prompt for LLM - consumption. + Validate and return the metric list, falling back to DEFAULT_METRICS. + """ + if metrics is None: + return DEFAULT_METRICS + invalid = [m for m in metrics if m not in VALID_METRICS] + if invalid: + _LOG.warning( + "Unknown metrics %s will be ignored. Valid options: %s", + invalid, + VALID_METRICS, + ) + resolved = [m for m in metrics if m in VALID_METRICS] + return resolved if resolved else DEFAULT_METRICS + + +# ============================================================================= +# LLM scope filtering +# ============================================================================= + + +def _select_columns_for_llm( + df: pd.DataFrame, + scope: str, + null_threshold: float = 0.05, +) -> typing.List[str]: + """ + Return the list of column names that should be sent to the LLM. + + Parameters + ---------- + df : pd.DataFrame + scope : str + "all" — every column + "semantic" — non-numeric columns only (object / category / string) + "nulls" — columns with null fraction above null_threshold + null_threshold : float + Fraction of nulls required for "nulls" scope. Default 5 %. + + Returns + ------- + list of str + """ + if scope == "all": + return list(df.columns) + + if scope == "semantic": + cols = df.select_dtypes( + include=["object", "category", "string"] + ).columns.tolist() + _LOG.info("LLM scope='semantic': %d columns selected.", len(cols)) + return cols + + if scope == "nulls": + cols = [ + col + for col in df.columns + if df[col].isnull().mean() > null_threshold + ] + _LOG.info( + "LLM scope='nulls' (threshold=%.0f%%): %d columns selected.", + null_threshold * 100, + len(cols), + ) + return cols + + _LOG.warning("Unknown LLM scope '%s'; falling back to 'all'.", scope) + return list(df.columns) + + +# ============================================================================= +# Prompt building +# ============================================================================= + + +def build_llm_prompt( + stats: typing.Dict[str, typing.Any], + columns_to_include: typing.Optional[typing.List[str]] = None, +) -> str: + """ + Serialize statistical data into a structured string prompt for LLM consumption. + + Parameters + ---------- + stats : dict + Output of compute_llm_agent_stats(). + columns_to_include : list of str, optional + Subset of column names to include in the prompt. None = all. + + Returns + ------- + str """ prompt_segments = [ "You are a Senior Data Scientist and Domain Expert.", "Analyze the provided dataset statistics and generate a profile for each column.", - "For each column, provide 2-3 testable hypotheses. " + "For each column, provide 2-3 testable hypotheses.", "Example: 'Higher discount rates correlate with higher volume but lower margins.'", "\n--- DATASET STATISTICS ---", ] - # Append datetime column metadata if available + if "datetime_columns" in stats and stats["datetime_columns"]: prompt_segments.append( - f"\nDetected Datetime Columns:\n{json.dumps(stats['datetime_columns'], indent=2)}" + f"\nDetected Datetime Columns:\n" + f"{json.dumps(stats['datetime_columns'], indent=2)}" ) - # Append numeric summaries if available + if "numeric_summary" in stats: - for tag, summary in stats["numeric_summary"].items(): + for tag, summary_df in stats["numeric_summary"].items(): + if columns_to_include is not None: + summary_df = summary_df[ + summary_df.index.isin(columns_to_include) + ] prompt_segments.append( - f"\nDataset [{tag}] Numeric Summary:\n{summary.to_string()}" + f"\nDataset [{tag}] Numeric Summary:\n{summary_df.to_string()}" ) - # Append categorical distributions if available + if "categorical_distributions" in stats: for tag, cols in stats["categorical_distributions"].items(): for col_name, dist in cols.items(): + if columns_to_include is not None and col_name not in columns_to_include: + continue prompt_segments.append( f"\nDistribution for [{col_name}]:\n{dist.to_string()}" ) + return "\n".join(prompt_segments) +# ============================================================================= +# LLM calls +# ============================================================================= + + +def generate_hypotheses_via_cli( + stats: typing.Dict[str, typing.Any], + model: str = "gpt-4o", + columns_to_include: typing.Optional[typing.List[str]] = None, +) -> typing.Dict[str, typing.Any]: + """ + Generate insights and hypotheses using internal hllmcli logic. + + Parses and Pydantic-validates the LLM response against DatasetInsights. + + Parameters + ---------- + stats : dict + model : str + columns_to_include : list of str, optional + If provided, only these columns are sent to the LLM (cost control). + + Returns + ------- + dict — DatasetInsights-shaped dict, or {"error": ...} on failure. + """ + _LOG.info("Generating hypotheses via hllmcli (model=%s)...", model) + + schema_json = DatasetInsights.model_json_schema() + user_prompt = build_llm_prompt(stats, columns_to_include=columns_to_include) + system_prompt = ( + "You are a Senior Data Scientist. Analyze the following data statistics.\n" + "Generate a set of 2-3 predictive or causal hypotheses for EVERY column.\n" + f"Return the output strictly in JSON matching this schema:\n" + f"{json.dumps(schema_json)}" + ) + + try: + response_text, cost = hllmcli.apply_llm( + input_str=user_prompt, + system_prompt=system_prompt, + model=model, + use_llm_executable=False, + ) + _LOG.info("LLM call successful. Estimated cost: $%.6f", cost) + + cleaned = ( + response_text.strip() + .removeprefix("```json") + .removesuffix("```") + .strip() + ) + raw = json.loads(cleaned) + + # Pydantic validation — raises ValidationError on schema mismatch. + validated = DatasetInsights.model_validate(raw) + return validated.model_dump() + + except pydantic.ValidationError as e: + _LOG.error("LLM output failed Pydantic validation: %s", e) + return {"error": str(e)} + except json.JSONDecodeError as e: + _LOG.error("LLM returned invalid JSON: %s", e) + return {"error": f"JSON parse error: {e}"} + except Exception as e: # pylint: disable=broad-exception-caught + _LOG.error("hllmcli call failed: %s", e) + return {"error": str(e)} + + def get_llm_semantic_insights_langchain( - prompt_text: str, model: str = "gpt-4o" + prompt_text: str, + model: str = "gpt-4o", ) -> typing.Dict[str, typing.Any]: """ - Process dataset metadata via LangChain to extract structured semantic - insights. + Process dataset metadata via LangChain to extract structured semantic insights. + + Uses JsonOutputParser alongside the Pydantic schema. Validates output. - Uses LangChain's JsonOutputParser alongside the Pydantic schema. + Parameters + ---------- + prompt_text : str + Serialized stats from build_llm_prompt(). + model : str + + Returns + ------- + dict """ _LOG.info("Querying LLM via LangChain (%s)...", model) llm = lco.ChatOpenAI(model=model, temperature=0) @@ -227,230 +569,454 @@ def get_llm_semantic_insights_langchain( chain = prompt | llm | parser try: result = chain.invoke({"metadata_stats": prompt_text}) - return typing.cast(typing.Dict[str, typing.Any], result) + # Validate against Pydantic schema. + validated = DatasetInsights.model_validate(result) + return validated.model_dump() + except pydantic.ValidationError as e: + _LOG.error("LangChain output failed Pydantic validation: %s", e) + return {"error": str(e)} except Exception as e: # pylint: disable=broad-exception-caught _LOG.error("LangChain invocation failed: %s", e) return {"error": str(e)} +# ============================================================================= +# Column profiles +# ============================================================================= + + +def build_column_profiles( + df: pd.DataFrame, + stats: typing.Dict[str, typing.Any], + insights: typing.Dict[str, typing.Any], +) -> typing.List[typing.Dict[str, typing.Any]]: + """ + Convert stat-centric structure into per-column profiles. + + Merges numeric stats, categorical distributions, datetime metadata, + and LLM semantic insights keyed on column name. + + Parameters + ---------- + df : pd.DataFrame + stats : dict + insights : dict — output of generate_hypotheses_via_cli() + + Returns + ------- + list of dict, one entry per column. + """ + profiles: typing.List[typing.Dict[str, typing.Any]] = [] + + numeric_summary = stats.get("numeric_summary", {}) + categorical_stats = stats.get("categorical_distributions", {}) + datetime_meta = stats.get("datetime_columns", {}) + + for col in df.columns: + profile: typing.Dict[str, typing.Any] = { + "column": col, + "dtype": str(df[col].dtype), + "null_pct": float(df[col].isnull().mean()), + "unique_count": int(df[col].nunique()), + "sample_values": df[col].dropna().head(3).tolist(), + } + + # Numeric stats + for _, summary_df in numeric_summary.items(): + if col in summary_df.index: + col_stats = summary_df.loc[col] + for metric in col_stats.index: + profile[metric] = col_stats[metric] + + # Categorical top values + for _, cols in categorical_stats.items(): + if col in cols: + dist = cols[col] + try: + profile["top_values"] = ( + dist.head(3).to_dict() + if hasattr(dist, "head") + else dict(list(dist.items())[:3]) + ) + except Exception: # pylint: disable=broad-exception-caught + pass + + # Datetime metadata + if col in datetime_meta: + profile["temporal"] = datetime_meta[col] + + # LLM insights + if "columns" in insights and col in insights["columns"]: + insight = insights["columns"][col] + if hasattr(insight, "dict"): + insight = insight.dict() + profile.update( + { + "semantic_meaning": insight.get("semantic_meaning"), + "role": insight.get("role"), + "data_quality_notes": insight.get("data_quality_notes"), + "hypotheses": insight.get("hypotheses", []), + } + ) + + profiles.append(profile) + + return profiles + + +# ============================================================================= +# Export helpers +# ============================================================================= + + def merge_and_export_results( stats: typing.Dict[str, typing.Any], insights: typing.Dict[str, typing.Any], + column_profiles: typing.List[typing.Dict[str, typing.Any]], output_path: str = "data_profile_report.json", ) -> None: """ - Merge pandas statistics with LLM insights and export to a JSON report. - - Converts DataFrame objects into dictionaries to ensure JSON - serialization. + Merge stats + insights + column_profiles and export to JSON. + + Parameters + ---------- + stats : dict + insights : dict + column_profiles : list of dict + output_path : str """ - _LOG.info("Merging technical stats with LLM insights...") - - serializable_stats = {} - - for key, value in stats.items(): - if isinstance(value, pd.DataFrame): - serializable_stats[key] = value.to_dict(orient="index") - elif isinstance(value, dict): - inner_dict = {} - for k, v in value.items(): - if isinstance(v, pd.DataFrame): - inner_dict[k] = v.to_dict(orient="index") - elif isinstance(v, dict): - # Handle nested dicts (e.g. categorical_distributions[tag][col]) - inner_inner = {} - for kk, vv in v.items(): - inner_inner[kk] = ( - vv.to_dict(orient="index") - if isinstance(vv, pd.DataFrame) - else vv - ) - inner_dict[k] = inner_inner - else: - inner_dict[k] = v - serializable_stats[key] = inner_dict - else: - serializable_stats[key] = value + _LOG.info("Merging results...") + serializable_stats = _make_serializable(stats) final_report = { - "report_metadata": {"version": "1.0", "agent": "LangChain-Data-Profiler"}, + "report_metadata": { + "version": "1.2", + "agent": "Data-Profiler-Agent", + "generated_at": datetime.datetime.utcnow().isoformat() + "Z", + }, + "column_profiles": column_profiles, "technical_stats": serializable_stats, "semantic_insights": insights, } - try: - with open(output_path, "w", encoding="utf-8") as f: - json.dump(final_report, f, indent=4, default=str) - _LOG.info("Successfully exported merged profile to: %s", output_path) - except Exception as e: # pylint: disable=broad-exception-caught - _LOG.error("Failed to export results: %s", e) + with open(output_path, "w", encoding="utf-8") as f: + json.dump(final_report, f, indent=4, default=str) + _LOG.info("Exported JSON report to '%s'.", output_path) -def generate_hypotheses_via_cli( - stats: typing.Dict[str, typing.Any], model: str = "gpt-4o" -) -> typing.Dict[str, typing.Any]: + +def _make_serializable(obj: typing.Any) -> typing.Any: """ - Generate insights and hypotheses using internal hllmcli logic. + Recursively convert DataFrames and nested dicts to JSON-safe structures. """ - _LOG.info("Generating hypotheses via hllmcli logic...") - - schema_json = DatasetInsights.model_json_schema() - user_prompt = build_llm_prompt(stats) - system_prompt = ( - "You are a Senior Data Scientist. Analyze the following data statistics.\n" - "Generate a set of 2-3 predictive or causal hypotheses for EVERY column.\n" - f"Return the output strictly in JSON matching this schema: {json.dumps(schema_json)}" - ) - - try: - response_text, cost = hllmcli.apply_llm( - input_str=user_prompt, - system_prompt=system_prompt, - model=model, - use_llm_executable=False, - ) - - _LOG.info("LLM Call successful. Cost: $%.6f", cost) - - cleaned_response = ( - response_text.strip() - .removeprefix("```json") - .removesuffix("```") - .strip() - ) - parsed = json.loads(cleaned_response) - return typing.cast(typing.Dict[str, typing.Any], parsed) - - except Exception as e: # pylint: disable=broad-exception-caught - _LOG.error("hllmcli call failed: %s", e) - return {"error": str(e)} - + if isinstance(obj, pd.DataFrame): + return obj.to_dict(orient="index") + if isinstance(obj, dict): + return {k: _make_serializable(v) for k, v in obj.items()} + if isinstance(obj, list): + return [_make_serializable(v) for v in obj] + return obj + + +def export_markdown_from_profiles( + column_profiles: typing.List[typing.Dict[str, typing.Any]], + numeric_stats: typing.Optional[typing.Dict[str, pd.DataFrame]] = None, + output_path: str = "data_profile_summary.md", +) -> None: + """ + Generate a readable Markdown report from column profiles and numeric stats. -def infer_and_convert_datetime_columns( - df: pd.DataFrame, - sample_size: int = 100, - threshold: float = 0.8, -) -> typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]: + Parameters + ---------- + column_profiles : list of dict + numeric_stats : dict of str → DataFrame, optional + output_path : str """ - Detect and convert date/datetime columns in a DataFrame. Uses sampling to - improve performance when checking format compliance. - Returns: - - Updated DataFrame with converted columns - - Metadata dict with inference details per column + def _clean(val: typing.Any) -> str: + if val is None: + return "" + return str(val).replace("|", "\\|").replace("\n", " ") + + def _fmt(val: typing.Any) -> str: + if isinstance(val, int): + return str(val) + if isinstance(val, float): + return f"{val:,.2f}" if abs(val) >= 1 else f"{val:.4f}" + return str(val) + + lines = ["# Data Profile Summary\n"] + + # Column profiles table + lines.append("## Column Profiles\n") + lines.append("| Column | Meaning | Role | Quality | Hypotheses |") + lines.append("|--------|---------|------|---------|------------|") + + for p in column_profiles: + hypotheses = p.get("hypotheses", []) + if isinstance(hypotheses, list) and hypotheses: + hyp_str = "
".join( + f"{i+1}. {_clean(h)}" for i, h in enumerate(hypotheses[:3]) + ) + else: + hyp_str = _clean(hypotheses) or "N/A" + + row = [ + _clean(p.get("column")), + _clean(p.get("semantic_meaning")), + _clean(p.get("role")), + _clean(p.get("data_quality_notes")), + hyp_str, + ] + lines.append("| " + " | ".join(row) + " |") + + # Numeric stats table + if numeric_stats: + lines.append("\n## Numeric Column Statistics\n") + for tag, df in numeric_stats.items(): + lines.append(f"### {tag}\n") + lines.append("| Column | Metric | Value |") + lines.append("|--------|--------|-------|") + for col_name in df.index: + for metric in df.columns: + val = df.loc[col_name, metric] + lines.append(f"| {col_name} | {metric} | {_fmt(val)} |") + lines.append("") + + with open(output_path, "w", encoding="utf-8") as f: + f.write("\n".join(lines) + "\n") + + _LOG.info("Exported Markdown report to '%s'.", output_path) + + +# ============================================================================= +# Pipeline +# ============================================================================= + + +def run_pipeline( + csv_paths: typing.List[str], + tags: typing.Optional[typing.List[str]] = None, + model: str = "gpt-4o", + metrics: typing.Optional[typing.List[str]] = None, + llm_scope: str = "all", + output_json: str = "data_profile_report.json", + output_md: str = "data_profile_summary.md", + use_langchain: bool = False, +) -> typing.Tuple[typing.Dict[str, pd.DataFrame], typing.Dict[str, typing.Any]]: """ + Execute the full data profiling pipeline over one or more CSV files. + + Parameters + ---------- + csv_paths : list of str + One or more CSV file paths to profile. + tags : list of str, optional + Human-readable tag for each CSV. Defaults to filename stems. + model : str + LLM model name passed to OpenAI / hllmcli. + metrics : list of str, optional + Numeric metrics to include. Defaults to DEFAULT_METRICS. + llm_scope : str + "all", "semantic", or "nulls" — controls which columns are LLM-profiled. + output_json : str + Path for the merged JSON report. + output_md : str + Path for the Markdown summary. + use_langchain : bool + Use LangChain chain instead of hllmcli for LLM calls. + + Returns + ------- + (dict of tag → df, stats dict) + """ + if tags is None: + tags = [os.path.splitext(os.path.basename(p))[0] for p in csv_paths] - COMMON_FORMATS = [ - "%Y-%m-%d", - "%d-%m-%Y", - "%m-%d-%Y", - "%Y/%m/%d", - "%d/%m/%Y", - "%m/%d/%Y", - "%Y-%m-%d %H:%M:%S", - "%Y-%m-%d %H:%M", - "%d-%m-%Y %H:%M:%S", - "%m/%d/%Y %H:%M:%S", - ] + if len(tags) != len(csv_paths): + raise ValueError( + f"Length of tags ({len(tags)}) must match csv_paths ({len(csv_paths)})." + ) - metadata: typing.Dict[str, typing.Any] = {} - df_out = df.copy() + # --- Load & type-coerce --- + tag_to_df: typing.Dict[str, pd.DataFrame] = {} + for path, tag in zip(csv_paths, tags): + df = load_csv(path) + df = hpanconv.convert_df(df) + df, datetime_meta_partial = infer_and_convert_datetime_columns(df) + tag_to_df[tag] = df + + # Merge datetime metadata across all DataFrames (using the last loaded tag + # as the primary df for single-dataset runs; full merge for multi). + _, datetime_meta = infer_and_convert_datetime_columns( + pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True) + ) - for col in df.columns: - if not pd.api.types.is_object_dtype( - df[col] - ) and not pd.api.types.is_string_dtype(df[col]): - continue + # --- Categorical column map --- + cat_cols_map: typing.Dict[str, typing.List[str]] = { + tag: df.select_dtypes( + include=["object", "category", "string"] + ).columns.tolist() + for tag, df in tag_to_df.items() + } - series = df[col].dropna().astype(str) + # --- Compute stats --- + stats = compute_llm_agent_stats( + tag_to_df, + categorical_cols_map=cat_cols_map, + metrics=metrics, + ) + stats["datetime_columns"] = datetime_meta - if series.empty: - continue + # --- LLM scope --- + # Use the concatenated DataFrame to decide which columns to send. + combined_df = pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True) + columns_for_llm = _select_columns_for_llm(combined_df, scope=llm_scope) + _LOG.info( + "LLM will profile %d / %d columns (scope=%s).", + len(columns_for_llm), + len(combined_df.columns), + llm_scope, + ) - sample = series.head(sample_size) + # --- LLM call --- + if use_langchain: + prompt_text = build_llm_prompt(stats, columns_to_include=columns_for_llm) + semantic_insights = get_llm_semantic_insights_langchain( + prompt_text, model=model + ) + else: + semantic_insights = generate_hypotheses_via_cli( + stats, + model=model, + columns_to_include=columns_for_llm, + ) - best_format = None - best_score = 0.0 + # --- Build column profiles (use first / primary df for column ordering) --- + primary_df = list(tag_to_df.values())[0] + column_profiles = build_column_profiles( + df=primary_df, + stats=stats, + insights=semantic_insights, + ) - for fmt in COMMON_FORMATS: - success = 0 - for val in sample: - try: - datetime.datetime.strptime(val, fmt) - success += 1 - except Exception: - continue + # --- Export --- + merge_and_export_results( + stats=stats, + insights=semantic_insights, + column_profiles=column_profiles, + output_path=output_json, + ) + export_markdown_from_profiles( + column_profiles, + numeric_stats=stats.get("numeric_summary", {}), + output_path=output_md, + ) - score = success / len(sample) + return tag_to_df, stats - if score > best_score: - best_score = score - best_format = fmt - if best_score >= threshold: - parsed = pd.to_datetime(df[col], format=best_format, errors="coerce") - used_format = best_format - else: - parsed = pd.to_datetime(df[col], errors="coerce") - used_format = None +# ============================================================================= +# CLI +# ============================================================================= - confidence = parsed.notna().mean() - if confidence < threshold: - continue +def _build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="schema_agent_utils", + description="Data Profiler Agent — statistical + LLM column profiling", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) - has_time = (parsed.dt.time != pd.Timestamp("00:00:00").time()).any() - col_type = "datetime" if has_time else "date" + # --- Inputs --- + parser.add_argument( + "csv_paths", + nargs="+", + metavar="CSV", + help="One or more CSV file paths to profile.", + ) + parser.add_argument( + "--tags", + nargs="+", + metavar="TAG", + help="Human-readable tag for each CSV (must match number of csv_paths).", + ) - df_out[col] = parsed + # --- LLM options --- + parser.add_argument( + "--model", + default="gpt-4o", + help="LLM model for semantic analysis.", + ) + parser.add_argument( + "--llm-scope", + choices=["all", "semantic", "nulls"], + default="all", + dest="llm_scope", + help=( + "Which columns to send to the LLM. " + "'all'=every column, 'semantic'=non-numeric only, " + "'nulls'=high-null columns only (saves cost)." + ), + ) + parser.add_argument( + "--use-langchain", + action="store_true", + dest="use_langchain", + help="Use LangChain pipeline instead of hllmcli for LLM calls.", + ) - metadata[col] = { - "semantic_type": "temporal", - "granularity": col_type, - "format": used_format, - "confidence": float(confidence), - } + # --- Stat options --- + parser.add_argument( + "--metrics", + nargs="+", + choices=VALID_METRICS, + default=None, + metavar="METRIC", + help=( + f"Numeric metrics to include in the summary. " + f"Valid: {', '.join(VALID_METRICS)}. " + f"Default: {', '.join(DEFAULT_METRICS)}." + ), + ) - _LOG.info( - "Column '%s' detected as %s (format=%s, confidence=%.2f)", - col, - col_type, - used_format, - confidence, - ) + # --- Output options --- + parser.add_argument( + "--output-json", + default="data_profile_report.json", + dest="output_json", + metavar="PATH", + help="Output path for the merged JSON report.", + ) + parser.add_argument( + "--output-md", + default="data_profile_summary.md", + dest="output_md", + metavar="PATH", + help="Output path for the Markdown summary.", + ) - return df_out, metadata + return parser -def main() -> typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]: +def main() -> None: """ - Execute entry point for the data profiling pipeline. - - Flow: Load Data -> Clean Types -> Infer Dates -> Compute Stats -> Request LLM Insights -> Export. + CLI entry point. Parses arguments and delegates to run_pipeline(). """ - df = hpanio.read_csv_to_df("global_ecommerce_forecasting.csv") - df_typed = hpanconv.convert_df(df) - # Process temporal inference - df_typed, datetime_meta = infer_and_convert_datetime_columns(df_typed) - # Identify categorical columns to calculate their distributions - cat_cols = df_typed.select_dtypes( - include=["object", "category", "string"] - ).columns.tolist() - # Compute base statistics - stats = compute_llm_agent_stats( - {"ecommerce_data": df_typed}, - categorical_cols_map={"ecommerce_data": cat_cols}, + parser = _build_arg_parser() + args = parser.parse_args() + + run_pipeline( + csv_paths=args.csv_paths, + tags=args.tags, + model=args.model, + metrics=args.metrics, + llm_scope=args.llm_scope, + output_json=args.output_json, + output_md=args.output_md, + use_langchain=args.use_langchain, ) - # FIX: Inject datetime metadata into stats so it's written to JSON and fed to LLM - stats["datetime_columns"] = datetime_meta - print(df_typed.dtypes) - print(datetime_meta) - # Send stats to LLM to generate testable hypotheses - semantic_insights = generate_hypotheses_via_cli(stats) - # Save the combined numerical stats and semantic insights to disk - merge_and_export_results(stats, semantic_insights) - return df_typed, stats if __name__ == "__main__": - main() + main() \ No newline at end of file From 52aaa71457339b35674b6c4b243872187cdd90fb Mon Sep 17 00:00:00 2001 From: Pranav Shashidhara Date: Fri, 3 Apr 2026 12:22:44 -0400 Subject: [PATCH 07/14] Update README.md, modularize and lint code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- .../schema_agent/README.md | 60 +- .../schema_agent/schema_agent_utils.py | 1022 ----------------- 2 files changed, 45 insertions(+), 1037 deletions(-) delete mode 100644 research/agentic_data_science/schema_agent/schema_agent_utils.py diff --git a/research/agentic_data_science/schema_agent/README.md b/research/agentic_data_science/schema_agent/README.md index 99ff8eafc..c8d1b5119 100644 --- a/research/agentic_data_science/schema_agent/README.md +++ b/research/agentic_data_science/schema_agent/README.md @@ -12,25 +12,40 @@ Automated statistical profiling and LLM-powered semantic analysis for CSV datase ## Setup -Go into the schema folder: -```bash -> cd research/agentic_data_science/schema_agent +Go into the schema folder: +```bash +cd research/agentic_data_science/schema_agent ``` -Install the requirements with the command: +Install the requirements: ```bash -> pip install -r requirements.txt +pip install -r requirements.txt ``` -Set the OPENAI_API_KEY in the .env file: -```bash -> export OPENAI_API_KEY=sk-... + +Set the `OPENAI_API_KEY` in your environment: +```bash +export OPENAI_API_KEY=sk-... ``` + +## Module Structure + +The agent is split into six focused modules: + +| Module | Responsibility | +|--------|---------------| +| `schema_agent_models.py` | Pydantic schemas for type-safe column/dataset insights | +| `schema_agent_loader.py` | CSV loading, type inference, datetime detection | +| `schema_agent_stats.py` | Numeric summaries, quality reports, categorical distributions | +| `schema_agent_llm.py` | Prompt building, OpenAI/LangChain calls, structured output parsing | +| `schema_agent_report.py` | Column profiles, JSON and Markdown export | +| `schema_agent.py` | Pipeline orchestration and CLI entry point | + ## Usage ### Basic ```bash -python schema_agent_utils.py data.csv +python schema_agent.py data.csv ``` Outputs: @@ -41,13 +56,16 @@ Outputs: ```bash # Multiple files with tags -python schema_agent_utils.py dataset1.csv dataset2.csv --tags sales_2024 inv_q1 +python schema_agent.py dataset1.csv dataset2.csv --tags sales_2024 inv_q1 # Cost-optimized: only high-null columns -python schema_agent_utils.py data.csv --llm-scope nulls --model gpt-4o-mini +python schema_agent.py data.csv --llm-scope nulls --model gpt-4o-mini # Custom metrics and output -python schema_agent_utils.py data.csv --metrics mean std max --output-json my_report.json +python schema_agent.py data.csv --metrics mean std max --output-json my_report.json + +# LangChain backend +python schema_agent.py data.csv --use-langchain ``` ## Command-Line Arguments @@ -71,16 +89,28 @@ python schema_agent_utils.py data.csv --metrics mean std max --output-json my_re ## Python API -```python -import research.agentic_data_science.schema_agent.schema_agent_utils as radssasau +### Full pipeline -tag_to_df, stats = radssasau.run_pipeline( +```python +import schema_agent as radsasag +tag_to_df, stats = radsasag.run_pipeline( csv_paths=["data.csv"], model="gpt-4o-mini", llm_scope="semantic" ) ``` +### Individual modules + +Each module can be imported independently for exploratory use or testing: + +```python +import schema_agent_loader as radsasal +import schema_agent_stats as radsasas +import schema_agent_llm as radsasal +import schema_agent_report as radsasar +``` + ## Output ### data_profile_report.json diff --git a/research/agentic_data_science/schema_agent/schema_agent_utils.py b/research/agentic_data_science/schema_agent/schema_agent_utils.py deleted file mode 100644 index ba9f36b46..000000000 --- a/research/agentic_data_science/schema_agent/schema_agent_utils.py +++ /dev/null @@ -1,1022 +0,0 @@ -""" -Data Profiler Agent — single-file implementation. - -Import as: - import research.agentic_data_science.schema_agent.schema_agent_utils as radssasau - -CLI usage: - python schema_agent_utils.py data.csv - python schema_agent_utils.py data.csv --model gpt-4o-mini --llm-scope nulls - python schema_agent_utils.py data.csv --metrics mean std min max --output-json out.json - python schema_agent_utils.py data.csv data2.csv --tags sales inventory -""" - -import argparse -import datetime -import json -import logging -import os -import sys -import typing - -import dotenv -import langchain_core.output_parsers as lcop -import langchain_core.prompts as lcpr -import langchain_openai as lco -import openai -import pandas as pd -import pydantic - -import helpers.hllm_cli as hllmcli -import helpers.hlogging as hloggin -import helpers.hpandas_conversion as hpanconv -import helpers.hpandas_io as hpanio -import helpers.hpandas_stats as hpanstat - - -# ============================================================================= -# Configuration & Logging -# ============================================================================= - -dotenv.load_dotenv() -api_key = os.environ.get("OPENAI_API_KEY") -if not api_key: - print("Error: OPENAI_API_KEY not found in environment.") - sys.exit(1) - -client = openai.OpenAI(api_key=api_key) -_LOG = hloggin.getLogger(__name__) -_LOG.setLevel(logging.DEBUG) - -console_handler = logging.StreamHandler(sys.stdout) -hloggin.set_v2_formatter( - ch=console_handler, - root_logger=_LOG, - force_no_warning=False, - force_print_format=False, - force_verbose_format=True, - report_memory_usage=True, - report_cpu_usage=True, -) - -# Allowed metric names for numeric summaries. -VALID_METRICS: typing.List[str] = ["mean", "std", "min", "25%", "50%", "75%", "max"] - -# Default metric subset shown in reports. -DEFAULT_METRICS: typing.List[str] = ["mean", "std", "min", "50%", "max"] - - -# ============================================================================= -# Pydantic schemas -# ============================================================================= - - -class ColumnInsight(pydantic.BaseModel): - semantic_meaning: str = pydantic.Field( - description="Brief description of what the data represents" - ) - role: str = pydantic.Field( - description="One of [ID, Feature, Target, Timestamp]" - ) - data_quality_notes: str = pydantic.Field( - description="Any concerns based on the stats (e.g. high nulls, outliers)" - ) - hypotheses: typing.List[str] = pydantic.Field( - description="List of testable hypotheses regarding the column's relationship " - "to business outcomes." - ) - - -class DatasetInsights(pydantic.BaseModel): - columns: typing.Dict[str, ColumnInsight] - - -# ============================================================================= -# Data loading -# ============================================================================= - - -def load_csv(csv_path: str) -> pd.DataFrame: - """ - Load a CSV into a DataFrame with clear error handling. - - Parameters - ---------- - csv_path : str - Path to the CSV file. - - Returns - ------- - pd.DataFrame - """ - try: - df = hpanio.read_csv_to_df(csv_path) - except FileNotFoundError: - _LOG.error("CSV not found at '%s'.", csv_path) - raise - if df.empty: - raise ValueError(f"CSV at '{csv_path}' loaded as an empty DataFrame.") - _LOG.info("Loaded '%s': %d rows × %d columns.", csv_path, len(df), len(df.columns)) - return df - - -# keep legacy name for backwards compatibility -load_employee_data = load_csv - - -# ============================================================================= -# Datetime inference -# ============================================================================= - - -def infer_and_convert_datetime_columns( - df: pd.DataFrame, - sample_size: int = 100, - threshold: float = 0.8, -) -> typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]: - """ - Detect and convert date/datetime columns in a DataFrame. - - Uses sampling for performance. Returns the updated DataFrame and a - metadata dict with inference details per column. - - Parameters - ---------- - df : pd.DataFrame - sample_size : int - Number of rows to sample when testing format compliance. - threshold : float - Minimum fraction of parsed values required to accept a column as temporal. - - Returns - ------- - (pd.DataFrame, dict) - Updated DataFrame with converted columns + metadata per column. - """ - COMMON_FORMATS = [ - "%Y-%m-%d", - "%d-%m-%Y", - "%m-%d-%Y", - "%Y/%m/%d", - "%d/%m/%Y", - "%m/%d/%Y", - "%Y-%m-%d %H:%M:%S", - "%Y-%m-%d %H:%M", - "%d-%m-%Y %H:%M:%S", - "%m/%d/%Y %H:%M:%S", - ] - - metadata: typing.Dict[str, typing.Any] = {} - df_out = df.copy() - - for col in df.columns: - if not ( - pd.api.types.is_object_dtype(df[col]) - or pd.api.types.is_string_dtype(df[col]) - ): - continue - - series = df[col].dropna().astype(str) - if series.empty: - continue - - sample = series.head(sample_size) - best_format: typing.Optional[str] = None - best_score = 0.0 - - for fmt in COMMON_FORMATS: - success = sum( - 1 - for val in sample - if _try_strptime(val, fmt) - ) - score = success / len(sample) - if score > best_score: - best_score = score - best_format = fmt - - if best_score >= threshold: - parsed = pd.to_datetime(df[col], format=best_format, errors="coerce") - used_format = best_format - else: - parsed = pd.to_datetime(df[col], errors="coerce") - used_format = None - - confidence = float(parsed.notna().mean()) - if confidence < threshold: - continue - - has_time = (parsed.dt.time != pd.Timestamp("00:00:00").time()).any() - col_type = "datetime" if has_time else "date" - df_out[col] = parsed - - metadata[col] = { - "semantic_type": "temporal", - "granularity": col_type, - "format": used_format, - "confidence": confidence, - } - _LOG.info( - "Column '%s' detected as %s (format=%s, confidence=%.2f)", - col, - col_type, - used_format, - confidence, - ) - - return df_out, metadata - - -def _try_strptime(val: str, fmt: str) -> bool: - """Return True if val parses under fmt, False otherwise.""" - try: - datetime.datetime.strptime(val, fmt) - return True - except Exception: # pylint: disable=broad-exception-caught - return False - - -# ============================================================================= -# Stats computation -# ============================================================================= - - -def compute_llm_agent_stats( - tag_to_df: typing.Dict[str, pd.DataFrame], - categorical_cols_map: typing.Optional[typing.Dict[str, typing.List[str]]] = None, - metrics: typing.Optional[typing.List[str]] = None, -) -> typing.Dict[str, typing.Any]: - """ - Compute a statistical profile including temporal boundaries, data quality, - categorical distributions, and numeric summaries for LLM injection. - - Parameters - ---------- - tag_to_df : dict - Mapping of dataset tag → DataFrame. Supports multiple datasets. - categorical_cols_map : dict, optional - Mapping of tag → list of categorical column names to profile. - metrics : list of str, optional - Subset of numeric metrics to include. Must be from VALID_METRICS. - Defaults to DEFAULT_METRICS. - - Returns - ------- - dict - Keys: temporal_boundaries, quality_reports, categorical_distributions, - numeric_summary. - """ - metrics = _resolve_metrics(metrics) - dataframe_stats: typing.Dict[str, typing.Any] = {} - - # 1. Temporal boundaries - try: - duration_stats, _ = hpanstat.compute_duration_df(tag_to_df) - dataframe_stats["temporal_boundaries"] = duration_stats - print("\n=== Temporal Boundaries ===\n", duration_stats.to_string()) - except Exception as e: # pylint: disable=broad-exception-caught - _LOG.warning("Skipping duration stats: %s", e) - dataframe_stats["temporal_boundaries"] = None - - # 2. Data quality - dataframe_stats["quality_reports"] = {} - for tag, df in tag_to_df.items(): - numeric_df = df.select_dtypes(include="number") - if numeric_df.empty: - _LOG.warning("No numeric columns in '%s'; skipping quality report", tag) - continue - df_stamped = hpanstat.add_end_download_timestamp(numeric_df.copy()) - try: - quality = hpanstat.report_zero_nan_inf_stats( - df_stamped, - zero_threshold=1e-9, - verbose=True, - as_txt=True, - ) - dataframe_stats["quality_reports"][tag] = quality - print(f"\n=== Quality Report: {tag} ===\n", quality.to_string()) - except Exception as e: # pylint: disable=broad-exception-caught - _LOG.warning("Quality report failed for '%s': %s", tag, e) - - # 3. Categorical distributions - dataframe_stats["categorical_distributions"] = {} - if categorical_cols_map: - for tag, cols in categorical_cols_map.items(): - if tag not in tag_to_df: - _LOG.warning("Tag '%s' not found in tag_to_df; skipping.", tag) - continue - dataframe_stats["categorical_distributions"][tag] = {} - for col in cols: - if col not in tag_to_df[tag].columns: - _LOG.warning("Column '%s' not in '%s'; skipping.", col, tag) - continue - dist = hpanstat.get_value_counts_stats_df(tag_to_df[tag], col) - dataframe_stats["categorical_distributions"][tag][col] = dist - print(f"\n=== Distribution: {tag} / {col} ===\n", dist.to_string()) - - # 4. Numeric summary (customisable metric subset) - dataframe_stats["numeric_summary"] = {} - for tag, df in tag_to_df.items(): - numeric_df = df.select_dtypes(include="number") - if numeric_df.empty: - continue - full_summary = numeric_df.describe().T - available = [m for m in metrics if m in full_summary.columns] - if not available: - _LOG.warning("None of the requested metrics %s are available.", metrics) - summary = full_summary[available].copy() - if "50%" in summary.columns: - summary = summary.rename(columns={"50%": "median"}) - dataframe_stats["numeric_summary"][tag] = summary - print(f"\n=== Numeric Summary: {tag} ===\n", summary.to_string()) - - return dataframe_stats - - -def _resolve_metrics(metrics: typing.Optional[typing.List[str]]) -> typing.List[str]: - """ - Validate and return the metric list, falling back to DEFAULT_METRICS. - """ - if metrics is None: - return DEFAULT_METRICS - invalid = [m for m in metrics if m not in VALID_METRICS] - if invalid: - _LOG.warning( - "Unknown metrics %s will be ignored. Valid options: %s", - invalid, - VALID_METRICS, - ) - resolved = [m for m in metrics if m in VALID_METRICS] - return resolved if resolved else DEFAULT_METRICS - - -# ============================================================================= -# LLM scope filtering -# ============================================================================= - - -def _select_columns_for_llm( - df: pd.DataFrame, - scope: str, - null_threshold: float = 0.05, -) -> typing.List[str]: - """ - Return the list of column names that should be sent to the LLM. - - Parameters - ---------- - df : pd.DataFrame - scope : str - "all" — every column - "semantic" — non-numeric columns only (object / category / string) - "nulls" — columns with null fraction above null_threshold - null_threshold : float - Fraction of nulls required for "nulls" scope. Default 5 %. - - Returns - ------- - list of str - """ - if scope == "all": - return list(df.columns) - - if scope == "semantic": - cols = df.select_dtypes( - include=["object", "category", "string"] - ).columns.tolist() - _LOG.info("LLM scope='semantic': %d columns selected.", len(cols)) - return cols - - if scope == "nulls": - cols = [ - col - for col in df.columns - if df[col].isnull().mean() > null_threshold - ] - _LOG.info( - "LLM scope='nulls' (threshold=%.0f%%): %d columns selected.", - null_threshold * 100, - len(cols), - ) - return cols - - _LOG.warning("Unknown LLM scope '%s'; falling back to 'all'.", scope) - return list(df.columns) - - -# ============================================================================= -# Prompt building -# ============================================================================= - - -def build_llm_prompt( - stats: typing.Dict[str, typing.Any], - columns_to_include: typing.Optional[typing.List[str]] = None, -) -> str: - """ - Serialize statistical data into a structured string prompt for LLM consumption. - - Parameters - ---------- - stats : dict - Output of compute_llm_agent_stats(). - columns_to_include : list of str, optional - Subset of column names to include in the prompt. None = all. - - Returns - ------- - str - """ - prompt_segments = [ - "You are a Senior Data Scientist and Domain Expert.", - "Analyze the provided dataset statistics and generate a profile for each column.", - "For each column, provide 2-3 testable hypotheses.", - "Example: 'Higher discount rates correlate with higher volume but lower margins.'", - "\n--- DATASET STATISTICS ---", - ] - - if "datetime_columns" in stats and stats["datetime_columns"]: - prompt_segments.append( - f"\nDetected Datetime Columns:\n" - f"{json.dumps(stats['datetime_columns'], indent=2)}" - ) - - if "numeric_summary" in stats: - for tag, summary_df in stats["numeric_summary"].items(): - if columns_to_include is not None: - summary_df = summary_df[ - summary_df.index.isin(columns_to_include) - ] - prompt_segments.append( - f"\nDataset [{tag}] Numeric Summary:\n{summary_df.to_string()}" - ) - - if "categorical_distributions" in stats: - for tag, cols in stats["categorical_distributions"].items(): - for col_name, dist in cols.items(): - if columns_to_include is not None and col_name not in columns_to_include: - continue - prompt_segments.append( - f"\nDistribution for [{col_name}]:\n{dist.to_string()}" - ) - - return "\n".join(prompt_segments) - - -# ============================================================================= -# LLM calls -# ============================================================================= - - -def generate_hypotheses_via_cli( - stats: typing.Dict[str, typing.Any], - model: str = "gpt-4o", - columns_to_include: typing.Optional[typing.List[str]] = None, -) -> typing.Dict[str, typing.Any]: - """ - Generate insights and hypotheses using internal hllmcli logic. - - Parses and Pydantic-validates the LLM response against DatasetInsights. - - Parameters - ---------- - stats : dict - model : str - columns_to_include : list of str, optional - If provided, only these columns are sent to the LLM (cost control). - - Returns - ------- - dict — DatasetInsights-shaped dict, or {"error": ...} on failure. - """ - _LOG.info("Generating hypotheses via hllmcli (model=%s)...", model) - - schema_json = DatasetInsights.model_json_schema() - user_prompt = build_llm_prompt(stats, columns_to_include=columns_to_include) - system_prompt = ( - "You are a Senior Data Scientist. Analyze the following data statistics.\n" - "Generate a set of 2-3 predictive or causal hypotheses for EVERY column.\n" - f"Return the output strictly in JSON matching this schema:\n" - f"{json.dumps(schema_json)}" - ) - - try: - response_text, cost = hllmcli.apply_llm( - input_str=user_prompt, - system_prompt=system_prompt, - model=model, - use_llm_executable=False, - ) - _LOG.info("LLM call successful. Estimated cost: $%.6f", cost) - - cleaned = ( - response_text.strip() - .removeprefix("```json") - .removesuffix("```") - .strip() - ) - raw = json.loads(cleaned) - - # Pydantic validation — raises ValidationError on schema mismatch. - validated = DatasetInsights.model_validate(raw) - return validated.model_dump() - - except pydantic.ValidationError as e: - _LOG.error("LLM output failed Pydantic validation: %s", e) - return {"error": str(e)} - except json.JSONDecodeError as e: - _LOG.error("LLM returned invalid JSON: %s", e) - return {"error": f"JSON parse error: {e}"} - except Exception as e: # pylint: disable=broad-exception-caught - _LOG.error("hllmcli call failed: %s", e) - return {"error": str(e)} - - -def get_llm_semantic_insights_langchain( - prompt_text: str, - model: str = "gpt-4o", -) -> typing.Dict[str, typing.Any]: - """ - Process dataset metadata via LangChain to extract structured semantic insights. - - Uses JsonOutputParser alongside the Pydantic schema. Validates output. - - Parameters - ---------- - prompt_text : str - Serialized stats from build_llm_prompt(). - model : str - - Returns - ------- - dict - """ - _LOG.info("Querying LLM via LangChain (%s)...", model) - llm = lco.ChatOpenAI(model=model, temperature=0) - parser = lcop.JsonOutputParser(pydantic_object=DatasetInsights) - - prompt = lcpr.ChatPromptTemplate.from_messages( - [ - ( - "system", - "You are a Senior Data Scientist. Answer in JSON format.\n" - "{format_instructions}", - ), - ("user", "{metadata_stats}"), - ] - ).partial(format_instructions=parser.get_format_instructions()) - - chain = prompt | llm | parser - try: - result = chain.invoke({"metadata_stats": prompt_text}) - # Validate against Pydantic schema. - validated = DatasetInsights.model_validate(result) - return validated.model_dump() - except pydantic.ValidationError as e: - _LOG.error("LangChain output failed Pydantic validation: %s", e) - return {"error": str(e)} - except Exception as e: # pylint: disable=broad-exception-caught - _LOG.error("LangChain invocation failed: %s", e) - return {"error": str(e)} - - -# ============================================================================= -# Column profiles -# ============================================================================= - - -def build_column_profiles( - df: pd.DataFrame, - stats: typing.Dict[str, typing.Any], - insights: typing.Dict[str, typing.Any], -) -> typing.List[typing.Dict[str, typing.Any]]: - """ - Convert stat-centric structure into per-column profiles. - - Merges numeric stats, categorical distributions, datetime metadata, - and LLM semantic insights keyed on column name. - - Parameters - ---------- - df : pd.DataFrame - stats : dict - insights : dict — output of generate_hypotheses_via_cli() - - Returns - ------- - list of dict, one entry per column. - """ - profiles: typing.List[typing.Dict[str, typing.Any]] = [] - - numeric_summary = stats.get("numeric_summary", {}) - categorical_stats = stats.get("categorical_distributions", {}) - datetime_meta = stats.get("datetime_columns", {}) - - for col in df.columns: - profile: typing.Dict[str, typing.Any] = { - "column": col, - "dtype": str(df[col].dtype), - "null_pct": float(df[col].isnull().mean()), - "unique_count": int(df[col].nunique()), - "sample_values": df[col].dropna().head(3).tolist(), - } - - # Numeric stats - for _, summary_df in numeric_summary.items(): - if col in summary_df.index: - col_stats = summary_df.loc[col] - for metric in col_stats.index: - profile[metric] = col_stats[metric] - - # Categorical top values - for _, cols in categorical_stats.items(): - if col in cols: - dist = cols[col] - try: - profile["top_values"] = ( - dist.head(3).to_dict() - if hasattr(dist, "head") - else dict(list(dist.items())[:3]) - ) - except Exception: # pylint: disable=broad-exception-caught - pass - - # Datetime metadata - if col in datetime_meta: - profile["temporal"] = datetime_meta[col] - - # LLM insights - if "columns" in insights and col in insights["columns"]: - insight = insights["columns"][col] - if hasattr(insight, "dict"): - insight = insight.dict() - profile.update( - { - "semantic_meaning": insight.get("semantic_meaning"), - "role": insight.get("role"), - "data_quality_notes": insight.get("data_quality_notes"), - "hypotheses": insight.get("hypotheses", []), - } - ) - - profiles.append(profile) - - return profiles - - -# ============================================================================= -# Export helpers -# ============================================================================= - - -def merge_and_export_results( - stats: typing.Dict[str, typing.Any], - insights: typing.Dict[str, typing.Any], - column_profiles: typing.List[typing.Dict[str, typing.Any]], - output_path: str = "data_profile_report.json", -) -> None: - """ - Merge stats + insights + column_profiles and export to JSON. - - Parameters - ---------- - stats : dict - insights : dict - column_profiles : list of dict - output_path : str - """ - _LOG.info("Merging results...") - serializable_stats = _make_serializable(stats) - - final_report = { - "report_metadata": { - "version": "1.2", - "agent": "Data-Profiler-Agent", - "generated_at": datetime.datetime.utcnow().isoformat() + "Z", - }, - "column_profiles": column_profiles, - "technical_stats": serializable_stats, - "semantic_insights": insights, - } - - with open(output_path, "w", encoding="utf-8") as f: - json.dump(final_report, f, indent=4, default=str) - - _LOG.info("Exported JSON report to '%s'.", output_path) - - -def _make_serializable(obj: typing.Any) -> typing.Any: - """ - Recursively convert DataFrames and nested dicts to JSON-safe structures. - """ - if isinstance(obj, pd.DataFrame): - return obj.to_dict(orient="index") - if isinstance(obj, dict): - return {k: _make_serializable(v) for k, v in obj.items()} - if isinstance(obj, list): - return [_make_serializable(v) for v in obj] - return obj - - -def export_markdown_from_profiles( - column_profiles: typing.List[typing.Dict[str, typing.Any]], - numeric_stats: typing.Optional[typing.Dict[str, pd.DataFrame]] = None, - output_path: str = "data_profile_summary.md", -) -> None: - """ - Generate a readable Markdown report from column profiles and numeric stats. - - Parameters - ---------- - column_profiles : list of dict - numeric_stats : dict of str → DataFrame, optional - output_path : str - """ - - def _clean(val: typing.Any) -> str: - if val is None: - return "" - return str(val).replace("|", "\\|").replace("\n", " ") - - def _fmt(val: typing.Any) -> str: - if isinstance(val, int): - return str(val) - if isinstance(val, float): - return f"{val:,.2f}" if abs(val) >= 1 else f"{val:.4f}" - return str(val) - - lines = ["# Data Profile Summary\n"] - - # Column profiles table - lines.append("## Column Profiles\n") - lines.append("| Column | Meaning | Role | Quality | Hypotheses |") - lines.append("|--------|---------|------|---------|------------|") - - for p in column_profiles: - hypotheses = p.get("hypotheses", []) - if isinstance(hypotheses, list) and hypotheses: - hyp_str = "
".join( - f"{i+1}. {_clean(h)}" for i, h in enumerate(hypotheses[:3]) - ) - else: - hyp_str = _clean(hypotheses) or "N/A" - - row = [ - _clean(p.get("column")), - _clean(p.get("semantic_meaning")), - _clean(p.get("role")), - _clean(p.get("data_quality_notes")), - hyp_str, - ] - lines.append("| " + " | ".join(row) + " |") - - # Numeric stats table - if numeric_stats: - lines.append("\n## Numeric Column Statistics\n") - for tag, df in numeric_stats.items(): - lines.append(f"### {tag}\n") - lines.append("| Column | Metric | Value |") - lines.append("|--------|--------|-------|") - for col_name in df.index: - for metric in df.columns: - val = df.loc[col_name, metric] - lines.append(f"| {col_name} | {metric} | {_fmt(val)} |") - lines.append("") - - with open(output_path, "w", encoding="utf-8") as f: - f.write("\n".join(lines) + "\n") - - _LOG.info("Exported Markdown report to '%s'.", output_path) - - -# ============================================================================= -# Pipeline -# ============================================================================= - - -def run_pipeline( - csv_paths: typing.List[str], - tags: typing.Optional[typing.List[str]] = None, - model: str = "gpt-4o", - metrics: typing.Optional[typing.List[str]] = None, - llm_scope: str = "all", - output_json: str = "data_profile_report.json", - output_md: str = "data_profile_summary.md", - use_langchain: bool = False, -) -> typing.Tuple[typing.Dict[str, pd.DataFrame], typing.Dict[str, typing.Any]]: - """ - Execute the full data profiling pipeline over one or more CSV files. - - Parameters - ---------- - csv_paths : list of str - One or more CSV file paths to profile. - tags : list of str, optional - Human-readable tag for each CSV. Defaults to filename stems. - model : str - LLM model name passed to OpenAI / hllmcli. - metrics : list of str, optional - Numeric metrics to include. Defaults to DEFAULT_METRICS. - llm_scope : str - "all", "semantic", or "nulls" — controls which columns are LLM-profiled. - output_json : str - Path for the merged JSON report. - output_md : str - Path for the Markdown summary. - use_langchain : bool - Use LangChain chain instead of hllmcli for LLM calls. - - Returns - ------- - (dict of tag → df, stats dict) - """ - if tags is None: - tags = [os.path.splitext(os.path.basename(p))[0] for p in csv_paths] - - if len(tags) != len(csv_paths): - raise ValueError( - f"Length of tags ({len(tags)}) must match csv_paths ({len(csv_paths)})." - ) - - # --- Load & type-coerce --- - tag_to_df: typing.Dict[str, pd.DataFrame] = {} - for path, tag in zip(csv_paths, tags): - df = load_csv(path) - df = hpanconv.convert_df(df) - df, datetime_meta_partial = infer_and_convert_datetime_columns(df) - tag_to_df[tag] = df - - # Merge datetime metadata across all DataFrames (using the last loaded tag - # as the primary df for single-dataset runs; full merge for multi). - _, datetime_meta = infer_and_convert_datetime_columns( - pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True) - ) - - # --- Categorical column map --- - cat_cols_map: typing.Dict[str, typing.List[str]] = { - tag: df.select_dtypes( - include=["object", "category", "string"] - ).columns.tolist() - for tag, df in tag_to_df.items() - } - - # --- Compute stats --- - stats = compute_llm_agent_stats( - tag_to_df, - categorical_cols_map=cat_cols_map, - metrics=metrics, - ) - stats["datetime_columns"] = datetime_meta - - # --- LLM scope --- - # Use the concatenated DataFrame to decide which columns to send. - combined_df = pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True) - columns_for_llm = _select_columns_for_llm(combined_df, scope=llm_scope) - _LOG.info( - "LLM will profile %d / %d columns (scope=%s).", - len(columns_for_llm), - len(combined_df.columns), - llm_scope, - ) - - # --- LLM call --- - if use_langchain: - prompt_text = build_llm_prompt(stats, columns_to_include=columns_for_llm) - semantic_insights = get_llm_semantic_insights_langchain( - prompt_text, model=model - ) - else: - semantic_insights = generate_hypotheses_via_cli( - stats, - model=model, - columns_to_include=columns_for_llm, - ) - - # --- Build column profiles (use first / primary df for column ordering) --- - primary_df = list(tag_to_df.values())[0] - column_profiles = build_column_profiles( - df=primary_df, - stats=stats, - insights=semantic_insights, - ) - - # --- Export --- - merge_and_export_results( - stats=stats, - insights=semantic_insights, - column_profiles=column_profiles, - output_path=output_json, - ) - export_markdown_from_profiles( - column_profiles, - numeric_stats=stats.get("numeric_summary", {}), - output_path=output_md, - ) - - return tag_to_df, stats - - -# ============================================================================= -# CLI -# ============================================================================= - - -def _build_arg_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser( - prog="schema_agent_utils", - description="Data Profiler Agent — statistical + LLM column profiling", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - # --- Inputs --- - parser.add_argument( - "csv_paths", - nargs="+", - metavar="CSV", - help="One or more CSV file paths to profile.", - ) - parser.add_argument( - "--tags", - nargs="+", - metavar="TAG", - help="Human-readable tag for each CSV (must match number of csv_paths).", - ) - - # --- LLM options --- - parser.add_argument( - "--model", - default="gpt-4o", - help="LLM model for semantic analysis.", - ) - parser.add_argument( - "--llm-scope", - choices=["all", "semantic", "nulls"], - default="all", - dest="llm_scope", - help=( - "Which columns to send to the LLM. " - "'all'=every column, 'semantic'=non-numeric only, " - "'nulls'=high-null columns only (saves cost)." - ), - ) - parser.add_argument( - "--use-langchain", - action="store_true", - dest="use_langchain", - help="Use LangChain pipeline instead of hllmcli for LLM calls.", - ) - - # --- Stat options --- - parser.add_argument( - "--metrics", - nargs="+", - choices=VALID_METRICS, - default=None, - metavar="METRIC", - help=( - f"Numeric metrics to include in the summary. " - f"Valid: {', '.join(VALID_METRICS)}. " - f"Default: {', '.join(DEFAULT_METRICS)}." - ), - ) - - # --- Output options --- - parser.add_argument( - "--output-json", - default="data_profile_report.json", - dest="output_json", - metavar="PATH", - help="Output path for the merged JSON report.", - ) - parser.add_argument( - "--output-md", - default="data_profile_summary.md", - dest="output_md", - metavar="PATH", - help="Output path for the Markdown summary.", - ) - - return parser - - -def main() -> None: - """ - CLI entry point. Parses arguments and delegates to run_pipeline(). - """ - parser = _build_arg_parser() - args = parser.parse_args() - - run_pipeline( - csv_paths=args.csv_paths, - tags=args.tags, - model=args.model, - metrics=args.metrics, - llm_scope=args.llm_scope, - output_json=args.output_json, - output_md=args.output_md, - use_langchain=args.use_langchain, - ) - - -if __name__ == "__main__": - main() \ No newline at end of file From ad5364f1043918dc645445feb2d5e180158d1600 Mon Sep 17 00:00:00 2001 From: Pranav Shashidhara Date: Tue, 7 Apr 2026 12:14:15 -0400 Subject: [PATCH 08/14] Add schema_agent_api.ipnb, docker, dassert and update README.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- .../schema_agent/Dockerfile | 30 + .../schema_agent/README.md | 23 +- .../schema_agent/data_profile_report.json | 608 ++++++++++++++++++ .../schema_agent/data_profile_summary.md | 102 +++ .../schema_agent/docker_bash.sh | 34 + .../schema_agent/docker_build.sh | 40 ++ .../schema_agent/docker_build.version.log | 166 +++++ .../schema_agent/docker_clean.sh | 26 + .../schema_agent/docker_cmd.sh | 41 ++ .../schema_agent/docker_exec.sh | 25 + .../schema_agent/docker_jupyter.sh | 37 ++ .../schema_agent/docker_name.sh | 12 + .../schema_agent/docker_push.sh | 25 + .../schema_agent/requirements.txt | 4 +- .../schema_agent/run_jupyter.sh | 36 ++ .../schema_agent/schema_agent.py | 60 +- .../schema_agent/schema_agent_hllmcli.py | 91 ++- .../schema_agent/schema_agent_loader.py | 61 +- .../schema_agent/schema_agent_report.py | 16 +- .../schema_agent/schema_agent_stats.py | 21 +- .../schema_agent/scmea_agent_example.ipynb | 371 +++++++++++ .../schema_agent/utils.sh | 504 +++++++++++++++ .../schema_agent/version.sh | 28 + 23 files changed, 2225 insertions(+), 136 deletions(-) create mode 100644 research/agentic_data_science/schema_agent/Dockerfile create mode 100644 research/agentic_data_science/schema_agent/data_profile_report.json create mode 100644 research/agentic_data_science/schema_agent/data_profile_summary.md create mode 100755 research/agentic_data_science/schema_agent/docker_bash.sh create mode 100755 research/agentic_data_science/schema_agent/docker_build.sh create mode 100644 research/agentic_data_science/schema_agent/docker_build.version.log create mode 100755 research/agentic_data_science/schema_agent/docker_clean.sh create mode 100755 research/agentic_data_science/schema_agent/docker_cmd.sh create mode 100755 research/agentic_data_science/schema_agent/docker_exec.sh create mode 100755 research/agentic_data_science/schema_agent/docker_jupyter.sh create mode 100644 research/agentic_data_science/schema_agent/docker_name.sh create mode 100755 research/agentic_data_science/schema_agent/docker_push.sh create mode 100755 research/agentic_data_science/schema_agent/run_jupyter.sh create mode 100644 research/agentic_data_science/schema_agent/scmea_agent_example.ipynb create mode 100644 research/agentic_data_science/schema_agent/utils.sh create mode 100755 research/agentic_data_science/schema_agent/version.sh diff --git a/research/agentic_data_science/schema_agent/Dockerfile b/research/agentic_data_science/schema_agent/Dockerfile new file mode 100644 index 000000000..a7e060b4a --- /dev/null +++ b/research/agentic_data_science/schema_agent/Dockerfile @@ -0,0 +1,30 @@ +FROM python:3.12-slim + +ENV DEBIAN_FRONTEND=noninteractive +ENV PATH="/opt/venv/bin:$PATH" + +# This allows 'import helpers' to work if helpers is inside /git_root/helpers_root +ENV PYTHONPATH="/git_root/research/agentic_data_science/schema_agent:/git_root/helpers_root:${PYTHONPATH:-}" + +RUN apt-get update && apt-get install -y \ + ca-certificates build-essential curl sudo gnupg git vim \ + libgl1 libglib2.0-0 libgomp1 \ + && rm -rf /var/lib/apt/lists/* + +RUN curl -Ls https://astral.sh/uv/install.sh | sh +ENV PATH="/root/.local/bin:$PATH" + +RUN uv venv /opt/venv + +# Requirements installation +COPY requirements.txt /install/requirements.txt +RUN uv pip install --python /opt/venv/bin/python --no-cache -r /install/requirements.txt jupyterlab + +# Create the skeleton directory structure +WORKDIR /git_root + +# Address reviewer feedback: We assume schema_agent.py is in the context +# We will chmod it inside the container during build or via the mount script +EXPOSE 8888 + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/research/agentic_data_science/schema_agent/README.md b/research/agentic_data_science/schema_agent/README.md index c8d1b5119..020f63e53 100644 --- a/research/agentic_data_science/schema_agent/README.md +++ b/research/agentic_data_science/schema_agent/README.md @@ -14,19 +14,22 @@ Automated statistical profiling and LLM-powered semantic analysis for CSV datase Go into the schema folder: ```bash -cd research/agentic_data_science/schema_agent +> cd research/agentic_data_science/schema_agent ``` Install the requirements: ```bash -pip install -r requirements.txt +> pip install -r requirements.txt ``` Set the `OPENAI_API_KEY` in your environment: ```bash -export OPENAI_API_KEY=sk-... +> export OPENAI_API_KEY=sk-... +``` +Make the script executable +```bash +> chmod +x schema_agent.py ``` - ## Module Structure The agent is split into six focused modules: @@ -45,7 +48,7 @@ The agent is split into six focused modules: ### Basic ```bash -python schema_agent.py data.csv +> ./schema_agent.py data.csv ``` Outputs: @@ -56,16 +59,16 @@ Outputs: ```bash # Multiple files with tags -python schema_agent.py dataset1.csv dataset2.csv --tags sales_2024 inv_q1 +> ./schema_agent.py dataset1.csv dataset2.csv --tags sales_2024 inv_q1 # Cost-optimized: only high-null columns -python schema_agent.py data.csv --llm-scope nulls --model gpt-4o-mini +> ./schema_agent.py data.csv --llm-scope nulls --model gpt-4o-mini # Custom metrics and output -python schema_agent.py data.csv --metrics mean std max --output-json my_report.json +> ./schema_agent.py data.csv --metrics mean std max --output-json my_report.json # LangChain backend -python schema_agent.py data.csv --use-langchain +> ./schema_agent.py data.csv --use-langchain ``` ## Command-Line Arguments @@ -123,7 +126,7 @@ Formatted table summary: Column | Meaning | Role | Quality | Hypotheses **API Key Error:** ```bash -export OPENAI_API_KEY=sk-... +> export OPENAI_API_KEY=sk-... ``` **Validation Errors:** diff --git a/research/agentic_data_science/schema_agent/data_profile_report.json b/research/agentic_data_science/schema_agent/data_profile_report.json new file mode 100644 index 000000000..df7adbef6 --- /dev/null +++ b/research/agentic_data_science/schema_agent/data_profile_report.json @@ -0,0 +1,608 @@ +{ + "report_metadata": { + "version": "1.2", + "agent": "Data-Profiler-Agent", + "generated_at": "2026-04-07T16:06:18.296448Z" + }, + "column_profiles": [ + { + "column": "order_datetime", + "dtype": "datetime64[us]", + "null_pct": 0.0, + "unique_count": 14903, + "sample_values": [ + "2009-12-01 07:45:00", + "2009-12-01 07:45:00", + "2009-12-01 09:06:00" + ] + }, + { + "column": "year", + "dtype": "int64", + "null_pct": 0.0, + "unique_count": 2, + "sample_values": [ + 2009, + 2009, + 2009 + ], + "mean": 2009.92928, + "std": 0.2563578334933183, + "min": 2009.0, + "median": 2010.0, + "max": 2010.0 + }, + { + "column": "month", + "dtype": "int64", + "null_pct": 0.0, + "unique_count": 12, + "sample_values": [ + 12, + 12, + 12 + ], + "mean": 7.37759, + "std": 3.456656661667856, + "min": 1.0, + "median": 8.0, + "max": 12.0 + }, + { + "column": "week_of_year", + "dtype": "int64", + "null_pct": 0.0, + "unique_count": 52, + "sample_values": [ + 49, + 49, + 49 + ], + "mean": 29.91514, + "std": 15.003268635903897, + "min": 1.0, + "median": 33.0, + "max": 52.0 + }, + { + "column": "day_of_week", + "dtype": "int64", + "null_pct": 0.0, + "unique_count": 7, + "sample_values": [ + 1, + 1, + 1 + ], + "mean": 2.58328, + "std": 1.9231592308007859, + "min": 0.0, + "median": 2.0, + "max": 6.0 + }, + { + "column": "order_hour", + "dtype": "int64", + "null_pct": 0.0, + "unique_count": 14, + "sample_values": [ + 7, + 7, + 9 + ], + "mean": 12.68047, + "std": 2.35158794833593, + "min": 7.0, + "median": 13.0, + "max": 20.0 + }, + { + "column": "is_weekend", + "dtype": "int64", + "null_pct": 0.0, + "unique_count": 2, + "sample_values": [ + 0, + 0, + 0 + ], + "mean": 0.15396, + "std": 0.36091220674314933, + "min": 0.0, + "median": 0.0, + "max": 1.0 + }, + { + "column": "country", + "dtype": "str", + "null_pct": 0.0, + "unique_count": 34, + "sample_values": [ + "United Kingdom", + "United Kingdom", + "United Kingdom" + ], + "top_values": { + "count": { + "United Kingdom": 64417, + "Ireland": 8507, + "Germany": 7654 + }, + "pct [%]": { + "United Kingdom": 64.417, + "Ireland": 8.507000000000001, + "Germany": 7.654 + } + }, + "semantic_meaning": "Represents the country where the transaction originated.", + "role": "Feature", + "data_quality_notes": "Data is well-distributed across several countries with a predominance in the United Kingdom.", + "hypotheses": [ + "Transactions from the United Kingdom have a higher total value than other countries.", + "Countries with a lower transaction count like Sweden have a higher average transaction value.", + "Country-specific marketing strategies positively impact sales volume." + ] + }, + { + "column": "country_code", + "dtype": "str", + "null_pct": 0.0, + "unique_count": 34, + "sample_values": [ + "GBR", + "GBR", + "GBR" + ], + "top_values": { + "count": { + "GBR": 64417, + "IRL": 8507, + "DEU": 7654 + }, + "pct [%]": { + "GBR": 64.417, + "IRL": 8.507000000000001, + "DEU": 7.654 + } + }, + "semantic_meaning": "3-letter code representing the country of each transaction.", + "role": "Feature", + "data_quality_notes": "Consistent with country, providing coded labels for countries.", + "hypotheses": [ + "Country codes correlate strongly with country-specific purchasing patterns.", + "The use of certain country codes predicts higher shipping costs.", + "Country codes are better predictors for regional discounts than country names." + ] + }, + { + "column": "product_id", + "dtype": "str", + "null_pct": 0.0, + "unique_count": 3623, + "sample_values": [ + "21523", + "79323W", + "82582" + ], + "top_values": { + "count": { + "POST": 731, + "85123A": 615, + "21212": 438 + }, + "pct [%]": { + "POST": 0.731, + "85123A": 0.615, + "21212": 0.438 + } + }, + "semantic_meaning": "Unique identifier for each product sold.", + "role": "Feature", + "data_quality_notes": "Varied distribution across products indicates a potential for high product diversity.", + "hypotheses": [ + "Products with higher sale counts like 'POST' have a higher discount rate applied.", + "Products with lower counts have a higher average profit margin.", + "Rarely sold products are linked with specific promotional campaigns." + ] + }, + { + "column": "customer_id", + "dtype": "int64", + "null_pct": 0.0, + "unique_count": 4012, + "sample_values": [ + 13085, + 13085, + 13078 + ], + "mean": 14768.12664, + "std": 1799.1647503828826, + "min": 12346.0, + "median": 14646.0, + "max": 18287.0 + }, + { + "column": "unit_price_gbp", + "dtype": "float64", + "null_pct": 0.0, + "unique_count": 300, + "sample_values": [ + 5.95, + 6.75, + 2.1 + ], + "mean": 3.88915772, + "std": 59.75020429686513, + "min": 0.001, + "median": 1.95, + "max": 10953.5 + }, + { + "column": "quantity_sold", + "dtype": "int64", + "null_pct": 0.0, + "unique_count": 232, + "sample_values": [ + 10, + 12, + 12 + ], + "mean": 18.65779, + "std": 159.34650236322747, + "min": 1.0, + "median": 6.0, + "max": 19152.0 + }, + { + "column": "sales_amount_gbp", + "dtype": "float64", + "null_pct": 0.0, + "unique_count": 1541, + "sample_values": [ + 59.5, + 81.0, + 25.200000000000003 + ], + "mean": 26.948917120000004, + "std": 92.39021385230444, + "min": 0.001, + "median": 14.98, + "max": 10953.5 + }, + { + "column": "population_total", + "dtype": "float64", + "null_pct": 0.0, + "unique_count": 55, + "sample_values": [ + 62276270.0, + 62276270.0, + 62276270.0 + ], + "mean": 54098116.95651, + "std": 26644482.35245398, + "min": 318041.0, + "median": 62766365.0, + "max": 309378227.0 + }, + { + "column": "gdp_current_usd", + "dtype": "float64", + "null_pct": 0.0, + "unique_count": 55, + "sample_values": [ + 2412840006231.5, + 2412840006231.5, + 2412840006231.5 + ], + "mean": 2161192799869.4167, + "std": 1115049256125.8184, + "min": 9035824366.00804, + "median": 2485482596184.709, + "max": 15048971000000.0 + }, + { + "column": "gdp_growth_pct", + "dtype": "float64", + "null_pct": 0.0, + "unique_count": 55, + "sample_values": [ + -17.633975690892566, + -17.633975690892566, + -17.633975690892566 + ], + "mean": 0.46262588609441824, + "std": 6.134116051369821, + "min": -19.62987001225588, + "median": 3.010667502428644, + "max": 32.50404703691798 + }, + { + "column": "inflation_consumer_pct", + "dtype": "float64", + "null_pct": 0.0, + "unique_count": 55, + "sample_values": [ + 1.89709031895291, + 1.89709031895291, + 1.89709031895291 + ], + "mean": 1.1042501219771699, + "std": 1.6555131180385045, + "min": -15.1829798865339, + "median": 1.58908069179591, + "max": 16.5278863640702 + } + ], + "technical_stats": { + "temporal_boundaries": null, + "quality_reports": {}, + "categorical_distributions": { + "ecommerce_data": { + "country": { + "United Kingdom": { + "count": 64417, + "pct [%]": 64.417 + }, + "Ireland": { + "count": 8507, + "pct [%]": 8.507000000000001 + }, + "Germany": { + "count": 7654, + "pct [%]": 7.654 + }, + "France": { + "count": 5470, + "pct [%]": 5.47 + }, + "Netherlands": { + "count": 2729, + "pct [%]": 2.7289999999999996 + }, + "Spain": { + "count": 1235, + "pct [%]": 1.2349999999999999 + }, + "Switzerland": { + "count": 1170, + "pct [%]": 1.17 + }, + "Belgium": { + "count": 1037, + "pct [%]": 1.0370000000000001 + }, + "Portugal": { + "count": 984, + "pct [%]": 0.984 + }, + "Sweden": { + "count": 868, + "pct [%]": 0.868 + } + }, + "country_code": { + "GBR": { + "count": 64417, + "pct [%]": 64.417 + }, + "IRL": { + "count": 8507, + "pct [%]": 8.507000000000001 + }, + "DEU": { + "count": 7654, + "pct [%]": 7.654 + }, + "FRA": { + "count": 5470, + "pct [%]": 5.47 + }, + "NLD": { + "count": 2729, + "pct [%]": 2.7289999999999996 + }, + "ESP": { + "count": 1235, + "pct [%]": 1.2349999999999999 + }, + "CHE": { + "count": 1170, + "pct [%]": 1.17 + }, + "BEL": { + "count": 1037, + "pct [%]": 1.0370000000000001 + }, + "PRT": { + "count": 984, + "pct [%]": 0.984 + }, + "SWE": { + "count": 868, + "pct [%]": 0.868 + } + }, + "product_id": { + "POST": { + "count": 731, + "pct [%]": 0.731 + }, + "85123A": { + "count": 615, + "pct [%]": 0.615 + }, + "21212": { + "count": 438, + "pct [%]": 0.438 + }, + "22423": { + "count": 437, + "pct [%]": 0.437 + }, + "85099B": { + "count": 391, + "pct [%]": 0.391 + }, + "20725": { + "count": 334, + "pct [%]": 0.334 + }, + "84991": { + "count": 298, + "pct [%]": 0.298 + }, + "20914": { + "count": 295, + "pct [%]": 0.295 + }, + "21232": { + "count": 295, + "pct [%]": 0.295 + }, + "84879": { + "count": 285, + "pct [%]": 0.28500000000000003 + } + } + } + }, + "numeric_summary": { + "ecommerce_data": { + "year": { + "mean": 2009.92928, + "std": 0.2563578334933183, + "min": 2009.0, + "median": 2010.0, + "max": 2010.0 + }, + "month": { + "mean": 7.37759, + "std": 3.456656661667856, + "min": 1.0, + "median": 8.0, + "max": 12.0 + }, + "week_of_year": { + "mean": 29.91514, + "std": 15.003268635903897, + "min": 1.0, + "median": 33.0, + "max": 52.0 + }, + "day_of_week": { + "mean": 2.58328, + "std": 1.9231592308007859, + "min": 0.0, + "median": 2.0, + "max": 6.0 + }, + "order_hour": { + "mean": 12.68047, + "std": 2.35158794833593, + "min": 7.0, + "median": 13.0, + "max": 20.0 + }, + "is_weekend": { + "mean": 0.15396, + "std": 0.36091220674314933, + "min": 0.0, + "median": 0.0, + "max": 1.0 + }, + "customer_id": { + "mean": 14768.12664, + "std": 1799.1647503828826, + "min": 12346.0, + "median": 14646.0, + "max": 18287.0 + }, + "unit_price_gbp": { + "mean": 3.88915772, + "std": 59.75020429686513, + "min": 0.001, + "median": 1.95, + "max": 10953.5 + }, + "quantity_sold": { + "mean": 18.65779, + "std": 159.34650236322747, + "min": 1.0, + "median": 6.0, + "max": 19152.0 + }, + "sales_amount_gbp": { + "mean": 26.948917120000004, + "std": 92.39021385230444, + "min": 0.001, + "median": 14.98, + "max": 10953.5 + }, + "population_total": { + "mean": 54098116.95651, + "std": 26644482.35245398, + "min": 318041.0, + "median": 62766365.0, + "max": 309378227.0 + }, + "gdp_current_usd": { + "mean": 2161192799869.4167, + "std": 1115049256125.8184, + "min": 9035824366.00804, + "median": 2485482596184.709, + "max": 15048971000000.0 + }, + "gdp_growth_pct": { + "mean": 0.46262588609441824, + "std": 6.134116051369821, + "min": -19.62987001225588, + "median": 3.010667502428644, + "max": 32.50404703691798 + }, + "inflation_consumer_pct": { + "mean": 1.1042501219771699, + "std": 1.6555131180385045, + "min": -15.1829798865339, + "median": 1.58908069179591, + "max": 16.5278863640702 + } + } + }, + "datetime_columns": {} + }, + "semantic_insights": { + "columns": { + "country": { + "semantic_meaning": "Represents the country where the transaction originated.", + "role": "Feature", + "data_quality_notes": "Data is well-distributed across several countries with a predominance in the United Kingdom.", + "hypotheses": [ + "Transactions from the United Kingdom have a higher total value than other countries.", + "Countries with a lower transaction count like Sweden have a higher average transaction value.", + "Country-specific marketing strategies positively impact sales volume." + ] + }, + "country_code": { + "semantic_meaning": "3-letter code representing the country of each transaction.", + "role": "Feature", + "data_quality_notes": "Consistent with country, providing coded labels for countries.", + "hypotheses": [ + "Country codes correlate strongly with country-specific purchasing patterns.", + "The use of certain country codes predicts higher shipping costs.", + "Country codes are better predictors for regional discounts than country names." + ] + }, + "product_id": { + "semantic_meaning": "Unique identifier for each product sold.", + "role": "Feature", + "data_quality_notes": "Varied distribution across products indicates a potential for high product diversity.", + "hypotheses": [ + "Products with higher sale counts like 'POST' have a higher discount rate applied.", + "Products with lower counts have a higher average profit margin.", + "Rarely sold products are linked with specific promotional campaigns." + ] + } + } + } +} \ No newline at end of file diff --git a/research/agentic_data_science/schema_agent/data_profile_summary.md b/research/agentic_data_science/schema_agent/data_profile_summary.md new file mode 100644 index 000000000..5ba7c62fe --- /dev/null +++ b/research/agentic_data_science/schema_agent/data_profile_summary.md @@ -0,0 +1,102 @@ +# Data Profile Summary + +## Column Profiles + +| Column | Meaning | Role | Quality | Hypotheses | +|--------|---------|------|---------|------------| +| order_datetime | | | | [] | +| year | | | | [] | +| month | | | | [] | +| week_of_year | | | | [] | +| day_of_week | | | | [] | +| order_hour | | | | [] | +| is_weekend | | | | [] | +| country | Represents the country where the transaction originated. | Feature | Data is well-distributed across several countries with a predominance in the United Kingdom. | 1. Transactions from the United Kingdom have a higher total value than other countries.
2. Countries with a lower transaction count like Sweden have a higher average transaction value.
3. Country-specific marketing strategies positively impact sales volume. | +| country_code | 3-letter code representing the country of each transaction. | Feature | Consistent with country, providing coded labels for countries. | 1. Country codes correlate strongly with country-specific purchasing patterns.
2. The use of certain country codes predicts higher shipping costs.
3. Country codes are better predictors for regional discounts than country names. | +| product_id | Unique identifier for each product sold. | Feature | Varied distribution across products indicates a potential for high product diversity. | 1. Products with higher sale counts like 'POST' have a higher discount rate applied.
2. Products with lower counts have a higher average profit margin.
3. Rarely sold products are linked with specific promotional campaigns. | +| customer_id | | | | [] | +| unit_price_gbp | | | | [] | +| quantity_sold | | | | [] | +| sales_amount_gbp | | | | [] | +| population_total | | | | [] | +| gdp_current_usd | | | | [] | +| gdp_growth_pct | | | | [] | +| inflation_consumer_pct | | | | [] | + +## Numeric Column Statistics + +### ecommerce_data + +| Column | Metric | Value | +|--------|--------|-------| +| year | mean | 2,009.93 | +| year | std | 0.2564 | +| year | min | 2,009.00 | +| year | median | 2,010.00 | +| year | max | 2,010.00 | +| month | mean | 7.38 | +| month | std | 3.46 | +| month | min | 1.00 | +| month | median | 8.00 | +| month | max | 12.00 | +| week_of_year | mean | 29.92 | +| week_of_year | std | 15.00 | +| week_of_year | min | 1.00 | +| week_of_year | median | 33.00 | +| week_of_year | max | 52.00 | +| day_of_week | mean | 2.58 | +| day_of_week | std | 1.92 | +| day_of_week | min | 0.0000 | +| day_of_week | median | 2.00 | +| day_of_week | max | 6.00 | +| order_hour | mean | 12.68 | +| order_hour | std | 2.35 | +| order_hour | min | 7.00 | +| order_hour | median | 13.00 | +| order_hour | max | 20.00 | +| is_weekend | mean | 0.1540 | +| is_weekend | std | 0.3609 | +| is_weekend | min | 0.0000 | +| is_weekend | median | 0.0000 | +| is_weekend | max | 1.00 | +| customer_id | mean | 14,768.13 | +| customer_id | std | 1,799.16 | +| customer_id | min | 12,346.00 | +| customer_id | median | 14,646.00 | +| customer_id | max | 18,287.00 | +| unit_price_gbp | mean | 3.89 | +| unit_price_gbp | std | 59.75 | +| unit_price_gbp | min | 0.0010 | +| unit_price_gbp | median | 1.95 | +| unit_price_gbp | max | 10,953.50 | +| quantity_sold | mean | 18.66 | +| quantity_sold | std | 159.35 | +| quantity_sold | min | 1.00 | +| quantity_sold | median | 6.00 | +| quantity_sold | max | 19,152.00 | +| sales_amount_gbp | mean | 26.95 | +| sales_amount_gbp | std | 92.39 | +| sales_amount_gbp | min | 0.0010 | +| sales_amount_gbp | median | 14.98 | +| sales_amount_gbp | max | 10,953.50 | +| population_total | mean | 54,098,116.96 | +| population_total | std | 26,644,482.35 | +| population_total | min | 318,041.00 | +| population_total | median | 62,766,365.00 | +| population_total | max | 309,378,227.00 | +| gdp_current_usd | mean | 2,161,192,799,869.42 | +| gdp_current_usd | std | 1,115,049,256,125.82 | +| gdp_current_usd | min | 9,035,824,366.01 | +| gdp_current_usd | median | 2,485,482,596,184.71 | +| gdp_current_usd | max | 15,048,971,000,000.00 | +| gdp_growth_pct | mean | 0.4626 | +| gdp_growth_pct | std | 6.13 | +| gdp_growth_pct | min | -19.63 | +| gdp_growth_pct | median | 3.01 | +| gdp_growth_pct | max | 32.50 | +| inflation_consumer_pct | mean | 1.10 | +| inflation_consumer_pct | std | 1.66 | +| inflation_consumer_pct | min | -15.18 | +| inflation_consumer_pct | median | 1.59 | +| inflation_consumer_pct | max | 16.53 | + diff --git a/research/agentic_data_science/schema_agent/docker_bash.sh b/research/agentic_data_science/schema_agent/docker_bash.sh new file mode 100755 index 000000000..0025e81f4 --- /dev/null +++ b/research/agentic_data_science/schema_agent/docker_bash.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# """ +# This script launches a Docker container with an interactive bash shell for +# development. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions from the project template. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +parse_default_args "$@" + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# List the available Docker images matching the expected image name. +run "docker image ls $FULL_IMAGE_NAME" + +# Configure and run the Docker container with interactive bash shell. +# - Container is removed automatically on exit (--rm) +# - Interactive mode with TTY allocation (-ti) +# - Port forwarding for Jupyter or other services +# - Git root mounted to /git_root inside container +CONTAINER_NAME=${IMAGE_NAME}_bash +PORT= +DOCKER_CMD=$(get_docker_bash_command) +DOCKER_CMD_OPTS=$(get_docker_bash_options $CONTAINER_NAME $PORT) +run "$DOCKER_CMD $DOCKER_CMD_OPTS $FULL_IMAGE_NAME" diff --git a/research/agentic_data_science/schema_agent/docker_build.sh b/research/agentic_data_science/schema_agent/docker_build.sh new file mode 100755 index 000000000..5b0957a99 --- /dev/null +++ b/research/agentic_data_science/schema_agent/docker_build.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# """ +# Build a Docker container image for the project. +# +# This script sets up the build environment with error handling and command +# tracing, loads Docker configuration from docker_name.sh, and builds the +# Docker image using the build_container_image utility function. It supports +# both single-architecture and multi-architecture builds via the +# DOCKER_BUILD_MULTI_ARCH environment variable. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +# Shift processed option flags so remaining args are passed to the build. +parse_default_args "$@" +shift $((OPTIND-1)) + +# Load Docker configuration variables (REPO_NAME, IMAGE_NAME, FULL_IMAGE_NAME). +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# Configure Docker build settings. +# Enable BuildKit for improved build performance and features. +export DOCKER_BUILDKIT=1 +#export DOCKER_BUILDKIT=0 + +# Configure single-architecture build (set to 1 for multi-arch build). +#export DOCKER_BUILD_MULTI_ARCH=1 +export DOCKER_BUILD_MULTI_ARCH=0 + +# Build the container image. +# Pass extra arguments (e.g., --no-cache) via command line after -v. +build_container_image "$@" diff --git a/research/agentic_data_science/schema_agent/docker_build.version.log b/research/agentic_data_science/schema_agent/docker_build.version.log new file mode 100644 index 000000000..d60536643 --- /dev/null +++ b/research/agentic_data_science/schema_agent/docker_build.version.log @@ -0,0 +1,166 @@ +# Python3 +Python 3.12.13 +# pip3 +pip 26.0.1 from /opt/venv/lib/python3.12/site-packages/pip (python 3.12) +# jupyter +Selected Jupyter core packages... +IPython : 9.12.0 +ipykernel : 7.2.0 +ipywidgets : not installed +jupyter_client : 8.8.0 +jupyter_core : 5.9.1 +jupyter_server : 2.17.0 +jupyterlab : 4.5.6 +nbclient : 0.10.4 +nbconvert : 7.17.0 +nbformat : 5.10.4 +notebook : not installed +qtconsole : not installed +traitlets : 5.14.3 +# Python packages +Package Version +------------------------- ------------ +aiohappyeyeballs 2.6.1 +aiohttp 3.13.5 +aiosignal 1.4.0 +annotated-types 0.7.0 +anthropic 0.89.0 +anyio 4.13.0 +argon2-cffi 25.1.0 +argon2-cffi-bindings 25.1.0 +arrow 1.4.0 +asttokens 3.0.1 +async-lru 2.3.0 +attrs 26.1.0 +babel 2.18.0 +beautifulsoup4 4.14.3 +bleach 6.3.0 +certifi 2026.2.25 +cffi 2.0.0 +charset-normalizer 3.4.7 +click 8.3.2 +click-default-group 1.2.4 +comm 0.2.3 +condense-json 0.1.3 +debugpy 1.8.20 +decorator 5.2.1 +defusedxml 0.7.1 +distro 1.9.0 +docstring_parser 0.17.0 +dotenv 0.9.9 +executing 2.2.1 +fastjsonschema 2.21.2 +fqdn 1.5.1 +frozenlist 1.8.0 +h11 0.16.0 +httpcore 1.0.9 +httpx 0.28.1 +idna 3.11 +ipykernel 7.2.0 +ipython 9.12.0 +ipython_pygments_lexers 1.1.1 +isoduration 20.11.0 +jedi 0.19.2 +Jinja2 3.1.6 +jiter 0.13.0 +json5 0.14.0 +jsonpatch 1.33 +jsonpointer 3.1.1 +jsonschema 4.26.0 +jsonschema-specifications 2025.9.1 +jupyter_client 8.8.0 +jupyter_core 5.9.1 +jupyter-events 0.12.0 +jupyter-lsp 2.3.1 +jupyter_server 2.17.0 +jupyter_server_terminals 0.5.4 +jupyterlab 4.5.6 +jupyterlab_pygments 0.3.0 +jupyterlab_server 2.28.0 +langchain-core 1.2.27 +langchain-openai 1.1.12 +langgraph 1.1.6 +langgraph-checkpoint 4.0.1 +langgraph-prebuilt 1.0.9 +langgraph-sdk 0.3.12 +langsmith 0.7.26 +lark 1.3.1 +llm 0.30 +MarkupSafe 3.0.3 +matplotlib-inline 0.2.1 +mistune 3.2.0 +multidict 6.7.1 +nbclient 0.10.4 +nbconvert 7.17.0 +nbformat 5.10.4 +nest-asyncio 1.6.0 +notebook_shim 0.2.4 +numpy 2.4.4 +openai 2.30.0 +orjson 3.11.8 +ormsgpack 1.12.2 +packaging 26.0 +pandas 3.0.2 +pandocfilters 1.5.1 +parso 0.8.6 +pexpect 4.9.0 +pip 26.0.1 +platformdirs 4.9.4 +pluggy 1.6.0 +prometheus_client 0.24.1 +prompt_toolkit 3.0.52 +propcache 0.4.1 +psutil 7.2.2 +ptyprocess 0.7.0 +pure_eval 0.2.3 +puremagic 2.1.1 +pycparser 3.0 +pydantic 2.12.5 +pydantic_core 2.41.5 +Pygments 2.20.0 +python-dateutil 2.9.0.post0 +python-dotenv 1.2.2 +python-json-logger 4.1.0 +python-ulid 3.1.0 +pytz 2026.1.post1 +PyYAML 6.0.3 +pyzmq 27.1.0 +referencing 0.37.0 +regex 2026.4.4 +requests 2.33.1 +requests-toolbelt 1.0.0 +rfc3339-validator 0.1.4 +rfc3986-validator 0.1.1 +rfc3987-syntax 1.1.0 +rpds-py 0.30.0 +Send2Trash 2.1.0 +setuptools 82.0.1 +six 1.17.0 +sniffio 1.3.1 +soupsieve 2.8.3 +sqlite-fts4 1.0.3 +sqlite-migrate 0.1b0 +sqlite-utils 3.39 +stack-data 0.6.3 +tabulate 0.10.0 +tenacity 9.1.4 +terminado 0.18.1 +tiktoken 0.12.0 +tinycss2 1.4.0 +tokencost 0.1.26 +tornado 6.5.5 +tqdm 4.67.3 +traitlets 5.14.3 +typing_extensions 4.15.0 +typing-inspection 0.4.2 +tzdata 2026.1 +uri-template 1.3.0 +urllib3 2.6.3 +uuid_utils 0.14.1 +wcwidth 0.6.0 +webcolors 25.10.0 +webencodings 0.5.1 +websocket-client 1.9.0 +xxhash 3.6.0 +yarl 1.23.0 +zstandard 0.25.0 diff --git a/research/agentic_data_science/schema_agent/docker_clean.sh b/research/agentic_data_science/schema_agent/docker_clean.sh new file mode 100755 index 000000000..7e40839ae --- /dev/null +++ b/research/agentic_data_science/schema_agent/docker_clean.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# """ +# Remove Docker container image for the project. +# +# This script cleans up Docker images by removing the container image +# matching the project configuration. Useful for freeing disk space or +# ensuring a fresh build. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +parse_default_args "$@" + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# Remove the container image. +remove_container_image diff --git a/research/agentic_data_science/schema_agent/docker_cmd.sh b/research/agentic_data_science/schema_agent/docker_cmd.sh new file mode 100755 index 000000000..906d7a77b --- /dev/null +++ b/research/agentic_data_science/schema_agent/docker_cmd.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# """ +# Execute a command in a Docker container. +# +# This script runs a specified command inside a new Docker container instance. +# The container is removed automatically after the command completes. The +# git root is mounted to /git_root inside the container. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +# Shift processed option flags so remaining args form the command. +parse_default_args "$@" +shift $((OPTIND-1)) + +# Capture the command to execute from remaining arguments. +CMD="$@" +echo "Executing: '$CMD'" + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# List available Docker images matching the expected image name. +run "docker image ls $FULL_IMAGE_NAME" +#(docker manifest inspect $FULL_IMAGE_NAME | grep arch) || true + +# Configure and run the Docker container with the specified command. +CONTAINER_NAME=$IMAGE_NAME +DOCKER_CMD=$(get_docker_cmd_command) +PORT="" +DOCKER_RUN_OPTS="" +DOCKER_CMD_OPTS=$(get_docker_bash_options $CONTAINER_NAME $PORT $DOCKER_RUN_OPTS) +run "$DOCKER_CMD $DOCKER_CMD_OPTS $FULL_IMAGE_NAME bash -c '$CMD'" diff --git a/research/agentic_data_science/schema_agent/docker_exec.sh b/research/agentic_data_science/schema_agent/docker_exec.sh new file mode 100755 index 000000000..24f8e401a --- /dev/null +++ b/research/agentic_data_science/schema_agent/docker_exec.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# """ +# Execute a bash shell in a running Docker container. +# +# This script connects to an already running Docker container and opens an +# interactive bash session for debugging or inspection purposes. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +parse_default_args "$@" + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# Execute bash shell in the running container. +exec_container diff --git a/research/agentic_data_science/schema_agent/docker_jupyter.sh b/research/agentic_data_science/schema_agent/docker_jupyter.sh new file mode 100755 index 000000000..6c7d09b13 --- /dev/null +++ b/research/agentic_data_science/schema_agent/docker_jupyter.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# """ +# Execute Jupyter Lab in a Docker container. +# +# This script launches a Docker container running Jupyter Lab with +# configurable port, directory mounting, and vim bindings. It passes +# command-line options to the run_jupyter.sh script inside the container. +# +# Usage: +# > docker_jupyter.sh [options] +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse command-line options and set Jupyter configuration variables. +parse_docker_jupyter_args "$@" + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# List available Docker images and inspect architecture. +run "docker image ls $FULL_IMAGE_NAME" +(docker manifest inspect $FULL_IMAGE_NAME | grep arch) || true + +# Run the Docker container with Jupyter Lab. +CMD=$(get_run_jupyter_cmd "${BASH_SOURCE[0]}" "$OLD_CMD_OPTS") +CONTAINER_NAME=$IMAGE_NAME +DOCKER_CMD=$(get_docker_jupyter_command) +DOCKER_CMD_OPTS=$(get_docker_jupyter_options $CONTAINER_NAME $JUPYTER_HOST_PORT $JUPYTER_USE_VIM) +run "$DOCKER_CMD $DOCKER_CMD_OPTS $FULL_IMAGE_NAME $CMD" diff --git a/research/agentic_data_science/schema_agent/docker_name.sh b/research/agentic_data_science/schema_agent/docker_name.sh new file mode 100644 index 000000000..1d6f8a55c --- /dev/null +++ b/research/agentic_data_science/schema_agent/docker_name.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# """ +# Docker image naming configuration. +# +# This file defines the repository name, image name, and full image name +# variables used by all docker_*.sh scripts in the project template. +# """ + +REPO_NAME=gpsaggese +# The file should be all lower case. +IMAGE_NAME=umd_schema_agent +FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME diff --git a/research/agentic_data_science/schema_agent/docker_push.sh b/research/agentic_data_science/schema_agent/docker_push.sh new file mode 100755 index 000000000..27d752dd9 --- /dev/null +++ b/research/agentic_data_science/schema_agent/docker_push.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# """ +# Push Docker container image to Docker Hub or registry. +# +# This script authenticates with the Docker registry using credentials from +# ~/.docker/passwd.$REPO_NAME.txt and pushes the locally built container +# image to the remote repository. +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Import the utility functions. +GIT_ROOT=$(git rev-parse --show-toplevel) +source $GIT_ROOT/class_project/project_template/utils.sh + +# Parse default args (-h, -v) and enable set -x if -v is passed. +parse_default_args "$@" + +# Load Docker image naming configuration. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source $SCRIPT_DIR/docker_name.sh + +# Push the container image to the registry. +push_container_image diff --git a/research/agentic_data_science/schema_agent/requirements.txt b/research/agentic_data_science/schema_agent/requirements.txt index 0ce56331c..ed4078da6 100644 --- a/research/agentic_data_science/schema_agent/requirements.txt +++ b/research/agentic_data_science/schema_agent/requirements.txt @@ -3,4 +3,6 @@ langchain_core langchain_openai langgraph llm -tokencost \ No newline at end of file +tokencost +pytz +dotenv \ No newline at end of file diff --git a/research/agentic_data_science/schema_agent/run_jupyter.sh b/research/agentic_data_science/schema_agent/run_jupyter.sh new file mode 100755 index 000000000..342a73f79 --- /dev/null +++ b/research/agentic_data_science/schema_agent/run_jupyter.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# """ +# Launch Jupyter Lab server. +# +# This script starts Jupyter Lab on port 8888 with the following configuration: +# - No browser auto-launch (useful for Docker containers) +# - Accessible from any IP address (0.0.0.0) +# - Root user allowed (required for Docker environments) +# - No authentication token or password (for development convenience) +# - Vim keybindings can be enabled via JUPYTER_USE_VIM environment variable +# """ + +# Exit immediately if any command exits with a non-zero status. +set -e + +# Print each command to stdout before executing it. +#set -x + +# Import the utility functions from /git_root. +GIT_ROOT=/git_root +source $GIT_ROOT/class_project/project_template/utils.sh + +# Load Docker configuration variables for this script. +get_docker_vars_script ${BASH_SOURCE[0]} +source $DOCKER_NAME +print_docker_vars + +# Configure vim keybindings and notifications. +configure_jupyter_vim_keybindings +configure_jupyter_notifications + +# Initialize Jupyter Lab command with base configuration. +JUPYTER_ARGS=$(get_jupyter_args) + +# Start Jupyter Lab with development-friendly settings. +run "jupyter lab $JUPYTER_ARGS" diff --git a/research/agentic_data_science/schema_agent/schema_agent.py b/research/agentic_data_science/schema_agent/schema_agent.py index 14491b6f6..e5f995b56 100644 --- a/research/agentic_data_science/schema_agent/schema_agent.py +++ b/research/agentic_data_science/schema_agent/schema_agent.py @@ -19,14 +19,16 @@ import os import sys import typing +import pytz import dotenv import pandas as pd -import research.agentic_data_science.schema_agent.schema_agent_hllmcli as radsasah +import schema_agent_hllmcli as radsasah import schema_agent_loader as radsasal import schema_agent_report as radsasar import schema_agent_stats as radsasas +import helpers.hdbg as hdbg import helpers.hlogging as hloggin # ============================================================================= @@ -35,13 +37,14 @@ dotenv.load_dotenv() api_key = os.environ.get("OPENAI_API_KEY") -if not api_key: - print("Error: OPENAI_API_KEY not found in environment.") - sys.exit(1) + +# Use dassert to ensure the API key exists +hdbg.dassert(api_key, "OPENAI_API_KEY not found in environment.") _LOG = hloggin.getLogger(__name__) _LOG.setLevel(logging.DEBUG) +# Ensure sys is imported for the handler console_handler = logging.StreamHandler(sys.stdout) hloggin.set_v2_formatter( ch=console_handler, @@ -71,46 +74,23 @@ def run_pipeline( ) -> typing.Tuple[typing.Dict[str, pd.DataFrame], typing.Dict[str, typing.Any]]: """ Execute the full data profiling pipeline over one or more CSV files. - - Parameters - ---------- - csv_paths : list of str - One or more CSV file paths to profile. - tags : list of str, optional - Human-readable tag for each CSV. Defaults to filename stems. - model : str - LLM model name passed to OpenAI / hllmcli. - metrics : list of str, optional - Numeric metrics to include. Defaults to DEFAULT_METRICS. - llm_scope : str - "all", "semantic", or "nulls" — controls which columns are LLM-profiled. - output_json : str - Path for the merged JSON report. - output_md : str - Path for the Markdown summary. - use_langchain : bool - Use LangChain chain instead of hllmcli for LLM calls. - - Returns - ------- - (dict of tag → df, stats dict) """ if tags is None: tags = [os.path.splitext(os.path.basename(p))[0] for p in csv_paths] - if len(tags) != len(csv_paths): - raise ValueError( - f"Length of tags ({len(tags)}) must match csv_paths ({len(csv_paths)})." - ) + # Use dassert_eq to check that the number of tags matches files + hdbg.dassert_eq( + len(tags), + len(csv_paths), + msg="Number of tags must match number of CSV paths" + ) # --- Load & type-coerce --- tag_to_df, cat_cols_map = radsasal.prepare_dataframes(csv_paths, tags) - # Merge datetime metadata across all DataFrames (using the last loaded tag - # as the primary df for single-dataset runs; full merge for multi). - _, datetime_meta = radsasal.infer_and_convert_datetime_columns( - pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True) - ) + # Merge datetime metadata + combined_for_dt = pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True) + _, datetime_meta = radsasal.infer_and_convert_datetime_columns(combined_for_dt) # --- Compute stats --- stats = radsasas.compute_llm_agent_stats( @@ -121,9 +101,9 @@ def run_pipeline( stats["datetime_columns"] = datetime_meta # --- LLM scope --- - # Use the concatenated DataFrame to decide which columns to send. combined_df = pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True) columns_for_llm = radsasah._select_columns_for_llm(combined_df, scope=llm_scope) + _LOG.info( "LLM will profile %d / %d columns (scope=%s).", len(columns_for_llm), @@ -146,8 +126,11 @@ def run_pipeline( columns_to_include=columns_for_llm, ) - # --- Build column profiles (use first / primary df for column ordering) --- + # --- Build column profiles --- + # Ensure tag_to_df is not empty before accessing + hdbg.dassert(tag_to_df, "No dataframes were loaded.") primary_df = list(tag_to_df.values())[0] + column_profiles = radsasar.build_column_profiles( df=primary_df, stats=stats, @@ -169,7 +152,6 @@ def run_pipeline( return tag_to_df, stats - # ============================================================================= # CLI # ============================================================================= diff --git a/research/agentic_data_science/schema_agent/schema_agent_hllmcli.py b/research/agentic_data_science/schema_agent/schema_agent_hllmcli.py index d2194684a..37b4c8f0e 100644 --- a/research/agentic_data_science/schema_agent/schema_agent_hllmcli.py +++ b/research/agentic_data_science/schema_agent/schema_agent_hllmcli.py @@ -10,9 +10,11 @@ import langchain_core.output_parsers as lcop import langchain_core.prompts as lcpr import langchain_openai as lco +import pandas as pd import pydantic import schema_agent_models as radsasam +import helpers.hdbg as hdbg import helpers.hllm_cli as hllmcli import helpers.hlogging as hloggin @@ -27,20 +29,19 @@ def _select_columns_for_llm( """ Return the list of column names that should be sent to the LLM. - Parameters - ---------- - df : pd.DataFrame - scope : str - "all" — every column - "semantic" — non-numeric columns only (object / category / string) - "nulls" — columns with null fraction above null_threshold - null_threshold : float - Fraction of nulls required for "nulls" scope. Default 5 %. - - Returns - ------- - list of str + :param df: Input dataframe. + :type df: pd.DataFrame + :param scope: "all" — every column, "semantic" — non-numeric columns + only, "nulls" — columns with high nulls. + :type scope: str + :param null_threshold: Fraction of nulls required for "nulls" scope. + Default 0.05. + :type null_threshold: float + :return: List of valid columns to process. + :rtype: typing.List[str] """ + hdbg.dassert_isinstance(df, pd.DataFrame) + if scope == "all": return list(df.columns) @@ -74,17 +75,16 @@ def build_llm_prompt( Serialize statistical data into a structured string prompt for LLM consumption. - Parameters - ---------- - stats : dict - Output of compute_llm_agent_stats(). - columns_to_include : list of str, optional - Subset of column names to include in the prompt. None = all. - - Returns - ------- - str + :param stats: Output of compute_llm_agent_stats(). + :type stats: typing.Dict[str, typing.Any] + :param columns_to_include: Subset of column names to include in the + prompt. None = all. + :type columns_to_include: typing.Optional[typing.List[str]] + :return: Formatted string prompt. + :rtype: str """ + hdbg.dassert_isinstance(stats, dict) + prompt_segments = [ "You are a Senior Data Scientist and Domain Expert.", "Analyze the provided dataset statistics and generate a profile for each column.", @@ -132,20 +132,20 @@ def generate_hypotheses_via_cli( Parses and Pydantic-validates the LLM response against DatasetInsights. - Parameters - ---------- - stats : dict - model : str - columns_to_include : list of str, optional - If provided, only these columns are sent to the LLM (cost control). - - Returns - ------- - dict — DatasetInsights-shaped dict, or {"error": ...} on failure. + :param stats: Computed dataset statistics. + :type stats: typing.Dict[str, typing.Any] + :param model: The target LLM model. + :type model: str + :param columns_to_include: Subset of column names to include. + :type columns_to_include: typing.Optional[typing.List[str]] + :return: DatasetInsights-shaped dict, or {"error": ...} on failure. + :rtype: typing.Dict[str, typing.Any] """ + hdbg.dassert_isinstance(stats, dict) + _LOG.info("Generating hypotheses via hllmcli (model=%s)...", model) - schema_json = radsasam.atasetInsights.model_json_schema() + schema_json = radsasam.DatasetInsights.model_json_schema() user_prompt = build_llm_prompt(stats, columns_to_include=columns_to_include) system_prompt = ( "You are a Senior Data Scientist. Analyze the following data statistics.\n" @@ -171,7 +171,7 @@ def generate_hypotheses_via_cli( ) raw = json.loads(cleaned) - # Pydantic validation — raises ValidationError on schema mismatch. + # Pydantic validation validated = radsasam.DatasetInsights.model_validate(raw) return validated.model_dump() @@ -194,18 +194,18 @@ def get_llm_semantic_insights_langchain( Process dataset metadata via LangChain to extract structured semantic insights. - Uses JsonOutputParser alongside the Pydantic schema. Validates output. - - Parameters - ---------- - prompt_text : str - Serialized stats from build_llm_prompt(). - model : str + Uses JsonOutputParser alongside the Pydantic schema. Validates + output. - Returns - ------- - dict + :param prompt_text: Serialized stats from build_llm_prompt(). + :type prompt_text: str + :param model: The target LLM model. + :type model: str + :return: Validated insights dictionary. + :rtype: typing.Dict[str, typing.Any] """ + hdbg.dassert_isinstance(prompt_text, str) + _LOG.info("Querying LLM via LangChain (%s)...", model) llm = lco.ChatOpenAI(model=model, temperature=0) parser = lcop.JsonOutputParser(pydantic_object=radsasam.DatasetInsights) @@ -224,7 +224,6 @@ def get_llm_semantic_insights_langchain( chain = prompt | llm | parser try: result = chain.invoke({"metadata_stats": prompt_text}) - # Validate against Pydantic schema. validated = radsasam.DatasetInsights.model_validate(result) return validated.model_dump() except pydantic.ValidationError as e: diff --git a/research/agentic_data_science/schema_agent/schema_agent_loader.py b/research/agentic_data_science/schema_agent/schema_agent_loader.py index 02fae3514..d8f649547 100644 --- a/research/agentic_data_science/schema_agent/schema_agent_loader.py +++ b/research/agentic_data_science/schema_agent/schema_agent_loader.py @@ -13,6 +13,7 @@ import pandas as pd +import helpers.hdbg as hdbg import helpers.hlogging as hloggin import helpers.hpandas_conversion as hpanconv import helpers.hpandas_io as hpanio @@ -24,22 +25,20 @@ def load_csv(csv_path: str) -> pd.DataFrame: """ Load a CSV into a DataFrame with clear error handling. - Parameters - ---------- - csv_path : str - Path to the CSV file. - - Returns - ------- - pd.DataFrame + :param csv_path: Path to the CSV file. + :type csv_path: str + :return: Loaded dataframe. + :rtype: pd.DataFrame """ + hdbg.dassert_isinstance(csv_path, str) try: df = hpanio.read_csv_to_df(csv_path) except FileNotFoundError: _LOG.error("CSV not found at '%s'.", csv_path) raise - if df.empty: - raise ValueError(f"CSV at '{csv_path}' loaded as an empty DataFrame.") + + hdbg.dassert_lt(0, len(df), "CSV at '%s' loaded as an empty DataFrame.", csv_path) + _LOG.info( "Loaded '%s': %d rows × %d columns.", csv_path, len(df), len(df.columns) ) @@ -61,19 +60,17 @@ def infer_and_convert_datetime_columns( Uses sampling for performance. Returns the updated DataFrame and a metadata dict with inference details per column. - Parameters - ---------- - df : pd.DataFrame - sample_size : int - Number of rows to sample when testing format compliance. - threshold : float - Minimum fraction of parsed values required to accept a column as temporal. - - Returns - ------- - (pd.DataFrame, dict) - Updated DataFrame with converted columns + metadata per column. + :param df: Input DataFrame. + :type df: pd.DataFrame + :param sample_size: Number of rows to sample when testing format compliance. + :type sample_size: int + :param threshold: Minimum fraction of parsed values required to accept a column as temporal. + :type threshold: float + :return: Updated DataFrame with converted columns + metadata per column. + :rtype: typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]] """ + hdbg.dassert_isinstance(df, pd.DataFrame) + COMMON_FORMATS = [ "%Y-%m-%d", "%d-%m-%Y", @@ -166,16 +163,16 @@ def prepare_dataframes( Applies type coercion, datetime inference, and categorical detection. - Parameters - ---------- - csv_paths : list of str - tags : list of str, optional - Human-readable tags; defaults to filename stems. - - Returns - ------- - (dict of tag → df, dict of tag → categorical_columns) + :param csv_paths: List of CSV file paths. + :type csv_paths: typing.List[str] + :param tags: Human-readable tags; defaults to filename stems. + :type tags: typing.Optional[typing.List[str]] + :return: A tuple containing a dict mapping tags to DataFrames, and a dict mapping tags to categorical columns. + :rtype: typing.Tuple[typing.Dict[str, pd.DataFrame], typing.Dict[str, typing.List[str]]] """ + hdbg.dassert_isinstance(csv_paths, list) + hdbg.dassert_lt(0, len(csv_paths)) + tag_to_df: typing.Dict[str, pd.DataFrame] = {} cat_cols_map: typing.Dict[str, typing.List[str]] = {} @@ -189,4 +186,4 @@ def prepare_dataframes( include=["object", "category", "string"] ).columns.tolist() - return tag_to_df, cat_cols_map + return tag_to_df, cat_cols_map \ No newline at end of file diff --git a/research/agentic_data_science/schema_agent/schema_agent_report.py b/research/agentic_data_science/schema_agent/schema_agent_report.py index 2c377322e..e46a34b07 100644 --- a/research/agentic_data_science/schema_agent/schema_agent_report.py +++ b/research/agentic_data_science/schema_agent/schema_agent_report.py @@ -14,6 +14,7 @@ import pandas as pd +import helpers.hdbg as hdbg import helpers.hlogging as hloggin _LOG = hloggin.getLogger(__name__) @@ -42,6 +43,9 @@ def build_column_profiles( """ profiles: typing.List[typing.Dict[str, typing.Any]] = [] + hdbg.dassert_isinstance(df, pd.DataFrame) + hdbg.dassert_isinstance(stats, dict) + hdbg.dassert_isinstance(insights, dict) numeric_summary = stats.get("numeric_summary", {}) categorical_stats = stats.get("categorical_distributions", {}) datetime_meta = stats.get("datetime_columns", {}) @@ -115,6 +119,11 @@ def merge_and_export_results( output_path : str """ _LOG.info("Merging results...") + hdbg.dassert_isinstance(stats, dict) + hdbg.dassert_isinstance(insights, dict) + hdbg.dassert_isinstance(column_profiles, list) + hdbg.dassert_isinstance(output_path, str) + hdbg.dassert(output_path, "output_path must be a non-empty string.") serializable_stats = _make_serializable(stats) final_report = { @@ -167,6 +176,11 @@ def _clean(val: typing.Any) -> str: return "" return str(val).replace("|", "\\|").replace("\n", " ") + hdbg.dassert_isinstance(column_profiles, list) + hdbg.dassert_lt(0, len(column_profiles), "column_profiles must be non-empty.") + hdbg.dassert_isinstance(output_path, str) + hdbg.dassert(output_path, "output_path must be a non-empty string.") + def _fmt(val: typing.Any) -> str: if isinstance(val, int): return str(val) @@ -215,4 +229,4 @@ def _fmt(val: typing.Any) -> str: with open(output_path, "w", encoding="utf-8") as f: f.write("\n".join(lines) + "\n") - _LOG.info("Exported Markdown report to '%s'.", output_path) + _LOG.info("Exported Markdown report to '%s'.", output_path) \ No newline at end of file diff --git a/research/agentic_data_science/schema_agent/schema_agent_stats.py b/research/agentic_data_science/schema_agent/schema_agent_stats.py index 213c3d25f..24ab40857 100644 --- a/research/agentic_data_science/schema_agent/schema_agent_stats.py +++ b/research/agentic_data_science/schema_agent/schema_agent_stats.py @@ -13,6 +13,7 @@ import pandas as pd +import helpers.hdbg as hdbg import helpers.hlogging as hloggin import helpers.hpandas_stats as hpanstat @@ -61,6 +62,8 @@ def compute_llm_agent_stats( numeric_summary. """ metrics = _resolve_metrics(metrics) + hdbg.dassert_isinstance(tag_to_df, dict) + hdbg.dassert_lt(0, len(tag_to_df), "tag_to_df must be non-empty.") dataframe_stats: typing.Dict[str, typing.Any] = {} # 1. Temporal boundaries @@ -98,14 +101,18 @@ def compute_llm_agent_stats( dataframe_stats["categorical_distributions"] = {} if categorical_cols_map: for tag, cols in categorical_cols_map.items(): - if tag not in tag_to_df: - _LOG.warning("Tag '%s' not found in tag_to_df; skipping.", tag) - continue + hdbg.dassert_in( + tag, tag_to_df, "Tag '%s' not found in tag_to_df.", tag + ) dataframe_stats["categorical_distributions"][tag] = {} for col in cols: - if col not in tag_to_df[tag].columns: - _LOG.warning("Column '%s' not in '%s'; skipping.", col, tag) - continue + hdbg.dassert_in( + col, + tag_to_df[tag].columns, + "Column '%s' not found in dataset '%s'.", + col, + tag, + ) dist = hpanstat.get_value_counts_stats_df(tag_to_df[tag], col) dataframe_stats["categorical_distributions"][tag][col] = dist print( @@ -149,4 +156,4 @@ def _resolve_metrics( VALID_METRICS, ) resolved = [m for m in metrics if m in VALID_METRICS] - return resolved if resolved else DEFAULT_METRICS + return resolved if resolved else DEFAULT_METRICS \ No newline at end of file diff --git a/research/agentic_data_science/schema_agent/scmea_agent_example.ipynb b/research/agentic_data_science/schema_agent/scmea_agent_example.ipynb new file mode 100644 index 000000000..6104a4038 --- /dev/null +++ b/research/agentic_data_science/schema_agent/scmea_agent_example.ipynb @@ -0,0 +1,371 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "49bffb6b-9c87-4d5a-a32a-a2382c2b700c", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "import argparse\n", + "import logging\n", + "import os\n", + "import typing\n", + "\n", + "import dotenv\n", + "import pandas as pd\n", + "import research.agentic_data_science.schema_agent.schema_agent_hllmcli as radsasah\n", + "import schema_agent_loader as radsasal\n", + "import schema_agent_report as radsasar\n", + "import schema_agent_stats as radsasas\n", + "\n", + "import helpers.hdbg as hdbg\n", + "import helpers.hlogging as hloggin\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3770d3bd-200f-4b7a-bb10-1fe76f26c4d7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n", + "WARNING: Running in Jupyter\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " parsed = pd.to_datetime(df[col], errors=\"coerce\")\n", + "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " parsed = pd.to_datetime(df[col], errors=\"coerce\")\n", + "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " parsed = pd.to_datetime(df[col], errors=\"coerce\")\n", + "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " parsed = pd.to_datetime(df[col], errors=\"coerce\")\n", + "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " parsed = pd.to_datetime(df[col], errors=\"coerce\")\n", + "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " parsed = pd.to_datetime(df[col], errors=\"coerce\")\n", + "Skipping duration stats: 'int' object has no attribute 'tzinfo'\n", + "Quality report failed for 'ecommerce_data': 'RangeIndex' object has no attribute 'date'\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " year month week_of_year day_of_week order_hour is_weekend customer_id unit_price_gbp quantity_sold sales_amount_gbp population_total gdp_current_usd gdp_growth_pct inflation_consumer_pct end_download_timestamp\n", + "0 2009 12 49 1 7 0 13085 5.95 10 59.5 62276270.0 2412840006231.5 -17.633976 1.89709 2026-04-07 16:06:12.386207+00:00\n", + "1 2009 12 49 1 7 0 13085 6.75 12 81.0 62276270.0 2412840006231.5 -17.633976 1.89709 2026-04-07 16:06:12.386207+00:00\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "99998 2010 12 49 3 20 0 17530 1.95 4 7.8 62766365.0 2485482596184.708984 3.010668 1.589081 2026-04-07 16:06:12.386207+00:00\n", + "99999 2010 12 49 3 20 0 17530 1.25 4 5.0 62766365.0 2485482596184.708984 3.010668 1.589081 2026-04-07 16:06:12.386207+00:00\n", + "\n", + "=== Distribution: ecommerce_data / country ===\n", + " count pct [%]\n", + "country \n", + "United Kingdom 64417 64.417\n", + "Ireland 8507 8.507\n", + "Germany 7654 7.654\n", + "France 5470 5.470\n", + "Netherlands 2729 2.729\n", + "Spain 1235 1.235\n", + "Switzerland 1170 1.170\n", + "Belgium 1037 1.037\n", + "Portugal 984 0.984\n", + "Sweden 868 0.868\n", + "\n", + "=== Distribution: ecommerce_data / country_code ===\n", + " count pct [%]\n", + "country_code \n", + "GBR 64417 64.417\n", + "IRL 8507 8.507\n", + "DEU 7654 7.654\n", + "FRA 5470 5.470\n", + "NLD 2729 2.729\n", + "ESP 1235 1.235\n", + "CHE 1170 1.170\n", + "BEL 1037 1.037\n", + "PRT 984 0.984\n", + "SWE 868 0.868\n", + "\n", + "=== Distribution: ecommerce_data / product_id ===\n", + " count pct [%]\n", + "product_id \n", + "POST 731 0.731\n", + "85123A 615 0.615\n", + "21212 438 0.438\n", + "22423 437 0.437\n", + "85099B 391 0.391\n", + "20725 334 0.334\n", + "84991 298 0.298\n", + "20914 295 0.295\n", + "21232 295 0.295\n", + "84879 285 0.285\n", + "\n", + "=== Numeric Summary: ecommerce_data ===\n", + " mean std min median max\n", + "year 2.009929e+03 2.563578e-01 2.009000e+03 2.010000e+03 2.010000e+03\n", + "month 7.377590e+00 3.456657e+00 1.000000e+00 8.000000e+00 1.200000e+01\n", + "week_of_year 2.991514e+01 1.500327e+01 1.000000e+00 3.300000e+01 5.200000e+01\n", + "day_of_week 2.583280e+00 1.923159e+00 0.000000e+00 2.000000e+00 6.000000e+00\n", + "order_hour 1.268047e+01 2.351588e+00 7.000000e+00 1.300000e+01 2.000000e+01\n", + "is_weekend 1.539600e-01 3.609122e-01 0.000000e+00 0.000000e+00 1.000000e+00\n", + "customer_id 1.476813e+04 1.799165e+03 1.234600e+04 1.464600e+04 1.828700e+04\n", + "unit_price_gbp 3.889158e+00 5.975020e+01 1.000000e-03 1.950000e+00 1.095350e+04\n", + "quantity_sold 1.865779e+01 1.593465e+02 1.000000e+00 6.000000e+00 1.915200e+04\n", + "sales_amount_gbp 2.694892e+01 9.239021e+01 1.000000e-03 1.498000e+01 1.095350e+04\n", + "population_total 5.409812e+07 2.664448e+07 3.180410e+05 6.276636e+07 3.093782e+08\n", + "gdp_current_usd 2.161193e+12 1.115049e+12 9.035824e+09 2.485483e+12 1.504897e+13\n", + "gdp_growth_pct 4.626259e-01 6.134116e+00 -1.962987e+01 3.010668e+00 3.250405e+01\n", + "inflation_consumer_pct 1.104250e+00 1.655513e+00 -1.518298e+01 1.589081e+00 1.652789e+01\n", + "12:06:12 rss=0.234GB vms=1.655GB mem_pct=2% cpu=100% - \u001b[36mINFO \u001b[0m Task-43 schema_agent.py run_pipeline:107 LLM will profile 3 / 18 columns (scope=semantic).\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
order_datetimeyearmonthweek_of_yearday_of_weekorder_houris_weekendcountrycountry_codeproduct_idcustomer_idunit_price_gbpquantity_soldsales_amount_gbppopulation_totalgdp_current_usdgdp_growth_pctinflation_consumer_pct
02009-12-01 07:45:0020091249170United KingdomGBR21523130855.951059.5062276270.02.412840e+12-17.6339761.89709
12009-12-01 07:45:0020091249170United KingdomGBR79323W130856.751281.0062276270.02.412840e+12-17.6339761.89709
22009-12-01 09:06:0020091249190United KingdomGBR82582130782.101225.2062276270.02.412840e+12-17.6339761.89709
32009-12-01 09:06:0020091249190United KingdomGBR22111130784.2524102.0062276270.02.412840e+12-17.6339761.89709
42009-12-01 09:06:0020091249190United KingdomGBR21756130785.95317.8562276270.02.412840e+12-17.6339761.89709
\n", + "
" + ], + "text/plain": [ + " order_datetime year month week_of_year day_of_week order_hour \\\n", + "0 2009-12-01 07:45:00 2009 12 49 1 7 \n", + "1 2009-12-01 07:45:00 2009 12 49 1 7 \n", + "2 2009-12-01 09:06:00 2009 12 49 1 9 \n", + "3 2009-12-01 09:06:00 2009 12 49 1 9 \n", + "4 2009-12-01 09:06:00 2009 12 49 1 9 \n", + "\n", + " is_weekend country country_code product_id customer_id \\\n", + "0 0 United Kingdom GBR 21523 13085 \n", + "1 0 United Kingdom GBR 79323W 13085 \n", + "2 0 United Kingdom GBR 82582 13078 \n", + "3 0 United Kingdom GBR 22111 13078 \n", + "4 0 United Kingdom GBR 21756 13078 \n", + "\n", + " unit_price_gbp quantity_sold sales_amount_gbp population_total \\\n", + "0 5.95 10 59.50 62276270.0 \n", + "1 6.75 12 81.00 62276270.0 \n", + "2 2.10 12 25.20 62276270.0 \n", + "3 4.25 24 102.00 62276270.0 \n", + "4 5.95 3 17.85 62276270.0 \n", + "\n", + " gdp_current_usd gdp_growth_pct inflation_consumer_pct \n", + "0 2.412840e+12 -17.633976 1.89709 \n", + "1 2.412840e+12 -17.633976 1.89709 \n", + "2 2.412840e+12 -17.633976 1.89709 \n", + "3 2.412840e+12 -17.633976 1.89709 \n", + "4 2.412840e+12 -17.633976 1.89709 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Make sure this is at the top of your notebook\n", + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import schema_agent as radsasag\n", + "\n", + "# Now run the pipeline\n", + "csv_files = [\"global_ecommerce_forecasting.csv\"]\n", + "tags = [\"ecommerce_data\"]\n", + "\n", + "tag_to_df, stats = radsasag.run_pipeline(\n", + " csv_paths=csv_files,\n", + " tags=tags,\n", + " model=\"gpt-4o\",\n", + " llm_scope=\"semantic\"\n", + ")\n", + "\n", + "display(tag_to_df[\"ecommerce_data\"].head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36f51050-4ead-49dc-9fec-cd430f24de6f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/research/agentic_data_science/schema_agent/utils.sh b/research/agentic_data_science/schema_agent/utils.sh new file mode 100644 index 000000000..67426f5d5 --- /dev/null +++ b/research/agentic_data_science/schema_agent/utils.sh @@ -0,0 +1,504 @@ +#!/bin/bash +# """ +# Utility functions for Docker container management. +# """ + + +# ############################################################################# +# General utilities +# ############################################################################# + + +run() { + # """ + # Execute a command with echo output. + # + # :param cmd: Command string to execute + # :return: Exit status of the executed command + # """ + cmd="$*" + echo "> $cmd" + eval "$cmd" +} + + +enable_verbose_mode() { + # """ + # Enable shell command tracing (set -x) when VERBOSE is set to 1. + # + # Reads the VERBOSE variable set by parse_docker_jupyter_args. + # Call this after parsing args to activate tracing for the rest of the script. + # """ + if [[ $VERBOSE == 1 ]]; then + set -x + fi +} + + +# ############################################################################# +# Argument parsing +# ############################################################################# + + +_print_default_help() { + # """ + # Print usage information and available default options for docker scripts. + # """ + echo "Usage: $(basename $0) [options]" + echo "" + echo "Options:" + echo " -h Print this help message and exit" + echo " -v Enable verbose output (set -x)" +} + + +parse_default_args() { + # """ + # Parse default command-line arguments for docker scripts. + # + # Sets VERBOSE variable in the caller's scope and enables set -x when -v + # is passed. Prints help and exits when -h is passed. + # Updates OPTIND so the caller can shift away processed arguments. + # + # :param @: command-line arguments forwarded from the calling script + # """ + VERBOSE=0 + while getopts "hv" flag; do + case "${flag}" in + h) _print_default_help; exit 0;; + v) VERBOSE=1;; + *) _print_default_help; exit 1;; + esac + done + enable_verbose_mode +} + + +_print_docker_jupyter_help() { + # """ + # Print usage information and available options for docker_jupyter.sh. + # """ + echo "Usage: $(basename $0) [options]" + echo "" + echo "Launch Jupyter Lab inside a Docker container." + echo "" + echo "Options:" + echo " -h Print this help message and exit" + echo " -p PORT Host port to forward to Jupyter Lab (default: 8888)" + echo " -u Enable vim keybindings in Jupyter Lab" + echo " -v Enable verbose output (set -x)" +} + + +parse_docker_jupyter_args() { + # """ + # Parse command-line arguments for docker_jupyter.sh. + # + # Sets JUPYTER_HOST_PORT, JUPYTER_USE_VIM, TARGET_DIR, VERBOSE, and + # OLD_CMD_OPTS in the caller's scope. Enables set -x when -v is passed. + # Prints help and exits when -h is passed. + # + # :param @: command-line arguments forwarded from the calling script + # """ + # Set defaults. + JUPYTER_HOST_PORT=8888 + JUPYTER_USE_VIM=0 + VERBOSE=0 + # Save original args to pass through to run_jupyter.sh. + OLD_CMD_OPTS="$*" + # Parse options. + while getopts "hp:uv" flag; do + case "${flag}" in + h) _print_docker_jupyter_help; exit 0;; + p) JUPYTER_HOST_PORT=${OPTARG};; # Port for Jupyter Lab. + u) JUPYTER_USE_VIM=1;; # Enable vim bindings. + v) VERBOSE=1;; # Enable verbose output. + *) _print_docker_jupyter_help; exit 1;; + esac + done + # Enable command tracing if verbose mode is requested. + enable_verbose_mode +} + + +# ############################################################################# +# Docker image management +# ############################################################################# + + +get_docker_vars_script() { + # """ + # Load Docker variables from docker_name.sh script. + # + # :param script_path: Path to the script to determine the Docker configuration directory + # :return: Sources REPO_NAME, IMAGE_NAME, and FULL_IMAGE_NAME variables + # """ + local script_path=$1 + # Find the name of the container. + SCRIPT_DIR=$(dirname $script_path) + DOCKER_NAME="$SCRIPT_DIR/docker_name.sh" + if [[ ! -e $SCRIPT_DIR ]]; then + echo "Can't find $DOCKER_NAME" + exit -1 + fi; + source $DOCKER_NAME +} + + +print_docker_vars() { + # """ + # Print current Docker variables to stdout. + # """ + echo "REPO_NAME=$REPO_NAME" + echo "IMAGE_NAME=$IMAGE_NAME" + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" +} + + +build_container_image() { + # """ + # Build a Docker container image. + # + # Supports both single-architecture and multi-architecture builds. + # Creates temporary build directory, copies files, and builds the image. + # + # :param @: Additional options to pass to docker build/buildx build + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + # Prepare build area. + #tar -czh . | docker build $OPTS -t $IMAGE_NAME - + DIR="../tmp.build" + if [[ -d $DIR ]]; then + rm -rf $DIR + fi; + cp -Lr . $DIR || true + # Build container. + echo "DOCKER_BUILDKIT=$DOCKER_BUILDKIT" + echo "DOCKER_BUILD_MULTI_ARCH=$DOCKER_BUILD_MULTI_ARCH" + if [[ $DOCKER_BUILD_MULTI_ARCH != 1 ]]; then + # Build for a single architecture. + echo "Building for current architecture..." + OPTS="--progress plain $@" + (cd $DIR; docker build $OPTS -t $FULL_IMAGE_NAME . 2>&1 | tee ../docker_build.log; exit ${PIPESTATUS[0]}) + else + # Build for multiple architectures. + echo "Building for multiple architectures..." + OPTS="$@" + export DOCKER_CLI_EXPERIMENTAL=enabled + # Create a new builder. + #docker buildx rm --all-inactive --force + #docker buildx create --name mybuilder + #docker buildx use mybuilder + # Use the default builder. + docker buildx use multiarch + docker buildx inspect --bootstrap + # Note that one needs to push to the repo since otherwise it is not + # possible to keep multiple. + (cd $DIR; docker buildx build --push --platform linux/arm64,linux/amd64 $OPTS --tag $FULL_IMAGE_NAME . 2>&1 | tee ../docker_build.log; exit ${PIPESTATUS[0]}) + # Report the status. + docker buildx imagetools inspect $FULL_IMAGE_NAME + fi; + # Report build version. + if [ -f docker_build.version.log ]; then + rm docker_build.version.log + fi + (cd $DIR; docker run --rm -it -v $(pwd):/data $FULL_IMAGE_NAME bash -c "/data/version.sh") 2>&1 | tee docker_build.version.log + # + docker image ls $REPO_NAME/$IMAGE_NAME + rm -rf $DIR + echo "*****************************" + echo "SUCCESS" + echo "*****************************" +} + + +remove_container_image() { + # """ + # Remove Docker container image(s) matching the current configuration. + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + docker image ls | grep $FULL_IMAGE_NAME + docker image ls | grep $FULL_IMAGE_NAME | awk '{print $1}' | xargs -n 1 -t docker image rm -f + docker image ls + echo "${FUNCNAME[0]} ... done" +} + + +push_container_image() { + # """ + # Push Docker container image to registry. + # + # Authenticates using credentials from ~/.docker/passwd.$REPO_NAME.txt. + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + docker login --username $REPO_NAME --password-stdin <~/.docker/passwd.$REPO_NAME.txt + docker images $FULL_IMAGE_NAME + docker push $FULL_IMAGE_NAME + echo "${FUNCNAME[0]} ... done" +} + + +pull_container_image() { + # """ + # Pull Docker container image from registry. + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + docker pull $FULL_IMAGE_NAME + echo "${FUNCNAME[0]} ... done" +} + + +# ############################################################################# +# Docker container management +# ############################################################################# + + +kill_container() { + # """ + # Kill and remove Docker container(s) matching the current configuration. + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + docker container ls + # + CONTAINER_ID=$(docker container ls -a | grep $FULL_IMAGE_NAME | awk '{print $1}') + echo "CONTAINER_ID=$CONTAINER_ID" + if [[ ! -z $CONTAINER_ID ]]; then + docker container rm -f $CONTAINER_ID + docker container ls + fi; + echo "${FUNCNAME[0]} ... done" +} + + +exec_container() { + # """ + # Execute bash shell in running Docker container. + # + # Opens an interactive bash session in the first container matching the + # current configuration. + # """ + echo "# ${FUNCNAME[0]} ..." + FULL_IMAGE_NAME=$REPO_NAME/$IMAGE_NAME + echo "FULL_IMAGE_NAME=$FULL_IMAGE_NAME" + docker container ls + # + CONTAINER_ID=$(docker container ls -a | grep $FULL_IMAGE_NAME | awk '{print $1}') + echo "CONTAINER_ID=$CONTAINER_ID" + docker exec -it $CONTAINER_ID bash + echo "${FUNCNAME[0]} ... done" +} + + +# ############################################################################# +# Docker common options +# ############################################################################# + + +get_docker_common_options() { + # """ + # Return docker run options common to all container types. + # + # Includes volume mount for the git root, plus environment variables for + # PYTHONPATH and host OS name. + # + # :return: docker run options string with volume mounts and env vars + # """ + echo "-v $GIT_ROOT:/git_root \ + -e PYTHONPATH=/git_root:/git_root/helpers_root:/git_root/msml610/tutorials \ + -e CSFY_GIT_ROOT_PATH=/git_root \ + -e CSFY_HOST_OS_NAME=$(uname -s) \ + -e CSFY_HOST_NAME=$(uname -n)" +} + + +# ############################################################################# +# Docker bash +# ############################################################################# + + +get_docker_bash_command() { + # """ + # Return the base docker run command for an interactive bash shell. + # + # :return: docker run command string with --rm and -ti flags + # """ + if [ -t 0 ]; then + echo "docker run --rm -ti" + else + echo "docker run --rm -i" + fi +} + + +get_docker_bash_options() { + # """ + # Return docker run options for a Docker container. + # + # :param container_name: Name for the Docker container + # :param port: Port number to forward (optional, skipped if empty) + # :param extra_opts: Additional docker run options (optional) + # :return: docker run options string with name, volume mounts, and env vars + # """ + local container_name=$1 + local port=$2 + local extra_opts=$3 + local port_opt="" + if [[ -n $port ]]; then + port_opt="-p $port:$port" + fi + echo "--name $container_name \ + $port_opt \ + $extra_opts \ + $(get_docker_common_options)" +} + + +# ############################################################################# +# Docker cmd +# ############################################################################# + + +get_docker_cmd_command() { + # """ + # Return the base docker run command for executing a non-interactive command. + # + # :return: docker run command string with --rm and -i flags + # """ + echo "docker run --rm -i" +} + + +# ############################################################################# +# Docker Jupyter +# ############################################################################# + + +get_docker_jupyter_command() { + # """ + # Return the base docker run command for running Jupyter Lab interactively. + # + # :return: docker run command string with --rm and -ti flags + # """ + echo "docker run --rm -ti" +} + + +get_docker_jupyter_options() { + # """ + # Return docker run options for a Jupyter Lab container. + # + # :param container_name: Name for the Docker container + # :param host_port: Host port to forward to container port 8888 + # :param jupyter_use_vim: 0 or 1 to enable vim bindings + # :return: docker run options string + # """ + local container_name=$1 + local host_port=$2 + local jupyter_use_vim=$3 + # Run as the current user when user is saggese. + if [[ "$(whoami)" == "saggese" ]]; then + echo "Overwriting jupyter_use_vim since user='saggese'" + jupyter_use_vim=1 + fi + echo "--name $container_name \ + -p $host_port:8888 \ + $(get_docker_common_options) \ + -e JUPYTER_USE_VIM=$jupyter_use_vim" +} + + +configure_jupyter_vim_keybindings() { + # """ + # Configure JupyterLab vim keybindings based on JUPYTER_USE_VIM env var. + # + # Reads JUPYTER_USE_VIM; if 1, verifies jupyterlab_vim is installed and + # writes enabled settings; otherwise writes disabled settings. + # """ + mkdir -p ~/.jupyter/lab/user-settings/@axlair/jupyterlab_vim + if [[ $JUPYTER_USE_VIM == 1 ]]; then + # Check that jupyterlab_vim is installed before trying to enable it. + if ! pip show jupyterlab_vim > /dev/null 2>&1; then + echo "ERROR: jupyterlab_vim is not installed but vim bindings were requested." + echo "Install it with: pip install jupyterlab_vim" + exit 1 + fi + echo "Enabling vim." + cat < ~/.jupyter/lab/user-settings/\@axlair/jupyterlab_vim/plugin.jupyterlab-settings +{ + "enabled": true, + "enabledInEditors": true, + "extraKeybindings": [] +} +EOF + else + echo "Disabling vim." + cat < ~/.jupyter/lab/user-settings/\@axlair/jupyterlab_vim/plugin.jupyterlab-settings +{ + "enabled": false, + "enabledInEditors": false, + "extraKeybindings": [] +} +EOF + fi; +} + + +configure_jupyter_notifications() { + # """ + # Disable JupyterLab news fetching and update checks. + # """ + mkdir -p ~/.jupyter/lab/user-settings/@jupyterlab/apputils-extension + cat < ~/.jupyter/lab/user-settings/\@jupyterlab/apputils-extension/notification.jupyterlab-settings +{ + // Notifications + // @jupyterlab/apputils-extension:notification + // Notifications settings. + + // Fetch official Jupyter news + // Whether to fetch news from the Jupyter news feed. If Always (`true`), it will make a request to a website. + "fetchNews": "false", + "checkForUpdates": false +} +EOF +} + + +get_jupyter_args() { + # """ + # Print the standard Jupyter Lab command-line arguments. + # + # :return: space-separated Jupyter Lab args for port 8888 with no browser, + # allow root, and no authentication + # """ + echo "--port=8888 --no-browser --ip=0.0.0.0 --allow-root --ServerApp.token='' --ServerApp.password=''" +} + + +get_run_jupyter_cmd() { + # """ + # Return the command to run run_jupyter.sh inside a container. + # + # Computes the script's path relative to GIT_ROOT and builds the + # corresponding /git_root/... path used inside the container. + # + # :param script_path: path of the calling script (pass ${BASH_SOURCE[0]}) + # :param cmd_opts: options to forward to run_jupyter.sh + # :return: full command string to run run_jupyter.sh + # """ + local script_path=$1 + local cmd_opts=$2 + local script_dir + script_dir=$(cd "$(dirname "$script_path")" && pwd) + local rel_dir="${script_dir#${GIT_ROOT}/}" + echo "/git_root/${rel_dir}/run_jupyter.sh $cmd_opts" +} diff --git a/research/agentic_data_science/schema_agent/version.sh b/research/agentic_data_science/schema_agent/version.sh new file mode 100755 index 000000000..c46ed254c --- /dev/null +++ b/research/agentic_data_science/schema_agent/version.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# """ +# Display versions of installed tools and packages. +# +# This script prints version information for Python, pip, Jupyter, and all +# installed Python packages. Used for debugging and documentation purposes +# to verify the Docker container environment setup. +# """ + +# Display Python 3 version. +echo "# Python3" +python3 --version + +# Display pip version. +echo "# pip3" +pip3 --version + +# Display Jupyter version. +echo "# jupyter" +jupyter --version + +# List all installed Python packages and their versions. +echo "# Python packages" +pip3 list + +# Template for adding additional tool versions. +# echo "# mongo" +# mongod --version From 211344925bd59eb404b77254e6bb066670d8a676 Mon Sep 17 00:00:00 2001 From: Pranav Shashidhara Date: Tue, 7 Apr 2026 12:55:57 -0400 Subject: [PATCH 09/14] Update schema_agent.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- .../schema_agent/schema_agent.py | 58 ++++++++++++------- 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/research/agentic_data_science/schema_agent/schema_agent.py b/research/agentic_data_science/schema_agent/schema_agent.py index e5f995b56..356f8f157 100644 --- a/research/agentic_data_science/schema_agent/schema_agent.py +++ b/research/agentic_data_science/schema_agent/schema_agent.py @@ -4,10 +4,10 @@ Main pipeline and CLI orchestration for end-to-end data profiling. Usage: - python schema_agent.py data.csv - python schema_agent.py data.csv --model gpt-4o-mini --llm-scope nulls - python schema_agent.py data.csv --metrics mean std min max --output-json out.json - python schema_agent.py data.csv data2.csv --tags sales inventory + ./schema_agent.py data.csv + ./schema_agent.py data.csv --model gpt-4o-mini --llm-scope nulls + ./schema_agent.py data.csv --metrics mean std min max --output-json out.json + ./schema_agent.py data.csv data2.csv --tags sales inventory Import as: @@ -19,11 +19,10 @@ import os import sys import typing -import pytz import dotenv import pandas as pd -import schema_agent_hllmcli as radsasah +import research.agentic_data_science.schema_agent.schema_agent_hllmcli as radsasah import schema_agent_loader as radsasal import schema_agent_report as radsasar import schema_agent_stats as radsasas @@ -38,13 +37,11 @@ dotenv.load_dotenv() api_key = os.environ.get("OPENAI_API_KEY") -# Use dassert to ensure the API key exists hdbg.dassert(api_key, "OPENAI_API_KEY not found in environment.") _LOG = hloggin.getLogger(__name__) _LOG.setLevel(logging.DEBUG) -# Ensure sys is imported for the handler console_handler = logging.StreamHandler(sys.stdout) hloggin.set_v2_formatter( ch=console_handler, @@ -74,23 +71,47 @@ def run_pipeline( ) -> typing.Tuple[typing.Dict[str, pd.DataFrame], typing.Dict[str, typing.Any]]: """ Execute the full data profiling pipeline over one or more CSV files. + + :param csv_paths: One or more CSV file paths to profile. + :type csv_paths: typing.List[str] + :param tags: Human-readable tag for each CSV. Defaults to filename stems. + :type tags: typing.Optional[typing.List[str]] + :param model: LLM model name passed to OpenAI / hllmcli. + :type model: str + :param metrics: Numeric metrics to include. Defaults to DEFAULT_METRICS. + :type metrics: typing.Optional[typing.List[str]] + :param llm_scope: "all", "semantic", or "nulls" — controls which columns are LLM-profiled. + :type llm_scope: str + :param output_json: Path for the merged JSON report. + :type output_json: str + :param output_md: Path for the Markdown summary. + :type output_md: str + :param use_langchain: Use LangChain chain instead of hllmcli for LLM calls. + :type use_langchain: bool + :return: A tuple containing a dict of tag -> df mappings, and a stats dict. + :rtype: typing.Tuple[typing.Dict[str, pd.DataFrame], typing.Dict[str, typing.Any]] """ + hdbg.dassert_isinstance(csv_paths, list) + hdbg.dassert_lt(0, len(csv_paths), "csv_paths must not be empty.") + if tags is None: tags = [os.path.splitext(os.path.basename(p))[0] for p in csv_paths] - # Use dassert_eq to check that the number of tags matches files hdbg.dassert_eq( len(tags), len(csv_paths), - msg="Number of tags must match number of CSV paths" + "Length of tags (%d) must match csv_paths (%d).", + len(tags), + len(csv_paths) ) # --- Load & type-coerce --- tag_to_df, cat_cols_map = radsasal.prepare_dataframes(csv_paths, tags) - # Merge datetime metadata - combined_for_dt = pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True) - _, datetime_meta = radsasal.infer_and_convert_datetime_columns(combined_for_dt) + # Merge datetime metadata across all DataFrames + _, datetime_meta = radsasal.infer_and_convert_datetime_columns( + pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True) + ) # --- Compute stats --- stats = radsasas.compute_llm_agent_stats( @@ -103,7 +124,6 @@ def run_pipeline( # --- LLM scope --- combined_df = pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True) columns_for_llm = radsasah._select_columns_for_llm(combined_df, scope=llm_scope) - _LOG.info( "LLM will profile %d / %d columns (scope=%s).", len(columns_for_llm), @@ -127,10 +147,7 @@ def run_pipeline( ) # --- Build column profiles --- - # Ensure tag_to_df is not empty before accessing - hdbg.dassert(tag_to_df, "No dataframes were loaded.") primary_df = list(tag_to_df.values())[0] - column_profiles = radsasar.build_column_profiles( df=primary_df, stats=stats, @@ -152,6 +169,7 @@ def run_pipeline( return tag_to_df, stats + # ============================================================================= # CLI # ============================================================================= @@ -232,9 +250,7 @@ def _build_arg_parser() -> argparse.ArgumentParser: def main() -> None: """ - CLI entry point. - - Parses arguments and delegates to run_pipeline(). + CLI entry point. Parses arguments and delegates to run_pipeline(). """ parser = _build_arg_parser() args = parser.parse_args() @@ -251,4 +267,4 @@ def main() -> None: if __name__ == "__main__": - main() + main() \ No newline at end of file From cd350f32cceaac568aecb9f3eab917abcacf6ef7 Mon Sep 17 00:00:00 2001 From: Pranav Shashidhara Date: Wed, 8 Apr 2026 12:20:07 -0400 Subject: [PATCH 10/14] Add API.ipynb, example.ipynb files and update datetime function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- .../schema_agent/schema_agent.py | 29 +- .../schema_agent/schema_agent_API.ipynb | 394 ++++++++++++++++ .../schema_agent/schema_agent_example.ipynb | 435 ++++++++++++++++++ .../schema_agent/schema_agent_loader.py | 147 +++--- .../schema_agent/schema_agent_stats.py | 17 +- .../schema_agent/scmea_agent_example.ipynb | 371 --------------- 6 files changed, 911 insertions(+), 482 deletions(-) create mode 100644 research/agentic_data_science/schema_agent/schema_agent_API.ipynb create mode 100644 research/agentic_data_science/schema_agent/schema_agent_example.ipynb delete mode 100644 research/agentic_data_science/schema_agent/scmea_agent_example.ipynb diff --git a/research/agentic_data_science/schema_agent/schema_agent.py b/research/agentic_data_science/schema_agent/schema_agent.py index 356f8f157..8dec89e21 100644 --- a/research/agentic_data_science/schema_agent/schema_agent.py +++ b/research/agentic_data_science/schema_agent/schema_agent.py @@ -23,9 +23,9 @@ import dotenv import pandas as pd import research.agentic_data_science.schema_agent.schema_agent_hllmcli as radsasah -import schema_agent_loader as radsasal -import schema_agent_report as radsasar -import schema_agent_stats as radsasas +import research.agentic_data_science.schema_agent.schema_agent_loader as radsasal +import research.agentic_data_science.schema_agent.schema_agent_report as radsasar +import research.agentic_data_science.schema_agent.schema_agent_stats as radsasas import helpers.hdbg as hdbg import helpers.hlogging as hloggin @@ -106,24 +106,28 @@ def run_pipeline( ) # --- Load & type-coerce --- - tag_to_df, cat_cols_map = radsasal.prepare_dataframes(csv_paths, tags) - - # Merge datetime metadata across all DataFrames - _, datetime_meta = radsasal.infer_and_convert_datetime_columns( - pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True) - ) + # UPDATED: We now capture datetime_meta during loading to ensure timezone + # consistency and avoid re-inference warnings. + tag_to_df, cat_cols_map, datetime_meta = radsasal.prepare_dataframes(csv_paths, tags) # --- Compute stats --- + # The stats module now handles DatetimeIndex and filters out timestamp columns + # from math operations to prevent 'abs()' errors. stats = radsasas.compute_llm_agent_stats( tag_to_df, categorical_cols_map=cat_cols_map, metrics=metrics, ) + + # Inject captured datetime metadata into the stats object for the LLM. stats["datetime_columns"] = datetime_meta # --- LLM scope --- - combined_df = pd.concat(list(tag_to_df.values()), axis=0, ignore_index=True) + # Combine dataframes for column selection logic. + # Note: We preserve the DatetimeIndex by not using ignore_index=True. + combined_df = pd.concat(list(tag_to_df.values()), axis=0) columns_for_llm = radsasah._select_columns_for_llm(combined_df, scope=llm_scope) + _LOG.info( "LLM will profile %d / %d columns (scope=%s).", len(columns_for_llm), @@ -147,7 +151,8 @@ def run_pipeline( ) # --- Build column profiles --- - primary_df = list(tag_to_df.values())[0] + # We use the primary dataframe (first tag) as the template for the profile. + primary_df = tag_to_df[tags[0]] column_profiles = radsasar.build_column_profiles( df=primary_df, stats=stats, @@ -161,6 +166,7 @@ def run_pipeline( column_profiles=column_profiles, output_path=output_json, ) + radsasar.export_markdown_from_profiles( column_profiles, numeric_stats=stats.get("numeric_summary", {}), @@ -169,7 +175,6 @@ def run_pipeline( return tag_to_df, stats - # ============================================================================= # CLI # ============================================================================= diff --git a/research/agentic_data_science/schema_agent/schema_agent_API.ipynb b/research/agentic_data_science/schema_agent/schema_agent_API.ipynb new file mode 100644 index 000000000..4845fc9e1 --- /dev/null +++ b/research/agentic_data_science/schema_agent/schema_agent_API.ipynb @@ -0,0 +1,394 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8881f77e-d668-4210-b5c5-06fad5f80608", + "metadata": {}, + "source": [ + "# API usage Notebook \n", + "- This notebook shows the implementation of each function from the respective libraries." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "3d4af97d-2052-4791-8b80-f9fa973b8233", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import dotenv\n", + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "# Load environment variables (ensure OPENAI_API_KEY is set in your .env)\n", + "dotenv.load_dotenv()\n", + "\n", + "# Import the schema agent modules\n", + "import research.agentic_data_science.schema_agent.schema_agent_loader as radsasal\n", + "import research.agentic_data_science.schema_agent.schema_agent_stats as radsasas\n", + "import research.agentic_data_science.schema_agent.schema_agent_hllmcli as radsasah\n", + "import research.agentic_data_science.schema_agent.schema_agent_report as radsasar" + ] + }, + { + "cell_type": "markdown", + "id": "20ab6884-fddf-4205-8bb7-eab2704f6f1d", + "metadata": {}, + "source": [ + "## 1. Create dummy Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1ada4cd7-10bb-45c5-be2e-7cce4a5b4de1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created dummy dataset at: dummy_employees.csv\n" + ] + } + ], + "source": [ + "# 1. Create a dummy dataset\n", + "np.random.seed(42)\n", + "num_rows = 100\n", + "\n", + "dummy_data = pd.DataFrame({\n", + " \"employee_id\": range(1000, 1000 + num_rows),\n", + " \"department\": np.random.choice([\"Engineering\", \"Sales\", \"HR\", \"Marketing\"], num_rows),\n", + " \"salary\": np.random.normal(85000, 20000, num_rows),\n", + " \"satisfaction_score\": np.random.uniform(1.0, 5.0, num_rows),\n", + " \"hire_date\": pd.date_range(start=\"2018-01-01\", periods=num_rows, freq=\"W\").astype(str),\n", + " \"notes\": [\"Good performance\"] * 50 + [None] * 50 # 50% nulls\n", + "})\n", + "\n", + "# Inject some missing values into salary\n", + "dummy_data.loc[10:20, \"salary\"] = np.nan\n", + "\n", + "# Save to CSV\n", + "csv_path = \"dummy_employees.csv\"\n", + "dummy_data.to_csv(csv_path, index=False)\n", + "print(f\"Created dummy dataset at: {csv_path}\")\n", + "dummy_data.head()\n", + "\n", + "csv_paths = [csv_path]\n", + "tags = [\"dummy_employees\"]" + ] + }, + { + "cell_type": "markdown", + "id": "25a81ada-67c5-4a5e-a22e-453f2b222b06", + "metadata": {}, + "source": [ + "## 2. Load and Infer datatypes from the columns" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e79d8059-c49d-438e-ad61-7a4c160370d4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- Loaded DataFrames ---\n", + "\n", + "DatetimeIndex: 100 entries, 2018-01-07 00:00:00+00:00 to 2019-12-01 00:00:00+00:00\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 employee_id 100 non-null int64 \n", + " 1 department 100 non-null str \n", + " 2 salary 89 non-null float64 \n", + " 3 satisfaction_score 100 non-null float64 \n", + " 4 hire_date 100 non-null datetime64[us, UTC]\n", + " 5 notes 50 non-null str \n", + "dtypes: datetime64[us, UTC](1), float64(2), int64(1), str(2)\n", + "memory usage: 5.5 KB\n", + "None\n", + "\n", + "--- Datetime Inference Metadata ---\n", + "{'hire_date': {'semantic_type': 'temporal', 'granularity': 'date', 'format': 'inferred', 'confidence': 1.0}}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:75: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " parsed = pd.to_datetime(df[col], errors=\"coerce\", utc=True)\n" + ] + } + ], + "source": [ + "# 1. Load and prepare DataFrames - now receiving 3 variables\n", + "tag_to_df, cat_cols_map, datetime_meta = radsasal.prepare_dataframes(csv_paths, tags)\n", + "\n", + "print(\"--- Loaded DataFrames ---\")\n", + "# The index will now show as a DatetimeIndex instead of a RangeIndex\n", + "print(tag_to_df[\"dummy_employees\"].info())\n", + "\n", + "# 2. Combine DataFrames while preserving the index\n", + "# We do NOT use ignore_index=True here because we want to keep the DatetimeIndex \n", + "# we just created in the loader.\n", + "updated_df = pd.concat(list(tag_to_df.values()), axis=0)\n", + "\n", + "print(\"\\n--- Datetime Inference Metadata ---\")\n", + "# This will now correctly show your temporal column info\n", + "print(datetime_meta)" + ] + }, + { + "cell_type": "markdown", + "id": "dc6c4466-d602-4c82-a2f6-b7ff7ada8e19", + "metadata": {}, + "source": [ + "## 3. Statistical Profiling" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1368b932-6299-43c1-a70c-c8e9489eb2b2", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Temporal Boundaries ===\n", + " min_index max_index min_valid_index max_valid_index\n", + "dummy_employees 2018-01-07 00:00:00+00:00 2019-12-01 00:00:00+00:00 2018-01-07 00:00:00+00:00 2018-12-16 00:00:00+00:00\n", + " employee_id salary satisfaction_score\n", + "2018-01-07 00:00:00+00:00 1000 99769.3316 4.268889\n", + "2018-01-14 00:00:00+00:00 1001 88427.365624 3.220803\n", + "... ... ... ...\n", + "2019-11-24 00:00:00+00:00 1098 101270.344347 3.313121\n", + "2019-12-01 00:00:00+00:00 1099 60382.713671 1.143769\n", + " num_rows num_zeros zeros [%] num_nans nans [%] num_infs infs [%] num_valid valid [%]\n", + "employee_id 100 0 0.0 0 0.0 0 0.0 100 100.0\n", + "salary 100 0 0.0 11 11.0 0 0.0 89 89.0\n", + "satisfaction_score 100 0 0.0 0 0.0 0 0.0 100 100.0\n", + "\n", + "=== Quality Report: dummy_employees ===\n", + " num_rows num_zeros zeros [%] num_nans nans [%] \\\n", + "employee_id 100 0 0.0 0 0.0 \n", + "salary 100 0 0.0 11 11.0 \n", + "satisfaction_score 100 0 0.0 0 0.0 \n", + "\n", + " num_infs infs [%] num_valid valid [%] \n", + "employee_id 0 0.0 100 100.0 \n", + "salary 0 0.0 89 89.0 \n", + "satisfaction_score 0 0.0 100 100.0 \n", + "\n", + "=== Distribution: dummy_employees / department ===\n", + " count pct [%]\n", + "department \n", + "Marketing 30 30.0\n", + "Sales 26 26.0\n", + "HR 24 24.0\n", + "Engineering 20 20.0\n", + "\n", + "=== Distribution: dummy_employees / notes ===\n", + " count pct [%]\n", + "notes \n", + "Good performance 50 50.0\n", + "\n", + "=== Numeric Summary: dummy_employees ===\n", + " mean std min max\n", + "employee_id 1049.500000 29.011492 1000.000000 1099.000000\n", + "salary 83981.174276 19304.098590 32605.097918 134264.842250\n", + "satisfaction_score 3.197062 1.163419 1.020246 4.960215\n", + "\n", + "--- Stats Computation Complete ---\n", + "Calculated stats for tags: ['dummy_employees']\n" + ] + } + ], + "source": [ + "# We pass the metadata we just generated into the stats function\n", + "stats = radsasas.compute_llm_agent_stats(\n", + " tag_to_df=tag_to_df,\n", + " categorical_cols_map=cat_cols_map,\n", + " metrics=[\"mean\", \"std\", \"min\", \"max\"]\n", + ")\n", + "\n", + "# Manually ensure the datetime_columns key is populated for the LLM\n", + "stats[\"datetime_columns\"] = datetime_meta\n", + "\n", + "print(\"\\n--- Stats Computation Complete ---\")\n", + "print(f\"Calculated stats for tags: {list(stats['numeric_summary'].keys())}\")" + ] + }, + { + "cell_type": "markdown", + "id": "41cf298b-2e84-4d82-8073-79f7f0c07277", + "metadata": {}, + "source": [ + "## 4. Call LLM for column type inferencing" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "db84d7ee-715d-464a-94e9-fd05261e36a4", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Cache hit for apply_llm\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selected columns for LLM: ['employee_id', 'department', 'salary', 'satisfaction_score', 'hire_date', 'notes']\n", + "\n", + "--- LLM Prompt Snippet ---\n", + "You are a Senior Data Scientist and Domain Expert.\n", + "Analyze the provided dataset statistics and generate a profile for each column.\n", + "For each column, provide 2-3 testable hypotheses.\n", + "Example: 'Higher discount rates correlate with higher volume but lower margins.'\n", + "\n", + "--- DATASET STATISTICS ---\n", + "\n", + "Detected Datetime Columns:\n", + "{\n", + " \"hire_date\": {\n", + " \"semantic_type\": \"temporal\",\n", + " \"granularity\": \"date\",\n", + " \"format\": \"inferred\",\n", + " \"confidence\": 1.0\n", + " }\n", + "}\n", + "\n", + "Dataset [dummy_employees] Numeric Summary:\n", + " \n", + "...\n", + "\n", + "--- LLM Insights Retrieved Successfully ---\n" + ] + } + ], + "source": [ + "# 1. Select columns (e.g., let's just send everything)\n", + "columns_for_llm = radsasah._select_columns_for_llm(updated_df, scope=\"all\")\n", + "print(f\"Selected columns for LLM: {columns_for_llm}\\n\")\n", + "\n", + "# 2. Build the exact prompt string that goes to the LLM\n", + "prompt_text = radsasah.build_llm_prompt(stats, columns_to_include=columns_for_llm)\n", + "print(\"--- LLM Prompt Snippet ---\")\n", + "print(prompt_text[:500] + \"\\n...\\n\")\n", + "\n", + "# 3. Call the LLM to generate hypotheses (using gpt-4o as default)\n", + "# If you don't have an API key configured, you can mock this response by creating a static dict.\n", + "try:\n", + " semantic_insights = radsasah.generate_hypotheses_via_cli(\n", + " stats=stats,\n", + " model=\"gpt-4o\",\n", + " columns_to_include=columns_for_llm\n", + " )\n", + " print(\"--- LLM Insights Retrieved Successfully ---\")\n", + "except Exception as e:\n", + " print(f\"LLM call failed (Check API key): {e}\")\n", + " semantic_insights = {\"columns\": {}} # Fallback empty dict" + ] + }, + { + "cell_type": "markdown", + "id": "0b497a08-1e1f-47b3-8f7d-8a7038488c4d", + "metadata": {}, + "source": [ + "## 5. Export to JSON and Markdown" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a3cd0c87-5951-4da3-b2ca-22abecebe626", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Pipeline complete! Check your directory for:\n", + "1. dummy_profile_report.json\n", + "2. dummy_profile_summary.md\n" + ] + } + ], + "source": [ + "# 1. Build structured column profiles\n", + "primary_df = list(tag_to_df.values())[0]\n", + "column_profiles = radsasar.build_column_profiles(\n", + " df=primary_df,\n", + " stats=stats,\n", + " insights=semantic_insights\n", + ")\n", + "\n", + "# 2. Export to JSON\n", + "json_out = \"dummy_profile_report.json\"\n", + "radsasar.merge_and_export_results(\n", + " stats=stats,\n", + " insights=semantic_insights,\n", + " column_profiles=column_profiles,\n", + " output_path=json_out\n", + ")\n", + "\n", + "# 3. Export to Markdown\n", + "md_out = \"dummy_profile_summary.md\"\n", + "radsasar.export_markdown_from_profiles(\n", + " column_profiles=column_profiles,\n", + " numeric_stats=stats.get(\"numeric_summary\", {}),\n", + " output_path=md_out\n", + ")\n", + "\n", + "print(f\"\\nPipeline complete! Check your directory for:\")\n", + "print(f\"1. {json_out}\")\n", + "print(f\"2. {md_out}\")\n", + "\n", + "# Clean up dummy CSV if desired\n", + "# os.remove(csv_path)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/research/agentic_data_science/schema_agent/schema_agent_example.ipynb b/research/agentic_data_science/schema_agent/schema_agent_example.ipynb new file mode 100644 index 000000000..44b96491c --- /dev/null +++ b/research/agentic_data_science/schema_agent/schema_agent_example.ipynb @@ -0,0 +1,435 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b6e62e00-6cb3-45ef-8b7d-3a8ce84eb825", + "metadata": {}, + "source": [ + "# Schema Parser example \n", + "- This implementation in the notebook utilizes a suite of pre-existing functions to parse a single Excel (or CSV) file, automatically inferring data types and capturing temporal metadata for downstream analysis." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "3770d3bd-200f-4b7a-bb10-1fe76f26c4d7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:79: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " parsed = pd.to_datetime(df[col], errors=\"coerce\", utc=True)\n", + "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:79: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " parsed = pd.to_datetime(df[col], errors=\"coerce\", utc=True)\n", + "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:79: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + " parsed = pd.to_datetime(df[col], errors=\"coerce\", utc=True)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Temporal Boundaries ===\n", + " min_index max_index min_valid_index max_valid_index\n", + "ecommerce_data 2009-12-01 07:45:00+00:00 2010-12-09 20:01:00+00:00 2009-12-01 07:45:00+00:00 2010-12-09 20:01:00+00:00\n", + " year month week_of_year day_of_week order_hour is_weekend customer_id unit_price_gbp quantity_sold sales_amount_gbp population_total gdp_current_usd gdp_growth_pct inflation_consumer_pct\n", + "2009-12-01 07:45:00+00:00 2009 12 49 1 7 0 13085 5.95 10 59.5 62276270.0 2412840006231.5 -17.633976 1.89709\n", + "2009-12-01 07:45:00+00:00 2009 12 49 1 7 0 13085 6.75 12 81.0 62276270.0 2412840006231.5 -17.633976 1.89709\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "2010-12-09 20:01:00+00:00 2010 12 49 3 20 0 17530 1.95 4 7.8 62766365.0 2485482596184.708984 3.010668 1.589081\n", + "2010-12-09 20:01:00+00:00 2010 12 49 3 20 0 17530 1.25 4 5.0 62766365.0 2485482596184.708984 3.010668 1.589081\n", + " num_rows num_zeros zeros [%] num_nans nans [%] num_infs infs [%] num_valid valid [%]\n", + "year 100000 0 0.0 0 0.0 0 0.0 100000 100.0\n", + "month 100000 0 0.0 0 0.0 0 0.0 100000 100.0\n", + "... ... ... ... ... ... ... ... ... ...\n", + "gdp_growth_pct 100000 0 0.0 0 0.0 0 0.0 100000 100.0\n", + "inflation_consumer_pct 100000 0 0.0 0 0.0 0 0.0 100000 100.0\n", + "\n", + "=== Quality Report: ecommerce_data ===\n", + " num_rows num_zeros zeros [%] num_nans nans [%] \\\n", + "year 100000 0 0.0 0 0.0 \n", + "month 100000 0 0.0 0 0.0 \n", + "week_of_year 100000 0 0.0 0 0.0 \n", + "day_of_week 100000 16298 16.3 0 0.0 \n", + "order_hour 100000 0 0.0 0 0.0 \n", + "is_weekend 100000 84604 84.6 0 0.0 \n", + "customer_id 100000 0 0.0 0 0.0 \n", + "unit_price_gbp 100000 0 0.0 0 0.0 \n", + "quantity_sold 100000 0 0.0 0 0.0 \n", + "sales_amount_gbp 100000 0 0.0 0 0.0 \n", + "population_total 100000 0 0.0 0 0.0 \n", + "gdp_current_usd 100000 0 0.0 0 0.0 \n", + "gdp_growth_pct 100000 0 0.0 0 0.0 \n", + "inflation_consumer_pct 100000 0 0.0 0 0.0 \n", + "\n", + " num_infs infs [%] num_valid valid [%] \n", + "year 0 0.0 100000 100.0 \n", + "month 0 0.0 100000 100.0 \n", + "week_of_year 0 0.0 100000 100.0 \n", + "day_of_week 0 0.0 83702 83.7 \n", + "order_hour 0 0.0 100000 100.0 \n", + "is_weekend 0 0.0 15396 15.4 \n", + "customer_id 0 0.0 100000 100.0 \n", + "unit_price_gbp 0 0.0 100000 100.0 \n", + "quantity_sold 0 0.0 100000 100.0 \n", + "sales_amount_gbp 0 0.0 100000 100.0 \n", + "population_total 0 0.0 100000 100.0 \n", + "gdp_current_usd 0 0.0 100000 100.0 \n", + "gdp_growth_pct 0 0.0 100000 100.0 \n", + "inflation_consumer_pct 0 0.0 100000 100.0 \n", + "\n", + "=== Distribution: ecommerce_data / country ===\n", + " count pct [%]\n", + "country \n", + "United Kingdom 64417 64.417\n", + "Ireland 8507 8.507\n", + "Germany 7654 7.654\n", + "France 5470 5.470\n", + "Netherlands 2729 2.729\n", + "Spain 1235 1.235\n", + "Switzerland 1170 1.170\n", + "Belgium 1037 1.037\n", + "Portugal 984 0.984\n", + "Sweden 868 0.868\n", + "\n", + "=== Distribution: ecommerce_data / country_code ===\n", + " count pct [%]\n", + "country_code \n", + "GBR 64417 64.417\n", + "IRL 8507 8.507\n", + "DEU 7654 7.654\n", + "FRA 5470 5.470\n", + "NLD 2729 2.729\n", + "ESP 1235 1.235\n", + "CHE 1170 1.170\n", + "BEL 1037 1.037\n", + "PRT 984 0.984\n", + "SWE 868 0.868\n", + "\n", + "=== Distribution: ecommerce_data / product_id ===\n", + " count pct [%]\n", + "product_id \n", + "POST 731 0.731\n", + "85123A 615 0.615\n", + "21212 438 0.438\n", + "22423 437 0.437\n", + "85099B 391 0.391\n", + "20725 334 0.334\n", + "84991 298 0.298\n", + "20914 295 0.295\n", + "21232 295 0.295\n", + "84879 285 0.285\n", + "\n", + "=== Numeric Summary: ecommerce_data ===\n", + " mean std min median max\n", + "year 2.009929e+03 2.563578e-01 2.009000e+03 2.010000e+03 2.010000e+03\n", + "month 7.377590e+00 3.456657e+00 1.000000e+00 8.000000e+00 1.200000e+01\n", + "week_of_year 2.991514e+01 1.500327e+01 1.000000e+00 3.300000e+01 5.200000e+01\n", + "day_of_week 2.583280e+00 1.923159e+00 0.000000e+00 2.000000e+00 6.000000e+00\n", + "order_hour 1.268047e+01 2.351588e+00 7.000000e+00 1.300000e+01 2.000000e+01\n", + "is_weekend 1.539600e-01 3.609122e-01 0.000000e+00 0.000000e+00 1.000000e+00\n", + "customer_id 1.476813e+04 1.799165e+03 1.234600e+04 1.464600e+04 1.828700e+04\n", + "unit_price_gbp 3.889158e+00 5.975020e+01 1.000000e-03 1.950000e+00 1.095350e+04\n", + "quantity_sold 1.865779e+01 1.593465e+02 1.000000e+00 6.000000e+00 1.915200e+04\n", + "sales_amount_gbp 2.694892e+01 9.239021e+01 1.000000e-03 1.498000e+01 1.095350e+04\n", + "population_total 5.409812e+07 2.664448e+07 3.180410e+05 6.276636e+07 3.093782e+08\n", + "gdp_current_usd 2.161193e+12 1.115049e+12 9.035824e+09 2.485483e+12 1.504897e+13\n", + "gdp_growth_pct 4.626259e-01 6.134116e+00 -1.962987e+01 3.010668e+00 3.250405e+01\n", + "inflation_consumer_pct 1.104250e+00 1.655513e+00 -1.518298e+01 1.589081e+00 1.652789e+01\n", + "12:11:45 rss=0.267GB vms=1.690GB mem_pct=2% cpu=0% - \u001b[36mINFO \u001b[0m Task-193 schema_agent.py run_pipeline:131 LLM will profile 3 / 18 columns (scope=semantic).\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Cache hit for apply_llm\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
order_datetimeyearmonthweek_of_yearday_of_weekorder_houris_weekendcountrycountry_codeproduct_idcustomer_idunit_price_gbpquantity_soldsales_amount_gbppopulation_totalgdp_current_usdgdp_growth_pctinflation_consumer_pct
order_datetime
2009-12-01 07:45:00+00:002009-12-01 07:45:00+00:0020091249170United KingdomGBR21523130855.951059.5062276270.02.412840e+12-17.6339761.89709
2009-12-01 07:45:00+00:002009-12-01 07:45:00+00:0020091249170United KingdomGBR79323W130856.751281.0062276270.02.412840e+12-17.6339761.89709
2009-12-01 09:06:00+00:002009-12-01 09:06:00+00:0020091249190United KingdomGBR82582130782.101225.2062276270.02.412840e+12-17.6339761.89709
2009-12-01 09:06:00+00:002009-12-01 09:06:00+00:0020091249190United KingdomGBR22111130784.2524102.0062276270.02.412840e+12-17.6339761.89709
2009-12-01 09:06:00+00:002009-12-01 09:06:00+00:0020091249190United KingdomGBR21756130785.95317.8562276270.02.412840e+12-17.6339761.89709
\n", + "
" + ], + "text/plain": [ + " order_datetime year month \\\n", + "order_datetime \n", + "2009-12-01 07:45:00+00:00 2009-12-01 07:45:00+00:00 2009 12 \n", + "2009-12-01 07:45:00+00:00 2009-12-01 07:45:00+00:00 2009 12 \n", + "2009-12-01 09:06:00+00:00 2009-12-01 09:06:00+00:00 2009 12 \n", + "2009-12-01 09:06:00+00:00 2009-12-01 09:06:00+00:00 2009 12 \n", + "2009-12-01 09:06:00+00:00 2009-12-01 09:06:00+00:00 2009 12 \n", + "\n", + " week_of_year day_of_week order_hour is_weekend \\\n", + "order_datetime \n", + "2009-12-01 07:45:00+00:00 49 1 7 0 \n", + "2009-12-01 07:45:00+00:00 49 1 7 0 \n", + "2009-12-01 09:06:00+00:00 49 1 9 0 \n", + "2009-12-01 09:06:00+00:00 49 1 9 0 \n", + "2009-12-01 09:06:00+00:00 49 1 9 0 \n", + "\n", + " country country_code product_id \\\n", + "order_datetime \n", + "2009-12-01 07:45:00+00:00 United Kingdom GBR 21523 \n", + "2009-12-01 07:45:00+00:00 United Kingdom GBR 79323W \n", + "2009-12-01 09:06:00+00:00 United Kingdom GBR 82582 \n", + "2009-12-01 09:06:00+00:00 United Kingdom GBR 22111 \n", + "2009-12-01 09:06:00+00:00 United Kingdom GBR 21756 \n", + "\n", + " customer_id unit_price_gbp quantity_sold \\\n", + "order_datetime \n", + "2009-12-01 07:45:00+00:00 13085 5.95 10 \n", + "2009-12-01 07:45:00+00:00 13085 6.75 12 \n", + "2009-12-01 09:06:00+00:00 13078 2.10 12 \n", + "2009-12-01 09:06:00+00:00 13078 4.25 24 \n", + "2009-12-01 09:06:00+00:00 13078 5.95 3 \n", + "\n", + " sales_amount_gbp population_total \\\n", + "order_datetime \n", + "2009-12-01 07:45:00+00:00 59.50 62276270.0 \n", + "2009-12-01 07:45:00+00:00 81.00 62276270.0 \n", + "2009-12-01 09:06:00+00:00 25.20 62276270.0 \n", + "2009-12-01 09:06:00+00:00 102.00 62276270.0 \n", + "2009-12-01 09:06:00+00:00 17.85 62276270.0 \n", + "\n", + " gdp_current_usd gdp_growth_pct \\\n", + "order_datetime \n", + "2009-12-01 07:45:00+00:00 2.412840e+12 -17.633976 \n", + "2009-12-01 07:45:00+00:00 2.412840e+12 -17.633976 \n", + "2009-12-01 09:06:00+00:00 2.412840e+12 -17.633976 \n", + "2009-12-01 09:06:00+00:00 2.412840e+12 -17.633976 \n", + "2009-12-01 09:06:00+00:00 2.412840e+12 -17.633976 \n", + "\n", + " inflation_consumer_pct \n", + "order_datetime \n", + "2009-12-01 07:45:00+00:00 1.89709 \n", + "2009-12-01 07:45:00+00:00 1.89709 \n", + "2009-12-01 09:06:00+00:00 1.89709 \n", + "2009-12-01 09:06:00+00:00 1.89709 \n", + "2009-12-01 09:06:00+00:00 1.89709 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "import research.agentic_data_science.schema_agent.schema_agent as radsasag\n", + "\n", + "# Now run the pipeline\n", + "csv_files = [\"global_ecommerce_forecasting.csv\"]\n", + "tags = [\"ecommerce_data\"]\n", + "\n", + "tag_to_df, stats = radsasag.run_pipeline(\n", + " csv_paths=csv_files,\n", + " tags=tags,\n", + " model=\"gpt-4o\",\n", + " llm_scope=\"semantic\"\n", + ")\n", + "\n", + "display(tag_to_df[\"ecommerce_data\"].head())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/research/agentic_data_science/schema_agent/schema_agent_loader.py b/research/agentic_data_science/schema_agent/schema_agent_loader.py index d8f649547..84a421b11 100644 --- a/research/agentic_data_science/schema_agent/schema_agent_loader.py +++ b/research/agentic_data_science/schema_agent/schema_agent_loader.py @@ -45,99 +45,55 @@ def load_csv(csv_path: str) -> pd.DataFrame: return df -# keep legacy name for backwards compatibility -load_employee_data = load_csv - - def infer_and_convert_datetime_columns( df: pd.DataFrame, sample_size: int = 100, threshold: float = 0.8, ) -> typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]]: - """ - Detect and convert date/datetime columns in a DataFrame. - - Uses sampling for performance. Returns the updated DataFrame and a - metadata dict with inference details per column. - - :param df: Input DataFrame. - :type df: pd.DataFrame - :param sample_size: Number of rows to sample when testing format compliance. - :type sample_size: int - :param threshold: Minimum fraction of parsed values required to accept a column as temporal. - :type threshold: float - :return: Updated DataFrame with converted columns + metadata per column. - :rtype: typing.Tuple[pd.DataFrame, typing.Dict[str, typing.Any]] - """ - hdbg.dassert_isinstance(df, pd.DataFrame) - - COMMON_FORMATS = [ - "%Y-%m-%d", - "%d-%m-%Y", - "%m-%d-%Y", - "%Y/%m/%d", - "%d/%m/%Y", - "%m/%d/%Y", - "%Y-%m-%d %H:%M:%S", - "%Y-%m-%d %H:%M", - "%d-%m-%Y %H:%M:%S", - "%m/%d/%Y %H:%M:%S", - ] - metadata: typing.Dict[str, typing.Any] = {} df_out = df.copy() for col in df.columns: - if not ( - pd.api.types.is_object_dtype(df[col]) - or pd.api.types.is_string_dtype(df[col]) - ): + # 1. If it's already datetime, just ensure UTC awareness + if pd.api.types.is_datetime64_any_dtype(df[col]): + df_out[col] = pd.to_datetime(df[col], utc=True) + metadata[col] = { + "semantic_type": "temporal", + "granularity": "datetime", + "format": "pre-converted", + "confidence": 1.0, + } continue - series = df[col].dropna().astype(str) - if series.empty: + # 2. Only attempt conversion on strings/objects + if not (pd.api.types.is_object_dtype(df[col]) or pd.api.types.is_string_dtype(df[col])): continue - sample = series.head(sample_size) - best_format: typing.Optional[str] = None - best_score = 0.0 - - for fmt in COMMON_FORMATS: - success = sum(1 for val in sample if _try_strptime(val, fmt)) - score = success / len(sample) - if score > best_score: - best_score = score - best_format = fmt - - if best_score >= threshold: - parsed = pd.to_datetime(df[col], format=best_format, errors="coerce") - used_format = best_format - else: - parsed = pd.to_datetime(df[col], errors="coerce") - used_format = None - - confidence = float(parsed.notna().mean()) - if confidence < threshold: + # Try to parse + try: + # We use errors="coerce" so non-dates become NaT + parsed = pd.to_datetime(df[col], errors="coerce", utc=True) + + valid_count = parsed.notna().sum() + if valid_count == 0: + continue + + confidence = float(valid_count / len(df[col])) + + # Only convert if it meets our confidence threshold + if confidence >= threshold: + df_out[col] = parsed + has_time = (parsed.dt.time != pd.Timestamp("00:00:00").time()).any() + metadata[col] = { + "semantic_type": "temporal", + "granularity": "datetime" if has_time else "date", + "format": "inferred", + "confidence": confidence, + } + _LOG.info("Converted column '%s' to datetime", col) + except Exception: continue - has_time = (parsed.dt.time != pd.Timestamp("00:00:00").time()).any() - col_type = "datetime" if has_time else "date" - df_out[col] = parsed - - metadata[col] = { - "semantic_type": "temporal", - "granularity": col_type, - "format": used_format, - "confidence": confidence, - } - _LOG.info( - "Column '%s' detected as %s (format=%s, confidence=%.2f)", - col, - col_type, - used_format, - confidence, - ) - return df_out, metadata @@ -152,38 +108,47 @@ def _try_strptime(val: str, fmt: str) -> bool: return False + def prepare_dataframes( csv_paths: typing.List[str], tags: typing.Optional[typing.List[str]] = None, ) -> typing.Tuple[ - typing.Dict[str, pd.DataFrame], typing.Dict[str, typing.List[str]] + typing.Dict[str, pd.DataFrame], + typing.Dict[str, typing.List[str]], + typing.Dict[str, typing.Any] # Added return type for metadata ]: """ Load and prepare all CSV files in one pass. - - Applies type coercion, datetime inference, and categorical detection. - - :param csv_paths: List of CSV file paths. - :type csv_paths: typing.List[str] - :param tags: Human-readable tags; defaults to filename stems. - :type tags: typing.Optional[typing.List[str]] - :return: A tuple containing a dict mapping tags to DataFrames, and a dict mapping tags to categorical columns. - :rtype: typing.Tuple[typing.Dict[str, pd.DataFrame], typing.Dict[str, typing.List[str]]] """ hdbg.dassert_isinstance(csv_paths, list) - hdbg.dassert_lt(0, len(csv_paths)) + if tags is None: + import os + tags = [os.path.splitext(os.path.basename(p))[0] for p in csv_paths] tag_to_df: typing.Dict[str, pd.DataFrame] = {} cat_cols_map: typing.Dict[str, typing.List[str]] = {} + combined_dt_meta: typing.Dict[str, typing.Any] = {} # Store metadata here for path, tag in zip(csv_paths, tags): + # 1. Load and perform initial type conversion df = load_csv(path) df = hpanconv.convert_df(df) - df, _ = infer_and_convert_datetime_columns(df) + + # 2. Perform datetime inference and CAPTURE metadata + df, dt_meta = infer_and_convert_datetime_columns(df) + combined_dt_meta.update(dt_meta) # Merge metadata + + # 3. FIX: Automatically promote the first detected temporal column to + # the Index for Quality and Duration reports. + temporal_cols = [c for c, m in dt_meta.items() if m.get("semantic_type") == "temporal"] + if temporal_cols: + df = df.set_index(temporal_cols[0], drop=False) + tag_to_df[tag] = df + # 4. Identify categorical/string columns cat_cols_map[tag] = df.select_dtypes( include=["object", "category", "string"] ).columns.tolist() - return tag_to_df, cat_cols_map \ No newline at end of file + return tag_to_df, cat_cols_map, combined_dt_meta \ No newline at end of file diff --git a/research/agentic_data_science/schema_agent/schema_agent_stats.py b/research/agentic_data_science/schema_agent/schema_agent_stats.py index 24ab40857..5c170fe17 100644 --- a/research/agentic_data_science/schema_agent/schema_agent_stats.py +++ b/research/agentic_data_science/schema_agent/schema_agent_stats.py @@ -78,23 +78,24 @@ def compute_llm_agent_stats( # 2. Data quality dataframe_stats["quality_reports"] = {} for tag, df in tag_to_df.items(): - numeric_df = df.select_dtypes(include="number") + # Select ONLY actual numeric columns for the quality math + numeric_df = df.select_dtypes(include=["int64", "float64"]) + if numeric_df.empty: - _LOG.warning( - "No numeric columns in '%s'; skipping quality report", tag - ) + _LOG.warning("No numeric columns in '%s'; skipping quality report", tag) continue - df_stamped = hpanstat.add_end_download_timestamp(numeric_df.copy()) + try: + # Pass ONLY the numeric dataframe here quality = hpanstat.report_zero_nan_inf_stats( - df_stamped, + numeric_df, zero_threshold=1e-9, verbose=True, as_txt=True, ) dataframe_stats["quality_reports"][tag] = quality - print(f"\n=== Quality Report: {tag} ===\n", quality.to_string()) - except Exception as e: # pylint: disable=broad-exception-caught + print(f"\n=== Quality Report: {tag} ===\n", quality) + except Exception as e: _LOG.warning("Quality report failed for '%s': %s", tag, e) # 3. Categorical distributions diff --git a/research/agentic_data_science/schema_agent/scmea_agent_example.ipynb b/research/agentic_data_science/schema_agent/scmea_agent_example.ipynb deleted file mode 100644 index 6104a4038..000000000 --- a/research/agentic_data_science/schema_agent/scmea_agent_example.ipynb +++ /dev/null @@ -1,371 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "49bffb6b-9c87-4d5a-a32a-a2382c2b700c", - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "import argparse\n", - "import logging\n", - "import os\n", - "import typing\n", - "\n", - "import dotenv\n", - "import pandas as pd\n", - "import research.agentic_data_science.schema_agent.schema_agent_hllmcli as radsasah\n", - "import schema_agent_loader as radsasal\n", - "import schema_agent_report as radsasar\n", - "import schema_agent_stats as radsasas\n", - "\n", - "import helpers.hdbg as hdbg\n", - "import helpers.hlogging as hloggin\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "3770d3bd-200f-4b7a-bb10-1fe76f26c4d7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n", - "WARNING: Running in Jupyter\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", - " parsed = pd.to_datetime(df[col], errors=\"coerce\")\n", - "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", - " parsed = pd.to_datetime(df[col], errors=\"coerce\")\n", - "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", - " parsed = pd.to_datetime(df[col], errors=\"coerce\")\n", - "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", - " parsed = pd.to_datetime(df[col], errors=\"coerce\")\n", - "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", - " parsed = pd.to_datetime(df[col], errors=\"coerce\")\n", - "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:116: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", - " parsed = pd.to_datetime(df[col], errors=\"coerce\")\n", - "Skipping duration stats: 'int' object has no attribute 'tzinfo'\n", - "Quality report failed for 'ecommerce_data': 'RangeIndex' object has no attribute 'date'\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " year month week_of_year day_of_week order_hour is_weekend customer_id unit_price_gbp quantity_sold sales_amount_gbp population_total gdp_current_usd gdp_growth_pct inflation_consumer_pct end_download_timestamp\n", - "0 2009 12 49 1 7 0 13085 5.95 10 59.5 62276270.0 2412840006231.5 -17.633976 1.89709 2026-04-07 16:06:12.386207+00:00\n", - "1 2009 12 49 1 7 0 13085 6.75 12 81.0 62276270.0 2412840006231.5 -17.633976 1.89709 2026-04-07 16:06:12.386207+00:00\n", - "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", - "99998 2010 12 49 3 20 0 17530 1.95 4 7.8 62766365.0 2485482596184.708984 3.010668 1.589081 2026-04-07 16:06:12.386207+00:00\n", - "99999 2010 12 49 3 20 0 17530 1.25 4 5.0 62766365.0 2485482596184.708984 3.010668 1.589081 2026-04-07 16:06:12.386207+00:00\n", - "\n", - "=== Distribution: ecommerce_data / country ===\n", - " count pct [%]\n", - "country \n", - "United Kingdom 64417 64.417\n", - "Ireland 8507 8.507\n", - "Germany 7654 7.654\n", - "France 5470 5.470\n", - "Netherlands 2729 2.729\n", - "Spain 1235 1.235\n", - "Switzerland 1170 1.170\n", - "Belgium 1037 1.037\n", - "Portugal 984 0.984\n", - "Sweden 868 0.868\n", - "\n", - "=== Distribution: ecommerce_data / country_code ===\n", - " count pct [%]\n", - "country_code \n", - "GBR 64417 64.417\n", - "IRL 8507 8.507\n", - "DEU 7654 7.654\n", - "FRA 5470 5.470\n", - "NLD 2729 2.729\n", - "ESP 1235 1.235\n", - "CHE 1170 1.170\n", - "BEL 1037 1.037\n", - "PRT 984 0.984\n", - "SWE 868 0.868\n", - "\n", - "=== Distribution: ecommerce_data / product_id ===\n", - " count pct [%]\n", - "product_id \n", - "POST 731 0.731\n", - "85123A 615 0.615\n", - "21212 438 0.438\n", - "22423 437 0.437\n", - "85099B 391 0.391\n", - "20725 334 0.334\n", - "84991 298 0.298\n", - "20914 295 0.295\n", - "21232 295 0.295\n", - "84879 285 0.285\n", - "\n", - "=== Numeric Summary: ecommerce_data ===\n", - " mean std min median max\n", - "year 2.009929e+03 2.563578e-01 2.009000e+03 2.010000e+03 2.010000e+03\n", - "month 7.377590e+00 3.456657e+00 1.000000e+00 8.000000e+00 1.200000e+01\n", - "week_of_year 2.991514e+01 1.500327e+01 1.000000e+00 3.300000e+01 5.200000e+01\n", - "day_of_week 2.583280e+00 1.923159e+00 0.000000e+00 2.000000e+00 6.000000e+00\n", - "order_hour 1.268047e+01 2.351588e+00 7.000000e+00 1.300000e+01 2.000000e+01\n", - "is_weekend 1.539600e-01 3.609122e-01 0.000000e+00 0.000000e+00 1.000000e+00\n", - "customer_id 1.476813e+04 1.799165e+03 1.234600e+04 1.464600e+04 1.828700e+04\n", - "unit_price_gbp 3.889158e+00 5.975020e+01 1.000000e-03 1.950000e+00 1.095350e+04\n", - "quantity_sold 1.865779e+01 1.593465e+02 1.000000e+00 6.000000e+00 1.915200e+04\n", - "sales_amount_gbp 2.694892e+01 9.239021e+01 1.000000e-03 1.498000e+01 1.095350e+04\n", - "population_total 5.409812e+07 2.664448e+07 3.180410e+05 6.276636e+07 3.093782e+08\n", - "gdp_current_usd 2.161193e+12 1.115049e+12 9.035824e+09 2.485483e+12 1.504897e+13\n", - "gdp_growth_pct 4.626259e-01 6.134116e+00 -1.962987e+01 3.010668e+00 3.250405e+01\n", - "inflation_consumer_pct 1.104250e+00 1.655513e+00 -1.518298e+01 1.589081e+00 1.652789e+01\n", - "12:06:12 rss=0.234GB vms=1.655GB mem_pct=2% cpu=100% - \u001b[36mINFO \u001b[0m Task-43 schema_agent.py run_pipeline:107 LLM will profile 3 / 18 columns (scope=semantic).\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
order_datetimeyearmonthweek_of_yearday_of_weekorder_houris_weekendcountrycountry_codeproduct_idcustomer_idunit_price_gbpquantity_soldsales_amount_gbppopulation_totalgdp_current_usdgdp_growth_pctinflation_consumer_pct
02009-12-01 07:45:0020091249170United KingdomGBR21523130855.951059.5062276270.02.412840e+12-17.6339761.89709
12009-12-01 07:45:0020091249170United KingdomGBR79323W130856.751281.0062276270.02.412840e+12-17.6339761.89709
22009-12-01 09:06:0020091249190United KingdomGBR82582130782.101225.2062276270.02.412840e+12-17.6339761.89709
32009-12-01 09:06:0020091249190United KingdomGBR22111130784.2524102.0062276270.02.412840e+12-17.6339761.89709
42009-12-01 09:06:0020091249190United KingdomGBR21756130785.95317.8562276270.02.412840e+12-17.6339761.89709
\n", - "
" - ], - "text/plain": [ - " order_datetime year month week_of_year day_of_week order_hour \\\n", - "0 2009-12-01 07:45:00 2009 12 49 1 7 \n", - "1 2009-12-01 07:45:00 2009 12 49 1 7 \n", - "2 2009-12-01 09:06:00 2009 12 49 1 9 \n", - "3 2009-12-01 09:06:00 2009 12 49 1 9 \n", - "4 2009-12-01 09:06:00 2009 12 49 1 9 \n", - "\n", - " is_weekend country country_code product_id customer_id \\\n", - "0 0 United Kingdom GBR 21523 13085 \n", - "1 0 United Kingdom GBR 79323W 13085 \n", - "2 0 United Kingdom GBR 82582 13078 \n", - "3 0 United Kingdom GBR 22111 13078 \n", - "4 0 United Kingdom GBR 21756 13078 \n", - "\n", - " unit_price_gbp quantity_sold sales_amount_gbp population_total \\\n", - "0 5.95 10 59.50 62276270.0 \n", - "1 6.75 12 81.00 62276270.0 \n", - "2 2.10 12 25.20 62276270.0 \n", - "3 4.25 24 102.00 62276270.0 \n", - "4 5.95 3 17.85 62276270.0 \n", - "\n", - " gdp_current_usd gdp_growth_pct inflation_consumer_pct \n", - "0 2.412840e+12 -17.633976 1.89709 \n", - "1 2.412840e+12 -17.633976 1.89709 \n", - "2 2.412840e+12 -17.633976 1.89709 \n", - "3 2.412840e+12 -17.633976 1.89709 \n", - "4 2.412840e+12 -17.633976 1.89709 " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Make sure this is at the top of your notebook\n", - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "import schema_agent as radsasag\n", - "\n", - "# Now run the pipeline\n", - "csv_files = [\"global_ecommerce_forecasting.csv\"]\n", - "tags = [\"ecommerce_data\"]\n", - "\n", - "tag_to_df, stats = radsasag.run_pipeline(\n", - " csv_paths=csv_files,\n", - " tags=tags,\n", - " model=\"gpt-4o\",\n", - " llm_scope=\"semantic\"\n", - ")\n", - "\n", - "display(tag_to_df[\"ecommerce_data\"].head())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "36f51050-4ead-49dc-9fec-cd430f24de6f", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 49659579a9c3308637225dda7af8763799176f82 Mon Sep 17 00:00:00 2001 From: Pranav Shashidhara Date: Wed, 8 Apr 2026 12:29:40 -0400 Subject: [PATCH 11/14] update README.md --- .../schema_agent/README.md | 123 ++++++++++-------- 1 file changed, 70 insertions(+), 53 deletions(-) diff --git a/research/agentic_data_science/schema_agent/README.md b/research/agentic_data_science/schema_agent/README.md index 020f63e53..c8c397510 100644 --- a/research/agentic_data_science/schema_agent/README.md +++ b/research/agentic_data_science/schema_agent/README.md @@ -1,46 +1,48 @@ # Data Profiler Agent -Automated statistical profiling and LLM-powered semantic analysis for CSV datasets. Generates column-level insights including semantic meaning, data quality assessment, and testable business hypotheses. +Automated statistical profiling and LLM-powered semantic analysis for CSV datasets. Generates column-level insights including semantic classification, data quality assessment, and testable business hypotheses. ## Features -- **Temporal Detection:** Auto-detects and converts date/datetime columns across multiple formats -- **Statistical Profiling:** Computes numeric summaries, data quality metrics, and categorical distributions -- **LLM Semantic Analysis:** Generates column roles (ID, Feature, Target, Timestamp), semantic meaning, and hypotheses -- **Cost Optimization:** Filter columns before LLM analysis to control token usage and API costs -- **Multi-Format Output:** JSON reports and Markdown summaries +- **Temporal detection** — Auto-detects and converts date/datetime columns across multiple formats +- **Statistical profiling** — Computes numeric summaries, data quality metrics, and categorical distributions +- **LLM semantic analysis** — Infers column roles (ID, Feature, Target, Timestamp), semantic meaning, and hypotheses +- **Cost optimization** — Filter columns prior to LLM analysis to control token usage and API costs +- **Multi-format output** — JSON reports and Markdown summaries ## Setup -Go into the schema folder: +Navigate to the project directory: ```bash > cd research/agentic_data_science/schema_agent ``` -Install the requirements: +Install dependencies: ```bash > pip install -r requirements.txt ``` -Set the `OPENAI_API_KEY` in your environment: +Set your API key: ```bash > export OPENAI_API_KEY=sk-... ``` -Make the script executable -```bash + +Make the entry point executable: +```bash > chmod +x schema_agent.py ``` + ## Module Structure -The agent is split into six focused modules: +The agent is organized into six focused modules: | Module | Responsibility | -|--------|---------------| -| `schema_agent_models.py` | Pydantic schemas for type-safe column/dataset insights | -| `schema_agent_loader.py` | CSV loading, type inference, datetime detection | -| `schema_agent_stats.py` | Numeric summaries, quality reports, categorical distributions | -| `schema_agent_llm.py` | Prompt building, OpenAI/LangChain calls, structured output parsing | -| `schema_agent_report.py` | Column profiles, JSON and Markdown export | +|--------|----------------| +| `schema_agent_models.py` | Pydantic schemas for type-safe column and dataset insights | +| `schema_agent_loader.py` | CSV loading, type inference, and datetime detection | +| `schema_agent_stats.py` | Numeric summaries, quality reports, and categorical distributions | +| `schema_agent_llm.py` | Prompt construction, OpenAI/LangChain calls, and structured output parsing | +| `schema_agent_report.py` | Column profiles, JSON export, and Markdown export | | `schema_agent.py` | Pipeline orchestration and CLI entry point | ## Usage @@ -51,52 +53,58 @@ The agent is split into six focused modules: > ./schema_agent.py data.csv ``` -Outputs: -- `data_profile_report.json` — Machine-readable report -- `data_profile_summary.md` — Human-readable summary +Produces two output files: + +- `data_profile_report.json` — Machine-readable column profiles and statistics +- `data_profile_summary.md` — Human-readable summary table ### Advanced ```bash -# Multiple files with tags +# Profile multiple files with custom tags > ./schema_agent.py dataset1.csv dataset2.csv --tags sales_2024 inv_q1 -# Cost-optimized: only high-null columns +# Cost-optimized: analyze only high-null columns > ./schema_agent.py data.csv --llm-scope nulls --model gpt-4o-mini -# Custom metrics and output +# Custom metrics and output path > ./schema_agent.py data.csv --metrics mean std max --output-json my_report.json -# LangChain backend +# Use LangChain as the inference backend > ./schema_agent.py data.csv --use-langchain ``` -## Command-Line Arguments +## Command-Line Reference | Argument | Default | Description | |----------|---------|-------------| | `csv_paths` | Required | One or more CSV file paths | -| `--tags` | File stems | Tags for each CSV (must match count) | -| `--model` | `gpt-4o` | LLM model (`gpt-4o`, `gpt-4o-mini`, etc.) | -| `--llm-scope` | `all` | Which columns to profile: `all`, `semantic`, `nulls` | -| `--metrics` | Subset | Numeric metrics: `mean`, `std`, `min`, `25%`, `50%`, `75%`, `max` | -| `--use-langchain` | False | Use LangChain instead of hllmcli | -| `--output-json` | `data_profile_report.json` | JSON report path | -| `--output-md` | `data_profile_summary.md` | Markdown summary path | +| `--tags` | File stems | Labels for each CSV (count must match `csv_paths`) | +| `--model` | `gpt-4o` | OpenAI model (`gpt-4o`, `gpt-4o-mini`, etc.) | +| `--llm-scope` | `all` | Column selection strategy: `all`, `semantic`, or `nulls` | +| `--metrics` | Subset | Numeric summary stats: `mean`, `std`, `min`, `25%`, `50%`, `75%`, `max` | +| `--use-langchain` | `false` | Use LangChain instead of the default inference client | +| `--output-json` | `data_profile_report.json` | Output path for the JSON report | +| `--output-md` | `data_profile_summary.md` | Output path for the Markdown summary | ## LLM Scoping -- **`all`** — Every column (highest cost, comprehensive) -- **`semantic`** — Non-numeric columns only -- **`nulls`** — Columns with >5% null values (cost-optimized) +Control which columns are sent to the LLM to manage cost and latency: + +| Scope | Behavior | +|-------|----------| +| `all` | Profiles every column — most comprehensive, highest cost | +| `semantic` | Profiles non-numeric columns only | +| `nulls` | Profiles only columns with >5% null values — most cost-efficient | ## Python API ### Full pipeline ```python -import schema_agent as radsasag -tag_to_df, stats = radsasag.run_pipeline( +import schema_agent as agent + +tag_to_df, stats = agent.run_pipeline( csv_paths=["data.csv"], model="gpt-4o-mini", llm_scope="semantic" @@ -108,30 +116,39 @@ tag_to_df, stats = radsasag.run_pipeline( Each module can be imported independently for exploratory use or testing: ```python -import schema_agent_loader as radsasal -import schema_agent_stats as radsasas -import schema_agent_llm as radsasal -import schema_agent_report as radsasar +import schema_agent_loader as loader +import schema_agent_stats as stats +import schema_agent_llm as llm +import schema_agent_report as report ``` -## Output +## Output Reference + +### `data_profile_report.json` -### data_profile_report.json -Structured report with column profiles, technical stats, and LLM insights. +Structured report containing per-column profiles, statistical summaries, and LLM-generated insights. -### data_profile_summary.md -Formatted table summary: Column | Meaning | Role | Quality | Hypotheses +### `data_profile_summary.md` + +Formatted Markdown table with columns: **Column · Meaning · Role · Quality · Hypotheses** ## Troubleshooting -**API Key Error:** +**API key not set** + ```bash > export OPENAI_API_KEY=sk-... ``` -**Validation Errors:** -- Use `--llm-scope nulls` or `--llm-scope semantic` to reduce columns -- Try `--model gpt-4o-mini` +**Validation or parsing errors** + +Reduce the number of columns sent to the LLM: + +```bash +> ./schema_agent.py data.csv --llm-scope nulls +> ./schema_agent.py data.csv --llm-scope semantic --model gpt-4o-mini +``` + +**No datetime columns detected** -**Datetime Detection:** -Skipped automatically if no temporal columns detected. +Expected behavior — datetime detection is skipped automatically when no temporal columns are present in the dataset. \ No newline at end of file From be0094b169e4422feee16c74e15afcf4be6528db Mon Sep 17 00:00:00 2001 From: Pranav Shashidhara Date: Thu, 9 Apr 2026 10:32:12 -0400 Subject: [PATCH 12/14] Use uv and lint *.ipynb files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- .../schema_agent/README.md | 10 +- .../schema_agent/requirements.in | 18 ++ .../schema_agent/requirements.txt | 234 +++++++++++++++++- .../schema_agent/schema_agent_API.py | 163 ++++++++++++ .../schema_agent/schema_agent_example.ipynb | 13 +- .../schema_agent/schema_agent_example.py | 35 +++ .../schema_agent/schema_agent_hllmcli.py | 2 +- 7 files changed, 454 insertions(+), 21 deletions(-) create mode 100644 research/agentic_data_science/schema_agent/requirements.in create mode 100644 research/agentic_data_science/schema_agent/schema_agent_API.py create mode 100644 research/agentic_data_science/schema_agent/schema_agent_example.py diff --git a/research/agentic_data_science/schema_agent/README.md b/research/agentic_data_science/schema_agent/README.md index c8c397510..dc7a08292 100644 --- a/research/agentic_data_science/schema_agent/README.md +++ b/research/agentic_data_science/schema_agent/README.md @@ -102,7 +102,7 @@ Control which columns are sent to the LLM to manage cost and latency: ### Full pipeline ```python -import schema_agent as agent +import research.agentic_data_science.schema_agent.schema_agent as agent tag_to_df, stats = agent.run_pipeline( csv_paths=["data.csv"], @@ -116,10 +116,10 @@ tag_to_df, stats = agent.run_pipeline( Each module can be imported independently for exploratory use or testing: ```python -import schema_agent_loader as loader -import schema_agent_stats as stats -import schema_agent_llm as llm -import schema_agent_report as report +import research.agentic_data_science.schema_agent.schema_agent_loader as loader +import research.agentic_data_science.schema_agent.schema_agent_stats as stats +import research.agentic_data_science.schema_agent.schema_agent_llm as llm +import research.agentic_data_science.schema_agent.schema_agent_report as report ``` ## Output Reference diff --git a/research/agentic_data_science/schema_agent/requirements.in b/research/agentic_data_science/schema_agent/requirements.in new file mode 100644 index 000000000..08dbfa79b --- /dev/null +++ b/research/agentic_data_science/schema_agent/requirements.in @@ -0,0 +1,18 @@ +pandas==3.0.2 +numpy==2.4.4 + +langchain-core==1.2.27 +langchain-openai==1.1.12 + +langgraph==1.1.6 +langgraph-checkpoint==4.0.1 +langgraph-prebuilt==1.0.9 +langgraph-sdk==0.3.12 + +llm==0.30 +tokencost==0.1.26 + +pytz==2026.1.post1 +python-dotenv==1.2.2 + +setuptools>=65.0.0 \ No newline at end of file diff --git a/research/agentic_data_science/schema_agent/requirements.txt b/research/agentic_data_science/schema_agent/requirements.txt index ed4078da6..3d14b9e25 100644 --- a/research/agentic_data_science/schema_agent/requirements.txt +++ b/research/agentic_data_science/schema_agent/requirements.txt @@ -1,8 +1,226 @@ -pandas -langchain_core -langchain_openai -langgraph -llm -tokencost -pytz -dotenv \ No newline at end of file +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile requirements.in +# +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.13.5 + # via tokencost +aiosignal==1.4.0 + # via aiohttp +annotated-types==0.7.0 + # via pydantic +anthropic==0.92.0 + # via tokencost +anyio==4.13.0 + # via + # anthropic + # httpx + # openai +attrs==26.1.0 + # via aiohttp +certifi==2026.2.25 + # via + # httpcore + # httpx + # requests +charset-normalizer==3.4.7 + # via requests +click==8.3.2 + # via + # click-default-group + # llm + # sqlite-utils +click-default-group==1.2.4 + # via + # llm + # sqlite-utils +condense-json==0.1.3 + # via llm +distro==1.9.0 + # via + # anthropic + # openai +docstring-parser==0.17.0 + # via anthropic +frozenlist==1.8.0 + # via + # aiohttp + # aiosignal +h11==0.16.0 + # via httpcore +httpcore==1.0.9 + # via httpx +httpx==0.28.1 + # via + # anthropic + # langgraph-sdk + # langsmith + # openai +idna==3.11 + # via + # anyio + # httpx + # requests + # yarl +jiter==0.13.0 + # via + # anthropic + # openai +jsonpatch==1.33 + # via langchain-core +jsonpointer==3.1.1 + # via jsonpatch +langchain-core==1.2.27 + # via + # -r requirements.in + # langchain-openai + # langgraph + # langgraph-checkpoint + # langgraph-prebuilt +langchain-openai==1.1.12 + # via -r requirements.in +langgraph==1.1.6 + # via -r requirements.in +langgraph-checkpoint==4.0.1 + # via + # -r requirements.in + # langgraph + # langgraph-prebuilt +langgraph-prebuilt==1.0.9 + # via + # -r requirements.in + # langgraph +langgraph-sdk==0.3.12 + # via + # -r requirements.in + # langgraph +langsmith==0.7.29 + # via langchain-core +llm==0.30 + # via -r requirements.in +multidict==6.7.1 + # via + # aiohttp + # yarl +numpy==2.4.4 + # via + # -r requirements.in + # pandas +openai==2.31.0 + # via + # langchain-openai + # llm +orjson==3.11.8 + # via + # langgraph-sdk + # langsmith +ormsgpack==1.12.2 + # via langgraph-checkpoint +packaging==26.0 + # via + # langchain-core + # langsmith +pandas==3.0.2 + # via -r requirements.in +pluggy==1.6.0 + # via + # llm + # sqlite-utils +propcache==0.4.1 + # via + # aiohttp + # yarl +puremagic==2.2.0 + # via llm +pydantic==2.12.5 + # via + # anthropic + # langchain-core + # langgraph + # langsmith + # llm + # openai +pydantic-core==2.41.5 + # via pydantic +python-dateutil==2.9.0.post0 + # via + # pandas + # sqlite-utils +python-dotenv==1.2.2 + # via -r requirements.in +python-ulid==3.1.0 + # via llm +pytz==2026.1.post1 + # via -r requirements.in +pyyaml==6.0.3 + # via + # langchain-core + # llm +regex==2026.4.4 + # via tiktoken +requests==2.33.1 + # via + # langsmith + # requests-toolbelt + # tiktoken +requests-toolbelt==1.0.0 + # via langsmith +six==1.17.0 + # via python-dateutil +sniffio==1.3.1 + # via + # anthropic + # openai +sqlite-fts4==1.0.3 + # via sqlite-utils +sqlite-migrate==0.1b0 + # via llm +sqlite-utils==3.39 + # via + # llm + # sqlite-migrate +tabulate==0.10.0 + # via sqlite-utils +tenacity==9.1.4 + # via langchain-core +tiktoken==0.12.0 + # via + # langchain-openai + # tokencost +tokencost==0.1.26 + # via -r requirements.in +tqdm==4.67.3 + # via openai +typing-extensions==4.15.0 + # via + # aiosignal + # anthropic + # anyio + # langchain-core + # openai + # pydantic + # pydantic-core + # typing-inspection +typing-inspection==0.4.2 + # via pydantic +urllib3==2.6.3 + # via requests +uuid-utils==0.14.1 + # via + # langchain-core + # langsmith +xxhash==3.6.0 + # via + # langgraph + # langsmith +yarl==1.23.0 + # via aiohttp +zstandard==0.25.0 + # via langsmith + +# The following packages are considered to be unsafe in a requirements file: +# pip +# setuptools diff --git a/research/agentic_data_science/schema_agent/schema_agent_API.py b/research/agentic_data_science/schema_agent/schema_agent_API.py new file mode 100644 index 000000000..939295380 --- /dev/null +++ b/research/agentic_data_science/schema_agent/schema_agent_API.py @@ -0,0 +1,163 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.1 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # API usage Notebook +# - This notebook shows the implementation of each function from the respective libraries. + +# %% +# %load_ext autoreload +# %autoreload 2 + +import dotenv +import os +import pandas as pd +import numpy as np + +# Load environment variables (ensure OPENAI_API_KEY is set in your .env) +dotenv.load_dotenv() + +# Import the schema agent modules +import research.agentic_data_science.schema_agent.schema_agent_loader as radsasal +import research.agentic_data_science.schema_agent.schema_agent_stats as radsasas +import research.agentic_data_science.schema_agent.schema_agent_hllmcli as radsasah +import research.agentic_data_science.schema_agent.schema_agent_report as radsasar + +# %% [markdown] +# ## 1. Create dummy Dataset + +# %% +# 1. Create a dummy dataset +np.random.seed(42) +num_rows = 100 + +dummy_data = pd.DataFrame({ + "employee_id": range(1000, 1000 + num_rows), + "department": np.random.choice(["Engineering", "Sales", "HR", "Marketing"], num_rows), + "salary": np.random.normal(85000, 20000, num_rows), + "satisfaction_score": np.random.uniform(1.0, 5.0, num_rows), + "hire_date": pd.date_range(start="2018-01-01", periods=num_rows, freq="W").astype(str), + "notes": ["Good performance"] * 50 + [None] * 50 # 50% nulls +}) + +# Inject some missing values into salary +dummy_data.loc[10:20, "salary"] = np.nan + +# Save to CSV +csv_path = "dummy_employees.csv" +dummy_data.to_csv(csv_path, index=False) +print(f"Created dummy dataset at: {csv_path}") +dummy_data.head() + +csv_paths = [csv_path] +tags = ["dummy_employees"] + +# %% [markdown] +# ## 2. Load and Infer datatypes from the columns + +# %% +# 1. Load and prepare DataFrames - now receiving 3 variables +tag_to_df, cat_cols_map, datetime_meta = radsasal.prepare_dataframes(csv_paths, tags) + +print("--- Loaded DataFrames ---") +# The index will now show as a DatetimeIndex instead of a RangeIndex +print(tag_to_df["dummy_employees"].info()) + +# 2. Combine DataFrames while preserving the index +# We do NOT use ignore_index=True here because we want to keep the DatetimeIndex +# we just created in the loader. +updated_df = pd.concat(list(tag_to_df.values()), axis=0) + +print("\n--- Datetime Inference Metadata ---") +# This will now correctly show your temporal column info +print(datetime_meta) + +# %% [markdown] +# ## 3. Statistical Profiling + +# %% +# We pass the metadata we just generated into the stats function +stats = radsasas.compute_llm_agent_stats( + tag_to_df=tag_to_df, + categorical_cols_map=cat_cols_map, + metrics=["mean", "std", "min", "max"] +) + +# Manually ensure the datetime_columns key is populated for the LLM +stats["datetime_columns"] = datetime_meta + +print("\n--- Stats Computation Complete ---") +print(f"Calculated stats for tags: {list(stats['numeric_summary'].keys())}") + +# %% [markdown] +# ## 4. Call LLM for column type inferencing + +# %% +# 1. Select columns (e.g., let's just send everything) +columns_for_llm = radsasah._select_columns_for_llm(updated_df, scope="all") +print(f"Selected columns for LLM: {columns_for_llm}\n") + +# 2. Build the exact prompt string that goes to the LLM +prompt_text = radsasah.build_llm_prompt(stats, columns_to_include=columns_for_llm) +print("--- LLM Prompt Snippet ---") +print(prompt_text[:500] + "\n...\n") + +# 3. Call the LLM to generate hypotheses (using gpt-4o as default) +# If you don't have an API key configured, you can mock this response by creating a static dict. +try: + semantic_insights = radsasah.generate_hypotheses_via_cli( + stats=stats, + model="gpt-4o", + columns_to_include=columns_for_llm + ) + print("--- LLM Insights Retrieved Successfully ---") +except Exception as e: + print(f"LLM call failed (Check API key): {e}") + semantic_insights = {"columns": {}} # Fallback empty dict + +# %% [markdown] +# ## 5. Export to JSON and Markdown + +# %% +# 1. Build structured column profiles +primary_df = list(tag_to_df.values())[0] +column_profiles = radsasar.build_column_profiles( + df=primary_df, + stats=stats, + insights=semantic_insights +) + +# 2. Export to JSON +json_out = "dummy_profile_report.json" +radsasar.merge_and_export_results( + stats=stats, + insights=semantic_insights, + column_profiles=column_profiles, + output_path=json_out +) + +# 3. Export to Markdown +md_out = "dummy_profile_summary.md" +radsasar.export_markdown_from_profiles( + column_profiles=column_profiles, + numeric_stats=stats.get("numeric_summary", {}), + output_path=md_out +) + +print(f"\nPipeline complete! Check your directory for:") +print(f"1. {json_out}") +print(f"2. {md_out}") + +# Clean up dummy CSV if desired +# os.remove(csv_path) diff --git a/research/agentic_data_science/schema_agent/schema_agent_example.ipynb b/research/agentic_data_science/schema_agent/schema_agent_example.ipynb index 44b96491c..0355550c0 100644 --- a/research/agentic_data_science/schema_agent/schema_agent_example.ipynb +++ b/research/agentic_data_science/schema_agent/schema_agent_example.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 1, "id": "3770d3bd-200f-4b7a-bb10-1fe76f26c4d7", "metadata": {}, "outputs": [ @@ -19,19 +19,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" + "WARNING: Running in Jupyter\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:79: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:75: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " parsed = pd.to_datetime(df[col], errors=\"coerce\", utc=True)\n", - "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:79: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:75: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " parsed = pd.to_datetime(df[col], errors=\"coerce\", utc=True)\n", - "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:79: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", + "/git_root/research/agentic_data_science/schema_agent/schema_agent_loader.py:75: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.\n", " parsed = pd.to_datetime(df[col], errors=\"coerce\", utc=True)\n" ] }, @@ -147,7 +146,7 @@ "gdp_current_usd 2.161193e+12 1.115049e+12 9.035824e+09 2.485483e+12 1.504897e+13\n", "gdp_growth_pct 4.626259e-01 6.134116e+00 -1.962987e+01 3.010668e+00 3.250405e+01\n", "inflation_consumer_pct 1.104250e+00 1.655513e+00 -1.518298e+01 1.589081e+00 1.652789e+01\n", - "12:11:45 rss=0.267GB vms=1.690GB mem_pct=2% cpu=0% - \u001b[36mINFO \u001b[0m Task-193 schema_agent.py run_pipeline:131 LLM will profile 3 / 18 columns (scope=semantic).\n" + "10:24:44 rss=0.222GB vms=1.643GB mem_pct=1% cpu=100% - \u001b[36mINFO \u001b[0m Task-20 schema_agent.py run_pipeline:131 LLM will profile 3 / 18 columns (scope=semantic).\n" ] }, { diff --git a/research/agentic_data_science/schema_agent/schema_agent_example.py b/research/agentic_data_science/schema_agent/schema_agent_example.py new file mode 100644 index 000000000..1c9f4a455 --- /dev/null +++ b/research/agentic_data_science/schema_agent/schema_agent_example.py @@ -0,0 +1,35 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.1 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Schema Parser example +# - This implementation in the notebook utilizes a suite of pre-existing functions to parse a single Excel (or CSV) file, automatically inferring data types and capturing temporal metadata for downstream analysis. + +# %% +# %load_ext autoreload +# %autoreload 2 +import research.agentic_data_science.schema_agent.schema_agent as radsasag + +# Now run the pipeline +csv_files = ["global_ecommerce_forecasting.csv"] +tags = ["ecommerce_data"] + +tag_to_df, stats = radsasag.run_pipeline( + csv_paths=csv_files, + tags=tags, + model="gpt-4o", + llm_scope="semantic" +) + +display(tag_to_df["ecommerce_data"].head()) diff --git a/research/agentic_data_science/schema_agent/schema_agent_hllmcli.py b/research/agentic_data_science/schema_agent/schema_agent_hllmcli.py index 37b4c8f0e..740dc6ce1 100644 --- a/research/agentic_data_science/schema_agent/schema_agent_hllmcli.py +++ b/research/agentic_data_science/schema_agent/schema_agent_hllmcli.py @@ -12,7 +12,7 @@ import langchain_openai as lco import pandas as pd import pydantic -import schema_agent_models as radsasam +import research.agentic_data_science.schema_agent.schema_agent_models as radsasam import helpers.hdbg as hdbg import helpers.hllm_cli as hllmcli From 15d6cc5ae87f247deb0ecd143fc78c69f1d202bb Mon Sep 17 00:00:00 2001 From: Pranav Shashidhara Date: Fri, 10 Apr 2026 11:41:09 -0400 Subject: [PATCH 13/14] Update README.md, Blog and lint notebooks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- .../schema_agent/README.md | 168 +++++++++--------- ...agent_API.ipynb => schema_agent.API.ipynb} | 0 ...chema_agent_API.py => schema_agent.API.py} | 0 ...ample.ipynb => schema_agent.example.ipynb} | 0 ...ent_example.py => schema_agent.example.py} | 0 .../schema_agent/schema_agent.py | 1 + website/docs/blog/posts/draft.Schema_agent.md | 58 ++++++ 7 files changed, 147 insertions(+), 80 deletions(-) rename research/agentic_data_science/schema_agent/{schema_agent_API.ipynb => schema_agent.API.ipynb} (100%) rename research/agentic_data_science/schema_agent/{schema_agent_API.py => schema_agent.API.py} (100%) rename research/agentic_data_science/schema_agent/{schema_agent_example.ipynb => schema_agent.example.ipynb} (100%) rename research/agentic_data_science/schema_agent/{schema_agent_example.py => schema_agent.example.py} (100%) mode change 100644 => 100755 research/agentic_data_science/schema_agent/schema_agent.py create mode 100644 website/docs/blog/posts/draft.Schema_agent.md diff --git a/research/agentic_data_science/schema_agent/README.md b/research/agentic_data_science/schema_agent/README.md index dc7a08292..7c33925bd 100644 --- a/research/agentic_data_science/schema_agent/README.md +++ b/research/agentic_data_science/schema_agent/README.md @@ -2,104 +2,96 @@ Automated statistical profiling and LLM-powered semantic analysis for CSV datasets. Generates column-level insights including semantic classification, data quality assessment, and testable business hypotheses. -## Features +## Key Features -- **Temporal detection** — Auto-detects and converts date/datetime columns across multiple formats +- **Automatic temporal detection** — Identifies and converts date/datetime columns across multiple formats - **Statistical profiling** — Computes numeric summaries, data quality metrics, and categorical distributions -- **LLM semantic analysis** — Infers column roles (ID, Feature, Target, Timestamp), semantic meaning, and hypotheses -- **Cost optimization** — Filter columns prior to LLM analysis to control token usage and API costs -- **Multi-format output** — JSON reports and Markdown summaries +- **LLM-powered semantic analysis** — Infers column roles (ID, Feature, Target, Timestamp), semantic meaning, and generates testable business hypotheses +- **Smart cost control** — Selectively analyze columns to optimize API usage and reduce costs +- **Flexible output formats** — Generate machine-readable JSON reports and human-friendly Markdown summaries -## Setup +## Quick Start -Navigate to the project directory: -```bash -> cd research/agentic_data_science/schema_agent -``` - -Install dependencies: -```bash -> pip install -r requirements.txt -``` +### Installation -Set your API key: -```bash -> export OPENAI_API_KEY=sk-... -``` +Navigate to the project directory and install dependencies: -Make the entry point executable: ```bash -> chmod +x schema_agent.py +cd research/agentic_data_science/schema_agent +pip install -r requirements.txt +export OPENAI_API_KEY=sk-... +chmod +x schema_agent.py ``` -## Module Structure - -The agent is organized into six focused modules: - -| Module | Responsibility | -|--------|----------------| -| `schema_agent_models.py` | Pydantic schemas for type-safe column and dataset insights | -| `schema_agent_loader.py` | CSV loading, type inference, and datetime detection | -| `schema_agent_stats.py` | Numeric summaries, quality reports, and categorical distributions | -| `schema_agent_llm.py` | Prompt construction, OpenAI/LangChain calls, and structured output parsing | -| `schema_agent_report.py` | Column profiles, JSON export, and Markdown export | -| `schema_agent.py` | Pipeline orchestration and CLI entry point | - -## Usage +### Basic Usage -### Basic +Profile a single CSV file: ```bash -> ./schema_agent.py data.csv +./schema_agent.py data.csv ``` -Produces two output files: - -- `data_profile_report.json` — Machine-readable column profiles and statistics -- `data_profile_summary.md` — Human-readable summary table +This generates two output files: +- **`data_profile_report.json`** — Complete statistical and semantic analysis +- **`data_profile_summary.md`** — Readable summary table with insights -### Advanced +### Advanced Usage ```bash -# Profile multiple files with custom tags -> ./schema_agent.py dataset1.csv dataset2.csv --tags sales_2024 inv_q1 +# Profile multiple files with custom labels +./schema_agent.py dataset1.csv dataset2.csv --tags sales_2024 inventory_q1 -# Cost-optimized: analyze only high-null columns -> ./schema_agent.py data.csv --llm-scope nulls --model gpt-4o-mini +# Cost-optimized analysis (only high-null columns) +./schema_agent.py data.csv --llm-scope nulls --model gpt-4o-mini -# Custom metrics and output path -> ./schema_agent.py data.csv --metrics mean std max --output-json my_report.json +# Custom metrics and output paths +./schema_agent.py data.csv --metrics mean std max --output-json my_report.json # Use LangChain as the inference backend -> ./schema_agent.py data.csv --use-langchain +./schema_agent.py data.csv --use-langchain ``` -## Command-Line Reference +## Architecture + +The agent consists of six focused modules working together: + +| Module | Purpose | +|--------|---------| +| `schema_agent_models.py` | Type-safe Pydantic schemas for column profiles and dataset insights | +| `schema_agent_loader.py` | CSV loading, type inference, and datetime detection | +| `schema_agent_stats.py` | Numeric summaries, data quality metrics, and categorical distributions | +| `schema_agent_llm.py` | LLM integration for semantic analysis and hypothesis generation | +| `schema_agent_report.py` | Report generation in JSON and Markdown formats | +| `schema_agent.py` | Pipeline orchestration and command-line interface | + +For detailed examples of individual module usage, see `schema_agent.example`. For end-to-end pipeline examples, see `schema_agent.API`. + +## Command-Line Options | Argument | Default | Description | |----------|---------|-------------| -| `csv_paths` | Required | One or more CSV file paths | -| `--tags` | File stems | Labels for each CSV (count must match `csv_paths`) | -| `--model` | `gpt-4o` | OpenAI model (`gpt-4o`, `gpt-4o-mini`, etc.) | -| `--llm-scope` | `all` | Column selection strategy: `all`, `semantic`, or `nulls` | -| `--metrics` | Subset | Numeric summary stats: `mean`, `std`, `min`, `25%`, `50%`, `75%`, `max` | -| `--use-langchain` | `false` | Use LangChain instead of the default inference client | -| `--output-json` | `data_profile_report.json` | Output path for the JSON report | -| `--output-md` | `data_profile_summary.md` | Output path for the Markdown summary | +| `csv_paths` | Required | One or more CSV file paths to analyze | +| `--tags` | File stems | Custom labels for each CSV (must match number of files) | +| `--model` | `gpt-4o` | OpenAI model to use (`gpt-4o`, `gpt-4o-mini`, etc.) | +| `--llm-scope` | `all` | Strategy for column selection: `all`, `semantic`, or `nulls` | +| `--metrics` | Subset | Statistics to compute: `mean`, `std`, `min`, `25%`, `50%`, `75%`, `max` | +| `--use-langchain` | `false` | Use LangChain instead of default inference client | +| `--output-json` | `data_profile_report.json` | Path for JSON report output | +| `--output-md` | `data_profile_summary.md` | Path for Markdown summary output | -## LLM Scoping +## Cost Optimization with LLM Scoping -Control which columns are sent to the LLM to manage cost and latency: +The `--llm-scope` parameter controls which columns are sent to the LLM, helping you balance analysis depth with costs: -| Scope | Behavior | -|-------|----------| -| `all` | Profiles every column — most comprehensive, highest cost | -| `semantic` | Profiles non-numeric columns only | -| `nulls` | Profiles only columns with >5% null values — most cost-efficient | +| Scope | What Gets Analyzed | Cost Level | Best For | +|-------|-------------------|-----------|----------| +| `all` | Every column | High | Complete dataset understanding | +| `semantic` | Non-numeric columns only | Medium | Text and categorical analysis | +| `nulls` | Columns with >5% null values | Low | Data quality issues only | ## Python API -### Full pipeline +### Run the full pipeline programmatically ```python import research.agentic_data_science.schema_agent.schema_agent as agent @@ -111,9 +103,9 @@ tag_to_df, stats = agent.run_pipeline( ) ``` -### Individual modules +### Use individual modules independently -Each module can be imported independently for exploratory use or testing: +Each module can be imported and used separately for custom workflows: ```python import research.agentic_data_science.schema_agent.schema_agent_loader as loader @@ -122,33 +114,49 @@ import research.agentic_data_science.schema_agent.schema_agent_llm as llm import research.agentic_data_science.schema_agent.schema_agent_report as report ``` -## Output Reference +## Output Details ### `data_profile_report.json` -Structured report containing per-column profiles, statistical summaries, and LLM-generated insights. +A structured JSON report containing: +- Per-column statistical profiles +- Data quality metrics +- LLM-generated semantic insights +- Column role classifications ### `data_profile_summary.md` -Formatted Markdown table with columns: **Column · Meaning · Role · Quality · Hypotheses** +A formatted Markdown table with columns: +- **Column** — Column name +- **Meaning** — Inferred semantic description +- **Role** — Classified role (ID, Feature, Target, Timestamp) +- **Quality** — Data quality assessment +- **Hypotheses** — Generated business insights ## Troubleshooting -**API key not set** +### API key not configured +Set your OpenAI API key: ```bash -> export OPENAI_API_KEY=sk-... +export OPENAI_API_KEY=sk-... ``` -**Validation or parsing errors** - -Reduce the number of columns sent to the LLM: +### Validation or parsing errors on large datasets +Reduce the number of columns analyzed by the LLM: ```bash -> ./schema_agent.py data.csv --llm-scope nulls -> ./schema_agent.py data.csv --llm-scope semantic --model gpt-4o-mini +./schema_agent.py data.csv --llm-scope nulls +./schema_agent.py data.csv --llm-scope semantic --model gpt-4o-mini ``` -**No datetime columns detected** +### No datetime columns detected + +This is normal behavior — the agent automatically skips temporal detection when no date-like columns are present in the dataset. + +## Next Steps -Expected behavior — datetime detection is skipped automatically when no temporal columns are present in the dataset. \ No newline at end of file +- Check out example notebooks for detailed workflows +- Integrate into your data science pipelines +- Extend with custom metrics or export formats +- Review individual module documentation for advanced use cases \ No newline at end of file diff --git a/research/agentic_data_science/schema_agent/schema_agent_API.ipynb b/research/agentic_data_science/schema_agent/schema_agent.API.ipynb similarity index 100% rename from research/agentic_data_science/schema_agent/schema_agent_API.ipynb rename to research/agentic_data_science/schema_agent/schema_agent.API.ipynb diff --git a/research/agentic_data_science/schema_agent/schema_agent_API.py b/research/agentic_data_science/schema_agent/schema_agent.API.py similarity index 100% rename from research/agentic_data_science/schema_agent/schema_agent_API.py rename to research/agentic_data_science/schema_agent/schema_agent.API.py diff --git a/research/agentic_data_science/schema_agent/schema_agent_example.ipynb b/research/agentic_data_science/schema_agent/schema_agent.example.ipynb similarity index 100% rename from research/agentic_data_science/schema_agent/schema_agent_example.ipynb rename to research/agentic_data_science/schema_agent/schema_agent.example.ipynb diff --git a/research/agentic_data_science/schema_agent/schema_agent_example.py b/research/agentic_data_science/schema_agent/schema_agent.example.py similarity index 100% rename from research/agentic_data_science/schema_agent/schema_agent_example.py rename to research/agentic_data_science/schema_agent/schema_agent.example.py diff --git a/research/agentic_data_science/schema_agent/schema_agent.py b/research/agentic_data_science/schema_agent/schema_agent.py old mode 100644 new mode 100755 index 8dec89e21..de1777da7 --- a/research/agentic_data_science/schema_agent/schema_agent.py +++ b/research/agentic_data_science/schema_agent/schema_agent.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 """ Data Profiler Agent — modular implementation. diff --git a/website/docs/blog/posts/draft.Schema_agent.md b/website/docs/blog/posts/draft.Schema_agent.md new file mode 100644 index 000000000..ac59d6e9f --- /dev/null +++ b/website/docs/blog/posts/draft.Schema_agent.md @@ -0,0 +1,58 @@ +--- +title: "Data Profiler Agent in 30 Minutes" +authors: + - Your Name +date: 2026-04-10 +description: +categories: + - AI Research + - Data Science +--- + +TL;DR: Learn how to automatically profile CSV datasets with statistical summaries and LLM-powered semantic analysis in 30 minutes. Generate column-level insights, detect temporal patterns, and discover data quality issues. + + + +## Tutorial in 30 Seconds + +The Data Profiler Agent is an automated system that combines classical statistical analysis with LLM-powered semantic understanding to comprehensively profile CSV datasets. + +Key capabilities: + +- **Automatic temporal detection**: Identifies and converts date/datetime columns across multiple formats +- **Statistical profiling**: Computes numeric summaries, data quality metrics, and categorical distributions +- **LLM semantic analysis**: Infers column roles (ID, Feature, Target, Timestamp), semantic meaning, and testable hypotheses +- **Smart cost control**: Selectively analyze columns to manage API costs without sacrificing insights +- **Flexible output**: Machine-readable JSON reports and human-friendly Markdown summaries + +This tutorial's goal is to show you in 30 minutes: + +- How the modular architecture enables both quick profiling and extensibility +- How to profile datasets and interpret results in multiple formats +- How to optimize costs while maintaining analysis quality +- How to integrate profiling into existing data pipelines + +## Official References + +- [Data Profiler Agent Repository](../../../../research/agentic_data_science/schema_agent) +- [README](../../../../research/agentic_data_science/schema_agent/README.md) + +## Tutorial Content + +This tutorial includes all code, notebooks, and documentation in +[research/agentic_data_science/schema_agent](../../../../research/agentic_data_science/schema_agent) + +- [`README.md`](../../../../research/agentic_data_science/schema_agent/README.md): Installation, usage, and configuration guide +- Six modular Python files: + - `schema_agent_models.py`: Type-safe schemas for insights and profiles + - `schema_agent_loader.py`: CSV loading and type inference + - `schema_agent_stats.py`: Statistical computation and quality metrics + - `schema_agent_llm.py`: LLM integration and semantic analysis + - `schema_agent_report.py`: Report generation and export + - `schema_agent.py`: Pipeline orchestration and CLI +- [`schema_agent.example`](../../../../research/agentic_data_science/schema_agent/schema_agent.example.ipynb): Individual module usage examples +- [`schema_agent.API`](../../../../research/agentic_data_science/schema_agent/schema_agent.API.ipynb): End-to-end pipeline workflows and patterns +- Example notebooks demonstrating real-world use cases: + - Basic profiling and interpretation + - Cost-optimized multi-file analysis + - Extracting and validating business hypotheses \ No newline at end of file From 6102e29eba764bfd718b68e624609e012d1a2e8f Mon Sep 17 00:00:00 2001 From: Pranav Shashidhara Date: Mon, 20 Apr 2026 08:39:19 -0400 Subject: [PATCH 14/14] Update blog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-commit checks: All checks passed ✅ --- website/docs/blog/posts/draft.Schema_agent.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/blog/posts/draft.Schema_agent.md b/website/docs/blog/posts/draft.Schema_agent.md index ac59d6e9f..f74928d50 100644 --- a/website/docs/blog/posts/draft.Schema_agent.md +++ b/website/docs/blog/posts/draft.Schema_agent.md @@ -34,7 +34,7 @@ This tutorial's goal is to show you in 30 minutes: ## Official References -- [Data Profiler Agent Repository](../../../../research/agentic_data_science/schema_agent) +- [Data Profiler Agent Repository](../../../ ../research/agentic_data_science/schema_agent) - [README](../../../../research/agentic_data_science/schema_agent/README.md) ## Tutorial Content