From ede86d9af33d27c53e35a5aebc46b64a12baed64 Mon Sep 17 00:00:00 2001 From: erivan Date: Sat, 18 Apr 2026 14:31:52 +0200 Subject: [PATCH 01/13] add assumption checks for SUTVA, ignorability and overlap --- cais/methods/assumption_utils.py | 227 +++++++++++++++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 cais/methods/assumption_utils.py diff --git a/cais/methods/assumption_utils.py b/cais/methods/assumption_utils.py new file mode 100644 index 0000000..e32ffc8 --- /dev/null +++ b/cais/methods/assumption_utils.py @@ -0,0 +1,227 @@ +""" +Reusable assumption checks for causal inference methods. + +Each check returns a standardized dict: + { + "passed": bool | None, # None => inconclusive + "reasoning": str, # human-readable explanation + "details": dict, # raw stats (F, p, SMDs, ...) + } + +These are composed in each estimator's `validate_assumptions` method. +The agent-level `validate_method` simply dispatches to the selected estimator. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Optional +import logging + +import numpy as np +import pandas as pd +from pydantic import BaseModel, Field + + +# import some assumptions already available for each method + +from cais.methods.utils import ( + calculate_standardized_differences, + check_overlap, +) +from cais.methods.instrumental_variable.diagnostics import ( + calculate_first_stage_f_statistic, +) +from cais.methods.difference_in_differences.diagnostics import ( + validate_parallel_trends, + run_placebo_test, +) +from cais.methods.generalized_propensity_score.diagnostics import ( + assess_gps_balance, +) +from cais.utils.llm_helpers import call_llm_with_json_output + +logger = logging.getLogger(__name__) + +# _____________________________________________________________________________ +# Output helper +# _____________________________________________________________________________ + +def _result( + passed: Optional[bool], + reasoning: str, + details: Optional[Dict[str, Any]] = None, +) -> Dict[str, Any]: + return { + "passed": passed, + "reasoning": reasoning, + "details": details or {}, + } + +# _____________________________________________________________________________ +# LLM-based assumption argumentation (non-statistically-testable assumptions) +# _____________________________________________________________________________ + +class _LLMAssumptionVerdict(BaseModel): + passed: Optional[bool] = Field( + None, + description="True if the assumption is plausibly satisfied given the dataset and " + "domain context, False if there is a clear reason to doubt it, " + "None if there is insufficient information to argue either way.", + ) + reasoning: str = Field( + ..., + description="Concise (2-4 sentences) justification grounded in the dataset " + "description, variable semantics, or domain knowledge.", + ) + + +def _llm_argue_assumption( + assumption_name: str, + assumption_description: str, + dataset_description: Optional[str], + variables_summary: Dict[str, Any], + llm, + extra_context: Optional[str] = None, +) -> Dict[str, Any]: + """Ask the LLM to argue for/against a non-statistically-testable assumption. + + Falls back to passed=None with a clear notice if no LLM is available. + """ + if llm is None: + return _result( + passed=None, + reasoning=( + f"'{assumption_name}' is not statistically testable and no LLM was " + f"provided to reason about it. Must be justified by study design or " + f"domain knowledge." + ), + ) + + prompt = f""" +You are a causal inference expert. Assess whether the following assumption is +plausibly satisfied for the analysis described below. Use the dataset +description and variables to argue concretely. Do not assume facts that are +not present in the description. + +Assumption: {assumption_name} +Definition: {assumption_description} + +Dataset description: +{dataset_description or "(not provided)"} + +Variables involved: +{variables_summary} + +{extra_context or ""} + +Respond ONLY as JSON matching this schema: +{{ + "passed": true | false | null, + "reasoning": "<2-4 sentence justification>" +}} + +Use null for "passed" if the dataset description is insufficient to argue either way. +""".strip() + + try: + raw = call_llm_with_json_output(llm, prompt) + verdict = _LLMAssumptionVerdict(**(raw or {})) + return _result( + passed=verdict.passed, + reasoning=verdict.reasoning, + details={"assumption": assumption_name}, + ) + except Exception as exc: + logger.warning("LLM assumption check failed for '%s': %s", assumption_name, exc) + return _result( + passed=None, + reasoning=f"LLM check failed: {exc}. Assumption must be justified manually.", + ) + + +# _____________________________________________________________________________ +# SUTVA (needed for every method) +# _____________________________________________________________________________ + +def check_sutva( + dataset_description: Optional[str], + variables_summary: Dict[str, Any], + llm=None, +) -> Dict[str, Any]: + """SUTVA: no interference between units, no hidden treatment versions.""" + return _llm_argue_assumption( + assumption_name="SUTVA (Stable Unit Treatment Value Assumption)", + assumption_description=( + "(1) No interference: one unit's treatment does not affect another unit's " + "potential outcomes. (2) No hidden versions of the treatment: the treatment " + "is administered consistently across treated units." + ), + dataset_description=dataset_description, + variables_summary=variables_summary, + llm=llm, + extra_context=( + "Pay attention to: network/spillover effects (e.g., units in shared " + "schools, households, markets), partial compliance, treatment intensity " + "variation." + ), + ) + + +# _____________________________________________________________________________ +# Ignorability / Conditional ignorability (RCT and observational) +# _____________________________________________________________________________ + +def check_rct_balance( + df: pd.DataFrame, + treatment: str, + covariates: List[str], + smd_threshold: float = 0.1, +) -> Dict[str, Any]: + """Partial test of ignorability for RCTs: covariate balance on observables.""" + if not covariates: + return _result( + passed=None, + reasoning="No covariates provided; balance check skipped.", + ) + smds = calculate_standardized_differences(df, treatment, covariates) + imbalanced = {c: v for c, v in smds.items() if pd.notna(v) and abs(v) > smd_threshold} + passed = len(imbalanced) == 0 + return _result( + passed=passed, + reasoning=( + f"Randomization check on {len(covariates)} covariates " + f"(|SMD| < {smd_threshold}). " + f"{'All balanced.' if passed else f'Imbalanced: {list(imbalanced.keys())}.'}" + ), + details={"smds": smds, "threshold": smd_threshold, "imbalanced": imbalanced}, + ) + + +# _____________________________________________________________________________ +# Positivity / overlap (IPW, matching, GPS) +# _____________________________________________________________________________ + +def check_positivity( + df: pd.DataFrame, + treatment: str, + propensity_scores: np.ndarray, + overlap_threshold: float = 0.5, + extreme_ps_bounds: tuple = (0.1, 0.9), # values from Crump et al. 2009 + max_extreme_pct: float = 0.05, +) -> Dict[str, Any]: + """0 < P(T=1|X) < 1 across the support of X.""" + overlap = check_overlap(df, treatment, propensity_scores, threshold=overlap_threshold) + lo, hi = extreme_ps_bounds + n_extreme = int(((propensity_scores < lo) | (propensity_scores > hi)).sum()) + pct_extreme = n_extreme / len(propensity_scores) if len(propensity_scores) else 0.0 + passed = overlap["sufficient_overlap"] and pct_extreme < max_extreme_pct + return _result( + passed=passed, + reasoning=( + f"Overlap proportion: {overlap['overlap_proportion']:.3f} " + f"(threshold {overlap_threshold}). " + f"{n_extreme} obs ({pct_extreme:.1%}) outside [{lo}, {hi}]. " + f"{'OK.' if passed else 'Consider trimming or restricting to common support.'}" + ), + details={**overlap, "n_extreme_ps": n_extreme, "pct_extreme_ps": pct_extreme}, + ) From 4e6e01b36b2de07d6a86cb4a54681b606cb20994 Mon Sep 17 00:00:00 2001 From: erivan Date: Mon, 20 Apr 2026 12:30:18 +0200 Subject: [PATCH 02/13] add pre/post modeling assumption checks --- cais/methods/post_model_assumption_utils.py | 70 ++++++++++ ...utils.py => pre_model_assumption_utils.py} | 132 +++++++++++++++++- 2 files changed, 198 insertions(+), 4 deletions(-) create mode 100644 cais/methods/post_model_assumption_utils.py rename cais/methods/{assumption_utils.py => pre_model_assumption_utils.py} (62%) diff --git a/cais/methods/post_model_assumption_utils.py b/cais/methods/post_model_assumption_utils.py new file mode 100644 index 0000000..3571c56 --- /dev/null +++ b/cais/methods/post_model_assumption_utils.py @@ -0,0 +1,70 @@ +""" +Reusable assumption checks for causal inference methods. + +Each check returns a standardized dict: + { + "passed": bool | None, # None => inconclusive + "reasoning": str, # human-readable explanation + "details": dict, # raw stats (F, p, SMDs, ...) + } + +These are composed in each estimator's `validate_assumptions` method. +The agent-level `validate_method` simply dispatches to the selected estimator. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Optional +import logging + +import numpy as np +import pandas as pd +from pydantic import BaseModel, Field + + +# import some assumptions already available for each method + +from cais.methods.instrumental_variable.diagnostics import ( + run_overidentification_test, +) +from cais.utils.llm_helpers import call_llm_with_json_output + +logger = logging.getLogger(__name__) + +# _____________________________________________________________________________ +# Output helper +# _____________________________________________________________________________ + +def _result( + passed: Optional[bool], + reasoning: str, + details: Optional[Dict[str, Any]] = None, +) -> Dict[str, Any]: + return { + "passed": passed, + "reasoning": reasoning, + "details": details or {}, + } + + +def check_iv_overidentification( + sm_results, df, treatment, outcome, instruments, covariates, +) -> Dict[str, Any]: + """Sargan-Hansen test: are the instruments valid (uncorrelated with errors)?""" + stat, p, status = run_overidentification_test( + sm_results, df, treatment, outcome, instruments, covariates, + ) + if stat is None: + return _result( + passed=None, + reasoning=status or "Over-identification test could not be computed.", + ) + passed = p > 0.05 # non-rejet = instruments valides + return _result( + passed=passed, + reasoning=( + f"Sargan-Hansen test: statistic={stat:.2f}, p={p:.4f}. " + f"{'Instruments appear valid.' if passed else 'Instruments may be invalid — correlated with errors.'}" + ), + details={"statistic": stat, "p_value": p, "status": status}, + ) \ No newline at end of file diff --git a/cais/methods/assumption_utils.py b/cais/methods/pre_model_assumption_utils.py similarity index 62% rename from cais/methods/assumption_utils.py rename to cais/methods/pre_model_assumption_utils.py index e32ffc8..7c0f61f 100644 --- a/cais/methods/assumption_utils.py +++ b/cais/methods/pre_model_assumption_utils.py @@ -28,13 +28,14 @@ calculate_standardized_differences, check_overlap, ) -from cais.methods.instrumental_variable.diagnostics import ( - calculate_first_stage_f_statistic, -) + +from cais.methods.instrumental_variable.diagnostics import calculate_first_stage_f_statistic + from cais.methods.difference_in_differences.diagnostics import ( validate_parallel_trends, run_placebo_test, ) + from cais.methods.generalized_propensity_score.diagnostics import ( assess_gps_balance, ) @@ -171,7 +172,7 @@ def check_sutva( # Ignorability / Conditional ignorability (RCT and observational) # _____________________________________________________________________________ -def check_rct_balance( +def check_cond_ignorability( df: pd.DataFrame, treatment: str, covariates: List[str], @@ -225,3 +226,126 @@ def check_positivity( ), details={**overlap, "n_extreme_ps": n_extreme, "pct_extreme_ps": pct_extreme}, ) + + +# _____________________________________________________________________________ +# IV-specific checks +# _____________________________________________________________________________ + +def check_iv_relevance( + df: pd.DataFrame, + treatment: str, + instruments: List[str], + covariates: List[str], + f_threshold: float = 10.0, +) -> Dict[str, Any]: + """First-stage F-test for instrument strength.""" + f, p = calculate_first_stage_f_statistic(df, treatment, instruments, covariates) + if f is None: + return _result( + passed=None, + reasoning="First-stage F-statistic could not be computed.", + ) + passed = f >= f_threshold + return _result( + passed=passed, + reasoning=( + f"First-stage F = {f:.2f} (threshold {f_threshold}). " + f"{'Strong instrument.' if passed else 'Weak instrument warning.'}" + ), + details={"f_statistic": f, "p_value": p, "threshold": f_threshold}, + ) + +def check_iv_exclusion(dataset_description, variables_summary, llm=None): + return _llm_argue_assumption( + "Exclusion restriction", + "The instrument Z affects the outcome Y only through the treatment T, with no direct effect.", + dataset_description, variables_summary, llm, + ) + +def check_iv_exogeneity(dataset_description, variables_summary, llm=None): + return _llm_argue_assumption( + "Instrument exogeneity (independence)", + "Z is as good as randomly assigned with respect to unobserved confounders of T and Y.", + dataset_description, variables_summary, llm, + ) + +def check_iv_monotonicity(dataset_description, variables_summary, llm=None): + return _llm_argue_assumption( + "Monotonicity (LATE)", + "There are no defiers: the instrument never moves any unit in the opposite direction " + "of its average effect on treatment uptake.", + dataset_description, variables_summary, llm, + ) + + +# _____________________________________________________________________________ +# DiD-specific checks +# _____________________________________________________________________________ + +def check_parallel_trends( + df: pd.DataFrame, time_var: str, outcome: str, + group_indicator_col: str, treatment_period_start, + **kwargs, +) -> Dict[str, Any]: + """Parallel trends: treatment and control groups had similar outcome trends pre-treatment.""" + res = validate_parallel_trends( + df, time_var, outcome, group_indicator_col, treatment_period_start, **kwargs + ) + return _result( + passed=res.get("valid"), + reasoning=res.get("details", ""), + details={"p_value": res.get("p_value"), "error": res.get("error")}, + ) + + +def check_no_anticipation( + df, time_var, group_var, outcome, treated_unit_indicator, + covariates, treatment_period_start, placebo_period_start, +) -> Dict[str, Any]: + """No anticipation: treatment has no effect before implementation.""" + res = run_placebo_test( + df, time_var, group_var, outcome, treated_unit_indicator, + covariates, treatment_period_start, placebo_period_start, + ) + return _result( + passed=res.get("passed"), + reasoning=res.get("details", ""), + details={k: v for k, v in res.items() if k != "details"}, + ) + + +def check_baseline_outcome_balance( + df: pd.DataFrame, treatment: str, outcome: str, + time_var: str, treatment_period_start, + smd_threshold: float = 0.1, +) -> Dict[str, Any]: + """Intervention unrelated to outcome at baseline: comparable pre-treatment outcome levels.""" + pre = df[df[time_var] < treatment_period_start] + if pre.empty: + return _result( + passed=None, + reasoning="No pre-treatment data available.", + ) + smd = calculate_standardized_differences(pre, treatment, [outcome]).get(outcome, np.nan) + if pd.isna(smd): + return _result( + passed=None, + reasoning="Could not compute SMD on baseline outcome (missing data or no variance).", + ) + passed = abs(smd) <= smd_threshold + return _result( + passed=passed, + reasoning=f"Baseline outcome SMD = {smd:.3f} (threshold {smd_threshold}).", + details={"smd_pre_outcome": smd, "threshold": smd_threshold}, + ) + + +def check_stable_group_composition(dataset_description, variables_summary, llm=None): + """Stable group composition: no differential attrition or selective entry/exit due to treatment.""" + return _llm_argue_assumption( + "Stable group composition", + "Unit composition of treatment and control groups does not change as a result " + "of treatment (no differential attrition or selective entry/exit).", + dataset_description, variables_summary, llm, + ) \ No newline at end of file From 9387c8e8b8cb4a34909a180b097201c0714d2f1d Mon Sep 17 00:00:00 2001 From: erivan Date: Mon, 20 Apr 2026 12:31:01 +0200 Subject: [PATCH 03/13] add notebook with short use cases for pre/post modeling assumption checks --- examples/assumption_utils_usecases.ipynb | 1295 ++++++++++++++++++++++ 1 file changed, 1295 insertions(+) create mode 100644 examples/assumption_utils_usecases.ipynb diff --git a/examples/assumption_utils_usecases.ipynb b/examples/assumption_utils_usecases.ipynb new file mode 100644 index 0000000..5656aa9 --- /dev/null +++ b/examples/assumption_utils_usecases.ipynb @@ -0,0 +1,1295 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "markdown", + "source": "# Assumption checking short use cases", + "id": "8d065cc33f6936ee" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "## Pre-modeling assumptions", + "id": "38c5e374ee1335eb" + }, + { + "metadata": { + "collapsed": true, + "ExecuteTime": { + "end_time": "2026-04-20T10:27:52.373168900Z", + "start_time": "2026-04-20T10:27:52.275824400Z" + } + }, + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "\n", + "dataset_path = \"../data/all_data/abortion_bf15.csv\"\n", + "df = pd.read_csv(dataset_path)\n", + "df.head()" + ], + "id": "e42bb8b76ff7855f", + "outputs": [ + { + "data": { + "text/plain": [ + " fip age race year sex totcase totpop rate totrate id ... \\\n", + "0 1.0 15.0 2.0 1985.0 2 5683.0 106187 6527.5 5351.9 14.0 ... \n", + "1 1.0 15.0 2.0 1986.0 2 5344.0 106831 6351.2 5002.3 14.0 ... \n", + "2 1.0 15.0 2.0 1987.0 2 4983.0 106496 5759.1 4679.0 14.0 ... \n", + "3 1.0 15.0 2.0 1988.0 2 5276.0 105238 6139.6 5013.4 14.0 ... \n", + "4 1.0 15.0 2.0 1989.0 2 5692.0 102956 5951.5 5528.6 14.0 ... \n", + "\n", + " female lnr t younger fa pi wm15 wf15 bm15 bf15 \n", + "0 1.0 8.783779 1.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 \n", + "1 1.0 8.756399 2.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 \n", + "2 1.0 8.658537 3.0 1.0 1.0 1.0 0.0 0.0 0.0 1.0 \n", + "3 1.0 8.722515 4.0 1.0 1.0 1.0 0.0 0.0 0.0 1.0 \n", + "4 1.0 8.691399 5.0 1.0 1.0 1.0 0.0 0.0 0.0 1.0 \n", + "\n", + "[5 rows x 39 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fipageraceyearsextotcasetotpopratetotrateid...femalelnrtyoungerfapiwm15wf15bm15bf15
01.015.02.01985.025683.01061876527.55351.914.0...1.08.7837791.01.01.00.00.00.00.01.0
11.015.02.01986.025344.01068316351.25002.314.0...1.08.7563992.01.01.00.00.00.00.01.0
21.015.02.01987.024983.01064965759.14679.014.0...1.08.6585373.01.01.01.00.00.00.01.0
31.015.02.01988.025276.01052386139.65013.414.0...1.08.7225154.01.01.01.00.00.00.01.0
41.015.02.01989.025692.01029565951.55528.614.0...1.08.6913995.01.01.01.00.00.00.01.0
\n", + "

5 rows × 39 columns

\n", + "
" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 3 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:27:52.445524800Z", + "start_time": "2026-04-20T10:27:52.390725800Z" + } + }, + "cell_type": "code", + "source": "from cais.methods.pre_model_assumption_utils import check_cond_ignorability", + "id": "90501ac7f9f951b", + "outputs": [], + "execution_count": 4 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:27:52.495178900Z", + "start_time": "2026-04-20T10:27:52.447534100Z" + } + }, + "cell_type": "code", + "source": [ + "covariates = ['crack', 'alcohol', 'income', 'ur', 'poverty', 'black', 'perc1519']\n", + "result_cond_ignorability = check_cond_ignorability(df, 'repeal', covariates)\n", + "print(result_cond_ignorability)" + ], + "id": "5d310a4aa2dca6f7", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'passed': False, 'reasoning': \"Randomization check on 7 covariates (|SMD| < 0.1). Imbalanced: ['crack', 'alcohol', 'income', 'ur', 'poverty', 'black', 'perc1519'].\", 'details': {'smds': {'crack': np.float64(0.1901831312076417), 'alcohol': np.float64(0.3090489010094973), 'income': np.float64(0.5807936072744352), 'ur': np.float64(0.4436246174561076), 'poverty': np.float64(-0.2878371568806569), 'black': np.float64(-0.630179818114365), 'perc1519': np.float64(-0.5894732981847037)}, 'threshold': 0.1, 'imbalanced': {'crack': np.float64(0.1901831312076417), 'alcohol': np.float64(0.3090489010094973), 'income': np.float64(0.5807936072744352), 'ur': np.float64(0.4436246174561076), 'poverty': np.float64(-0.2878371568806569), 'black': np.float64(-0.630179818114365), 'perc1519': np.float64(-0.5894732981847037)}}}\n" + ] + } + ], + "execution_count": 5 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:27:52.570233300Z", + "start_time": "2026-04-20T10:27:52.519778400Z" + } + }, + "cell_type": "code", + "source": [ + "from cais.methods.pre_model_assumption_utils import check_positivity\n", + "from cais.methods.propensity_score.base import estimate_propensity_scores" + ], + "id": "ee059538218d39ee", + "outputs": [], + "execution_count": 6 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:27:52.637020400Z", + "start_time": "2026-04-20T10:27:52.574298200Z" + } + }, + "cell_type": "code", + "source": [ + "ps = estimate_propensity_scores(df, 'repeal', covariates)\n", + "\n", + "overlap_result = check_positivity(df, 'repeal', ps)\n", + "print(overlap_result)" + ], + "id": "d39d90ba7ebcac34", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'passed': False, 'reasoning': 'Overlap proportion: 0.777 (threshold 0.5). 472 obs (64.0%) outside [0.1, 0.9]. Consider trimming or restricting to common support.', 'details': {'treated_range': (0.01873818828478758, 0.766475780981912), 'control_range': (0.01, 0.6066358199069207), 'overlap_range': (0.01873818828478758, 0.6066358199069207), 'overlap_proportion': 0.7771532762873611, 'sufficient_overlap': np.True_, 'n_extreme_ps': 472, 'pct_extreme_ps': 0.6404341926729986}}\n" + ] + } + ], + "execution_count": 7 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "### Fix an OPENAI_API_KEY in environment before running cells below", + "id": "ac989fb331517781" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:27:53.228295300Z", + "start_time": "2026-04-20T10:27:52.639386400Z" + } + }, + "cell_type": "code", + "source": [ + "from cais.config import get_llm_client\n", + "llm = get_llm_client()" + ], + "id": "98e0711e996cb3f5", + "outputs": [], + "execution_count": 8 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:27:55.947713600Z", + "start_time": "2026-04-20T10:27:53.246537200Z" + } + }, + "cell_type": "code", + "source": [ + "from cais.methods.pre_model_assumption_utils import check_sutva\n", + "\n", + "dataset_description = (\n", + " \"Panel data of 51 US states from 1985-2000 (Donohue & Levitt). \"\n", + " \"Treatment 'repeal' is binary: whether the state legalized abortion \"\n", + " \"before Roe v. Wade in 1973. Outcome 'rate' is the crime rate per 100k. \"\n", + " \"The hypothesis is that legalized abortion reduced unwanted births, \"\n", + " \"which later reduced crime. Subgroup: black females age 15.\"\n", + ")\n", + "\n", + "variables_info = {\n", + " 'treatment': 'repeal',\n", + " 'outcome': 'rate',\n", + " 'covariates': covariates,\n", + " 'panel_id': 'fip (state FIPS code)',\n", + " 'time': 'year (1985-2000)',\n", + "}\n", + "\n", + "result_sutva = check_sutva(dataset_description, variables_info, llm=llm)\n", + "print(result_sutva)" + ], + "id": "7409ecd3503dc2e5", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'passed': False, 'reasoning': 'The assumption of no interference is likely violated due to potential spillover effects among states. For instance, the legalization of abortion in one state could influence neighboring states through migration patterns, shared economic conditions, or social norms, thereby affecting their crime rates. Additionally, the treatment is binary and may not account for variations in how states implemented the repeal, suggesting the presence of hidden versions of the treatment.', 'details': {'assumption': 'SUTVA (Stable Unit Treatment Value Assumption)'}}\n" + ] + } + ], + "execution_count": 9 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:27:55.961967200Z", + "start_time": "2026-04-20T10:27:55.950140100Z" + } + }, + "cell_type": "code", + "source": [ + "from cais.methods.pre_model_assumption_utils import (\n", + " check_iv_relevance,\n", + " check_iv_exclusion,\n", + " check_iv_exogeneity,\n", + " check_iv_monotonicity,\n", + ")" + ], + "id": "b170f0d603110f8b", + "outputs": [], + "execution_count": 10 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:27:55.974645700Z", + "start_time": "2026-04-20T10:27:55.964090400Z" + } + }, + "cell_type": "code", + "source": "import pandas as pd", + "id": "5a6ebd0b2770d0bc", + "outputs": [], + "execution_count": 11 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:27:55.993221500Z", + "start_time": "2026-04-20T10:27:55.975973Z" + } + }, + "cell_type": "code", + "source": [ + "dataset_iv_path = \"../data/all_data/card_geographic.csv\"\n", + "df_iv = pd.read_csv(dataset_iv_path)" + ], + "id": "43f5cdd305531a14", + "outputs": [], + "execution_count": 12 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:27:56.008393100Z", + "start_time": "2026-04-20T10:27:55.995467300Z" + } + }, + "cell_type": "code", + "source": [ + "df_iv = df_iv.drop(columns=['Unnamed: 0'], errors='ignore')\n", + "df_iv = df_iv.dropna()" + ], + "id": "64af04994333ad51", + "outputs": [], + "execution_count": 13 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:27:56.054986Z", + "start_time": "2026-04-20T10:27:56.010656800Z" + } + }, + "cell_type": "code", + "source": "df_iv.head()", + "id": "a8bd12b301c3c00", + "outputs": [ + { + "data": { + "text/plain": [ + " nearc4 educ black smsa south married exper lwage\n", + "0 0 7 1 1 0 1.0 16 6.306275\n", + "1 0 12 0 1 0 1.0 9 6.175867\n", + "2 0 12 0 1 0 1.0 16 6.580639\n", + "3 1 11 0 1 0 1.0 10 5.521461\n", + "4 1 12 0 1 0 1.0 16 6.591674" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nearc4educblacksmsasouthmarriedexperlwage
0071101.0166.306275
10120101.096.175867
20120101.0166.580639
31110101.0105.521461
41120101.0166.591674
\n", + "
" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 14 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:27:56.118001600Z", + "start_time": "2026-04-20T10:27:56.058386300Z" + } + }, + "cell_type": "code", + "source": [ + "# --- check_iv_relevance : Is nearc4 a strong instrument for educ ? ---\n", + "result_iv_relevance = check_iv_relevance(\n", + " df=df_iv,\n", + " treatment='educ',\n", + " instruments=['nearc4'],\n", + " covariates=['black', 'smsa', 'south', 'married', 'exper'],\n", + ")\n", + "print(result_iv_relevance)" + ], + "id": "14f14485f71acd42", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'passed': True, 'reasoning': 'First-stage F = 15.77 (threshold 10.0). Strong instrument.', 'details': {'f_statistic': 15.766661138654587, 'p_value': 7.333887270678314e-05, 'threshold': 10.0}}\n" + ] + } + ], + "execution_count": 15 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:27:57.668451700Z", + "start_time": "2026-04-20T10:27:56.122437900Z" + } + }, + "cell_type": "code", + "source": [ + "# --- check_iv_exclusion ---\n", + "card_description = (\n", + " \"Card (1995) dataset. 3010 men from the NLS Young Men Cohort. \"\n", + " \"Treatment is 'educ' (years of education). Instrument is 'nearc4' \"\n", + " \"(grew up near a 4-year college). Outcome is 'lwage' (log wage). \"\n", + " \"The exclusion restriction argument is that college proximity affects \"\n", + " \"education but not wages directly — though this is debated, since \"\n", + " \"proximity may correlate with local labor market conditions.\"\n", + ")\n", + "\n", + "card_variables = {\n", + " 'treatment': 'educ',\n", + " 'outcome': 'lwage',\n", + " 'instrument': 'nearc4',\n", + " 'covariates': ['black', 'smsa', 'south', 'married', 'exper'],\n", + "}\n", + "\n", + "result_iv_exclusion = check_iv_exclusion(card_description, card_variables, llm=llm)\n", + "print(\"check_iv_exclusion :\", result_iv_exclusion)" + ], + "id": "5ed937d2c5f0cf91", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "check_iv_exclusion : {'passed': False, 'reasoning': 'The exclusion restriction is likely violated because growing up near a 4-year college (nearc4) may influence local labor market conditions, which could directly affect wages (lwage) independent of education (educ). This correlation suggests that the instrument may have a direct effect on the outcome, undermining the validity of the exclusion restriction.', 'details': {'assumption': 'Exclusion restriction'}}\n" + ] + } + ], + "execution_count": 16 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:27:59.501261300Z", + "start_time": "2026-04-20T10:27:57.674640900Z" + } + }, + "cell_type": "code", + "source": [ + "# --- check_iv_exogeneity ---\n", + "result_iv_exogeneity = check_iv_exogeneity(card_description, card_variables, llm=llm)\n", + "print(\"check_iv_exogeneity :\", result_iv_exogeneity)" + ], + "id": "311b08f100915949", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "check_iv_exogeneity : {'passed': False, 'reasoning': \"The assumption of instrument exogeneity is likely violated because 'nearc4' (proximity to a 4-year college) may correlate with local labor market conditions, which can affect wages ('lwage') directly. This suggests that 'nearc4' is not as good as randomly assigned with respect to unobserved confounders affecting both education and wages.\", 'details': {'assumption': 'Instrument exogeneity (independence)'}}\n" + ] + } + ], + "execution_count": 17 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:28:01.122831700Z", + "start_time": "2026-04-20T10:27:59.504289100Z" + } + }, + "cell_type": "code", + "source": [ + "# --- check_iv_monotonicity ---\n", + "result_iv_monotonicity = check_iv_monotonicity(card_description, card_variables, llm=llm)\n", + "print(\"check_iv_monotonicity :\", result_iv_monotonicity)" + ], + "id": "10b716ad1fb8c649", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "check_iv_monotonicity : {'passed': None, 'reasoning': \"The dataset description does not provide sufficient information about the relationship between the instrument 'nearc4' and the treatment 'educ' to assess the presence of defiers. Without knowing how individuals who grew up near a 4-year college respond to the instrument in terms of their education, we cannot determine if there are units that would decrease their education despite being near a college.\", 'details': {'assumption': 'Monotonicity (LATE)'}}\n" + ] + } + ], + "execution_count": 18 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "In the description furnished before, there wasn't enough description for the LLM to judge whether IV monocity was valid. That's why it returns `'None'`.\n", + "\n", + "We enrich the description now:" + ], + "id": "e8261a5687e3da4e" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:28:01.136060100Z", + "start_time": "2026-04-20T10:28:01.124507600Z" + } + }, + "cell_type": "code", + "source": [ + "card_description = (\n", + " \"Card (1995) dataset. 3010 men from the NLS Young Men Cohort. \"\n", + " \"Treatment is 'educ' (years of education). Instrument is 'nearc4' \"\n", + " \"(grew up near a 4-year college). Outcome is 'lwage' (log wage). \"\n", + " \"The instrument works through reduced cost of attending college: \"\n", + " \"individuals near a college face lower transportation and housing costs, \"\n", + " \"making them more likely to attend. It is implausible that proximity \"\n", + " \"to a college would cause someone to get LESS education — the effect \"\n", + " \"should go in one direction only (more proximity → more education or no change).\"\n", + ")" + ], + "id": "2f246249bee3bdc7", + "outputs": [], + "execution_count": 19 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:28:03.358785900Z", + "start_time": "2026-04-20T10:28:01.139115900Z" + } + }, + "cell_type": "code", + "source": [ + "# --- check_iv_monotonicity ---\n", + "result_iv_monotonicity = check_iv_monotonicity(card_description, card_variables, llm=llm)\n", + "print(\"check_iv_monotonicity :\", result_iv_monotonicity)" + ], + "id": "59abd1ed0672b4de", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "check_iv_monotonicity : {'passed': True, 'reasoning': \"The assumption of monotonicity is plausibly satisfied because the instrument 'nearc4' is expected to increase the likelihood of attending college due to reduced costs associated with proximity to a college. It is highly unlikely that being near a college would lead to a decrease in education, as the natural effect of proximity is to either increase education or have no effect, thus supporting the absence of defiers.\", 'details': {'assumption': 'Monotonicity (LATE)'}}\n" + ] + } + ], + "execution_count": 20 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:28:03.418716700Z", + "start_time": "2026-04-20T10:28:03.376072700Z" + } + }, + "cell_type": "code", + "source": [ + "from cais.methods.pre_model_assumption_utils import (\n", + " check_parallel_trends,\n", + " check_no_anticipation,\n", + " check_baseline_outcome_balance,\n", + " check_stable_group_composition,\n", + ")" + ], + "id": "d6268e8f3f29807b", + "outputs": [], + "execution_count": 21 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:28:03.486582800Z", + "start_time": "2026-04-20T10:28:03.420717500Z" + } + }, + "cell_type": "code", + "source": [ + "df_did = pd.read_csv('../data/all_data/castle.csv')\n", + "df_did['ever_treated'] = df_did.groupby('sid')['post'].transform('max').astype(int)\n", + "\n", + "# 2006 is the cutoff year for the tests\n", + "treatment_year = 2006\n", + "\n", + "df_did.head()" + ], + "id": "78b9c69726df3a0a", + "outputs": [ + { + "data": { + "text/plain": [ + " state year sid cdl pre2_cdl caselaw anywhere assumption civil \\\n", + "0 Alabama 2000 1 0.0 0.0 0.0 0 0 0 \n", + "1 Alabama 2001 1 0.0 0.0 0.0 0 0 0 \n", + "2 Alabama 2002 1 0.0 0.0 0.0 0 0 0 \n", + "3 Alabama 2003 1 0.0 0.0 0.0 0 0 0 \n", + "4 Alabama 2004 1 0.0 1.0 0.0 0 0 0 \n", + "\n", + " homicide_c ... _Iyear_2003 _Iyear_2004 _Iyear_2005 _Iyear_2006 \\\n", + "0 329 ... 0 0 0 0 \n", + "1 379 ... 0 0 0 0 \n", + "2 303 ... 0 0 0 0 \n", + "3 299 ... 1 0 0 0 \n", + "4 254 ... 0 1 0 0 \n", + "\n", + " _Iyear_2007 _Iyear_2008 _Iyear_2009 _Iyear_2010 popwt ever_treated \n", + "0 0 0 0 0 4499293.0 1 \n", + "1 0 0 0 0 4499293.0 1 \n", + "2 0 0 0 0 4499293.0 1 \n", + "3 0 0 0 0 4499293.0 1 \n", + "4 0 0 0 0 4499293.0 1 \n", + "\n", + "[5 rows x 186 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stateyearsidcdlpre2_cdlcaselawanywhereassumptioncivilhomicide_c..._Iyear_2003_Iyear_2004_Iyear_2005_Iyear_2006_Iyear_2007_Iyear_2008_Iyear_2009_Iyear_2010popwtever_treated
0Alabama200010.00.00.0000329...000000004499293.01
1Alabama200110.00.00.0000379...000000004499293.01
2Alabama200210.00.00.0000303...000000004499293.01
3Alabama200310.00.00.0000299...100000004499293.01
4Alabama200410.01.00.0000254...010000004499293.01
\n", + "

5 rows × 186 columns

\n", + "
" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 22 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:28:03.566545700Z", + "start_time": "2026-04-20T10:28:03.489714600Z" + } + }, + "cell_type": "code", + "source": [ + "# --- check_parallel_trends ---\n", + "result_parallel_trends = check_parallel_trends(\n", + " df=df_did,\n", + " time_var='year',\n", + " outcome='l_homicide',\n", + " group_indicator_col='ever_treated',\n", + " treatment_period_start=treatment_year,\n", + ")\n", + "print(result_parallel_trends)" + ], + "id": "c736cb484ee757c2", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'passed': np.True_, 'reasoning': 'Simple linear trend test: p-value for group-trend interaction: 0.8264. Parallel trends: True.', 'details': {'p_value': np.float64(0.8263558702861501), 'error': None}}\n" + ] + } + ], + "execution_count": 23 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:28:03.802998500Z", + "start_time": "2026-04-20T10:28:03.569965Z" + } + }, + "cell_type": "code", + "source": [ + "# --- check_no_anticipation ---\n", + "covariates = ['l_police', 'l_income', 'l_prisoner', 'unemployrt', 'poverty']\n", + "\n", + "result_no_anticipation = check_no_anticipation(\n", + " df=df_did,\n", + " time_var='year',\n", + " group_var='sid',\n", + " outcome='l_homicide',\n", + " treated_unit_indicator='ever_treated',\n", + " covariates=covariates,\n", + " treatment_period_start=treatment_year,\n", + " placebo_period_start=2003,\n", + ")\n", + "print(result_no_anticipation)" + ], + "id": "92374b195412d2fb", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'passed': True, 'reasoning': 'Placebo treatment effect estimated at 0.0976 (p=0.1673). Test passed: True.', 'details': {'passed': True, 'effect_estimate': 0.09764229987805968, 'p_value': 0.16731301316128822, 'error': None}}\n" + ] + } + ], + "execution_count": 24 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:28:03.880077200Z", + "start_time": "2026-04-20T10:28:03.826052400Z" + } + }, + "cell_type": "code", + "source": [ + "# --- check_baseline_outcome_balance ---\n", + "result_baseline_outcome_balance = check_baseline_outcome_balance(\n", + " df=df_did,\n", + " treatment='ever_treated',\n", + " outcome='l_homicide',\n", + " time_var='year',\n", + " treatment_period_start=treatment_year,\n", + ")\n", + "print(result_baseline_outcome_balance)" + ], + "id": "c0c993726771aaff", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'passed': np.False_, 'reasoning': 'Baseline outcome SMD = 0.675 (threshold 0.1).', 'details': {'smd_pre_outcome': np.float64(0.6753481392211624), 'threshold': 0.1}}\n" + ] + } + ], + "execution_count": 25 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:28:06.140551Z", + "start_time": "2026-04-20T10:28:03.890887Z" + } + }, + "cell_type": "code", + "source": [ + "# --- check_stable_group_composition (LLM-reasoned) ---\n", + "castle_description = (\n", + " \"Castle Doctrine dataset. Panel of 50 US states from 2000-2010. \"\n", + " \"Treatment is adoption of Castle Doctrine / Stand Your Ground laws, \"\n", + " \"which expanded the right to use lethal force in self-defense. \"\n", + " \"States adopted the law at different times (staggered treatment). \"\n", + " \"Outcome is log homicide rate. Unit of observation is state-year.\"\n", + ")\n", + "\n", + "castle_variables = {\n", + " 'treatment': 'ever_treated',\n", + " 'outcome': 'l_homicide',\n", + " 'panel_id': 'sid',\n", + " 'time': 'year (2000-2010)',\n", + " 'covariates': covariates,\n", + "}\n", + "\n", + "result_stable_group_composition = check_stable_group_composition(castle_description, castle_variables, llm=llm)\n", + "print(result_stable_group_composition)" + ], + "id": "b7377b38b2cca006", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'passed': None, 'reasoning': 'The dataset description does not provide information on whether there was differential attrition or selective entry/exit of states in relation to the treatment (adoption of Castle Doctrine laws). Without data on how states may have changed their composition over time or how the treatment affected state characteristics, it is not possible to assess the plausibility of the stable group composition assumption.', 'details': {'assumption': 'Stable group composition'}}\n" + ] + } + ], + "execution_count": 26 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:28:07.928286700Z", + "start_time": "2026-04-20T10:28:06.145549900Z" + } + }, + "cell_type": "code", + "source": [ + "castle_description = (\n", + " \"Castle Doctrine dataset. Panel of 50 US states from 2000-2010. \"\n", + " \"Treatment is adoption of Castle Doctrine / Stand Your Ground laws. \"\n", + " \"Outcome is log homicide rate. Unit of observation is state-year. \"\n", + " \"All 50 states are observed for all 11 years — the panel is balanced \"\n", + " \"with no missing state-year observations. States do not enter or exit \"\n", + " \"the dataset. However, population migration between states could change \"\n", + " \"the demographic composition of treatment and control states over time.\"\n", + ")\n", + "\n", + "result_stable_group_composition2 = check_stable_group_composition(castle_description, castle_variables, llm=llm)\n", + "print(result_stable_group_composition2)" + ], + "id": "11ad061f34b1c261", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'passed': False, 'reasoning': 'While the dataset is balanced and does not have missing observations, the potential for population migration between states introduces the possibility of changes in the demographic composition of treatment and control groups over time. This could lead to differential attrition or selective entry/exit, violating the stable group composition assumption.', 'details': {'assumption': 'Stable group composition'}}\n" + ] + } + ], + "execution_count": 27 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:28:09.910292800Z", + "start_time": "2026-04-20T10:28:07.933859600Z" + } + }, + "cell_type": "code", + "source": [ + "# --- check_sutva (LLM-reasoned) ---\n", + "result_sutva_did = check_sutva(castle_description, castle_variables, llm=llm)\n", + "print(result_sutva_did)" + ], + "id": "9f38e4360f24ad53", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'passed': False, 'reasoning': 'The assumption of no interference is likely violated due to potential population migration between states, which could lead to spillover effects where the treatment in one state affects the outcomes in another state. Additionally, the treatment (adoption of Castle Doctrine laws) may not be uniformly applied across states, suggesting the possibility of hidden versions of the treatment. Therefore, SUTVA is not plausibly satisfied.', 'details': {'assumption': 'SUTVA (Stable Unit Treatment Value Assumption)'}}\n" + ] + } + ], + "execution_count": 28 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "## Post-modeling assumptions", + "id": "4c16b7a6b8ae03ef" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:28:09.937251500Z", + "start_time": "2026-04-20T10:28:09.914351300Z" + } + }, + "cell_type": "code", + "source": [ + "from cais.methods.post_model_assumption_utils import (\n", + " check_iv_overidentification\n", + ")" + ], + "id": "dd005b46b39be3", + "outputs": [], + "execution_count": 29 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:28:09.981476500Z", + "start_time": "2026-04-20T10:28:09.939842500Z" + } + }, + "cell_type": "code", + "source": [ + "result_iv_overidentification = check_iv_overidentification(\n", + " sm_results=None,\n", + " df=df_iv,\n", + " treatment='educ',\n", + " outcome='lwage',\n", + " instruments=['nearc4'],\n", + " covariates=['black', 'smsa', 'south', 'married', 'exper'],\n", + ")\n", + "print(result_iv_overidentification)" + ], + "id": "1ce0cd27ab0a2bb8", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'passed': None, 'reasoning': 'Test not applicable (Need more instruments than endogenous regressors)', 'details': {}}\n" + ] + } + ], + "execution_count": 30 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-20T10:28:10.058553300Z", + "start_time": "2026-04-20T10:28:09.986093800Z" + } + }, + "cell_type": "code", + "source": [ + "from statsmodels.sandbox.regression.gmm import IV2SLS\n", + "import statsmodels.api as sm\n", + "\n", + "# using 'nearc4' and 'smsa' as instruments (smsa isn't a real instrument, just used for testing)\n", + "instruments = ['nearc4', 'smsa']\n", + "covariates_iv = ['black', 'south', 'married', 'exper']\n", + "\n", + "# IV estimation via 2SLS\n", + "endog = df_iv['lwage']\n", + "exog = sm.add_constant(df_iv[['educ'] + covariates_iv])\n", + "instrument_matrix = sm.add_constant(df_iv[instruments + covariates_iv])\n", + "\n", + "iv_model = IV2SLS(endog, exog, instrument_matrix).fit()\n", + "\n", + "result_iv_overidentification2 = check_iv_overidentification(\n", + " sm_results=iv_model,\n", + " df=df_iv,\n", + " treatment='educ',\n", + " outcome='lwage',\n", + " instruments=instruments,\n", + " covariates=covariates_iv,\n", + ")\n", + "print(result_iv_overidentification2)" + ], + "id": "7f4a9c4482c7c8be", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'passed': np.False_, 'reasoning': 'Sargan-Hansen test: statistic=8.72, p=0.0031. Instruments may be invalid — correlated with errors.', 'details': {'statistic': np.float64(8.724238008268228), 'p_value': np.float64(0.0031400727800432876), 'status': 'Test successful'}}\n" + ] + } + ], + "execution_count": 31 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From cd4daec5ec2a94fd23e4099c7dd88bf12efa225a Mon Sep 17 00:00:00 2001 From: erivan Date: Wed, 29 Apr 2026 12:38:34 +0200 Subject: [PATCH 04/13] add assumption checks for IVs, DiD and frontdoor adjustment --- cais/methods/pre_model_assumption_utils.py | 109 ++++++++++++++++++++- 1 file changed, 104 insertions(+), 5 deletions(-) diff --git a/cais/methods/pre_model_assumption_utils.py b/cais/methods/pre_model_assumption_utils.py index 7c0f61f..5ff9580 100644 --- a/cais/methods/pre_model_assumption_utils.py +++ b/cais/methods/pre_model_assumption_utils.py @@ -74,7 +74,11 @@ class _LLMAssumptionVerdict(BaseModel): description="Concise (2-4 sentences) justification grounded in the dataset " "description, variable semantics, or domain knowledge.", ) - + missing_info: Optional[str] = Field( + None, + description="If passed is null, what additional information would be needed " + "to make a determination. Otherwise null.", + ) def _llm_argue_assumption( assumption_name: str, @@ -113,24 +117,32 @@ def _llm_argue_assumption( Variables involved: {variables_summary} +If you recognize this study from the literature, use your prior knowledge +about its design, mechanisms, and known methodological concerns to inform +your assessment — not just the description provided above. + {extra_context or ""} Respond ONLY as JSON matching this schema: {{ "passed": true | false | null, "reasoning": "<2-4 sentence justification>" + "missing_info": "" }} - + Use null for "passed" if the dataset description is insufficient to argue either way. """.strip() try: raw = call_llm_with_json_output(llm, prompt) verdict = _LLMAssumptionVerdict(**(raw or {})) + details = {"assumption": assumption_name} + if verdict.missing_info: + details["missing_info"] = verdict.missing_info return _result( passed=verdict.passed, reasoning=verdict.reasoning, - details={"assumption": assumption_name}, + details=details, ) except Exception as exc: logger.warning("LLM assumption check failed for '%s': %s", assumption_name, exc) @@ -144,7 +156,8 @@ def _llm_argue_assumption( # SUTVA (needed for every method) # _____________________________________________________________________________ -def check_sutva( + +def check_strict_sutva( dataset_description: Optional[str], variables_summary: Dict[str, Any], llm=None, @@ -167,6 +180,35 @@ def check_sutva( ), ) +def check_permissive_sutva( + dataset_description: Optional[str], + variables_summary: Dict[str, Any], + llm=None, +) -> Dict[str, Any]: + """SUTVA: no interference between units, no hidden treatment versions.""" + return _llm_argue_assumption( + assumption_name="SUTVA (Stable Unit Treatment Value Assumption)", + assumption_description=( + "(1) No interference: one unit's treatment does not affect another unit's " + "potential outcomes. (2) No hidden versions of the treatment: the treatment " + "is administered consistently across treated units." + ), + dataset_description=dataset_description, + variables_summary=variables_summary, + llm=llm, + extra_context=( + "Important: SUTVA is an idealized assumption that is technically violated " + "in most real-world settings. Do NOT fail this assumption simply because " + "minor spillovers or small treatment variations are theoretically possible. " + "Only return passed=false if there is a strong, concrete reason grounded in " + "the dataset description — such as explicit network structure, shared " + "environments where interference is the primary mechanism, or clearly " + "documented treatment heterogeneity. If the dataset comes from a published " + "causal study, assume the researchers judged SUTVA to be reasonable unless " + "the description contradicts this." + ), + ) + # _____________________________________________________________________________ # Ignorability / Conditional ignorability (RCT and observational) @@ -348,4 +390,61 @@ def check_stable_group_composition(dataset_description, variables_summary, llm=N "Unit composition of treatment and control groups does not change as a result " "of treatment (no differential attrition or selective entry/exit).", dataset_description, variables_summary, llm, - ) \ No newline at end of file + ) + + +# _____________________________________________________________________________ +# Frontdoor-specific checks (LLM-reasoned) +# _____________________________________________________________________________ + +def check_frontdoor_full_mediation(dataset_description, variables_summary, llm=None): + """Full mediation: M fully captures the effect of T on Y; no direct T→Y path.""" + return _llm_argue_assumption( + "Full mediation", + "The mediator M fully captures the effect of treatment T on outcome Y; " + "there is no direct T→Y path outside of the T→M→Y pathway.", + dataset_description, variables_summary, llm, + ) + +def check_frontdoor_no_TM_confounding(dataset_description, variables_summary, llm=None): + """No unobserved confounding between treatment and mediator.""" + return _llm_argue_assumption( + "No T-M confounding", + "The relationship between the treatment T and the mediator M is unconfounded. " + "There are no unobserved variables that affect both T and M.", + dataset_description, variables_summary, llm, + ) + +def check_frontdoor_T_blocks_MY(dataset_description, variables_summary, llm=None): + """Treatment blocks all confounding paths between mediator and outcome.""" + return _llm_argue_assumption( + "T blocks M→Y confounding", + "Conditioning on the treatment T removes all back-door paths between " + "the mediator M and the outcome Y.", + dataset_description, variables_summary, llm, + ) + +def check_frontdoor_positivity( + df: pd.DataFrame, + treatment: str, + mediator: str, + min_count: int = 5, +) -> Dict[str, Any]: + """Frontdoor positivity: P(M=m|X=x) > 0 for all relevant (x, m) combinations.""" + combos = df.groupby([treatment, mediator]).size().reset_index(name='count') + total_combos = df[treatment].nunique() * df[mediator].nunique() + observed_combos = len(combos) + empty = total_combos - observed_combos + sparse = int((combos['count'] < min_count).sum()) + + passed = empty == 0 and sparse == 0 + return _result( + passed=passed, + reasoning=( + f"{observed_combos}/{total_combos} (treatment, mediator) combinations observed. " + f"{empty} empty, {sparse} sparse (< {min_count} obs). " + f"{'Positivity satisfied.' if passed else 'Some combinations are empty or near-empty — frontdoor formula may be undefined.'}" + ), + details={"total_combos": total_combos, "observed_combos": observed_combos, + "empty": empty, "sparse": sparse, "min_count": min_count}, + ) From ba0fc36c3c7031346af4bed2caf6642b2e53d037 Mon Sep 17 00:00:00 2001 From: erivan Date: Wed, 29 Apr 2026 12:39:29 +0200 Subject: [PATCH 05/13] add post-modeling assumption checks for IPW, matching and IVs --- cais/methods/post_model_assumption_utils.py | 60 +++++++++++++++++++-- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/cais/methods/post_model_assumption_utils.py b/cais/methods/post_model_assumption_utils.py index 3571c56..96532b1 100644 --- a/cais/methods/post_model_assumption_utils.py +++ b/cais/methods/post_model_assumption_utils.py @@ -21,12 +21,11 @@ import pandas as pd from pydantic import BaseModel, Field - # import some assumptions already available for each method - from cais.methods.instrumental_variable.diagnostics import ( run_overidentification_test, ) +from cais.methods.utils import calculate_standardized_differences from cais.utils.llm_helpers import call_llm_with_json_output logger = logging.getLogger(__name__) @@ -46,6 +45,59 @@ def _result( "details": details or {}, } +# _____________________________________________________________________________ +# Balance checks (IPW, matching) +# _____________________________________________________________________________ + +def check_balance_after_weighting( + df: pd.DataFrame, treatment: str, covariates: List[str], + weights: np.ndarray, smd_threshold: float = 0.1, +) -> Dict[str, Any]: + """Weighted SMDs after IPW.""" + treated = df[treatment] == 1 + smds = {} + for c in covariates: + x = df[c].astype(float).values + w = weights + m1 = np.average(x[treated], weights=w[treated]) + m0 = np.average(x[~treated], weights=w[~treated]) + v1 = np.average((x[treated] - m1) ** 2, weights=w[treated]) + v0 = np.average((x[~treated] - m0) ** 2, weights=w[~treated]) + denom = np.sqrt((v1 + v0) / 2) + smds[c] = (m1 - m0) / denom if denom > 0 else np.nan + imbalanced = {c: v for c, v in smds.items() if pd.notna(v) and abs(v) > smd_threshold} + passed = len(imbalanced) == 0 + return _result( + passed=passed, + reasoning=( + f"Weighted balance on {len(covariates)} covariates. " + f"{'All balanced after IPW.' if passed else f'Still imbalanced: {list(imbalanced.keys())}.'}" + ), + details={"weighted_smds": smds, "threshold": smd_threshold, "imbalanced": imbalanced}, + ) + + +def check_balance_after_matching( + df_matched: pd.DataFrame, treatment: str, covariates: List[str], + smd_threshold: float = 0.1, +) -> Dict[str, Any]: + """SMDs computed on the matched sample.""" + smds = calculate_standardized_differences(df_matched, treatment, covariates) + imbalanced = {c: v for c, v in smds.items() if pd.notna(v) and abs(v) > smd_threshold} + passed = len(imbalanced) == 0 + return _result( + passed=passed, + reasoning=( + f"Matched sample balance on {len(covariates)} covariates. " + f"{'All balanced after matching.' if passed else f'Still imbalanced: {list(imbalanced.keys())}.'}" + ), + details={"smds": smds, "threshold": smd_threshold, "imbalanced": imbalanced}, + ) + + +# _____________________________________________________________________________ +# IVs +# _____________________________________________________________________________ def check_iv_overidentification( sm_results, df, treatment, outcome, instruments, covariates, @@ -67,4 +119,6 @@ def check_iv_overidentification( f"{'Instruments appear valid.' if passed else 'Instruments may be invalid — correlated with errors.'}" ), details={"statistic": stat, "p_value": p, "status": status}, - ) \ No newline at end of file + ) + + From b61e5271690e52302568fce60e10f4bff21ddf52 Mon Sep 17 00:00:00 2001 From: erivan Date: Wed, 29 Apr 2026 12:40:15 +0200 Subject: [PATCH 06/13] add short use cases for assumption checks in pre/post modeling modules --- examples/assumption_utils_usecases.ipynb | 674 +++++++++++++++++++---- 1 file changed, 560 insertions(+), 114 deletions(-) diff --git a/examples/assumption_utils_usecases.ipynb b/examples/assumption_utils_usecases.ipynb index 5656aa9..237cfb9 100644 --- a/examples/assumption_utils_usecases.ipynb +++ b/examples/assumption_utils_usecases.ipynb @@ -16,8 +16,8 @@ "metadata": { "collapsed": true, "ExecuteTime": { - "end_time": "2026-04-20T10:27:52.373168900Z", - "start_time": "2026-04-20T10:27:52.275824400Z" + "end_time": "2026-04-29T10:06:03.331019300Z", + "start_time": "2026-04-29T10:06:03.210220500Z" } }, "cell_type": "code", @@ -218,31 +218,46 @@ "" ] }, - "execution_count": 3, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 3 + "execution_count": 1 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "#### Cond-ignorability (IPW, matching)", + "id": "f6f1d812a34d9529" }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:27:52.445524800Z", - "start_time": "2026-04-20T10:27:52.390725800Z" + "end_time": "2026-04-29T10:06:15.266016300Z", + "start_time": "2026-04-29T10:06:03.334015Z" } }, "cell_type": "code", "source": "from cais.methods.pre_model_assumption_utils import check_cond_ignorability", "id": "90501ac7f9f951b", - "outputs": [], - "execution_count": 4 + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\beriv\\PycharmProjects\\causal-agent\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "execution_count": 2 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:27:52.495178900Z", - "start_time": "2026-04-20T10:27:52.447534100Z" + "end_time": "2026-04-29T10:06:15.299181100Z", + "start_time": "2026-04-29T10:06:15.272603900Z" } }, "cell_type": "code", @@ -261,13 +276,13 @@ ] } ], - "execution_count": 5 + "execution_count": 3 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:27:52.570233300Z", - "start_time": "2026-04-20T10:27:52.519778400Z" + "end_time": "2026-04-29T10:06:15.332700200Z", + "start_time": "2026-04-29T10:06:15.315219500Z" } }, "cell_type": "code", @@ -277,13 +292,13 @@ ], "id": "ee059538218d39ee", "outputs": [], - "execution_count": 6 + "execution_count": 4 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:27:52.637020400Z", - "start_time": "2026-04-20T10:27:52.574298200Z" + "end_time": "2026-04-29T10:06:15.373305300Z", + "start_time": "2026-04-29T10:06:15.332700200Z" } }, "cell_type": "code", @@ -303,7 +318,7 @@ ] } ], - "execution_count": 7 + "execution_count": 5 }, { "metadata": {}, @@ -314,8 +329,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:27:53.228295300Z", - "start_time": "2026-04-20T10:27:52.639386400Z" + "end_time": "2026-04-29T10:33:35.269827500Z", + "start_time": "2026-04-29T10:33:34.696889100Z" } }, "cell_type": "code", @@ -325,18 +340,18 @@ ], "id": "98e0711e996cb3f5", "outputs": [], - "execution_count": 8 + "execution_count": 2 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:27:55.947713600Z", - "start_time": "2026-04-20T10:27:53.246537200Z" + "end_time": "2026-04-29T10:06:18.401978500Z", + "start_time": "2026-04-29T10:06:15.968956600Z" } }, "cell_type": "code", "source": [ - "from cais.methods.pre_model_assumption_utils import check_sutva\n", + "from cais.methods.pre_model_assumption_utils import check_permissive_sutva\n", "\n", "dataset_description = (\n", " \"Panel data of 51 US states from 1985-2000 (Donohue & Levitt). \"\n", @@ -354,7 +369,7 @@ " 'time': 'year (1985-2000)',\n", "}\n", "\n", - "result_sutva = check_sutva(dataset_description, variables_info, llm=llm)\n", + "result_sutva = check_permissive_sutva(dataset_description, variables_info, llm=llm)\n", "print(result_sutva)" ], "id": "7409ecd3503dc2e5", @@ -363,17 +378,23 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'passed': False, 'reasoning': 'The assumption of no interference is likely violated due to potential spillover effects among states. For instance, the legalization of abortion in one state could influence neighboring states through migration patterns, shared economic conditions, or social norms, thereby affecting their crime rates. Additionally, the treatment is binary and may not account for variations in how states implemented the repeal, suggesting the presence of hidden versions of the treatment.', 'details': {'assumption': 'SUTVA (Stable Unit Treatment Value Assumption)'}}\n" + "{'passed': True, 'reasoning': \"The assumption of SUTVA is plausibly satisfied in this context as the treatment 'repeal' pertains to state-level legalization of abortion, which is unlikely to directly affect the potential outcomes of other states. Additionally, the treatment appears to be uniformly applied within states that legalized abortion, minimizing concerns about hidden versions of the treatment. The analysis focuses on a specific subgroup (black females age 15), which further supports the consistency of treatment application within that group.\", 'details': {'assumption': 'SUTVA (Stable Unit Treatment Value Assumption)'}}\n" ] } ], - "execution_count": 9 + "execution_count": 7 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "#### Instrumental Variables (IVs)", + "id": "12427e3330a86867" }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:27:55.961967200Z", - "start_time": "2026-04-20T10:27:55.950140100Z" + "end_time": "2026-04-29T10:06:18.436988200Z", + "start_time": "2026-04-29T10:06:18.424323Z" } }, "cell_type": "code", @@ -387,26 +408,13 @@ ], "id": "b170f0d603110f8b", "outputs": [], - "execution_count": 10 - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2026-04-20T10:27:55.974645700Z", - "start_time": "2026-04-20T10:27:55.964090400Z" - } - }, - "cell_type": "code", - "source": "import pandas as pd", - "id": "5a6ebd0b2770d0bc", - "outputs": [], - "execution_count": 11 + "execution_count": 8 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:27:55.993221500Z", - "start_time": "2026-04-20T10:27:55.975973Z" + "end_time": "2026-04-29T10:06:18.455694900Z", + "start_time": "2026-04-29T10:06:18.439915100Z" } }, "cell_type": "code", @@ -416,13 +424,13 @@ ], "id": "43f5cdd305531a14", "outputs": [], - "execution_count": 12 + "execution_count": 9 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:27:56.008393100Z", - "start_time": "2026-04-20T10:27:55.995467300Z" + "end_time": "2026-04-29T10:06:18.472045900Z", + "start_time": "2026-04-29T10:06:18.459007200Z" } }, "cell_type": "code", @@ -432,13 +440,13 @@ ], "id": "64af04994333ad51", "outputs": [], - "execution_count": 13 + "execution_count": 10 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:27:56.054986Z", - "start_time": "2026-04-20T10:27:56.010656800Z" + "end_time": "2026-04-29T10:06:18.534397800Z", + "start_time": "2026-04-29T10:06:18.474064Z" } }, "cell_type": "code", @@ -545,18 +553,18 @@ "" ] }, - "execution_count": 14, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 14 + "execution_count": 11 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:27:56.118001600Z", - "start_time": "2026-04-20T10:27:56.058386300Z" + "end_time": "2026-04-29T10:06:18.566474300Z", + "start_time": "2026-04-29T10:06:18.538373800Z" } }, "cell_type": "code", @@ -580,13 +588,13 @@ ] } ], - "execution_count": 15 + "execution_count": 12 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:27:57.668451700Z", - "start_time": "2026-04-20T10:27:56.122437900Z" + "end_time": "2026-04-29T10:06:20.338112800Z", + "start_time": "2026-04-29T10:06:18.567613800Z" } }, "cell_type": "code", @@ -617,17 +625,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "check_iv_exclusion : {'passed': False, 'reasoning': 'The exclusion restriction is likely violated because growing up near a 4-year college (nearc4) may influence local labor market conditions, which could directly affect wages (lwage) independent of education (educ). This correlation suggests that the instrument may have a direct effect on the outcome, undermining the validity of the exclusion restriction.', 'details': {'assumption': 'Exclusion restriction'}}\n" + "check_iv_exclusion : {'passed': False, 'reasoning': 'The exclusion restriction is likely violated because growing up near a 4-year college may influence local labor market conditions, which could directly affect wages (lwage) independent of education (educ). This suggests that proximity to a college could have a direct effect on wages, thus undermining the validity of the instrument.', 'details': {'assumption': 'Exclusion restriction'}}\n" ] } ], - "execution_count": 16 + "execution_count": 13 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:27:59.501261300Z", - "start_time": "2026-04-20T10:27:57.674640900Z" + "end_time": "2026-04-29T10:06:22.725119800Z", + "start_time": "2026-04-29T10:06:20.355612200Z" } }, "cell_type": "code", @@ -642,17 +650,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "check_iv_exogeneity : {'passed': False, 'reasoning': \"The assumption of instrument exogeneity is likely violated because 'nearc4' (proximity to a 4-year college) may correlate with local labor market conditions, which can affect wages ('lwage') directly. This suggests that 'nearc4' is not as good as randomly assigned with respect to unobserved confounders affecting both education and wages.\", 'details': {'assumption': 'Instrument exogeneity (independence)'}}\n" + "check_iv_exogeneity : {'passed': False, 'reasoning': \"The assumption of instrument exogeneity is likely violated in this context because 'nearc4' (growing up near a 4-year college) may be correlated with local labor market conditions, which can affect wages independently of education. This suggests that 'nearc4' could influence 'lwage' through pathways other than education, undermining the validity of the exclusion restriction.\", 'details': {'assumption': 'Instrument exogeneity (independence)'}}\n" ] } ], - "execution_count": 17 + "execution_count": 14 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:28:01.122831700Z", - "start_time": "2026-04-20T10:27:59.504289100Z" + "end_time": "2026-04-29T10:06:25.463365800Z", + "start_time": "2026-04-29T10:06:22.742494500Z" } }, "cell_type": "code", @@ -667,11 +675,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "check_iv_monotonicity : {'passed': None, 'reasoning': \"The dataset description does not provide sufficient information about the relationship between the instrument 'nearc4' and the treatment 'educ' to assess the presence of defiers. Without knowing how individuals who grew up near a 4-year college respond to the instrument in terms of their education, we cannot determine if there are units that would decrease their education despite being near a college.\", 'details': {'assumption': 'Monotonicity (LATE)'}}\n" + "check_iv_monotonicity : {'passed': False, 'reasoning': 'The assumption of monotonicity is likely violated in this context because proximity to a 4-year college may lead some individuals to pursue more education while others may be discouraged or choose not to attend college despite proximity. This suggests the presence of defiers, as some individuals could be negatively influenced by the local educational environment, leading to a decrease in education despite being near a college.', 'details': {'assumption': 'Monotonicity (LATE)'}}\n" ] } ], - "execution_count": 18 + "execution_count": 15 }, { "metadata": {}, @@ -686,8 +694,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:28:01.136060100Z", - "start_time": "2026-04-20T10:28:01.124507600Z" + "end_time": "2026-04-29T10:06:25.500243300Z", + "start_time": "2026-04-29T10:06:25.487566300Z" } }, "cell_type": "code", @@ -705,13 +713,13 @@ ], "id": "2f246249bee3bdc7", "outputs": [], - "execution_count": 19 + "execution_count": 16 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:28:03.358785900Z", - "start_time": "2026-04-20T10:28:01.139115900Z" + "end_time": "2026-04-29T10:06:27.602556200Z", + "start_time": "2026-04-29T10:06:25.502361500Z" } }, "cell_type": "code", @@ -726,17 +734,23 @@ "name": "stdout", "output_type": "stream", "text": [ - "check_iv_monotonicity : {'passed': True, 'reasoning': \"The assumption of monotonicity is plausibly satisfied because the instrument 'nearc4' is expected to increase the likelihood of attending college due to reduced costs associated with proximity to a college. It is highly unlikely that being near a college would lead to a decrease in education, as the natural effect of proximity is to either increase education or have no effect, thus supporting the absence of defiers.\", 'details': {'assumption': 'Monotonicity (LATE)'}}\n" + "check_iv_monotonicity : {'passed': True, 'reasoning': 'The assumption of monotonicity is plausibly satisfied in this context because proximity to a 4-year college is expected to either increase education or have no effect, but not decrease it. Given that the mechanism involves reduced costs associated with attending college, it is reasonable to conclude that no individuals would be deflected from pursuing education due to their proximity to a college.', 'details': {'assumption': 'Monotonicity (LATE)'}}\n" ] } ], - "execution_count": 20 + "execution_count": 17 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "#### Difference-in-Differences (DiD)", + "id": "d3bb643edd9ef0bf" }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:28:03.418716700Z", - "start_time": "2026-04-20T10:28:03.376072700Z" + "end_time": "2026-04-29T10:06:27.749050400Z", + "start_time": "2026-04-29T10:06:27.678419200Z" } }, "cell_type": "code", @@ -750,13 +764,13 @@ ], "id": "d6268e8f3f29807b", "outputs": [], - "execution_count": 21 + "execution_count": 18 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:28:03.486582800Z", - "start_time": "2026-04-20T10:28:03.420717500Z" + "end_time": "2026-04-29T10:06:27.924696800Z", + "start_time": "2026-04-29T10:06:27.749050400Z" } }, "cell_type": "code", @@ -966,18 +980,18 @@ "" ] }, - "execution_count": 22, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 22 + "execution_count": 19 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:28:03.566545700Z", - "start_time": "2026-04-20T10:28:03.489714600Z" + "end_time": "2026-04-29T10:06:28.229146300Z", + "start_time": "2026-04-29T10:06:27.924696800Z" } }, "cell_type": "code", @@ -1002,13 +1016,13 @@ ] } ], - "execution_count": 23 + "execution_count": 20 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:28:03.802998500Z", - "start_time": "2026-04-20T10:28:03.569965Z" + "end_time": "2026-04-29T10:06:28.800155900Z", + "start_time": "2026-04-29T10:06:28.247110600Z" } }, "cell_type": "code", @@ -1038,13 +1052,13 @@ ] } ], - "execution_count": 24 + "execution_count": 21 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:28:03.880077200Z", - "start_time": "2026-04-20T10:28:03.826052400Z" + "end_time": "2026-04-29T10:06:29.035604900Z", + "start_time": "2026-04-29T10:06:28.807552Z" } }, "cell_type": "code", @@ -1069,13 +1083,13 @@ ] } ], - "execution_count": 25 + "execution_count": 22 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:28:06.140551Z", - "start_time": "2026-04-20T10:28:03.890887Z" + "end_time": "2026-04-29T10:06:31.028959300Z", + "start_time": "2026-04-29T10:06:29.041141500Z" } }, "cell_type": "code", @@ -1106,22 +1120,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'passed': None, 'reasoning': 'The dataset description does not provide information on whether there was differential attrition or selective entry/exit of states in relation to the treatment (adoption of Castle Doctrine laws). Without data on how states may have changed their composition over time or how the treatment affected state characteristics, it is not possible to assess the plausibility of the stable group composition assumption.', 'details': {'assumption': 'Stable group composition'}}\n" + "{'passed': False, 'reasoning': 'The assumption of stable group composition is likely violated in this analysis due to the staggered adoption of the Castle Doctrine laws across states. This staggered treatment could lead to differential attrition or selective entry/exit, as states that adopt the laws may differ systematically from those that do not, potentially affecting the composition of treatment and control groups over time.', 'details': {'assumption': 'Stable group composition'}}\n" ] } ], - "execution_count": 26 + "execution_count": 23 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:28:07.928286700Z", - "start_time": "2026-04-20T10:28:06.145549900Z" + "end_time": "2026-04-29T10:06:33.366297200Z", + "start_time": "2026-04-29T10:06:31.030893900Z" } }, "cell_type": "code", "source": [ - "castle_description = (\n", + "castle_description_real = (\n", " \"Castle Doctrine dataset. Panel of 50 US states from 2000-2010. \"\n", " \"Treatment is adoption of Castle Doctrine / Stand Your Ground laws. \"\n", " \"Outcome is log homicide rate. Unit of observation is state-year. \"\n", @@ -1140,23 +1154,28 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'passed': False, 'reasoning': 'While the dataset is balanced and does not have missing observations, the potential for population migration between states introduces the possibility of changes in the demographic composition of treatment and control groups over time. This could lead to differential attrition or selective entry/exit, violating the stable group composition assumption.', 'details': {'assumption': 'Stable group composition'}}\n" + "{'passed': False, 'reasoning': 'The assumption of stable group composition is likely violated in this analysis due to the staggered adoption of the Castle Doctrine laws across states. This staggered treatment could lead to differential attrition or selective entry/exit of states based on their homicide rates or other unobserved factors, which may change the composition of treatment and control groups over time.', 'details': {'assumption': 'Stable group composition'}}\n" ] } ], - "execution_count": 27 + "execution_count": 24 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:28:09.910292800Z", - "start_time": "2026-04-20T10:28:07.933859600Z" + "end_time": "2026-04-29T10:06:34.970163500Z", + "start_time": "2026-04-29T10:06:33.388245500Z" } }, "cell_type": "code", "source": [ "# --- check_sutva (LLM-reasoned) ---\n", - "result_sutva_did = check_sutva(castle_description, castle_variables, llm=llm)\n", + "\n", + "castle_description_real = (\n", + " \"Castle Doctrine dataset. Panel of 50 US states from 2000-2010. \"\n", + ")\n", + "\n", + "result_sutva_did = check_permissive_sutva(castle_description, castle_variables, llm=llm)\n", "print(result_sutva_did)" ], "id": "9f38e4360f24ad53", @@ -1165,11 +1184,214 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'passed': False, 'reasoning': 'The assumption of no interference is likely violated due to potential population migration between states, which could lead to spillover effects where the treatment in one state affects the outcomes in another state. Additionally, the treatment (adoption of Castle Doctrine laws) may not be uniformly applied across states, suggesting the possibility of hidden versions of the treatment. Therefore, SUTVA is not plausibly satisfied.', 'details': {'assumption': 'SUTVA (Stable Unit Treatment Value Assumption)'}}\n" + "{'passed': True, 'reasoning': 'The Castle Doctrine laws are state-level policies that do not directly affect other states, suggesting that interference between units is minimal. Additionally, the treatment (adoption of the law) is clearly defined and applied consistently across states, supporting the assumption of no hidden versions of the treatment.', 'details': {'assumption': 'SUTVA (Stable Unit Treatment Value Assumption)'}}\n" + ] + } + ], + "execution_count": 25 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "#### Frontdoor adjustment", + "id": "ab2c9b91a19255bd" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-29T10:33:24.170673Z", + "start_time": "2026-04-29T10:33:09.808139600Z" + } + }, + "cell_type": "code", + "source": [ + "from cais.methods.pre_model_assumption_utils import (\n", + " check_frontdoor_full_mediation,\n", + " check_frontdoor_no_TM_confounding,\n", + " check_frontdoor_T_blocks_MY,\n", + " check_frontdoor_positivity,\n", + ")" + ], + "id": "5134dec2de5e8b93", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\beriv\\PycharmProjects\\causal-agent\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "execution_count": 1 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-29T10:33:44.109339400Z", + "start_time": "2026-04-29T10:33:44.071005Z" + } + }, + "cell_type": "code", + "source": [ + "# Description classique : smoking → tar → cancer\n", + "frontdoor_description = (\n", + " \"Observational study examining the effect of smoking (T) on lung cancer (Y). \"\n", + " \"Tar deposits in lungs (M) are used as a mediator. The argument is that \"\n", + " \"smoking affects cancer only through tar accumulation. There may be \"\n", + " \"unobserved genetic confounders affecting both smoking behavior and cancer risk.\"\n", + ")\n", + "\n", + "frontdoor_variables = {\n", + " 'treatment': 'smoking',\n", + " 'mediator': 'tar deposits',\n", + " 'outcome': 'lung cancer',\n", + " 'potential_confounders': 'genetic predisposition (unobserved)',\n", + "}" + ], + "id": "c092dd995e2a558e", + "outputs": [], + "execution_count": 3 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-29T10:33:48.050888200Z", + "start_time": "2026-04-29T10:33:45.844099500Z" + } + }, + "cell_type": "code", + "source": [ + "# --- Full mediation : T→Y only through M ? ---\n", + "result = check_frontdoor_full_mediation(frontdoor_description, frontdoor_variables, llm=llm)\n", + "print(\"check_frontdoor_full_mediation :\", result)" + ], + "id": "96a181e99c075055", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "check_frontdoor_full_mediation : {'passed': False, 'reasoning': 'The assumption of full mediation is not plausibly satisfied because there are unobserved genetic confounders that may influence both smoking behavior and lung cancer risk. This suggests that there could be a direct effect of smoking on lung cancer that is not fully captured by the mediator of tar deposits, violating the full mediation assumption.', 'details': {'assumption': 'Full mediation'}}\n" + ] + } + ], + "execution_count": 4 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-29T10:33:51.528185200Z", + "start_time": "2026-04-29T10:33:49.350818500Z" + } + }, + "cell_type": "code", + "source": [ + "# --- No T-M confounding ---\n", + "result = check_frontdoor_no_TM_confounding(frontdoor_description, frontdoor_variables, llm=llm)\n", + "print(\"check_frontdoor_no_TM_confounding :\", result)" + ], + "id": "a91d5d06260adfca", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "check_frontdoor_no_TM_confounding : {'passed': False, 'reasoning': 'The assumption of no T-M confounding is not plausibly satisfied because there are unobserved genetic predispositions that could influence both smoking behavior and the accumulation of tar deposits in the lungs. These genetic factors could create a confounding relationship between the treatment (smoking) and the mediator (tar deposits), thus violating the assumption.', 'details': {'assumption': 'No T-M confounding'}}\n" + ] + } + ], + "execution_count": 5 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-29T10:33:54.348785200Z", + "start_time": "2026-04-29T10:33:52.501899600Z" + } + }, + "cell_type": "code", + "source": [ + "# --- T blocks M→Y confounding ---\n", + "result = check_frontdoor_T_blocks_MY(frontdoor_description, frontdoor_variables, llm=llm)\n", + "print(\"check_frontdoor_T_blocks_MY :\", result)" + ], + "id": "408725172bba21f1", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "check_frontdoor_T_blocks_MY : {'passed': False, 'reasoning': 'The assumption that T blocks M→Y confounding is not plausibly satisfied because there are unobserved genetic confounders that affect both smoking behavior (T) and lung cancer risk (Y). These unobserved confounders can create back-door paths between the mediator (M) and the outcome (Y), violating the assumption.', 'details': {'assumption': 'T blocks M→Y confounding'}}\n" + ] + } + ], + "execution_count": 6 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-29T10:35:16.476952500Z", + "start_time": "2026-04-29T10:35:16.336653Z" + } + }, + "cell_type": "code", + "source": [ + "from cais.methods.pre_model_assumption_utils import check_frontdoor_positivity\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "# Simulate frontdoor data\n", + "np.random.seed(42)\n", + "n = 500\n", + "df_fd = pd.DataFrame({\n", + " 'T': np.random.binomial(1, 0.5, n),\n", + " 'M': np.random.binomial(1, 0.6, n),\n", + "})\n", + "\n", + "# Verified\n", + "result = check_frontdoor_positivity(df_fd, 'T', 'M')\n", + "print(\"Normal case :\", result)" + ], + "id": "22880c40f21991d9", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Normal case : {'passed': True, 'reasoning': '4/4 (treatment, mediator) combinations observed. 0 empty, 0 sparse (< 5 obs). Positivity satisfied.', 'details': {'total_combos': 4, 'observed_combos': 4, 'empty': 0, 'sparse': 0, 'min_count': 5}}\n" ] } ], - "execution_count": 28 + "execution_count": 8 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-29T10:35:20.022758900Z", + "start_time": "2026-04-29T10:35:19.920688800Z" + } + }, + "cell_type": "code", + "source": [ + "# Violation: an empty combination\n", + "df_fd_bad = df_fd.copy()\n", + "df_fd_bad = df_fd_bad[~((df_fd_bad['T'] == 1) & (df_fd_bad['M'] == 0))]\n", + "\n", + "result = check_frontdoor_positivity(df_fd_bad, 'T', 'M')\n", + "print(\"Violation case :\", result)" + ], + "id": "d6a254da007df478", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Violation case : {'passed': False, 'reasoning': '3/4 (treatment, mediator) combinations observed. 1 empty, 0 sparse (< 5 obs). Some combinations are empty or near-empty — frontdoor formula may be undefined.', 'details': {'total_combos': 4, 'observed_combos': 3, 'empty': 1, 'sparse': 0, 'min_count': 5}}\n" + ] + } + ], + "execution_count": 9 }, { "metadata": {}, @@ -1180,25 +1402,249 @@ { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:28:09.937251500Z", - "start_time": "2026-04-20T10:28:09.914351300Z" + "end_time": "2026-04-29T10:06:41.952452300Z", + "start_time": "2026-04-29T10:06:41.925732700Z" } }, "cell_type": "code", "source": [ "from cais.methods.post_model_assumption_utils import (\n", - " check_iv_overidentification\n", + " check_iv_overidentification, check_balance_after_matching, check_balance_after_weighting\n", ")" ], "id": "dd005b46b39be3", "outputs": [], - "execution_count": 29 + "execution_count": 31 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "#### Balance checks (IPW, matching)", + "id": "d707342d2932fb6c" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Let's perform matching on the `abortion_bf15.csv` dataset", + "id": "33e3a46a49da647" }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:28:09.981476500Z", - "start_time": "2026-04-20T10:28:09.939842500Z" + "end_time": "2026-04-29T10:22:32.930882500Z", + "start_time": "2026-04-29T10:22:32.680917800Z" + } + }, + "cell_type": "code", + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.neighbors import NearestNeighbors\n", + "import numpy as np\n", + "\n", + "covariates = ['crack', 'alcohol', 'income', 'ur', 'poverty', 'black', 'perc1519']\n", + "df_clean = df[covariates + ['repeal', 'rate']].dropna()\n", + "T = df_clean['repeal'].astype(int)\n", + "\n", + "# Estimate propensity scores\n", + "ps_model = LogisticRegression(max_iter=1000)\n", + "ps_model.fit(df_clean[covariates], T)\n", + "ps = ps_model.predict_proba(df_clean[covariates])[:, 1]\n", + "ps = np.clip(ps, 0.01, 0.99)\n", + "\n", + "print(f\"Observations : {len(df_clean)}\")\n", + "print(f\"Traited : {T.sum()}, Controls : {(1-T).sum()}\")\n", + "\n", + "treated_idx = df_clean[T == 1].index\n", + "control_idx = df_clean[T == 0].index\n", + "\n", + "nn = NearestNeighbors(n_neighbors=1)\n", + "nn.fit(ps[T == 0].reshape(-1, 1))\n", + "distances, indices = nn.kneighbors(ps[T == 1].reshape(-1, 1))\n", + "\n", + "matched_control_idx = control_idx[indices.flatten()]\n", + "df_matched = pd.concat([df_clean.loc[treated_idx], df_clean.loc[matched_control_idx]])" + ], + "id": "82dd7db2f724ce98", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Observations : 737\n", + "Traited : 80, Controls : 657\n" + ] + } + ], + "execution_count": 45 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-29T10:18:45.346005300Z", + "start_time": "2026-04-29T10:18:45.251915700Z" + } + }, + "cell_type": "code", + "source": [ + "# --- check_balance_after_matching ---\n", + "result = check_balance_after_matching(df_matched, 'repeal', covariates)\n", + "print(\"check_balance_after_matching :\", result)" + ], + "id": "7150093c9adfc2e0", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "check_balance_after_matching : {'passed': False, 'reasoning': \"Matched sample balance on 7 covariates. Still imbalanced: ['crack', 'poverty', 'perc1519'].\", 'details': {'smds': {'crack': np.float64(-0.35194822827546235), 'alcohol': np.float64(0.09820595193655905), 'income': np.float64(-0.08401277897298327), 'ur': np.float64(0.0653372997131283), 'poverty': np.float64(0.20211127320121058), 'black': np.float64(-0.0014753254554432944), 'perc1519': np.float64(0.20136611611785302)}, 'threshold': 0.1, 'imbalanced': {'crack': np.float64(-0.35194822827546235), 'poverty': np.float64(0.20211127320121058), 'perc1519': np.float64(0.20136611611785302)}}}\n" + ] + } + ], + "execution_count": 41 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Let's perform matching on the `lalonde_data_psid.csv` dataset", + "id": "ad3e0f2e6285181f" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-29T10:26:38.848426400Z", + "start_time": "2026-04-29T10:26:38.545776200Z" + } + }, + "cell_type": "code", + "source": [ + "df_lalonde = pd.read_csv('../data/all_data/lalonde_data_psid.csv').dropna()\n", + "covariates_lal = ['age', 'education', 'black', 'hispanic', 'married', 'nodegree', 're74', 're75']\n", + "T_lal = df_lalonde['treat'].astype(int)\n", + "\n", + "ps_lal = LogisticRegression(max_iter=1000).fit(df_lalonde[covariates_lal], T_lal).predict_proba(df_lalonde[covariates_lal])[:, 1]\n", + "ps_lal = np.clip(ps_lal, 0.01, 0.99)\n", + "\n", + "weights_lal = np.where(T_lal == 1, 1 / ps_lal, 1 / (1 - ps_lal))\n", + "\n", + "result = check_balance_after_weighting(df_lalonde, 'treat', covariates_lal, weights_lal)\n", + "print(\"IPW balance :\", result)" + ], + "id": "17021d9285b6412a", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "IPW balance : {'passed': False, 'reasoning': \"Weighted balance on 8 covariates. Still imbalanced: ['age'].\", 'details': {'weighted_smds': {'age': np.float64(-0.14624458496355314), 'education': np.float64(0.015037166411965392), 'black': np.float64(0.08334646157682368), 'hispanic': np.float64(-0.03105377837487486), 'married': np.float64(-0.07682990482566496), 'nodegree': np.float64(0.015280318253117237), 're74': np.float64(-0.06313795380968303), 're75': np.float64(-0.004874393257471577)}, 'threshold': 0.1, 'imbalanced': {'age': np.float64(-0.14624458496355314)}}}\n" + ] + } + ], + "execution_count": 46 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Let's perform IPW on the `abortion_bf15.csv` dataset", + "id": "5ebfbb0c709eee26" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-29T10:21:30.566220400Z", + "start_time": "2026-04-29T10:21:30.477705Z" + } + }, + "cell_type": "code", + "source": [ + "from cais.methods.propensity_score.base import estimate_propensity_scores\n", + "\n", + "# Estimate PS\n", + "ps = estimate_propensity_scores(df_clean, 'repeal', covariates)\n", + "ps = np.clip(ps, 0.01, 0.99)\n", + "T = df_clean['repeal'].astype(int)\n", + "\n", + "# Computing IPW weights\n", + "weights = np.where(T == 1, 1 / ps, 1 / (1 - ps))" + ], + "id": "dc0ecd56a3c12ad2", + "outputs": [], + "execution_count": 43 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-29T10:21:32.376378700Z", + "start_time": "2026-04-29T10:21:32.239870800Z" + } + }, + "cell_type": "code", + "source": [ + "# --- check_balance_after_weighting (IPW) ---\n", + "result = check_balance_after_weighting(df_clean, 'repeal', covariates, weights)\n", + "print(result)" + ], + "id": "caa3863afa22030", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'passed': False, 'reasoning': \"Weighted balance on 7 covariates. Still imbalanced: ['crack', 'alcohol', 'ur', 'poverty', 'black', 'perc1519'].\", 'details': {'weighted_smds': {'crack': np.float64(0.34447889210407), 'alcohol': np.float64(0.40374532829144905), 'income': np.float64(0.010660875506205408), 'ur': np.float64(-0.23481605164645944), 'poverty': np.float64(-0.26256027859824893), 'black': np.float64(-0.3988844819819745), 'perc1519': np.float64(-0.3584176988065902)}, 'threshold': 0.1, 'imbalanced': {'crack': np.float64(0.34447889210407), 'alcohol': np.float64(0.40374532829144905), 'ur': np.float64(-0.23481605164645944), 'poverty': np.float64(-0.26256027859824893), 'black': np.float64(-0.3988844819819745), 'perc1519': np.float64(-0.3584176988065902)}}}\n" + ] + } + ], + "execution_count": 44 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Let's perform IPW on the `lalonde_data_psid.csv` dataset", + "id": "c5d1eebb1e9b53d7" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-29T10:28:50.053941300Z", + "start_time": "2026-04-29T10:28:49.903451700Z" + } + }, + "cell_type": "code", + "source": [ + "nn = NearestNeighbors(n_neighbors=1)\n", + "nn.fit(ps_lal[T_lal == 0].reshape(-1, 1))\n", + "distances, indices = nn.kneighbors(ps_lal[T_lal == 1].reshape(-1, 1))\n", + "\n", + "treated_idx = df_lalonde[T_lal == 1].index\n", + "control_idx = df_lalonde[T_lal == 0].index\n", + "matched_control_idx = control_idx[indices.flatten()]\n", + "df_matched_lal = pd.concat([df_lalonde.loc[treated_idx], df_lalonde.loc[matched_control_idx]])\n", + "\n", + "result = check_balance_after_matching(df_matched_lal, 'treat', covariates_lal)\n", + "print(\"Matching balance (LaLonde) :\", result)" + ], + "id": "e5f107380fa76b1d", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Matching balance (LaLonde) : {'passed': False, 'reasoning': \"Matched sample balance on 8 covariates. Still imbalanced: ['age'].\", 'details': {'smds': {'age': np.float64(0.10258472633527428), 'education': np.float64(0.00504611249078997), 'black': np.float64(0.09928459218720592), 'hispanic': np.float64(-0.08443761220875926), 'married': np.float64(0.0), 'nodegree': np.float64(0.02359013327218533), 're74': np.float64(0.043937949151885126), 're75': np.float64(0.05707051827285377)}, 'threshold': 0.1, 'imbalanced': {'age': np.float64(0.10258472633527428)}}}\n" + ] + } + ], + "execution_count": 47 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "#### Instrumental Variables (IVs)", + "id": "32ed8f8ad4ab798b" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-29T10:12:59.044983Z", + "start_time": "2026-04-29T10:12:58.675274200Z" } }, "cell_type": "code", @@ -1223,13 +1669,13 @@ ] } ], - "execution_count": 30 + "execution_count": 34 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-20T10:28:10.058553300Z", - "start_time": "2026-04-20T10:28:09.986093800Z" + "end_time": "2026-04-29T10:12:59.766273400Z", + "start_time": "2026-04-29T10:12:59.067029500Z" } }, "cell_type": "code", @@ -1268,7 +1714,7 @@ ] } ], - "execution_count": 31 + "execution_count": 35 } ], "metadata": { From 355af9ddaae796ac72d22459f16a4316331e11ee Mon Sep 17 00:00:00 2001 From: erivan Date: Wed, 29 Apr 2026 14:47:20 +0200 Subject: [PATCH 07/13] corrected imports --- cais/methods/post_model_assumption_utils.py | 2 -- cais/methods/pre_model_assumption_utils.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/cais/methods/post_model_assumption_utils.py b/cais/methods/post_model_assumption_utils.py index 96532b1..b4d0344 100644 --- a/cais/methods/post_model_assumption_utils.py +++ b/cais/methods/post_model_assumption_utils.py @@ -12,8 +12,6 @@ The agent-level `validate_method` simply dispatches to the selected estimator. """ -from __future__ import annotations - from typing import Any, Dict, List, Optional import logging diff --git a/cais/methods/pre_model_assumption_utils.py b/cais/methods/pre_model_assumption_utils.py index 5ff9580..cc70ced 100644 --- a/cais/methods/pre_model_assumption_utils.py +++ b/cais/methods/pre_model_assumption_utils.py @@ -12,8 +12,6 @@ The agent-level `validate_method` simply dispatches to the selected estimator. """ -from __future__ import annotations - from typing import Any, Dict, List, Optional import logging From 16efd8162c269a7fd89e2818ca22debecf176a7b Mon Sep 17 00:00:00 2001 From: erivan Date: Mon, 4 May 2026 17:04:25 +0200 Subject: [PATCH 08/13] fix: improve formula quoting using Q() instead of backticks --- cais/methods/difference_in_differences/diagnostics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cais/methods/difference_in_differences/diagnostics.py b/cais/methods/difference_in_differences/diagnostics.py index e22cb31..db98ed8 100644 --- a/cais/methods/difference_in_differences/diagnostics.py +++ b/cais/methods/difference_in_differences/diagnostics.py @@ -307,10 +307,10 @@ def run_placebo_test(df: pd.DataFrame, time_var: str, group_var: str, outcome: s df_placebo[interaction_placebo_col] = df_placebo[treated_unit_indicator] * df_placebo[post_placebo_col] # Construct formula for placebo regression - formula = f"`{outcome}` ~ `{treated_unit_indicator}` + `{post_placebo_col}` + `{interaction_placebo_col}`" + formula = f"Q('{outcome}') ~ Q('{treated_unit_indicator}') + {post_placebo_col} + {interaction_placebo_col}" if covariates: - formula += f" + {' + '.join([f'`{c}`' for c in covariates])}" - formula += f" + C(`{group_var}`) + C(`{time_var}`)" # Include FEs + formula += f" + {' + '.join([f'Q(\"{c}\")' for c in covariates])}" + formula += f" + C(Q('{group_var}')) + C(Q('{time_var}'))" # Include FEs logger.debug(f"Placebo test formula: {formula}") From b2c58d7b64ea8802b976f949c699d6a97a7f892c Mon Sep 17 00:00:00 2001 From: erivan Date: Mon, 4 May 2026 21:40:38 +0200 Subject: [PATCH 09/13] added gps post-modeling assumption checking functions --- cais/methods/post_model_assumption_utils.py | 60 +++++++++++++++++---- 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/cais/methods/post_model_assumption_utils.py b/cais/methods/post_model_assumption_utils.py index b4d0344..b8c6d58 100644 --- a/cais/methods/post_model_assumption_utils.py +++ b/cais/methods/post_model_assumption_utils.py @@ -1,32 +1,30 @@ """ -Reusable assumption checks for causal inference methods. +Post-modeling assumption checks for causal inference methods. + +These checks require outputs from the estimation step (e.g., IPW weights, +matched samples, IV residuals, GPS model residuals) and are run after +the causal effect has been estimated. Each check returns a standardized dict: { "passed": bool | None, # None => inconclusive "reasoning": str, # human-readable explanation - "details": dict, # raw stats (F, p, SMDs, ...) + "details": dict, # raw stats (SMDs, p-values, ...) } - -These are composed in each estimator's `validate_assumptions` method. -The agent-level `validate_method` simply dispatches to the selected estimator. """ from typing import Any, Dict, List, Optional -import logging import numpy as np import pandas as pd -from pydantic import BaseModel, Field +from scipy import stats # import some assumptions already available for each method from cais.methods.instrumental_variable.diagnostics import ( run_overidentification_test, ) from cais.methods.utils import calculate_standardized_differences -from cais.utils.llm_helpers import call_llm_with_json_output - -logger = logging.getLogger(__name__) +from cais.methods.generalized_propensity_score.diagnostics import assess_gps_balance # _____________________________________________________________________________ # Output helper @@ -120,3 +118,45 @@ def check_iv_overidentification( ) +# _____________________________________________________________________________ +# GPS (Generalized Propensity Score) +# _____________________________________________________________________________ + +def check_gps_balance( + df_with_gps: pd.DataFrame, treatment_var: str, covariate_vars: List[str], + gps_col_name: str, **kwargs, +) -> Dict[str, Any]: + """Covariate balance after GPS adjustment.""" + res = assess_gps_balance(df_with_gps, treatment_var, covariate_vars, gps_col_name, **kwargs) + cov_balance = res.get("covariate_balance", {}) + unbalanced = [c for c, v in cov_balance.items() if not v.get("balanced", True)] + passed = len(unbalanced) == 0 + return _result( + passed=passed, + reasoning=( + res.get("summary", "GPS balance assessed.") + + (f" Unbalanced: {unbalanced}." if unbalanced else "") + ), + details=res, + ) + + +def check_gps_specification(residuals: np.ndarray) -> Dict[str, Any]: + """Residual normality of the GPS model (e.g., Shapiro-Wilk).""" + if len(residuals) < 3: + return _result( + passed=None, + reasoning="Too few residuals for normality test.", + ) + # Shapiro caps at ~5000; subsample if needed + sample = residuals if len(residuals) <= 5000 else np.random.choice(residuals, 5000, replace=False) + stat, p = stats.shapiro(sample) + passed = p > 0.05 + return _result( + passed=passed, + reasoning=( + f"Shapiro-Wilk on GPS residuals: W={stat:.3f}, p={p:.4f}. " + f"{'Residuals consistent with normality.' if passed else 'Departure from normality — reconsider GPS model.'}" + ), + details={"statistic": float(stat), "p_value": float(p)}, + ) \ No newline at end of file From 2057c886ce2e9f9cc63e60eaeada58a745ccb236 Mon Sep 17 00:00:00 2001 From: erivan Date: Mon, 4 May 2026 21:41:54 +0200 Subject: [PATCH 10/13] added frontdoor and RDD pre-modeling assumption checking functions --- cais/methods/pre_model_assumption_utils.py | 140 +++++++++++++++++++-- 1 file changed, 130 insertions(+), 10 deletions(-) diff --git a/cais/methods/pre_model_assumption_utils.py b/cais/methods/pre_model_assumption_utils.py index cc70ced..f1ab617 100644 --- a/cais/methods/pre_model_assumption_utils.py +++ b/cais/methods/pre_model_assumption_utils.py @@ -1,5 +1,10 @@ """ -Reusable assumption checks for causal inference methods. +Pre-modeling assumption checks for causal inference methods. + +These checks are run before estimation to verify whether the data and study +design satisfy the assumptions required by the chosen causal method. +Statistical checks use the raw data directly. Non-testable assumptions +are assessed via LLM reasoning based on the dataset description. Each check returns a standardized dict: { @@ -7,9 +12,6 @@ "reasoning": str, # human-readable explanation "details": dict, # raw stats (F, p, SMDs, ...) } - -These are composed in each estimator's `validate_assumptions` method. -The agent-level `validate_method` simply dispatches to the selected estimator. """ from typing import Any, Dict, List, Optional @@ -17,11 +19,11 @@ import numpy as np import pandas as pd +from scipy import stats as scipy_stats from pydantic import BaseModel, Field # import some assumptions already available for each method - from cais.methods.utils import ( calculate_standardized_differences, check_overlap, @@ -33,10 +35,6 @@ validate_parallel_trends, run_placebo_test, ) - -from cais.methods.generalized_propensity_score.diagnostics import ( - assess_gps_balance, -) from cais.utils.llm_helpers import call_llm_with_json_output logger = logging.getLogger(__name__) @@ -137,9 +135,14 @@ def _llm_argue_assumption( details = {"assumption": assumption_name} if verdict.missing_info: details["missing_info"] = verdict.missing_info + + disclaimer = ( + " Note: this assessment relies on LLM reasoning and is sensitive to the " + "quality and completeness of the dataset description provided." + ) return _result( passed=verdict.passed, - reasoning=verdict.reasoning, + reasoning=verdict.reasoning + disclaimer, details=details, ) except Exception as exc: @@ -446,3 +449,120 @@ def check_frontdoor_positivity( details={"total_combos": total_combos, "observed_combos": observed_combos, "empty": empty, "sparse": sparse, "min_count": min_count}, ) + + +# _____________________________________________________________________________ +# RDD-specific checks +# _____________________________________________________________________________ + +def check_rdd_no_manipulation( + df: pd.DataFrame, + running_variable: str, + cutoff: float, + n_bins: int = 50, + bandwidth: float = None, +) -> Dict[str, Any]: + """McCrary-style density test: check for bunching at the cutoff.""" + rv = df[running_variable].dropna() + + if bandwidth is None: + bandwidth = (rv.max() - rv.min()) * 0.25 + + near = rv[(rv >= cutoff - bandwidth) & (rv <= cutoff + bandwidth)] + below = near[near < cutoff] + above = near[near >= cutoff] + + if len(below) < 10 or len(above) < 10: + return _result( + passed=None, + reasoning="Too few observations near cutoff for density test.", + ) + + # Compare density just below vs just above using bin counts + n_below = len(below) + n_above = len(above) + + # Binomial test: under no manipulation, ~50% should be on each side + total = n_below + n_above + # Using scipy.stats.binomtest (modern) or binom_test (legacy) + try: + p_value = scipy_stats.binomtest(n_below, total, 0.5).pvalue + except AttributeError: + p_value = scipy_stats.binom_test(n_below, total, 0.5) + + passed = p_value > 0.05 + + # Logic adjustment: Mention that the binomial test is a local density approximation + status_msg = "No evidence of manipulation." if passed else "Significant density discontinuity — possible manipulation." + reasoning = ( + f"Density test around cutoff ({cutoff}): {n_below} below, {n_above} above " + f"(p={p_value:.4f}). Note: This binomial test is a local approximation of " + f"density continuity and may be sensitive to the underlying distribution's slope. " + f"{status_msg}" + ) + + return _result( + passed=passed, + reasoning=reasoning, + details={ + "n_below": n_below, "n_above": n_above, + "p_value": p_value, "bandwidth": bandwidth, + "test_type": "binomial_density_approximation" + }, + ) + + +def check_rdd_covariate_continuity( + df: pd.DataFrame, + running_variable: str, + cutoff: float, + covariates: List[str], + bandwidth: float = None, +) -> Dict[str, Any]: + """Check continuity of covariates at the cutoff via t-tests.""" + if not covariates: + return _result(passed=None, reasoning="No covariates provided.") + + if bandwidth is None: + rv_range = df[running_variable].max() - df[running_variable].min() + bandwidth = 0.1 * rv_range + + df_bw = df[(df[running_variable] >= cutoff - bandwidth) & (df[running_variable] <= cutoff + bandwidth)] + below = df_bw[df_bw[running_variable] < cutoff] + above = df_bw[df_bw[running_variable] >= cutoff] + + if len(below) < 5 or len(above) < 5: + return _result(passed=None, reasoning="Too few observations near cutoff.") + + results = {} + discontinuous = [] + for cov in covariates: + if cov not in df_bw.columns: + continue + t_stat, p_val = scipy_stats.ttest_ind( + below[cov].dropna(), above[cov].dropna(), equal_var=False + ) + results[cov] = {"t_stat": float(t_stat), "p_value": float(p_val)} + if p_val < 0.05: + discontinuous.append(cov) + + passed = len(discontinuous) == 0 + return _result( + passed=passed, + reasoning=( + f"Covariate continuity at cutoff ({cutoff}) on {len(results)} covariates. " + f"{'All continuous.' if passed else f'Discontinuous: {discontinuous}.'}" + ), + details={"covariate_tests": results, "discontinuous": discontinuous, "bandwidth": bandwidth}, + ) + + +def check_rdd_continuity_potential_outcomes(dataset_description, variables_summary, llm=None): + """Continuity of potential outcomes at the cutoff (local exchangeability).""" + return _llm_argue_assumption( + "Continuity of potential outcomes at the cutoff", + "E[Y(1)|X=c] and E[Y(0)|X=c] are continuous at the cutoff c. " + "In the absence of treatment, individuals just above and just below " + "the threshold would have had, on average, the same outcome.", + dataset_description, variables_summary, llm, + ) \ No newline at end of file From 90ef874efef031a8d9b0ad246a4e850815f11036 Mon Sep 17 00:00:00 2001 From: erivan Date: Mon, 4 May 2026 21:42:39 +0200 Subject: [PATCH 11/13] added frontdoor and RDD pre-modeling assumption checking functions --- cais/methods/pre_model_assumption_utils.py | 26 +--------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/cais/methods/pre_model_assumption_utils.py b/cais/methods/pre_model_assumption_utils.py index f1ab617..f1dce52 100644 --- a/cais/methods/pre_model_assumption_utils.py +++ b/cais/methods/pre_model_assumption_utils.py @@ -157,31 +157,7 @@ def _llm_argue_assumption( # SUTVA (needed for every method) # _____________________________________________________________________________ - -def check_strict_sutva( - dataset_description: Optional[str], - variables_summary: Dict[str, Any], - llm=None, -) -> Dict[str, Any]: - """SUTVA: no interference between units, no hidden treatment versions.""" - return _llm_argue_assumption( - assumption_name="SUTVA (Stable Unit Treatment Value Assumption)", - assumption_description=( - "(1) No interference: one unit's treatment does not affect another unit's " - "potential outcomes. (2) No hidden versions of the treatment: the treatment " - "is administered consistently across treated units." - ), - dataset_description=dataset_description, - variables_summary=variables_summary, - llm=llm, - extra_context=( - "Pay attention to: network/spillover effects (e.g., units in shared " - "schools, households, markets), partial compliance, treatment intensity " - "variation." - ), - ) - -def check_permissive_sutva( +def check_sutva( dataset_description: Optional[str], variables_summary: Dict[str, Any], llm=None, From f519ea4536256f1104863bfad3b09290d5004832 Mon Sep 17 00:00:00 2001 From: erivan Date: Mon, 4 May 2026 21:43:43 +0200 Subject: [PATCH 12/13] completed short use cases for the assumptions implemented in pre/post-modeling modules --- examples/assumption_utils_usecases.ipynb | 848 +++++++++++++---------- 1 file changed, 467 insertions(+), 381 deletions(-) diff --git a/examples/assumption_utils_usecases.ipynb b/examples/assumption_utils_usecases.ipynb index 237cfb9..144be7c 100644 --- a/examples/assumption_utils_usecases.ipynb +++ b/examples/assumption_utils_usecases.ipynb @@ -9,222 +9,44 @@ { "metadata": {}, "cell_type": "markdown", - "source": "## Pre-modeling assumptions", - "id": "38c5e374ee1335eb" + "source": [ + "Some assumptions cannot be tested statistically from the data alone. For these, we use an LLM to reason about their plausibility based on the dataset description and domain knowledge.\n", + "\n", + "Set your `OPENAI_API_KEY` in the environment before running the cells below." + ], + "id": "910624c801cea5de" }, { "metadata": { - "collapsed": true, "ExecuteTime": { - "end_time": "2026-04-29T10:06:03.331019300Z", - "start_time": "2026-04-29T10:06:03.210220500Z" + "end_time": "2026-05-04T15:36:19.112817300Z", + "start_time": "2026-05-04T15:36:05.471762500Z" } }, "cell_type": "code", "source": [ - "import pandas as pd\n", - "\n", - "dataset_path = \"../data/all_data/abortion_bf15.csv\"\n", - "df = pd.read_csv(dataset_path)\n", - "df.head()" + "from cais.config import get_llm_client\n", + "llm = get_llm_client()" ], - "id": "e42bb8b76ff7855f", + "id": "765a14abb89aa78c", "outputs": [ { - "data": { - "text/plain": [ - " fip age race year sex totcase totpop rate totrate id ... \\\n", - "0 1.0 15.0 2.0 1985.0 2 5683.0 106187 6527.5 5351.9 14.0 ... \n", - "1 1.0 15.0 2.0 1986.0 2 5344.0 106831 6351.2 5002.3 14.0 ... \n", - "2 1.0 15.0 2.0 1987.0 2 4983.0 106496 5759.1 4679.0 14.0 ... \n", - "3 1.0 15.0 2.0 1988.0 2 5276.0 105238 6139.6 5013.4 14.0 ... \n", - "4 1.0 15.0 2.0 1989.0 2 5692.0 102956 5951.5 5528.6 14.0 ... \n", - "\n", - " female lnr t younger fa pi wm15 wf15 bm15 bf15 \n", - "0 1.0 8.783779 1.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 \n", - "1 1.0 8.756399 2.0 1.0 1.0 0.0 0.0 0.0 0.0 1.0 \n", - "2 1.0 8.658537 3.0 1.0 1.0 1.0 0.0 0.0 0.0 1.0 \n", - "3 1.0 8.722515 4.0 1.0 1.0 1.0 0.0 0.0 0.0 1.0 \n", - "4 1.0 8.691399 5.0 1.0 1.0 1.0 0.0 0.0 0.0 1.0 \n", - "\n", - "[5 rows x 39 columns]" - ], - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
fipageraceyearsextotcasetotpopratetotrateid...femalelnrtyoungerfapiwm15wf15bm15bf15
01.015.02.01985.025683.01061876527.55351.914.0...1.08.7837791.01.01.00.00.00.00.01.0
11.015.02.01986.025344.01068316351.25002.314.0...1.08.7563992.01.01.00.00.00.00.01.0
21.015.02.01987.024983.01064965759.14679.014.0...1.08.6585373.01.01.01.00.00.00.01.0
31.015.02.01988.025276.01052386139.65013.414.0...1.08.7225154.01.01.01.00.00.00.01.0
41.015.02.01989.025692.01029565951.55528.614.0...1.08.6913995.01.01.01.00.00.00.01.0
\n", - "

5 rows × 39 columns

\n", - "
" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\beriv\\PycharmProjects\\causal-agent\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] } ], "execution_count": 1 }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "## Pre-modeling assumptions", + "id": "38c5e374ee1335eb" + }, { "metadata": {}, "cell_type": "markdown", @@ -234,34 +56,28 @@ { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:15.266016300Z", - "start_time": "2026-04-29T10:06:03.334015Z" + "end_time": "2026-05-04T15:36:28.431363400Z", + "start_time": "2026-05-04T15:36:28.405482200Z" } }, "cell_type": "code", "source": "from cais.methods.pre_model_assumption_utils import check_cond_ignorability", "id": "90501ac7f9f951b", - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\beriv\\PycharmProjects\\causal-agent\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "execution_count": 2 + "outputs": [], + "execution_count": 3 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:15.299181100Z", - "start_time": "2026-04-29T10:06:15.272603900Z" + "end_time": "2026-05-04T15:36:29.649769100Z", + "start_time": "2026-05-04T15:36:29.575569400Z" } }, "cell_type": "code", "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"../data/all_data/abortion_bf15.csv\")\n", "covariates = ['crack', 'alcohol', 'income', 'ur', 'poverty', 'black', 'perc1519']\n", "result_cond_ignorability = check_cond_ignorability(df, 'repeal', covariates)\n", "print(result_cond_ignorability)" @@ -276,13 +92,13 @@ ] } ], - "execution_count": 3 + "execution_count": 4 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:15.332700200Z", - "start_time": "2026-04-29T10:06:15.315219500Z" + "end_time": "2026-05-04T15:05:06.479752900Z", + "start_time": "2026-05-04T15:05:06.466740700Z" } }, "cell_type": "code", @@ -297,8 +113,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:15.373305300Z", - "start_time": "2026-04-29T10:06:15.332700200Z" + "end_time": "2026-05-04T15:05:06.537495100Z", + "start_time": "2026-05-04T15:05:06.479752900Z" } }, "cell_type": "code", @@ -320,33 +136,11 @@ ], "execution_count": 5 }, - { - "metadata": {}, - "cell_type": "markdown", - "source": "### Fix an OPENAI_API_KEY in environment before running cells below", - "id": "ac989fb331517781" - }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:33:35.269827500Z", - "start_time": "2026-04-29T10:33:34.696889100Z" - } - }, - "cell_type": "code", - "source": [ - "from cais.config import get_llm_client\n", - "llm = get_llm_client()" - ], - "id": "98e0711e996cb3f5", - "outputs": [], - "execution_count": 2 - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2026-04-29T10:06:18.401978500Z", - "start_time": "2026-04-29T10:06:15.968956600Z" + "end_time": "2026-05-04T15:05:10.916241600Z", + "start_time": "2026-05-04T15:05:06.537495100Z" } }, "cell_type": "code", @@ -378,11 +172,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'passed': True, 'reasoning': \"The assumption of SUTVA is plausibly satisfied in this context as the treatment 'repeal' pertains to state-level legalization of abortion, which is unlikely to directly affect the potential outcomes of other states. Additionally, the treatment appears to be uniformly applied within states that legalized abortion, minimizing concerns about hidden versions of the treatment. The analysis focuses on a specific subgroup (black females age 15), which further supports the consistency of treatment application within that group.\", 'details': {'assumption': 'SUTVA (Stable Unit Treatment Value Assumption)'}}\n" + "{'passed': True, 'reasoning': 'The assumption of SUTVA is plausibly satisfied in this context because the treatment (abortion legalization) is a state-level policy that does not directly interfere with the potential outcomes of individuals in other states. Additionally, the treatment is consistently defined as whether a state legalized abortion before Roe v. Wade, which minimizes the likelihood of hidden versions of the treatment affecting the outcome. The analysis focuses on a specific subgroup (black females age 15), which further supports the consistency of treatment application across this group within each state.', 'details': {'assumption': 'SUTVA (Stable Unit Treatment Value Assumption)'}}\n" ] } ], - "execution_count": 7 + "execution_count": 6 }, { "metadata": {}, @@ -393,8 +187,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:18.436988200Z", - "start_time": "2026-04-29T10:06:18.424323Z" + "end_time": "2026-05-04T15:05:10.931450500Z", + "start_time": "2026-05-04T15:05:10.920131400Z" } }, "cell_type": "code", @@ -408,13 +202,13 @@ ], "id": "b170f0d603110f8b", "outputs": [], - "execution_count": 8 + "execution_count": 7 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:18.455694900Z", - "start_time": "2026-04-29T10:06:18.439915100Z" + "end_time": "2026-05-04T15:05:10.966200500Z", + "start_time": "2026-05-04T15:05:10.935494800Z" } }, "cell_type": "code", @@ -424,13 +218,13 @@ ], "id": "43f5cdd305531a14", "outputs": [], - "execution_count": 9 + "execution_count": 8 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:18.472045900Z", - "start_time": "2026-04-29T10:06:18.459007200Z" + "end_time": "2026-05-04T15:05:10.978019Z", + "start_time": "2026-05-04T15:05:10.966200500Z" } }, "cell_type": "code", @@ -440,13 +234,13 @@ ], "id": "64af04994333ad51", "outputs": [], - "execution_count": 10 + "execution_count": 9 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:18.534397800Z", - "start_time": "2026-04-29T10:06:18.474064Z" + "end_time": "2026-05-04T15:05:11.007020900Z", + "start_time": "2026-05-04T15:05:10.981944900Z" } }, "cell_type": "code", @@ -553,23 +347,23 @@ "" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 11 + "execution_count": 10 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:18.566474300Z", - "start_time": "2026-04-29T10:06:18.538373800Z" + "end_time": "2026-05-04T15:05:11.046268500Z", + "start_time": "2026-05-04T15:05:11.009065400Z" } }, "cell_type": "code", "source": [ - "# --- check_iv_relevance : Is nearc4 a strong instrument for educ ? ---\n", + "# --- check_iv_relevance: Is nearc4 a strong instrument for educ ? ---\n", "result_iv_relevance = check_iv_relevance(\n", " df=df_iv,\n", " treatment='educ',\n", @@ -588,13 +382,13 @@ ] } ], - "execution_count": 12 + "execution_count": 11 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:20.338112800Z", - "start_time": "2026-04-29T10:06:18.567613800Z" + "end_time": "2026-05-04T15:05:13.785041Z", + "start_time": "2026-05-04T15:05:11.054418300Z" } }, "cell_type": "code", @@ -625,17 +419,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "check_iv_exclusion : {'passed': False, 'reasoning': 'The exclusion restriction is likely violated because growing up near a 4-year college may influence local labor market conditions, which could directly affect wages (lwage) independent of education (educ). This suggests that proximity to a college could have a direct effect on wages, thus undermining the validity of the instrument.', 'details': {'assumption': 'Exclusion restriction'}}\n" + "check_iv_exclusion : {'passed': False, 'reasoning': \"The exclusion restriction is likely violated because growing up near a 4-year college may influence local labor market conditions, which could directly affect wages (lwage) independent of education (educ). This suggests that the instrument 'nearc4' may have a direct effect on the outcome, undermining the validity of the causal inference.\", 'details': {'assumption': 'Exclusion restriction'}}\n" ] } ], - "execution_count": 13 + "execution_count": 12 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:22.725119800Z", - "start_time": "2026-04-29T10:06:20.355612200Z" + "end_time": "2026-05-04T15:05:16.034731200Z", + "start_time": "2026-05-04T15:05:13.785041Z" } }, "cell_type": "code", @@ -650,17 +444,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "check_iv_exogeneity : {'passed': False, 'reasoning': \"The assumption of instrument exogeneity is likely violated in this context because 'nearc4' (growing up near a 4-year college) may be correlated with local labor market conditions, which can affect wages independently of education. This suggests that 'nearc4' could influence 'lwage' through pathways other than education, undermining the validity of the exclusion restriction.\", 'details': {'assumption': 'Instrument exogeneity (independence)'}}\n" + "check_iv_exogeneity : {'passed': False, 'reasoning': \"The assumption of instrument exogeneity is likely violated in this context because growing up near a 4-year college may correlate with local labor market conditions, which can affect wages independently of education. This suggests that the instrument 'nearc4' may not be as good as randomly assigned with respect to unobserved confounders affecting both education and wages.\", 'details': {'assumption': 'Instrument exogeneity (independence)'}}\n" ] } ], - "execution_count": 14 + "execution_count": 13 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:25.463365800Z", - "start_time": "2026-04-29T10:06:22.742494500Z" + "end_time": "2026-05-04T15:05:18.341953800Z", + "start_time": "2026-05-04T15:05:16.034731200Z" } }, "cell_type": "code", @@ -675,11 +469,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "check_iv_monotonicity : {'passed': False, 'reasoning': 'The assumption of monotonicity is likely violated in this context because proximity to a 4-year college may lead some individuals to pursue more education while others may be discouraged or choose not to attend college despite proximity. This suggests the presence of defiers, as some individuals could be negatively influenced by the local educational environment, leading to a decrease in education despite being near a college.', 'details': {'assumption': 'Monotonicity (LATE)'}}\n" + "check_iv_monotonicity : {'passed': False, 'reasoning': 'The assumption of monotonicity is likely violated in this context because proximity to a 4-year college may not uniformly increase education for all individuals. Some individuals may choose to pursue less education or drop out due to various factors, such as local labor market conditions or personal circumstances, which could lead to defiers who are negatively affected by the instrument.', 'details': {'assumption': 'Monotonicity (LATE)'}}\n" ] } ], - "execution_count": 15 + "execution_count": 14 }, { "metadata": {}, @@ -694,8 +488,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:25.500243300Z", - "start_time": "2026-04-29T10:06:25.487566300Z" + "end_time": "2026-05-04T15:05:18.354982200Z", + "start_time": "2026-05-04T15:05:18.345714200Z" } }, "cell_type": "code", @@ -713,13 +507,13 @@ ], "id": "2f246249bee3bdc7", "outputs": [], - "execution_count": 16 + "execution_count": 15 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:27.602556200Z", - "start_time": "2026-04-29T10:06:25.502361500Z" + "end_time": "2026-05-04T15:05:20.813547700Z", + "start_time": "2026-05-04T15:05:18.356733600Z" } }, "cell_type": "code", @@ -734,11 +528,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "check_iv_monotonicity : {'passed': True, 'reasoning': 'The assumption of monotonicity is plausibly satisfied in this context because proximity to a 4-year college is expected to either increase education or have no effect, but not decrease it. Given that the mechanism involves reduced costs associated with attending college, it is reasonable to conclude that no individuals would be deflected from pursuing education due to their proximity to a college.', 'details': {'assumption': 'Monotonicity (LATE)'}}\n" + "check_iv_monotonicity : {'passed': True, 'reasoning': 'The assumption of monotonicity is plausibly satisfied in this context because proximity to a 4-year college is expected to either increase education or have no effect, but not decrease it. Given that the mechanism involves reduced costs associated with attending college, it is unlikely that individuals would be less likely to pursue education due to being near a college.', 'details': {'assumption': 'Monotonicity (LATE)'}}\n" ] } ], - "execution_count": 17 + "execution_count": 16 }, { "metadata": {}, @@ -749,8 +543,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:27.749050400Z", - "start_time": "2026-04-29T10:06:27.678419200Z" + "end_time": "2026-05-04T15:05:20.828880500Z", + "start_time": "2026-05-04T15:05:20.817935600Z" } }, "cell_type": "code", @@ -764,13 +558,13 @@ ], "id": "d6268e8f3f29807b", "outputs": [], - "execution_count": 18 + "execution_count": 17 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:27.924696800Z", - "start_time": "2026-04-29T10:06:27.749050400Z" + "end_time": "2026-05-04T15:05:20.888580200Z", + "start_time": "2026-05-04T15:05:20.830926800Z" } }, "cell_type": "code", @@ -980,18 +774,18 @@ "" ] }, - "execution_count": 19, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 19 + "execution_count": 18 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:28.229146300Z", - "start_time": "2026-04-29T10:06:27.924696800Z" + "end_time": "2026-05-04T15:05:20.921774800Z", + "start_time": "2026-05-04T15:05:20.888580200Z" } }, "cell_type": "code", @@ -1016,13 +810,13 @@ ] } ], - "execution_count": 20 + "execution_count": 19 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:28.800155900Z", - "start_time": "2026-04-29T10:06:28.247110600Z" + "end_time": "2026-05-04T15:05:20.980700900Z", + "start_time": "2026-05-04T15:05:20.923323500Z" } }, "cell_type": "code", @@ -1052,13 +846,13 @@ ] } ], - "execution_count": 21 + "execution_count": 20 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:29.035604900Z", - "start_time": "2026-04-29T10:06:28.807552Z" + "end_time": "2026-05-04T15:05:20.996846300Z", + "start_time": "2026-05-04T15:05:20.982722600Z" } }, "cell_type": "code", @@ -1083,13 +877,13 @@ ] } ], - "execution_count": 22 + "execution_count": 21 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:31.028959300Z", - "start_time": "2026-04-29T10:06:29.041141500Z" + "end_time": "2026-05-04T15:05:23.898303900Z", + "start_time": "2026-05-04T15:05:21.000832600Z" } }, "cell_type": "code", @@ -1120,17 +914,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'passed': False, 'reasoning': 'The assumption of stable group composition is likely violated in this analysis due to the staggered adoption of the Castle Doctrine laws across states. This staggered treatment could lead to differential attrition or selective entry/exit, as states that adopt the laws may differ systematically from those that do not, potentially affecting the composition of treatment and control groups over time.', 'details': {'assumption': 'Stable group composition'}}\n" + "{'passed': False, 'reasoning': 'The assumption of stable group composition is likely violated in this analysis due to the nature of the treatment (adoption of Castle Doctrine laws) which may lead to differential attrition or selective entry/exit of states based on their homicide rates or political climates. States that adopt such laws may differ systematically from those that do not, potentially affecting the composition of treatment and control groups over time.', 'details': {'assumption': 'Stable group composition'}}\n" ] } ], - "execution_count": 23 + "execution_count": 22 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:33.366297200Z", - "start_time": "2026-04-29T10:06:31.030893900Z" + "end_time": "2026-05-04T15:05:25.942687300Z", + "start_time": "2026-05-04T15:05:23.902390200Z" } }, "cell_type": "code", @@ -1154,17 +948,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'passed': False, 'reasoning': 'The assumption of stable group composition is likely violated in this analysis due to the staggered adoption of the Castle Doctrine laws across states. This staggered treatment could lead to differential attrition or selective entry/exit of states based on their homicide rates or other unobserved factors, which may change the composition of treatment and control groups over time.', 'details': {'assumption': 'Stable group composition'}}\n" + "{'passed': False, 'reasoning': 'The assumption of stable group composition is likely violated in this analysis due to the staggered adoption of the Castle Doctrine laws across states. This staggered treatment could lead to differential attrition or selective entry/exit, as states that adopt the laws may differ systematically from those that do not, potentially affecting the composition of treatment and control groups over time.', 'details': {'assumption': 'Stable group composition'}}\n" ] } ], - "execution_count": 24 + "execution_count": 23 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:34.970163500Z", - "start_time": "2026-04-29T10:06:33.388245500Z" + "end_time": "2026-05-04T15:05:27.833135700Z", + "start_time": "2026-05-04T15:05:25.942687300Z" } }, "cell_type": "code", @@ -1184,11 +978,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'passed': True, 'reasoning': 'The Castle Doctrine laws are state-level policies that do not directly affect other states, suggesting that interference between units is minimal. Additionally, the treatment (adoption of the law) is clearly defined and applied consistently across states, supporting the assumption of no hidden versions of the treatment.', 'details': {'assumption': 'SUTVA (Stable Unit Treatment Value Assumption)'}}\n" + "{'passed': True, 'reasoning': 'The Castle Doctrine laws are state-level policies that do not directly affect neighboring states, suggesting minimal interference between units. Additionally, the treatment (adoption of the law) is clearly defined and consistently applied within each state, supporting the assumption of no hidden versions of the treatment.', 'details': {'assumption': 'SUTVA (Stable Unit Treatment Value Assumption)'}}\n" ] } ], - "execution_count": 25 + "execution_count": 24 }, { "metadata": {}, @@ -1199,8 +993,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:33:24.170673Z", - "start_time": "2026-04-29T10:33:09.808139600Z" + "end_time": "2026-05-04T15:05:27.842739600Z", + "start_time": "2026-05-04T15:05:27.833135700Z" } }, "cell_type": "code", @@ -1213,28 +1007,19 @@ ")" ], "id": "5134dec2de5e8b93", - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\beriv\\PycharmProjects\\causal-agent\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "execution_count": 1 + "outputs": [], + "execution_count": 25 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:33:44.109339400Z", - "start_time": "2026-04-29T10:33:44.071005Z" + "end_time": "2026-05-04T15:05:27.855930600Z", + "start_time": "2026-05-04T15:05:27.847904600Z" } }, "cell_type": "code", "source": [ - "# Description classique : smoking → tar → cancer\n", + "# Description: smoking → tar → cancer\n", "frontdoor_description = (\n", " \"Observational study examining the effect of smoking (T) on lung cancer (Y). \"\n", " \"Tar deposits in lungs (M) are used as a mediator. The argument is that \"\n", @@ -1251,13 +1036,13 @@ ], "id": "c092dd995e2a558e", "outputs": [], - "execution_count": 3 + "execution_count": 26 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:33:48.050888200Z", - "start_time": "2026-04-29T10:33:45.844099500Z" + "end_time": "2026-05-04T15:05:30.675124800Z", + "start_time": "2026-05-04T15:05:27.858233500Z" } }, "cell_type": "code", @@ -1276,13 +1061,13 @@ ] } ], - "execution_count": 4 + "execution_count": 27 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:33:51.528185200Z", - "start_time": "2026-04-29T10:33:49.350818500Z" + "end_time": "2026-05-04T15:05:32.254733500Z", + "start_time": "2026-05-04T15:05:30.678552700Z" } }, "cell_type": "code", @@ -1297,17 +1082,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "check_frontdoor_no_TM_confounding : {'passed': False, 'reasoning': 'The assumption of no T-M confounding is not plausibly satisfied because there are unobserved genetic predispositions that could influence both smoking behavior and the accumulation of tar deposits in the lungs. These genetic factors could create a confounding relationship between the treatment (smoking) and the mediator (tar deposits), thus violating the assumption.', 'details': {'assumption': 'No T-M confounding'}}\n" + "check_frontdoor_no_TM_confounding : {'passed': False, 'reasoning': 'The assumption of no T-M confounding is not plausibly satisfied because there are unobserved genetic predispositions that may influence both smoking behavior and the accumulation of tar deposits in the lungs. These genetic factors could create a confounding relationship between the treatment (smoking) and the mediator (tar deposits), thus violating the assumption.', 'details': {'assumption': 'No T-M confounding'}}\n" ] } ], - "execution_count": 5 + "execution_count": 28 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:33:54.348785200Z", - "start_time": "2026-04-29T10:33:52.501899600Z" + "end_time": "2026-05-04T15:05:33.432380900Z", + "start_time": "2026-05-04T15:05:32.254733500Z" } }, "cell_type": "code", @@ -1322,22 +1107,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "check_frontdoor_T_blocks_MY : {'passed': False, 'reasoning': 'The assumption that T blocks M→Y confounding is not plausibly satisfied because there are unobserved genetic confounders that affect both smoking behavior (T) and lung cancer risk (Y). These unobserved confounders can create back-door paths between the mediator (M) and the outcome (Y), violating the assumption.', 'details': {'assumption': 'T blocks M→Y confounding'}}\n" + "check_frontdoor_T_blocks_MY : {'passed': False, 'reasoning': 'The assumption that T blocks M→Y confounding is not plausibly satisfied because there are unobserved genetic confounders that may influence both smoking behavior (T) and lung cancer risk (Y). These unobserved confounders can create back-door paths between the mediator (M) and the outcome (Y), thus violating the assumption.', 'details': {'assumption': 'T blocks M→Y confounding'}}\n" ] } ], - "execution_count": 6 + "execution_count": 29 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:35:16.476952500Z", - "start_time": "2026-04-29T10:35:16.336653Z" + "end_time": "2026-05-04T15:05:33.453811500Z", + "start_time": "2026-05-04T15:05:33.434072800Z" } }, "cell_type": "code", "source": [ - "from cais.methods.pre_model_assumption_utils import check_frontdoor_positivity\n", "import numpy as np\n", "import pandas as pd\n", "\n", @@ -1349,7 +1133,6 @@ " 'M': np.random.binomial(1, 0.6, n),\n", "})\n", "\n", - "# Verified\n", "result = check_frontdoor_positivity(df_fd, 'T', 'M')\n", "print(\"Normal case :\", result)" ], @@ -1363,13 +1146,13 @@ ] } ], - "execution_count": 8 + "execution_count": 30 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:35:20.022758900Z", - "start_time": "2026-04-29T10:35:19.920688800Z" + "end_time": "2026-05-04T15:05:33.469584600Z", + "start_time": "2026-05-04T15:05:33.453811500Z" } }, "cell_type": "code", @@ -1391,30 +1174,314 @@ ] } ], - "execution_count": 9 + "execution_count": 31 }, { "metadata": {}, "cell_type": "markdown", - "source": "## Post-modeling assumptions", - "id": "4c16b7a6b8ae03ef" + "source": "#### Regression Discontinuity Design (RDD)", + "id": "c67e21ef92017f3" }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:06:41.952452300Z", - "start_time": "2026-04-29T10:06:41.925732700Z" + "end_time": "2026-05-04T15:36:34.675391500Z", + "start_time": "2026-05-04T15:36:34.645667800Z" } }, "cell_type": "code", "source": [ - "from cais.methods.post_model_assumption_utils import (\n", - " check_iv_overidentification, check_balance_after_matching, check_balance_after_weighting\n", + "from cais.methods.pre_model_assumption_utils import (\n", + " check_rdd_no_manipulation,\n", + " check_rdd_covariate_continuity,\n", + " check_rdd_continuity_potential_outcomes,\n", ")" ], - "id": "dd005b46b39be3", + "id": "2619f7606e322aa9", "outputs": [], - "execution_count": 31 + "execution_count": 5 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T15:36:36.528550700Z", + "start_time": "2026-05-04T15:36:36.289594700Z" + } + }, + "cell_type": "code", + "source": [ + "df_hansen = pd.read_csv('../data/all_data/hansen.csv').dropna()\n", + "print(f\"Shape: {df_hansen.shape}\")\n", + "print(f\"Columns: {list(df_hansen.columns)}\")\n", + "print(f\"bac1 range: {df_hansen['bac1'].min():.3f} - {df_hansen['bac1'].max():.3f}\")" + ], + "id": "9294b4e106852031", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape: (214558, 11)\n", + "Columns: ['date', 'alcohol1', 'alcohol2', 'low_score', 'male', 'white', 'recidivism', 'acc', 'aged', 'year', 'bac1']\n", + "bac1 range: 0.000 - 0.449\n" + ] + } + ], + "execution_count": 6 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T15:36:42.257062400Z", + "start_time": "2026-05-04T15:36:42.121233800Z" + } + }, + "cell_type": "code", + "source": [ + "# --- No manipulation: density test around BAC=0.08 ---\n", + "result = check_rdd_no_manipulation(df_hansen, 'bac1', cutoff=0.08)\n", + "print(\"check_rdd_no_manipulation :\", result)" + ], + "id": "5e53a9e09bbd6e23", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "check_rdd_no_manipulation : {'passed': np.False_, 'reasoning': \"Density test around cutoff (0.08): 22101 below, 156949 above (p=0.0000). Note: This binomial test is a local approximation of density continuity and may be sensitive to the underlying distribution's slope. Significant density discontinuity — possible manipulation.\", 'details': {'n_below': 22101, 'n_above': 156949, 'p_value': np.float64(0.0), 'bandwidth': np.float64(0.11225), 'test_type': 'binomial_density_approximation'}}\n" + ] + } + ], + "execution_count": 7 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T15:36:44.186283900Z", + "start_time": "2026-05-04T15:36:44.079004200Z" + } + }, + "cell_type": "code", + "source": [ + "# --- Covariate continuity at cutoff ---\n", + "covariates_rdd = ['male', 'white', 'age', 'acc']\n", + "result = check_rdd_covariate_continuity(df_hansen, 'bac1', cutoff=0.08, covariates=covariates_rdd)\n", + "print(\"check_rdd_covariate_continuity :\", result)" + ], + "id": "3d7f37f2cc8e4e44", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "check_rdd_covariate_continuity : {'passed': False, 'reasoning': \"Covariate continuity at cutoff (0.08) on 3 covariates. Discontinuous: ['white'].\", 'details': {'covariate_tests': {'male': {'t_stat': -1.531368003869146, 'p_value': 0.12568985124790266}, 'white': {'t_stat': -2.8493159475536536, 'p_value': 0.0043845915062270405}, 'acc': {'t_stat': 0.46879313183613336, 'p_value': 0.6392211759827671}}, 'discontinuous': ['white'], 'bandwidth': np.float64(0.0449)}}\n" + ] + } + ], + "execution_count": 8 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T15:05:36.927328600Z", + "start_time": "2026-05-04T15:05:33.746957200Z" + } + }, + "cell_type": "code", + "source": [ + "# --- Continuity of potential outcomes (LLM-reasoned) ---\n", + "hansen_description = (\n", + " \"Hansen (2015). Administrative DUI records from Washington State. \"\n", + " \"Running variable is BAC (blood alcohol content) measured by breathalyzer. \"\n", + " \"Cutoff is 0.08 (legal DUI limit). Treatment is DUI conviction. \"\n", + " \"Outcome is recidivism (future DUI). Individuals cannot precisely \"\n", + " \"control their BAC level during a breathalyzer test.\"\n", + ")\n", + "\n", + "hansen_variables = {\n", + " 'treatment': 'DUI conviction (BAC >= 0.08)',\n", + " 'outcome': 'recidivism',\n", + " 'running_variable': 'bac1',\n", + " 'cutoff': 0.08,\n", + " 'covariates': covariates_rdd,\n", + "}\n", + "\n", + "result = check_rdd_continuity_potential_outcomes(hansen_description, hansen_variables, llm=llm)\n", + "print(\"check_rdd_continuity_potential_outcomes :\", result)" + ], + "id": "6fc9dd73d373108e", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "check_rdd_continuity_potential_outcomes : {'passed': True, 'reasoning': 'The assumption of continuity of potential outcomes at the cutoff is plausibly satisfied in this context because individuals cannot precisely control their BAC levels during a breathalyzer test. This suggests that those just above and just below the 0.08 cutoff are likely to be similar in unobserved characteristics, leading to the expectation that their potential outcomes (recidivism rates) would be continuous at the cutoff.', 'details': {'assumption': 'Continuity of potential outcomes at the cutoff'}}\n" + ] + } + ], + "execution_count": 36 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "## Post-modeling assumptions", + "id": "4c16b7a6b8ae03ef" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "#### Generalized Propensity Score", + "id": "16cc1e1b11f77077" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "from cais.methods.post_model_assumption_utils import check_gps_specification", + "id": "859f92ce3e3f008" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Let's compute Generalized Propensity Scores on `fulton.csv` with continuous treatment.", + "id": "b53060fafc441993" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T15:05:36.965088100Z", + "start_time": "2026-05-04T15:05:36.947836200Z" + } + }, + "cell_type": "code", + "source": [ + "df_fulton = pd.read_csv('../data/all_data/fulton.csv').dropna()\n", + "print(f\"Shape: {df_fulton.shape}\")\n", + "print(f\"Columns: {list(df_fulton.columns)}\")\n", + "print(f\"q (continuous treatment): min={df_fulton['q'].min():.2f}, max={df_fulton['q'].max():.2f}\")" + ], + "id": "c7988eb52ec49813", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape: (111, 12)\n", + "Columns: ['mon', 'tue', 'wed', 'thu', 'date', 'stormy', 'mixed', 'p', 'q', 'rainy', 'cold', 'wind']\n", + "q (continuous treatment): min=6.19, max=9.98\n" + ] + } + ], + "execution_count": 38 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T15:05:36.983832500Z", + "start_time": "2026-05-04T15:05:36.965088100Z" + } + }, + "cell_type": "code", + "source": [ + "# GPS model simulation: regress q on covariates and residuals extraction\n", + "import statsmodels.api as sm\n", + "\n", + "covariates_fulton = ['mon', 'tue', 'wed', 'thu', 'stormy']\n", + "X = sm.add_constant(df_fulton[covariates_fulton])\n", + "model = sm.OLS(df_fulton['q'], X).fit()\n", + "residuals = model.resid.values\n", + "\n", + "print(f\"Residuals: n={len(residuals)}, mean={residuals.mean():.4f}, std={residuals.std():.4f}\")" + ], + "id": "ef1e1084571cb9f3", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Residuals: n=111, mean=0.0000, std=0.6631\n" + ] + } + ], + "execution_count": 39 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T15:05:37.002237200Z", + "start_time": "2026-05-04T15:05:36.985053900Z" + } + }, + "cell_type": "code", + "source": [ + "# --- check_gps_specification: are the residuals normally distributed? ---\n", + "result = check_gps_specification(residuals)\n", + "print(\"check_gps_specification (fulton) :\", result)" + ], + "id": "337039ca1ac9ec2f", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "check_gps_specification (fulton) : {'passed': np.False_, 'reasoning': 'Shapiro-Wilk on GPS residuals: W=0.958, p=0.0014. Departure from normality — reconsider GPS model.', 'details': {'statistic': 0.9576404603447696, 'p_value': 0.0014026817630125956}}\n" + ] + } + ], + "execution_count": 40 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T15:05:37.020844400Z", + "start_time": "2026-05-04T15:05:37.002237200Z" + } + }, + "cell_type": "code", + "source": [ + "# --- Comparison with artificially normal residuals ---\n", + "residuals_perfect = np.random.normal(0, 1, len(residuals))\n", + "result = check_gps_specification(residuals_perfect)\n", + "print(\"check_gps_specification (normal parfait) :\", result)" + ], + "id": "1a2b70f718bd42f2", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "check_gps_specification (normal parfait) : {'passed': np.True_, 'reasoning': 'Shapiro-Wilk on GPS residuals: W=0.983, p=0.1563. Residuals consistent with normality.', 'details': {'statistic': 0.9825489204292709, 'p_value': 0.15628318585891304}}\n" + ] + } + ], + "execution_count": 41 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T15:05:37.037838200Z", + "start_time": "2026-05-04T15:05:37.020844400Z" + } + }, + "cell_type": "code", + "source": [ + "# --- Comparison with highly non-normal residuals ---\n", + "residuals_bad = np.concatenate([np.random.exponential(2, 300), -np.random.exponential(2, 200)])\n", + "result = check_gps_specification(residuals_bad)\n", + "print(\"check_gps_specification (non-normal) :\", result)" + ], + "id": "a15764559b833770", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "check_gps_specification (non-normal) : {'passed': np.False_, 'reasoning': 'Shapiro-Wilk on GPS residuals: W=0.953, p=0.0000. Departure from normality — reconsider GPS model.', 'details': {'statistic': 0.9531326431078454, 'p_value': 1.6850353927245807e-11}}\n" + ] + } + ], + "execution_count": 42 }, { "metadata": {}, @@ -1422,17 +1489,28 @@ "source": "#### Balance checks (IPW, matching)", "id": "d707342d2932fb6c" }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "from cais.methods.post_model_assumption_utils import (\n", + " check_balance_after_matching, check_balance_after_weighting)" + ], + "id": "92789430b3a8b298" + }, { "metadata": {}, "cell_type": "markdown", - "source": "Let's perform matching on the `abortion_bf15.csv` dataset", + "source": "Let's perform matching on the `abortion_bf15.csv` dataset.", "id": "33e3a46a49da647" }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:22:32.930882500Z", - "start_time": "2026-04-29T10:22:32.680917800Z" + "end_time": "2026-05-04T15:05:37.166284200Z", + "start_time": "2026-05-04T15:05:37.037838200Z" } }, "cell_type": "code", @@ -1475,13 +1553,13 @@ ] } ], - "execution_count": 45 + "execution_count": 43 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:18:45.346005300Z", - "start_time": "2026-04-29T10:18:45.251915700Z" + "end_time": "2026-05-04T15:05:37.184860600Z", + "start_time": "2026-05-04T15:05:37.166856500Z" } }, "cell_type": "code", @@ -1500,7 +1578,7 @@ ] } ], - "execution_count": 41 + "execution_count": 44 }, { "metadata": {}, @@ -1511,8 +1589,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:26:38.848426400Z", - "start_time": "2026-04-29T10:26:38.545776200Z" + "end_time": "2026-05-04T15:05:37.249403100Z", + "start_time": "2026-05-04T15:05:37.187765600Z" } }, "cell_type": "code", @@ -1539,7 +1617,7 @@ ] } ], - "execution_count": 46 + "execution_count": 45 }, { "metadata": {}, @@ -1550,8 +1628,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:21:30.566220400Z", - "start_time": "2026-04-29T10:21:30.477705Z" + "end_time": "2026-05-04T15:05:37.275547Z", + "start_time": "2026-05-04T15:05:37.253965700Z" } }, "cell_type": "code", @@ -1563,18 +1641,18 @@ "ps = np.clip(ps, 0.01, 0.99)\n", "T = df_clean['repeal'].astype(int)\n", "\n", - "# Computing IPW weights\n", + "# computing IPW weights\n", "weights = np.where(T == 1, 1 / ps, 1 / (1 - ps))" ], "id": "dc0ecd56a3c12ad2", "outputs": [], - "execution_count": 43 + "execution_count": 46 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:21:32.376378700Z", - "start_time": "2026-04-29T10:21:32.239870800Z" + "end_time": "2026-05-04T15:05:37.344788Z", + "start_time": "2026-05-04T15:05:37.278665100Z" } }, "cell_type": "code", @@ -1593,7 +1671,7 @@ ] } ], - "execution_count": 44 + "execution_count": 47 }, { "metadata": {}, @@ -1604,8 +1682,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:28:50.053941300Z", - "start_time": "2026-04-29T10:28:49.903451700Z" + "end_time": "2026-05-04T15:05:37.387395Z", + "start_time": "2026-05-04T15:05:37.349403Z" } }, "cell_type": "code", @@ -1632,7 +1710,7 @@ ] } ], - "execution_count": 47 + "execution_count": 48 }, { "metadata": {}, @@ -1640,11 +1718,19 @@ "source": "#### Instrumental Variables (IVs)", "id": "32ed8f8ad4ab798b" }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "from cais.methods.post_model_assumption_utils import check_iv_overidentification", + "id": "ddd21c6cdb22b34b" + }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:12:59.044983Z", - "start_time": "2026-04-29T10:12:58.675274200Z" + "end_time": "2026-05-04T15:05:37.504831900Z", + "start_time": "2026-05-04T15:05:37.395820600Z" } }, "cell_type": "code", @@ -1669,13 +1755,13 @@ ] } ], - "execution_count": 34 + "execution_count": 49 }, { "metadata": { "ExecuteTime": { - "end_time": "2026-04-29T10:12:59.766273400Z", - "start_time": "2026-04-29T10:12:59.067029500Z" + "end_time": "2026-05-04T15:05:37.560162900Z", + "start_time": "2026-05-04T15:05:37.510625Z" } }, "cell_type": "code", @@ -1714,7 +1800,7 @@ ] } ], - "execution_count": 35 + "execution_count": 50 } ], "metadata": { From 005d6d4840ed795159a57414872b14e7751610e9 Mon Sep 17 00:00:00 2001 From: erivan Date: Mon, 4 May 2026 21:54:37 +0200 Subject: [PATCH 13/13] added sutva empirical check notebook --- examples/sutva_empirical_check.ipynb | 1557 ++++++++++++++++++++++++++ 1 file changed, 1557 insertions(+) create mode 100644 examples/sutva_empirical_check.ipynb diff --git a/examples/sutva_empirical_check.ipynb b/examples/sutva_empirical_check.ipynb new file mode 100644 index 0000000..78721f7 --- /dev/null +++ b/examples/sutva_empirical_check.ipynb @@ -0,0 +1,1557 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "markdown", + "source": "### Evaluation of SUTVA pass/fail/inconclusive rate across all real datasets.", + "id": "36bfe5b22dbde1ed" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "In its very strict version:", + "id": "67063fb5e0afbcb8" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:49:42.302703600Z", + "start_time": "2026-05-04T19:49:42.209891300Z" + } + }, + "cell_type": "code", + "source": [ + "def check_strict_sutva(dataset_description, variables_summary, llm):\n", + " \"\"\"SUTVA check with strict prompt — flags theoretical spillovers.\"\"\"\n", + " from cais.methods.pre_model_assumption_utils import _llm_argue_assumption\n", + " return _llm_argue_assumption(\n", + " assumption_name=\"SUTVA (Stable Unit Treatment Value Assumption)\",\n", + " assumption_description=(\n", + " \"(1) No interference: one unit's treatment does not affect another unit's \"\n", + " \"potential outcomes. (2) No hidden versions of the treatment: the treatment \"\n", + " \"is administered consistently across treated units.\"\n", + " ),\n", + " dataset_description=dataset_description,\n", + " variables_summary=variables_summary,\n", + " llm=llm,\n", + " extra_context=(\n", + " \"Pay attention to: network/spillover effects (e.g., units in shared \"\n", + " \"schools, households, markets), partial compliance, treatment intensity \"\n", + " \"variation.\"\n", + " ),\n", + " )" + ], + "id": "9866f56c07fc89fd", + "outputs": [], + "execution_count": 4 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:49:42.482633900Z", + "start_time": "2026-05-04T19:49:42.337110100Z" + } + }, + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "from cais.config import get_llm_client\n", + "\n", + "info = pd.read_csv('../data/real_info.csv', encoding='utf-8-sig')\n", + "\n", + "# some of the datasets appear several times, deleting duplicates\n", + "datasets = info.drop_duplicates(subset='data_files')[['data_files', 'data_description', 'method',\n", + " 'treatment', 'outcome', 'covariates',\n", + " 'instrument_var', 'temporal_var']].reset_index(drop=True)\n", + "\n", + "print(f\"Number of unique datasets: {len(datasets)}\\n\")" + ], + "id": "ad7f21df4482573b", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of unique datasets: 19\n", + "\n" + ] + } + ], + "execution_count": 5 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:50:27.122826200Z", + "start_time": "2026-05-04T19:49:42.516528700Z" + } + }, + "cell_type": "code", + "source": [ + "llm = get_llm_client()\n", + "\n", + "results = []\n", + "\n", + "for i, row in datasets.iterrows():\n", + " dataset_name = row['data_files']\n", + " description = row['data_description']\n", + "\n", + " variables_summary = {\n", + " 'treatment': row['treatment'],\n", + " 'outcome': row['outcome'],\n", + " 'covariates': row['covariates'],\n", + " 'method': row['method'],\n", + " 'instrument': row.get('instrument_var', None),\n", + " 'time_variable': row.get('temporal_var', None),\n", + " }\n", + "\n", + " print(f\"[{i+1}/{len(datasets)}] {dataset_name}...\", end=\" \")\n", + "\n", + " strict_result = check_strict_sutva(description, variables_summary, llm=llm)\n", + "\n", + " status = {True: 'PASS', False: 'FAIL', None: 'INCONCLUSIVE'}[strict_result['passed']]\n", + " print(status)\n", + "\n", + " results.append({\n", + " 'dataset': dataset_name,\n", + " 'method': row['method'],\n", + " 'passed': strict_result['passed'],\n", + " 'status': status,\n", + " 'reasoning': strict_result['reasoning'],\n", + " 'missing_info': strict_result.get('details', {}).get('missing_info', None),\n", + " })" + ], + "id": "3361f9e8da06230b", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1/19] voter_turnout_data.csv... FAIL\n", + "[2/19] lalonde_data.csv... PASS\n", + "[3/19] lalonde_data_psid.csv... PASS\n", + "[4/19] vernby_2019.csv... PASS\n", + "[5/19] card_geographic.csv... INCONCLUSIVE\n", + "[6/19] lee_2008.csv... PASS\n", + "[7/19] abortion_bf15.csv... FAIL\n", + "[8/19] abortion_bm15.csv... FAIL\n", + "[9/19] broockman_intrinsic.csv... PASS\n", + "[10/19] castle.csv... FAIL\n", + "[11/19] gov_transfers.csv... FAIL\n", + "[12/19] organ_donations.csv... PASS\n", + "[13/19] thornton_hiv.csv... PASS\n", + "[14/19] close_elections.csv... FAIL\n", + "[15/19] electrification_data.csv... FAIL\n", + "[16/19] nan... FAIL\n", + "[17/19] fda_carpenter.csv... PASS\n", + "[18/19] fulton.csv... FAIL\n", + "[19/19] hansen.csv... PASS\n" + ] + } + ], + "execution_count": 6 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:50:27.202002Z", + "start_time": "2026-05-04T19:50:27.144718300Z" + } + }, + "cell_type": "code", + "source": [ + "strict_result_df = pd.DataFrame(results)\n", + "\n", + "# SUTVA SUMMARY ON REAL DATASETS\n", + "\n", + "print(f\"\\n{strict_result_df['status'].value_counts().to_string()}\")\n", + "print(f\"\\nPASS Rate: {(strict_result_df['status']=='PASS').mean():.0%}\")\n", + "print(f\"FAIL Rate: {(strict_result_df['status']=='FAIL').mean():.0%}\")\n", + "print(f\"INCONCLUSIVE Rate: {(strict_result_df['status']=='INCONCLUSIVE').mean():.0%}\")" + ], + "id": "d138f680ed3661eb", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "status\n", + "FAIL 9\n", + "PASS 9\n", + "INCONCLUSIVE 1\n", + "\n", + "PASS Rate: 47%\n", + "FAIL Rate: 47%\n", + "INCONCLUSIVE Rate: 5%\n" + ] + } + ], + "execution_count": 7 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:50:27.258990400Z", + "start_time": "2026-05-04T19:50:27.209978500Z" + } + }, + "cell_type": "code", + "source": [ + "# Detail\n", + "print(strict_result_df[['dataset', 'method', 'status', 'reasoning']].to_string(index=False))" + ], + "id": "dd21ce85fd2bf5dd", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " dataset method status reasoning\n", + " voter_turnout_data.csv ols FAIL The SUTVA assumption is likely violated due to potential interference among individuals within the same household. For example, the treatment groups that emphasize social pressure (like Neighbors and Self) could lead to discussions or influence among household members, affecting each other's voting behavior. Additionally, the presence of multiple individuals in a household could create variations in treatment exposure and response, further complicating the assumption of no interference. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " lalonde_data.csv ols PASS The SUTVA assumption appears to be satisfied in the context of the NSW Demonstration study. The random assignment of participants into treatment and control groups minimizes the risk of interference between units, as each participant's treatment status is independent of others. Additionally, the treatment (employment program) is consistently administered to those in the treatment group, with no indication of hidden versions of the treatment affecting outcomes. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " lalonde_data_psid.csv matching PASS The SUTVA assumption appears to be plausibly satisfied in this analysis. The treatment (NSW job training program) was randomly assigned, which minimizes the risk of interference between units. Additionally, since the control group is drawn from a separate observational dataset (PSID) and did not participate in the NSW program, it is unlikely that there are hidden versions of the treatment affecting the outcomes. Thus, the treatment is administered consistently across treated units. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " vernby_2019.csv ols PASS The SUTVA assumption appears to be satisfied in this study as the treatment (being an immigrant) is applied to individual candidates independently, and there is no indication of interference between candidates' outcomes. Additionally, the treatment is clearly defined and consistently applied across all applicants, with no hidden versions of the treatment suggested in the dataset description. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " card_geographic.csv iv INCONCLUSIVE The dataset description does not provide specific information about the treatment being analyzed, nor does it clarify how treatment is administered or whether there are potential interactions between units. Without knowing the nature of the treatment and its implementation, it is impossible to assess whether SUTVA is satisfied. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " lee_2008.csv rdd PASS The dataset focuses on U.S. House of Representatives elections, where the treatment (e.g., Democratic candidate characteristics) is applied at the individual candidate level and does not interfere with other candidates' outcomes. Given the nature of elections, it is reasonable to assume that one candidate's treatment does not affect another's potential outcomes, satisfying the no interference condition of SUTVA. Additionally, the treatment appears to be consistently defined across candidates, addressing the no hidden versions condition. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " abortion_bf15.csv did FAIL The assumption of no interference is likely violated in this context, as the legalization of abortion in one state could influence behaviors and health outcomes in neighboring states, particularly among young females who may share social networks or access to healthcare resources. Additionally, the treatment (abortion legalization) may not be uniformly experienced across states, leading to variations in how it affects different populations, which challenges the assumption of no hidden versions of the treatment. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " abortion_bm15.csv did FAIL The assumption of no interference is likely violated in this context, as the legalization of abortion in one state could influence behaviors and health outcomes in neighboring states, particularly among adolescents who may share social networks or access to healthcare resources. Additionally, the treatment (abortion legalization) may not be uniformly experienced across states, leading to variations in how it affects gonorrhea incidence among different populations. This suggests potential spillover effects that undermine SUTVA. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " broockman_intrinsic.csv ols PASS The SUTVA assumption appears to be satisfied in this study as the treatment (emails from out-of-district senders) is unlikely to interfere with the potential outcomes of other legislators, given that each legislator operates independently in their response to emails. Additionally, the treatment is consistently applied across all treated units (emails sent from out-of-district senders), with no indication of hidden versions of the treatment. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " castle.csv did FAIL The assumption of no interference is likely violated in this context, as the implementation of castle-doctrine laws in one state could influence crime rates in neighboring states through spillover effects. Additionally, the treatment (enactment of the law) may not be uniformly experienced across states, as differences in law enforcement practices and public perception could lead to variations in how the law affects crime rates. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " gov_transfers.csv rdd FAIL The assumption of no interference is likely violated in this context, as households participating in the PANES program may influence the support perceptions of non-participating households through social networks or community interactions. Additionally, the treatment (participation) may not be uniformly experienced across households due to variations in how the program was implemented or perceived, suggesting potential hidden versions of the treatment. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " organ_donations.csv did PASS The SUTVA assumption appears to be plausibly satisfied in this context. The treatment (active-choice phrasing for organ donation sign-up) is applied uniformly across California, and there is no indication of interference between individuals' decisions to register for organ donation across states. Since the data is aggregated at the state level and focuses on registration rates, it minimizes the likelihood of spillover effects from one individual to another. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " thornton_hiv.csv ols PASS The SUTVA assumption appears to be plausibly satisfied in this study as the treatment (incentives) is randomly assigned to individuals, which minimizes the risk of interference between units. Additionally, the treatment is administered consistently across treated units, as all participants receiving a voucher are given the same type of incentive based on a random draw, reducing the likelihood of hidden versions of the treatment. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " close_elections.csv rdd FAIL The assumption of no interference is likely violated in this context, as the policy positions of elected officials can be influenced by the actions and outcomes of other officials within the same state or district, especially in closely contested elections. Additionally, the treatment (election outcomes) may not be administered consistently across units, as the political landscape and voter preferences can vary significantly between districts and states, leading to potential hidden versions of the treatment. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "electrification_data.csv iv FAIL The SUTVA assumption is likely violated due to potential interference among households in close proximity, particularly in rural settings where social and economic interactions are common. Households that are near each other may share resources, information, or even influence each other's decisions regarding expenditures and usage of electricity, which could affect their potential outcomes. Additionally, the treatment (electricity connection) may not be uniformly experienced due to variations in connection reliability or quality, leading to hidden versions of the treatment. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " NaN did FAIL The assumption of no interference is likely violated in this context, as fast food restaurants can influence each other's employment outcomes through market competition and labor mobility. For example, if one restaurant raises wages, it may attract employees from nearby restaurants, thereby affecting their employment levels. Additionally, the treatment (minimum wage increase) may not be uniformly experienced across all stores, especially if some stores are more affected by local labor market conditions than others. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " fda_carpenter.csv matching PASS The SUTVA assumption appears to be plausibly satisfied in this context. The treatment variable, 'demsnmaj', pertains to the political majority in the Senate, which is unlikely to directly influence the potential outcomes (approval times) of individual drug applications submitted to the FDA. Additionally, the treatment is consistently defined across all units (drug applications), as it reflects a binary state of the Senate majority without variations in its application. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " fulton.csv iv FAIL The assumption of no interference is likely violated in this context, as the sales of whiting fish are influenced by the actions of other sellers and buyers in the market. For example, if one seller lowers their price, it could affect the prices and quantities sold by other sellers, leading to spillover effects. Additionally, the treatment (quantity sold) may not be administered consistently across treated units due to individual seller pricing strategies. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " hansen.csv rdd PASS The SUTVA assumption appears to be plausibly satisfied in this context. The dataset describes a uniform enforcement of a legal BAC limit of 0.08 for DUI offenses, suggesting that the treatment (being charged with a DUI) is consistently applied across individuals. Additionally, since the analysis focuses on individual DUI incidents, it is unlikely that one driver's treatment would directly affect another's potential outcomes, thus minimizing concerns about interference. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n" + ] + } + ], + "execution_count": 8 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:50:27.304951900Z", + "start_time": "2026-05-04T19:50:27.264050400Z" + } + }, + "cell_type": "code", + "source": [ + "strict_df = pd.DataFrame(strict_result_df)\n", + "\n", + "fails = strict_df[strict_df['status'] == 'FAIL']\n", + "print(f\"FAIL number (strict) : {len(fails)} / {len(strict_df)}\\n\")\n", + "\n", + "for _, row in fails.iterrows():\n", + " print(f\" Dataset : {row['dataset']}\")\n", + " print(f\" Method : {row['method']}\")\n", + " print(f\" Status : FAIL\")\n", + " print(f\" Reasoning :\")\n", + " print(f\" {row['reasoning']}\")\n", + " if row.get('missing_info'):\n", + " print(f\" Missing info : {row['missing_info']}\")\n", + " print(f\"\\n\")" + ], + "id": "1f6dd8201d7b5506", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FAIL number (strict) : 9 / 19\n", + "\n", + " Dataset : voter_turnout_data.csv\n", + " Method : ols\n", + " Status : FAIL\n", + " Reasoning :\n", + " The SUTVA assumption is likely violated due to potential interference among individuals within the same household. For example, the treatment groups that emphasize social pressure (like Neighbors and Self) could lead to discussions or influence among household members, affecting each other's voting behavior. Additionally, the presence of multiple individuals in a household could create variations in treatment exposure and response, further complicating the assumption of no interference. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : abortion_bf15.csv\n", + " Method : did\n", + " Status : FAIL\n", + " Reasoning :\n", + " The assumption of no interference is likely violated in this context, as the legalization of abortion in one state could influence behaviors and health outcomes in neighboring states, particularly among young females who may share social networks or access to healthcare resources. Additionally, the treatment (abortion legalization) may not be uniformly experienced across states, leading to variations in how it affects different populations, which challenges the assumption of no hidden versions of the treatment. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : abortion_bm15.csv\n", + " Method : did\n", + " Status : FAIL\n", + " Reasoning :\n", + " The assumption of no interference is likely violated in this context, as the legalization of abortion in one state could influence behaviors and health outcomes in neighboring states, particularly among adolescents who may share social networks or access to healthcare resources. Additionally, the treatment (abortion legalization) may not be uniformly experienced across states, leading to variations in how it affects gonorrhea incidence among different populations. This suggests potential spillover effects that undermine SUTVA. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : castle.csv\n", + " Method : did\n", + " Status : FAIL\n", + " Reasoning :\n", + " The assumption of no interference is likely violated in this context, as the implementation of castle-doctrine laws in one state could influence crime rates in neighboring states through spillover effects. Additionally, the treatment (enactment of the law) may not be uniformly experienced across states, as differences in law enforcement practices and public perception could lead to variations in how the law affects crime rates. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : gov_transfers.csv\n", + " Method : rdd\n", + " Status : FAIL\n", + " Reasoning :\n", + " The assumption of no interference is likely violated in this context, as households participating in the PANES program may influence the support perceptions of non-participating households through social networks or community interactions. Additionally, the treatment (participation) may not be uniformly experienced across households due to variations in how the program was implemented or perceived, suggesting potential hidden versions of the treatment. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : close_elections.csv\n", + " Method : rdd\n", + " Status : FAIL\n", + " Reasoning :\n", + " The assumption of no interference is likely violated in this context, as the policy positions of elected officials can be influenced by the actions and outcomes of other officials within the same state or district, especially in closely contested elections. Additionally, the treatment (election outcomes) may not be administered consistently across units, as the political landscape and voter preferences can vary significantly between districts and states, leading to potential hidden versions of the treatment. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : electrification_data.csv\n", + " Method : iv\n", + " Status : FAIL\n", + " Reasoning :\n", + " The SUTVA assumption is likely violated due to potential interference among households in close proximity, particularly in rural settings where social and economic interactions are common. Households that are near each other may share resources, information, or even influence each other's decisions regarding expenditures and usage of electricity, which could affect their potential outcomes. Additionally, the treatment (electricity connection) may not be uniformly experienced due to variations in connection reliability or quality, leading to hidden versions of the treatment. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : nan\n", + " Method : did\n", + " Status : FAIL\n", + " Reasoning :\n", + " The assumption of no interference is likely violated in this context, as fast food restaurants can influence each other's employment outcomes through market competition and labor mobility. For example, if one restaurant raises wages, it may attract employees from nearby restaurants, thereby affecting their employment levels. Additionally, the treatment (minimum wage increase) may not be uniformly experienced across all stores, especially if some stores are more affected by local labor market conditions than others. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : fulton.csv\n", + " Method : iv\n", + " Status : FAIL\n", + " Reasoning :\n", + " The assumption of no interference is likely violated in this context, as the sales of whiting fish are influenced by the actions of other sellers and buyers in the market. For example, if one seller lowers their price, it could affect the prices and quantities sold by other sellers, leading to spillover effects. Additionally, the treatment (quantity sold) may not be administered consistently across treated units due to individual seller pricing strategies. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n" + ] + } + ], + "execution_count": 9 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "In its permissive version:", + "id": "269cde721e35e623" + }, + { + "metadata": { + "collapsed": true, + "ExecuteTime": { + "end_time": "2026-05-04T19:50:27.353527600Z", + "start_time": "2026-05-04T19:50:27.308975900Z" + } + }, + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "from cais.methods.pre_model_assumption_utils import check_sutva\n", + "from cais.config import get_llm_client\n", + "\n", + "info = pd.read_csv('../data/real_info.csv', encoding='utf-8-sig')\n", + "\n", + "datasets = info.drop_duplicates(subset='data_files')[['data_files', 'data_description', 'method',\n", + " 'treatment', 'outcome', 'covariates',\n", + " 'instrument_var', 'temporal_var']].reset_index(drop=True)\n", + "\n", + "print(f\"Number of unique datasets: {len(datasets)}\\n\")" + ], + "id": "5b29ea2109a845b4", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of unique datasets: 19\n", + "\n" + ] + } + ], + "execution_count": 10 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:51:08.244265300Z", + "start_time": "2026-05-04T19:50:27.355290400Z" + } + }, + "cell_type": "code", + "source": [ + "llm = get_llm_client()\n", + "\n", + "results = []\n", + "\n", + "for i, row in datasets.iterrows():\n", + " dataset_name = row['data_files']\n", + " description = row['data_description']\n", + "\n", + " variables_summary = {\n", + " 'treatment': row['treatment'],\n", + " 'outcome': row['outcome'],\n", + " 'covariates': row['covariates'],\n", + " 'method': row['method'],\n", + " 'instrument': row.get('instrument_var', None),\n", + " 'time_variable': row.get('temporal_var', None),\n", + " }\n", + "\n", + " print(f\"[{i+1}/{len(datasets)}] {dataset_name}...\", end=\" \")\n", + "\n", + " permissive_result = check_sutva(description, variables_summary, llm=llm)\n", + "\n", + " status = {True: 'PASS', False: 'FAIL', None: 'INCONCLUSIVE'}[permissive_result['passed']]\n", + " print(status)\n", + "\n", + " results.append({\n", + " 'dataset': dataset_name,\n", + " 'method': row['method'],\n", + " 'passed': permissive_result['passed'],\n", + " 'status': status,\n", + " 'reasoning': permissive_result['reasoning'],\n", + " 'missing_info': permissive_result.get('details', {}).get('missing_info', None),\n", + " })" + ], + "id": "fac7f15252be9178", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1/19] voter_turnout_data.csv... PASS\n", + "[2/19] lalonde_data.csv... PASS\n", + "[3/19] lalonde_data_psid.csv... PASS\n", + "[4/19] vernby_2019.csv... PASS\n", + "[5/19] card_geographic.csv... PASS\n", + "[6/19] lee_2008.csv... PASS\n", + "[7/19] abortion_bf15.csv... PASS\n", + "[8/19] abortion_bm15.csv... PASS\n", + "[9/19] broockman_intrinsic.csv... PASS\n", + "[10/19] castle.csv... PASS\n", + "[11/19] gov_transfers.csv... PASS\n", + "[12/19] organ_donations.csv... PASS\n", + "[13/19] thornton_hiv.csv... PASS\n", + "[14/19] close_elections.csv... PASS\n", + "[15/19] electrification_data.csv... PASS\n", + "[16/19] nan... PASS\n", + "[17/19] fda_carpenter.csv... PASS\n", + "[18/19] fulton.csv... PASS\n", + "[19/19] hansen.csv... PASS\n" + ] + } + ], + "execution_count": 11 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:51:08.285702200Z", + "start_time": "2026-05-04T19:51:08.244265300Z" + } + }, + "cell_type": "code", + "source": [ + "permissive_result_df = pd.DataFrame(results)\n", + "\n", + "# SUTVA SUMMARY ON REAL DATASETS\n", + "\n", + "print(f\"\\n{permissive_result_df['status'].value_counts().to_string()}\")\n", + "print(f\"\\nPASS Rate: {(permissive_result_df['status']=='PASS').mean():.0%}\")\n", + "print(f\"FAIL Rate: {(permissive_result_df['status']=='FAIL').mean():.0%}\")\n", + "print(f\"INCONCLUSIVE Rate: {(permissive_result_df['status']=='INCONCLUSIVE').mean():.0%}\")" + ], + "id": "c6ed6e8fc4781e3d", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "status\n", + "PASS 19\n", + "\n", + "PASS Rate: 100%\n", + "FAIL Rate: 0%\n", + "INCONCLUSIVE Rate: 0%\n" + ] + } + ], + "execution_count": 12 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:51:08.329603900Z", + "start_time": "2026-05-04T19:51:08.285702200Z" + } + }, + "cell_type": "code", + "source": [ + "# Detail\n", + "print(permissive_result_df[['dataset', 'method', 'status', 'reasoning']].to_string(index=False))" + ], + "id": "6d08fbeccbd33c3f", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " dataset method status reasoning\n", + " voter_turnout_data.csv ols PASS The design of the experiment involves random assignment of households to treatment groups, which minimizes the risk of interference between units. Each treatment group received a distinct mailing that was consistent across participants, suggesting that there are no hidden versions of the treatment. Given the nature of the mailings and the focus on individual households, the assumption of no interference is reasonably satisfied. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " lalonde_data.csv ols PASS The NSW Demonstration involved random assignment of participants to treatment and control groups, which minimizes the risk of interference between units. Additionally, the treatment (employment program) was consistently administered to those in the treatment group, suggesting that there are no hidden versions of the treatment. Therefore, SUTVA is plausibly satisfied in this context. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " lalonde_data_psid.csv matching PASS The dataset is derived from a randomized controlled trial (the NSW Demonstration), which typically supports the SUTVA assumption by ensuring that treatment assignment is independent and does not influence other units. Additionally, the treatment (job training) is clearly defined and administered uniformly to participants, minimizing the likelihood of hidden versions of the treatment. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " vernby_2019.csv ols PASS The study involves a randomized field experiment where fictitious job applications are submitted independently across various restaurants and cafés. Given that the treatment (immigrant status) is applied to individual applications without any indication of interference between them, and the treatment is consistently defined across all applications, SUTVA is plausibly satisfied. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " card_geographic.csv iv PASS The National Longitudinal Survey of Young Men (NLSYM) primarily collects individual-level data on demographics, education, and employment outcomes without explicit indications of treatment interference among participants. Given that the treatment is not specified and the dataset focuses on individual characteristics, it is reasonable to assume that one individual's treatment does not affect another's outcomes. Additionally, there is no evidence of hidden versions of the treatment based on the provided variables. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " lee_2008.csv rdd PASS The dataset focuses on U.S. House of Representatives elections, where the treatment (e.g., Democratic candidate characteristics) is applied at the individual candidate level, and the outcomes (vote shares) are measured independently for each election. There is no indication of interference between candidates or elections, as each election is treated as a separate unit. Additionally, the treatment appears to be consistently defined across candidates, satisfying the no hidden versions aspect of SUTVA. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " abortion_bf15.csv did PASS The assumption of SUTVA appears to be plausibly satisfied in this analysis as the treatment (abortion legalization) is applied at the state level, which minimizes the likelihood of interference between individuals in different states. Additionally, the treatment is clearly defined as the repeal of abortion bans in early repeal states, suggesting a consistent application of the treatment across those states without hidden variations. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " abortion_bm15.csv did PASS The assumption of SUTVA appears to be plausibly satisfied in this analysis. The treatment, which is the repeal of abortion bans, is applied at the state level, and there is no indication of interference between states that would affect the gonorrhea incidence among 15–19-year-old males. Additionally, the treatment is clearly defined as the repeal of abortion prohibition, suggesting consistency in its application across treated units. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " broockman_intrinsic.csv ols PASS The study involves a randomized field experiment where fictional emails are sent to legislators, which minimizes the likelihood of interference between units. Each legislator's response is based solely on the treatment they receive (the out-of-district email), and there is no indication of treatment variations or hidden versions that could affect the outcomes. Thus, SUTVA appears to be reasonably satisfied in this context. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " castle.csv did PASS The dataset examines the impact of castle-doctrine statutes on violent crime rates at the state level, where the treatment (implementation of the law) is applied uniformly across states without direct interference from one state's treatment affecting another's outcomes. Additionally, the laws were enacted at different times, which suggests that the treatment is consistent across treated units, supporting the assumption of no hidden versions of the treatment. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " gov_transfers.csv rdd PASS The PANES program's eligibility was determined by a predicted income score, which suggests that treatment assignment was based on individual household characteristics rather than interactions between households. Additionally, the cash transfer program is unlikely to create significant interference among households, as the treatment is a direct financial transfer to eligible households. Therefore, the assumption of no interference is reasonably satisfied. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " organ_donations.csv did PASS The assumption of SUTVA appears to be satisfied in this context as the treatment (active-choice phrasing for organ donation sign-up) is applied uniformly across California without evidence of interference from other states. Additionally, there is no indication of hidden versions of the treatment, as the phrasing change is a clear and consistent policy implementation. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " thornton_hiv.csv ols PASS The study involves a randomized field experiment where participants are assigned to receive vouchers of varying values to encourage them to collect their HIV test results. Given the random assignment of treatment and the individual nature of the intervention (home-based HIV testing and voucher redemption), it is reasonable to assume that there is no interference between units. Additionally, the treatment (voucher value) is consistently administered across treated units, satisfying the no hidden versions of the treatment aspect of SUTVA. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " close_elections.csv rdd PASS The dataset focuses on the legislative behavior of elected officials in response to election outcomes, where the treatment (winning the election) is clearly defined and does not directly affect other units (other districts or states). There is no indication of interference between districts or states, and the treatment appears to be consistently applied across treated units (winning candidates). Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "electrification_data.csv iv PASS The dataset focuses on households that are either connected to the grid or not, with a clear eligibility criterion based on distance from the power pole. Given that the treatment (electrification) is applied at the household level and the study design minimizes overlap by excluding households in the ambiguous distance range, it is reasonable to assume that there is no significant interference between units. Additionally, the treatment appears to be consistently administered based on the defined eligibility criteria. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " NaN did PASS The dataset focuses on fast food restaurants in New Jersey and Pennsylvania, where the treatment (minimum wage increase) is applied uniformly across the state of New Jersey. Given the nature of the fast food industry, where operations are standardized and the treatment is consistent, it is reasonable to assume that there is no significant interference between units. Additionally, the treatment is clearly defined and does not suggest hidden variations. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " fda_carpenter.csv matching PASS The assumption of SUTVA appears to be plausibly satisfied in this context. The treatment variable, 'demsnmaj', pertains to the political climate (Democratic majority in the Senate), which is unlikely to directly interfere with the treatment outcomes of individual drug applications. Additionally, the treatment is consistently defined across all units (drug applications), suggesting no hidden versions of the treatment. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " fulton.csv iv PASS The dataset focuses on individual transactions of whiting fish at the Fulton Fish Market, where prices and quantities sold are determined by individual sellers without direct interference from other sellers' transactions. The treatment (quantity sold) and outcome (price) are measured at the level of individual transactions, suggesting that the treatment does not affect other units' outcomes. Additionally, the treatment is consistently defined across the dataset, as it pertains to the quantity sold of a specific fish type. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + " hansen.csv rdd PASS The dataset describes a uniform enforcement of a legal BAC limit of 0.08 for DUI offenses, suggesting that the treatment (being charged with a DUI) is consistently applied across individuals. Additionally, there is no indication of interference among individuals, as the treatment is based on individual BAC levels and does not depend on the actions or outcomes of other drivers. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n" + ] + } + ], + "execution_count": 13 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:51:08.370270800Z", + "start_time": "2026-05-04T19:51:08.329603900Z" + } + }, + "cell_type": "code", + "source": [ + "permissive_df = pd.DataFrame(permissive_result_df)\n", + "\n", + "passes = permissive_df[permissive_df['status'] == 'PASS']\n", + "print(f\"PASS Number (permissive) : {len(passes)} / {len(permissive_df)}\\n\")\n", + "\n", + "for _, row in passes.iterrows():\n", + " print(f\" Dataset : {row['dataset']}\")\n", + " print(f\" Method : {row['method']}\")\n", + " print(f\" Status : PASS\")\n", + " print(f\" Reasoning :\")\n", + " print(f\" {row['reasoning']}\")\n", + " if row.get('missing_info'):\n", + " print(f\" Missing info : {row['missing_info']}\")\n", + " print(f\"\\n\")" + ], + "id": "28a03ae80456a7", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PASS Number (permissive) : 19 / 19\n", + "\n", + " Dataset : voter_turnout_data.csv\n", + " Method : ols\n", + " Status : PASS\n", + " Reasoning :\n", + " The design of the experiment involves random assignment of households to treatment groups, which minimizes the risk of interference between units. Each treatment group received a distinct mailing that was consistent across participants, suggesting that there are no hidden versions of the treatment. Given the nature of the mailings and the focus on individual households, the assumption of no interference is reasonably satisfied. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : lalonde_data.csv\n", + " Method : ols\n", + " Status : PASS\n", + " Reasoning :\n", + " The NSW Demonstration involved random assignment of participants to treatment and control groups, which minimizes the risk of interference between units. Additionally, the treatment (employment program) was consistently administered to those in the treatment group, suggesting that there are no hidden versions of the treatment. Therefore, SUTVA is plausibly satisfied in this context. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : lalonde_data_psid.csv\n", + " Method : matching\n", + " Status : PASS\n", + " Reasoning :\n", + " The dataset is derived from a randomized controlled trial (the NSW Demonstration), which typically supports the SUTVA assumption by ensuring that treatment assignment is independent and does not influence other units. Additionally, the treatment (job training) is clearly defined and administered uniformly to participants, minimizing the likelihood of hidden versions of the treatment. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : vernby_2019.csv\n", + " Method : ols\n", + " Status : PASS\n", + " Reasoning :\n", + " The study involves a randomized field experiment where fictitious job applications are submitted independently across various restaurants and cafés. Given that the treatment (immigrant status) is applied to individual applications without any indication of interference between them, and the treatment is consistently defined across all applications, SUTVA is plausibly satisfied. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : card_geographic.csv\n", + " Method : iv\n", + " Status : PASS\n", + " Reasoning :\n", + " The National Longitudinal Survey of Young Men (NLSYM) primarily collects individual-level data on demographics, education, and employment outcomes without explicit indications of treatment interference among participants. Given that the treatment is not specified and the dataset focuses on individual characteristics, it is reasonable to assume that one individual's treatment does not affect another's outcomes. Additionally, there is no evidence of hidden versions of the treatment based on the provided variables. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : lee_2008.csv\n", + " Method : rdd\n", + " Status : PASS\n", + " Reasoning :\n", + " The dataset focuses on U.S. House of Representatives elections, where the treatment (e.g., Democratic candidate characteristics) is applied at the individual candidate level, and the outcomes (vote shares) are measured independently for each election. There is no indication of interference between candidates or elections, as each election is treated as a separate unit. Additionally, the treatment appears to be consistently defined across candidates, satisfying the no hidden versions aspect of SUTVA. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : abortion_bf15.csv\n", + " Method : did\n", + " Status : PASS\n", + " Reasoning :\n", + " The assumption of SUTVA appears to be plausibly satisfied in this analysis as the treatment (abortion legalization) is applied at the state level, which minimizes the likelihood of interference between individuals in different states. Additionally, the treatment is clearly defined as the repeal of abortion bans in early repeal states, suggesting a consistent application of the treatment across those states without hidden variations. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : abortion_bm15.csv\n", + " Method : did\n", + " Status : PASS\n", + " Reasoning :\n", + " The assumption of SUTVA appears to be plausibly satisfied in this analysis. The treatment, which is the repeal of abortion bans, is applied at the state level, and there is no indication of interference between states that would affect the gonorrhea incidence among 15–19-year-old males. Additionally, the treatment is clearly defined as the repeal of abortion prohibition, suggesting consistency in its application across treated units. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : broockman_intrinsic.csv\n", + " Method : ols\n", + " Status : PASS\n", + " Reasoning :\n", + " The study involves a randomized field experiment where fictional emails are sent to legislators, which minimizes the likelihood of interference between units. Each legislator's response is based solely on the treatment they receive (the out-of-district email), and there is no indication of treatment variations or hidden versions that could affect the outcomes. Thus, SUTVA appears to be reasonably satisfied in this context. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : castle.csv\n", + " Method : did\n", + " Status : PASS\n", + " Reasoning :\n", + " The dataset examines the impact of castle-doctrine statutes on violent crime rates at the state level, where the treatment (implementation of the law) is applied uniformly across states without direct interference from one state's treatment affecting another's outcomes. Additionally, the laws were enacted at different times, which suggests that the treatment is consistent across treated units, supporting the assumption of no hidden versions of the treatment. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : gov_transfers.csv\n", + " Method : rdd\n", + " Status : PASS\n", + " Reasoning :\n", + " The PANES program's eligibility was determined by a predicted income score, which suggests that treatment assignment was based on individual household characteristics rather than interactions between households. Additionally, the cash transfer program is unlikely to create significant interference among households, as the treatment is a direct financial transfer to eligible households. Therefore, the assumption of no interference is reasonably satisfied. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : organ_donations.csv\n", + " Method : did\n", + " Status : PASS\n", + " Reasoning :\n", + " The assumption of SUTVA appears to be satisfied in this context as the treatment (active-choice phrasing for organ donation sign-up) is applied uniformly across California without evidence of interference from other states. Additionally, there is no indication of hidden versions of the treatment, as the phrasing change is a clear and consistent policy implementation. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : thornton_hiv.csv\n", + " Method : ols\n", + " Status : PASS\n", + " Reasoning :\n", + " The study involves a randomized field experiment where participants are assigned to receive vouchers of varying values to encourage them to collect their HIV test results. Given the random assignment of treatment and the individual nature of the intervention (home-based HIV testing and voucher redemption), it is reasonable to assume that there is no interference between units. Additionally, the treatment (voucher value) is consistently administered across treated units, satisfying the no hidden versions of the treatment aspect of SUTVA. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : close_elections.csv\n", + " Method : rdd\n", + " Status : PASS\n", + " Reasoning :\n", + " The dataset focuses on the legislative behavior of elected officials in response to election outcomes, where the treatment (winning the election) is clearly defined and does not directly affect other units (other districts or states). There is no indication of interference between districts or states, and the treatment appears to be consistently applied across treated units (winning candidates). Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : electrification_data.csv\n", + " Method : iv\n", + " Status : PASS\n", + " Reasoning :\n", + " The dataset focuses on households that are either connected to the grid or not, with a clear eligibility criterion based on distance from the power pole. Given that the treatment (electrification) is applied at the household level and the study design minimizes overlap by excluding households in the ambiguous distance range, it is reasonable to assume that there is no significant interference between units. Additionally, the treatment appears to be consistently administered based on the defined eligibility criteria. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : nan\n", + " Method : did\n", + " Status : PASS\n", + " Reasoning :\n", + " The dataset focuses on fast food restaurants in New Jersey and Pennsylvania, where the treatment (minimum wage increase) is applied uniformly across the state of New Jersey. Given the nature of the fast food industry, where operations are standardized and the treatment is consistent, it is reasonable to assume that there is no significant interference between units. Additionally, the treatment is clearly defined and does not suggest hidden variations. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : fda_carpenter.csv\n", + " Method : matching\n", + " Status : PASS\n", + " Reasoning :\n", + " The assumption of SUTVA appears to be plausibly satisfied in this context. The treatment variable, 'demsnmaj', pertains to the political climate (Democratic majority in the Senate), which is unlikely to directly interfere with the treatment outcomes of individual drug applications. Additionally, the treatment is consistently defined across all units (drug applications), suggesting no hidden versions of the treatment. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : fulton.csv\n", + " Method : iv\n", + " Status : PASS\n", + " Reasoning :\n", + " The dataset focuses on individual transactions of whiting fish at the Fulton Fish Market, where prices and quantities sold are determined by individual sellers without direct interference from other sellers' transactions. The treatment (quantity sold) and outcome (price) are measured at the level of individual transactions, suggesting that the treatment does not affect other units' outcomes. Additionally, the treatment is consistently defined across the dataset, as it pertains to the quantity sold of a specific fish type. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n", + " Dataset : hansen.csv\n", + " Method : rdd\n", + " Status : PASS\n", + " Reasoning :\n", + " The dataset describes a uniform enforcement of a legal BAC limit of 0.08 for DUI offenses, suggesting that the treatment (being charged with a DUI) is consistently applied across individuals. Additionally, there is no indication of interference among individuals, as the treatment is based on individual BAC levels and does not depend on the actions or outcomes of other drivers. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\n", + "\n", + "\n" + ] + } + ], + "execution_count": 14 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "On a study we fore sure know it doesn't satisfy SUTVA: Twenty Year Economic Effects of Deworming, Miguel & Kremer (2004)", + "id": "ad4de73f92b3e925" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:51:08.436492800Z", + "start_time": "2026-05-04T19:51:08.370270800Z" + } + }, + "cell_type": "code", + "source": [ + "df = pd.read_stata('../data/all_data/Worm_Infection_Panel.dta')\n", + "print(df.shape)\n", + "print(df.columns.tolist())" + ], + "id": "ce632292cdfcaffb", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(2695, 29)\n", + "['pupid', 'klps_popweight', 'year', 'female', 'male', 'older', 'younger', 'treat_1999', 'treat_2001', 'psdpsch98', 'pup_pop', 'avgtest96', 'popT_6k', 'zoneidI2', 'zoneidI3', 'zoneidI4', 'zoneidI5', 'zoneidI6', 'zoneidI7', 'zoneidI8', 'std98_base_I2', 'std98_base_I3', 'std98_base_I4', 'std98_base_I5', 'std98_base_I6', 'any_moderate_who_1999', 'z_intensity_who_1999', 'any_moderate_who_2001', 'z_intensity_who_2001']\n" + ] + } + ], + "execution_count": 15 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:51:08.455161100Z", + "start_time": "2026-05-04T19:51:08.446309300Z" + } + }, + "cell_type": "code", + "source": [ + "deworming_description2 = (\n", + " \"Miguel & Kremer (2004) — School-based deworming program in Kenya. \"\n", + " \"Children in 75 primary schools were randomly assigned to early or late treatment groups. \"\n", + " \"Treatment consisted of mass deworming medication (albendazole + praziquantel). \"\n", + " \"CRITICAL CONTEXT: Worm infections are transmitted between children through shared \"\n", + " \"environments (soil, water). Treating children in one school reduces parasite prevalence \"\n", + " \"for untreated children in the same school AND in nearby schools (within 3km). \"\n", + " \"The original paper explicitly models and estimates these spillover/externality effects. \"\n", + ")\n", + "\n", + "deworming_description3 = (\n", + " \"Miguel & Kremer (2004) — School-based deworming program in Kenya. \"\n", + " \"Children in 75 primary schools were randomly assigned to early or late treatment groups. \"\n", + " \"Treatment consisted of mass deworming medication (albendazole + praziquantel). \"\n", + " \"Worm infections are transmitted between children through shared environments (soil, water).\"\n", + ")\n", + "\n", + "\n", + "deworming_description = (\n", + " \"Estimating the impact of child health investments on adult living standards entails multiple \"\n", + " \"methodological challenges, including the lack of experimental variation in health status, \"\n", + " \"an inability to track individuals over time, and accurately measuring living standards \"\n", + " \"and productivity in low-income settings. This study exploits a randomized school health \"\n", + " \"intervention that provided deworming treatment to Kenyan children, and uses longitudinal data \"\n", + " \"to estimate impacts on economic outcomes up to 20 years later. The effective respondent tracking \"\n", + " \"rate was 84%. Individuals who received two to three additional years of childhood deworming \"\n", + " \"experienced a 14% gain in consumption expenditures and 13% increase in hourly earnings. \"\n", + " \"There are also shifts in sectors of residence and employment: Treatment group individuals \"\n", + " \"are 9% more likely to live in urban areas, and experience a 9% increase in nonagricultural \"\n", + " \"work hours. Most effects are concentrated among males and older individuals. The observed \"\n", + " \"consumption and earnings benefits, together with deworming’s low cost when distributed at \"\n", + " \"scale, imply that a conservative estimate of its annualized social internal rate of return \"\n", + " \"is 37%, a high return by any standard.\"\n", + ")\n", + "\n", + "\n", + "deworming_variables = {\n", + " 'treatment': 'deworming treatment',\n", + " 'outcome': 'consumption expenditures, hourly earnings',\n", + " 'unit': 'individual children in schools',\n", + " 'method': 'RCT',\n", + "}" + ], + "id": "32e1aa309c448f41", + "outputs": [], + "execution_count": 16 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:51:09.658275500Z", + "start_time": "2026-05-04T19:51:08.455161100Z" + } + }, + "cell_type": "code", + "source": [ + "# Strict version\n", + "result_strict = check_strict_sutva(deworming_description, deworming_variables, llm=llm)\n", + "print(\"STRICT :\", result_strict)" + ], + "id": "17441751695f0f4e", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "STRICT : {'passed': True, 'reasoning': 'The SUTVA assumption appears to be satisfied in this study as the randomized controlled trial design minimizes the likelihood of interference between individuals, given that deworming treatment is administered at the school level. Additionally, the treatment (deworming) is consistently applied across treated individuals, with no indication of hidden versions of the treatment affecting the outcomes. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.', 'details': {'assumption': 'SUTVA (Stable Unit Treatment Value Assumption)'}}\n" + ] + } + ], + "execution_count": 17 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:51:11.723218800Z", + "start_time": "2026-05-04T19:51:09.660028700Z" + } + }, + "cell_type": "code", + "source": [ + "# Strict version\n", + "result_strict = check_strict_sutva(deworming_description2, deworming_variables, llm=llm)\n", + "print(\"STRICT :\", result_strict)" + ], + "id": "b7cae92b4d3c1e57", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "STRICT : {'passed': False, 'reasoning': 'The SUTVA assumption is not plausibly satisfied in this context due to the presence of interference among units. Since worm infections can be transmitted between children through shared environments, treating children in one school can affect the health outcomes of untreated children in the same school and nearby schools. This indicates that the treatment of one unit (child) can influence the potential outcomes of another unit, violating the no interference condition of SUTVA. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.', 'details': {'assumption': 'SUTVA (Stable Unit Treatment Value Assumption)'}}\n" + ] + } + ], + "execution_count": 18 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:51:13.487626500Z", + "start_time": "2026-05-04T19:51:11.733223100Z" + } + }, + "cell_type": "code", + "source": [ + "# Strict version\n", + "result_strict = check_strict_sutva(deworming_description3, deworming_variables, llm=llm)\n", + "print(\"STRICT :\", result_strict)" + ], + "id": "7074eb398a510606", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "STRICT : {'passed': False, 'reasoning': \"The assumption of no interference is likely violated in this context because worm infections can be transmitted between children through shared environments, meaning that one child's treatment could affect the health outcomes of others in the same school. Additionally, the treatment is administered to groups of children, which raises concerns about consistent treatment effects across individuals. Therefore, the SUTVA assumption is not plausibly satisfied. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\", 'details': {'assumption': 'SUTVA (Stable Unit Treatment Value Assumption)'}}\n" + ] + } + ], + "execution_count": 19 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:51:14.823143100Z", + "start_time": "2026-05-04T19:51:13.497339900Z" + } + }, + "cell_type": "code", + "source": [ + "# Permissive version\n", + "result_permissive = check_sutva(deworming_description, deworming_variables, llm=llm)\n", + "print(\"PERMISSIVE :\", result_permissive)" + ], + "id": "91acc9700889ad5d", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PERMISSIVE : {'passed': True, 'reasoning': 'The study employs a randomized controlled trial (RCT) design to evaluate the effects of deworming treatment on economic outcomes, which typically supports the SUTVA assumption. The treatment is administered consistently to individuals in schools, and there is no indication of interference among treated and untreated individuals, as the outcomes measured (consumption expenditures and hourly earnings) are individual-level metrics. Additionally, the study does not suggest any significant spillover effects or variations in treatment application. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.', 'details': {'assumption': 'SUTVA (Stable Unit Treatment Value Assumption)'}}\n" + ] + } + ], + "execution_count": 20 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:51:16.727425Z", + "start_time": "2026-05-04T19:51:14.825663200Z" + } + }, + "cell_type": "code", + "source": [ + "# Permissive version\n", + "result_permissive = check_sutva(deworming_description2, deworming_variables, llm=llm)\n", + "print(\"PERMISSIVE :\", result_permissive)" + ], + "id": "f09ab118438c680e", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PERMISSIVE : {'passed': False, 'reasoning': 'The assumption of no interference is violated in this context because worm infections are transmitted between children through shared environments, meaning that treating children in one school can affect the health outcomes of untreated children in the same school and nearby schools. This creates a clear mechanism for spillover effects, which the original paper explicitly models and estimates. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.', 'details': {'assumption': 'SUTVA (Stable Unit Treatment Value Assumption)'}}\n" + ] + } + ], + "execution_count": 21 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:51:18.494833Z", + "start_time": "2026-05-04T19:51:16.734120500Z" + } + }, + "cell_type": "code", + "source": [ + "# Permissive version\n", + "result_permissive = check_sutva(deworming_description3, deworming_variables, llm=llm)\n", + "print(\"PERMISSIVE :\", result_permissive)" + ], + "id": "369244d84364a4ab", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PERMISSIVE : {'passed': False, 'reasoning': \"The assumption of no interference is violated in this context because worm infections can be transmitted between children through shared environments, such as soil and water. This suggests that one child's treatment could affect another child's potential outcomes, particularly in a school setting where children interact closely. Additionally, the treatment is not administered uniformly across all children, as some receive early treatment while others receive late treatment, which could lead to variations in treatment effects. Note: this assessment relies on LLM reasoning and is sensitive to the quality and completeness of the dataset description provided.\", 'details': {'assumption': 'SUTVA (Stable Unit Treatment Value Assumption)'}}\n" + ] + } + ], + "execution_count": 22 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "**Conclusion**:\n", + "\n", + "The LLM-based SUTVA check is **only as good as the dataset description provided**.\n", + "When the description omits key mechanistic details (e.g., transmission pathways, shared environments), the check may return false negatives even on well-known SUTVA violations such as Miguel & Kremer (2004)." + ], + "id": "43fa2d99ea3d2cf5" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "## On synthetic data", + "id": "295451f08dfc13c3" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:52:41.701747100Z", + "start_time": "2026-05-04T19:51:18.494833Z" + } + }, + "cell_type": "code", + "source": [ + "info = pd.read_csv('../data/synthetic_info.csv', encoding='utf-8-sig')\n", + "\n", + "print(f\"Number of synthetic datasets : {len(info)}\")\n", + "\n", + "llm = get_llm_client()\n", + "\n", + "results = []\n", + "\n", + "for i, row in info.iterrows():\n", + " dataset_name = row['data_files']\n", + " description = row['data_description']\n", + " method = row['method']\n", + "\n", + " # No columns treatment/outcome/covariates separated in synthetic_info,\n", + " # everything is in data_description\n", + " variables_summary = {\n", + " 'method': method,\n", + " 'query': row['natural_language_query'],\n", + " }\n", + "\n", + " print(f\"[{i+1}/{len(info)}] {dataset_name} ({method})...\", end=\" \")\n", + "\n", + " result = check_sutva(description, variables_summary, llm=llm)\n", + "\n", + " status = {True: 'PASS', False: 'FAIL', None: 'INCONCLUSIVE'}[result['passed']]\n", + " print(status)\n", + "\n", + " results.append({\n", + " 'dataset': dataset_name,\n", + " 'method': method,\n", + " 'passed': result['passed'],\n", + " 'status': status,\n", + " 'reasoning': result['reasoning'],\n", + " 'missing_info': result.get('details', {}).get('missing_info', None),\n", + " })\n", + "\n", + "results_df = pd.DataFrame(results)\n", + "\n", + "\n", + "# SUTVA SUMMARY ON SYNTHETIC DATASETS\n", + "\n", + "print(f\"\\nGlobal :\")\n", + "print(results_df['status'].value_counts().to_string())\n", + "print(f\"\\nPASS Rate: {(results_df['status']=='PASS').mean():.0%}\")\n", + "print(f\"FAIL Rate: {(results_df['status']=='FAIL').mean():.0%}\")\n", + "print(f\"INCONCLUSIVE Rate: {(results_df['status']=='INCONCLUSIVE').mean():.0%}\")\n", + "\n", + "print(f\"\\nPer method:\")\n", + "print(results_df.groupby('method')['status'].value_counts().unstack(fill_value=0).to_string())" + ], + "id": "e21d5d86e9940465", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of synthetic datasets : 45\n", + "[1/45] did_canonical_data_0.csv (did_canonical)... PASS\n", + "[2/45] did_canonical_data_1.csv (did_canonical)... PASS\n", + "[3/45] did_canonical_data_3.csv (did_canonical)... PASS\n", + "[4/45] did_canonical_data_2.csv (did_canonical)... PASS\n", + "[5/45] did_canonical_data_4.csv (did_canonical)... PASS\n", + "[6/45] did_twfe_data_4.csv (did_twfe)... PASS\n", + "[7/45] did_twfe_data_2.csv (did_twfe)... PASS\n", + "[8/45] did_twfe_data_3.csv (did_twfe)... PASS\n", + "[9/45] did_twfe_data_1.csv (did_twfe)... PASS\n", + "[10/45] did_twfe_data_0.csv (did_twfe)... PASS\n", + "[11/45] iv_encouragement_data_4.csv (iv_encouragement)... PASS\n", + "[12/45] iv_encouragement_data_0.csv (iv_encouragement)... FAIL\n", + "[13/45] iv_encouragement_data_1.csv (iv_encouragement)... PASS\n", + "[14/45] iv_encouragement_data_3.csv (iv_encouragement)... PASS\n", + "[15/45] iv_encouragement_data_2.csv (iv_encouragement)... PASS\n", + "[16/45] iv_data_0.csv (iv)... PASS\n", + "[17/45] iv_data_1.csv (iv)... PASS\n", + "[18/45] iv_data_3.csv (iv)... PASS\n", + "[19/45] iv_data_2.csv (iv)... PASS\n", + "[20/45] iv_data_4.csv (iv)... PASS\n", + "[21/45] observational_data_8.csv (observational)... PASS\n", + "[22/45] observational_data_9.csv (observational)... PASS\n", + "[23/45] observational_data_2.csv (observational)... PASS\n", + "[24/45] observational_data_3.csv (observational)... PASS\n", + "[25/45] observational_data_1.csv (observational)... PASS\n", + "[26/45] observational_data_0.csv (observational)... PASS\n", + "[27/45] observational_data_4.csv (observational)... PASS\n", + "[28/45] observational_data_5.csv (observational)... PASS\n", + "[29/45] observational_data_7.csv (observational)... PASS\n", + "[30/45] observational_data_6.csv (observational)... PASS\n", + "[31/45] rct_data_8.csv (rct)... PASS\n", + "[32/45] rct_data_9.csv (rct)... PASS\n", + "[33/45] rct_data_2.csv (rct)... PASS\n", + "[34/45] rct_data_3.csv (rct)... PASS\n", + "[35/45] rct_data_1.csv (rct)... PASS\n", + "[36/45] rct_data_0.csv (rct)... PASS\n", + "[37/45] rct_data_4.csv (rct)... PASS\n", + "[38/45] rct_data_5.csv (rct)... PASS\n", + "[39/45] rct_data_7.csv (rct)... PASS\n", + "[40/45] rct_data_6.csv (rct)... PASS\n", + "[41/45] rdd_data_2.csv (rdd)... PASS\n", + "[42/45] rdd_data_3.csv (rdd)... PASS\n", + "[43/45] rdd_data_1.csv (rdd)... PASS\n", + "[44/45] rdd_data_0.csv (rdd)... PASS\n", + "[45/45] rdd_data_4.csv (rdd)... PASS\n", + "\n", + "Global :\n", + "status\n", + "PASS 44\n", + "FAIL 1\n", + "\n", + "PASS Rate: 98%\n", + "FAIL Rate: 2%\n", + "INCONCLUSIVE Rate: 0%\n", + "\n", + "Per method:\n", + "status FAIL PASS\n", + "method \n", + "did_canonical 0 5\n", + "did_twfe 0 5\n", + "iv 0 5\n", + "iv_encouragement 1 4\n", + "observational 0 10\n", + "rct 0 10\n", + "rdd 0 5\n" + ] + } + ], + "execution_count": 23 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:52:41.738208100Z", + "start_time": "2026-05-04T19:52:41.703681300Z" + } + }, + "cell_type": "code", + "source": [ + "# Detail\n", + "for _, row in results_df.iterrows():\n", + " print(f\"\\n {row['dataset']} ({row['method']}) → {row['status']}\")\n", + " print(f\" {row['reasoning'][:150]}...\")" + ], + "id": "b6ff9e5f86f4292", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " did_canonical_data_0.csv (did_canonical) → PASS\n", + " The dataset describes a study conducted in public schools where the treatment (biannual health check-ups) is applied uniformly across students in scho...\n", + "\n", + " did_canonical_data_1.csv (did_canonical) → PASS\n", + " The dataset describes a study of factories where the treatment (adoption of the industrial reform policy) is applied at the factory level, and there i...\n", + "\n", + " did_canonical_data_3.csv (did_canonical) → PASS\n", + " The dataset involves multiple schools with unique identifiers, and the treatment (tutoring initiative) is applied at the school level, suggesting that...\n", + "\n", + " did_canonical_data_2.csv (did_canonical) → PASS\n", + " The dataset describes a government program providing free solar panels to selected households, suggesting a clear treatment assignment without interfe...\n", + "\n", + " did_canonical_data_4.csv (did_canonical) → PASS\n", + " The dataset involves retail stores in two different states, with a clear distinction between those affected by the tax policy and those not affected. ...\n", + "\n", + " did_twfe_data_4.csv (did_twfe) → PASS\n", + " The staggered rollout of the policy across different regions suggests that treatment effects are likely isolated to the regions where the policy is in...\n", + "\n", + " did_twfe_data_2.csv (did_twfe) → PASS\n", + " The dataset consists of independent retail stores, and the treatment (adoption of e-commerce) is applied individually to each store without indication...\n", + "\n", + " did_twfe_data_3.csv (did_twfe) → PASS\n", + " The dataset describes individual retail stores and their attributes, with treatment (online platform usage) applied at the store level. There is no in...\n", + "\n", + " did_twfe_data_1.csv (did_twfe) → PASS\n", + " The assumption of SUTVA appears to be plausibly satisfied in this context. The staggered installation of air purification systems across different hea...\n", + "\n", + " did_twfe_data_0.csv (did_twfe) → PASS\n", + " The dataset describes a study where the treatment (new teaching methodology) is applied at the school level, and each school operates independently in...\n", + "\n", + " iv_encouragement_data_4.csv (iv_encouragement) → PASS\n", + " The dataset describes a scenario where families are individually assigned to receive encouragement letters to switch to energy-efficient appliances, s...\n", + "\n", + " iv_encouragement_data_0.csv (iv_encouragement) → FAIL\n", + " The assumption of no interference is violated because the treatment (encouragement) is not consistently received by all assigned students due to facto...\n", + "\n", + " iv_encouragement_data_1.csv (iv_encouragement) → PASS\n", + " The dataset describes a randomized controlled trial where patients were assigned to receive personal consultations from psychologists or not, which mi...\n", + "\n", + " iv_encouragement_data_3.csv (iv_encouragement) → PASS\n", + " The dataset describes a health and wellness study where participants were randomly selected and invited to participate in a fitness program, with thei...\n", + "\n", + " iv_encouragement_data_2.csv (iv_encouragement) → PASS\n", + " The dataset describes a randomized controlled trial where the training program was offered to some employees, suggesting that treatment assignment was...\n", + "\n", + " iv_data_0.csv (iv) → PASS\n", + " The dataset describes individual farms with specific characteristics and treatments (fertilizer use) that are likely to be independent of one another....\n", + "\n", + " iv_data_1.csv (iv) → PASS\n", + " The dataset description does not indicate any mechanisms of interference between individuals, such as shared housing situations or communal financial ...\n", + "\n", + " iv_data_3.csv (iv) → PASS\n", + " The dataset does not indicate any explicit mechanisms for interference between individuals, such as shared environments or social networks that could ...\n", + "\n", + " iv_data_2.csv (iv) → PASS\n", + " The dataset does not indicate any direct mechanisms of interference between respondents, such as shared work environments or collaborative roles that ...\n", + "\n", + " iv_data_4.csv (iv) → PASS\n", + " The dataset describes individual-level data collected from a health survey, where each individual's treatment (hours of physical activity) is independ...\n", + "\n", + " observational_data_8.csv (observational) → PASS\n", + " The dataset consists of individual survey responses from residents, and there is no indication of interference between respondents' treatment (homeown...\n", + "\n", + " observational_data_9.csv (observational) → PASS\n", + " The dataset does not indicate any mechanisms of interference among respondents, as it is based on individual self-reported data collected through a na...\n", + "\n", + " observational_data_2.csv (observational) → PASS\n", + " The dataset focuses on individual respondents and their home ownership status, which suggests that the treatment (home ownership) is applied at the in...\n", + "\n", + " observational_data_3.csv (observational) → PASS\n", + " The dataset involves an observational study where the treatment (meditation practice) is assessed at the individual level without any indication of in...\n", + "\n", + " observational_data_1.csv (observational) → PASS\n", + " The dataset description does not indicate any mechanisms of interference between respondents, such as shared environments or social networks that coul...\n", + "\n", + " observational_data_0.csv (observational) → PASS\n", + " The dataset is based on individual health screenings where subjects provided information about their health and lifestyle, suggesting that treatment (...\n", + "\n", + " observational_data_4.csv (observational) → PASS\n", + " The dataset description does not indicate any mechanisms of interference between respondents, such as shared environments or social networks that coul...\n", + "\n", + " observational_data_5.csv (observational) → PASS\n", + " The dataset is based on a nation-wide survey where participants voluntarily provided information about their lifestyle choices, including dietary adhe...\n", + "\n", + " observational_data_7.csv (observational) → PASS\n", + " The assumption of SUTVA appears to be plausibly satisfied in this context. The dataset focuses on individual respondents and their distance to public ...\n", + "\n", + " observational_data_6.csv (observational) → PASS\n", + " The dataset is observational and does not indicate any direct interactions between participants that could lead to interference in treatment effects. ...\n", + "\n", + " rct_data_8.csv (rct) → PASS\n", + " The study is a randomized control trial (RCT), which typically minimizes interference between units by randomly assigning participants to treatment an...\n", + "\n", + " rct_data_9.csv (rct) → PASS\n", + " The study employs a randomized controlled trial (RCT) design, which typically helps to mitigate interference between units. Since participants were ra...\n", + "\n", + " rct_data_2.csv (rct) → PASS\n", + " The study employs a randomized controlled trial (RCT) design, which typically minimizes interference between participants, as treatment assignment is ...\n", + "\n", + " rct_data_3.csv (rct) → PASS\n", + " The study employs a randomized controlled trial (RCT) design, which typically minimizes interference between units, as students are assigned to treatm...\n", + "\n", + " rct_data_1.csv (rct) → PASS\n", + " The study is a randomized controlled trial (RCT) where subjects were randomly assigned to either a treatment group (new diet plan) or a control group ...\n", + "\n", + " rct_data_0.csv (rct) → PASS\n", + " The randomized controlled trial design minimizes the risk of interference between units, as patients are assigned to treatment or placebo groups indep...\n", + "\n", + " rct_data_4.csv (rct) → PASS\n", + " The study is a randomized controlled trial (RCT), which typically minimizes the risk of interference between participants' treatment assignments. Sinc...\n", + "\n", + " rct_data_5.csv (rct) → PASS\n", + " The study employs a randomized controlled trial (RCT) design, which typically minimizes interference between units, as participants are randomly assig...\n", + "\n", + " rct_data_7.csv (rct) → PASS\n", + " The study employs a randomized controlled trial (RCT) design, which typically helps to mitigate concerns about interference between units. Given that ...\n", + "\n", + " rct_data_6.csv (rct) → PASS\n", + " The study employs a randomized controlled trial (RCT) design, which typically helps to satisfy the no interference condition of SUTVA by ensuring that...\n", + "\n", + " rdd_data_2.csv (rdd) → PASS\n", + " The assumption of SUTVA appears to be satisfied in this study as the treatment (medication for high cholesterol) is administered based on a clear thre...\n", + "\n", + " rdd_data_3.csv (rdd) → PASS\n", + " The assumption of SUTVA appears to be plausibly satisfied in this context. The implementation of the policy to reduce class sizes is likely to affect ...\n", + "\n", + " rdd_data_1.csv (rdd) → PASS\n", + " The dataset focuses on individual loan applicants and their credit scores, with no indication of interference between applicants. Each applicant's tre...\n", + "\n", + " rdd_data_0.csv (rdd) → PASS\n", + " The SUTVA assumption appears to be satisfied in this context as the tutoring program is implemented at the school level, and there is no indication of...\n", + "\n", + " rdd_data_4.csv (rdd) → PASS\n", + " The dataset describes a survey of companies regarding their eligibility for a 'green' tax incentive, focusing on individual company characteristics an...\n" + ] + } + ], + "execution_count": 24 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:54:07.543383800Z", + "start_time": "2026-05-04T19:52:41.748399Z" + } + }, + "cell_type": "code", + "source": [ + "results_strict = []\n", + "\n", + "for i, row in info.iterrows():\n", + " dataset_name = row['data_files']\n", + " description = row['data_description']\n", + " method = row['method']\n", + "\n", + " variables_summary = {\n", + " 'method': method,\n", + " 'query': row['natural_language_query'],\n", + " }\n", + "\n", + " print(f\"[{i+1}/{len(info)}] {dataset_name} ({method})...\", end=\" \")\n", + "\n", + " result = check_strict_sutva(description, variables_summary, llm=llm)\n", + "\n", + " status = {True: 'PASS', False: 'FAIL', None: 'INCONCLUSIVE'}[result['passed']]\n", + " print(status)\n", + "\n", + " results_strict.append({\n", + " 'dataset': dataset_name,\n", + " 'method': method,\n", + " 'passed': result['passed'],\n", + " 'status': status,\n", + " 'reasoning': result['reasoning'],\n", + " 'missing_info': result.get('details', {}).get('missing_info', None),\n", + " })\n", + "\n", + "results_strict_df = pd.DataFrame(results_strict)\n", + "\n", + "\n", + "# SUTVA SUMMARY ON SYNTHETIC DATASETS\n", + "\n", + "print(f\"\\nGlobal :\")\n", + "print(results_strict_df['status'].value_counts().to_string())\n", + "print(f\"\\nPASS Rate: {(results_strict_df['status']=='PASS').mean():.0%}\")\n", + "print(f\"FAIL Rate: {(results_strict_df['status']=='FAIL').mean():.0%}\")\n", + "print(f\"INCONCLUSIVE Rate: {(results_strict_df['status']=='INCONCLUSIVE').mean():.0%}\")\n", + "\n", + "print(f\"\\nPer method:\")\n", + "print(results_strict_df.groupby('method')['status'].value_counts().unstack(fill_value=0).to_string())" + ], + "id": "5ceb43957afb01cc", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1/45] did_canonical_data_0.csv (did_canonical)... FAIL\n", + "[2/45] did_canonical_data_1.csv (did_canonical)... FAIL\n", + "[3/45] did_canonical_data_3.csv (did_canonical)... FAIL\n", + "[4/45] did_canonical_data_2.csv (did_canonical)... FAIL\n", + "[5/45] did_canonical_data_4.csv (did_canonical)... FAIL\n", + "[6/45] did_twfe_data_4.csv (did_twfe)... FAIL\n", + "[7/45] did_twfe_data_2.csv (did_twfe)... FAIL\n", + "[8/45] did_twfe_data_3.csv (did_twfe)... FAIL\n", + "[9/45] did_twfe_data_1.csv (did_twfe)... FAIL\n", + "[10/45] did_twfe_data_0.csv (did_twfe)... FAIL\n", + "[11/45] iv_encouragement_data_4.csv (iv_encouragement)... FAIL\n", + "[12/45] iv_encouragement_data_0.csv (iv_encouragement)... FAIL\n", + "[13/45] iv_encouragement_data_1.csv (iv_encouragement)... PASS\n", + "[14/45] iv_encouragement_data_3.csv (iv_encouragement)... FAIL\n", + "[15/45] iv_encouragement_data_2.csv (iv_encouragement)... FAIL\n", + "[16/45] iv_data_0.csv (iv)... FAIL\n", + "[17/45] iv_data_1.csv (iv)... FAIL\n", + "[18/45] iv_data_3.csv (iv)... FAIL\n", + "[19/45] iv_data_2.csv (iv)... PASS\n", + "[20/45] iv_data_4.csv (iv)... FAIL\n", + "[21/45] observational_data_8.csv (observational)... FAIL\n", + "[22/45] observational_data_9.csv (observational)... FAIL\n", + "[23/45] observational_data_2.csv (observational)... FAIL\n", + "[24/45] observational_data_3.csv (observational)... FAIL\n", + "[25/45] observational_data_1.csv (observational)... FAIL\n", + "[26/45] observational_data_0.csv (observational)... FAIL\n", + "[27/45] observational_data_4.csv (observational)... FAIL\n", + "[28/45] observational_data_5.csv (observational)... FAIL\n", + "[29/45] observational_data_7.csv (observational)... FAIL\n", + "[30/45] observational_data_6.csv (observational)... FAIL\n", + "[31/45] rct_data_8.csv (rct)... PASS\n", + "[32/45] rct_data_9.csv (rct)... PASS\n", + "[33/45] rct_data_2.csv (rct)... PASS\n", + "[34/45] rct_data_3.csv (rct)... FAIL\n", + "[35/45] rct_data_1.csv (rct)... PASS\n", + "[36/45] rct_data_0.csv (rct)... PASS\n", + "[37/45] rct_data_4.csv (rct)... PASS\n", + "[38/45] rct_data_5.csv (rct)... PASS\n", + "[39/45] rct_data_7.csv (rct)... PASS\n", + "[40/45] rct_data_6.csv (rct)... PASS\n", + "[41/45] rdd_data_2.csv (rdd)... FAIL\n", + "[42/45] rdd_data_3.csv (rdd)... FAIL\n", + "[43/45] rdd_data_1.csv (rdd)... PASS\n", + "[44/45] rdd_data_0.csv (rdd)... FAIL\n", + "[45/45] rdd_data_4.csv (rdd)... FAIL\n", + "\n", + "Global :\n", + "status\n", + "FAIL 33\n", + "PASS 12\n", + "\n", + "PASS Rate: 27%\n", + "FAIL Rate: 73%\n", + "INCONCLUSIVE Rate: 0%\n", + "\n", + "Per method:\n", + "status FAIL PASS\n", + "method \n", + "did_canonical 5 0\n", + "did_twfe 5 0\n", + "iv 4 1\n", + "iv_encouragement 4 1\n", + "observational 10 0\n", + "rct 1 9\n", + "rdd 4 1\n" + ] + } + ], + "execution_count": 25 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2026-05-04T19:54:07.689139500Z", + "start_time": "2026-05-04T19:54:07.557966400Z" + } + }, + "cell_type": "code", + "source": [ + "# Detail\n", + "for _, row in results_strict_df.iterrows():\n", + " print(f\"\\n {row['dataset']} ({row['method']}) → {row['status']}\")\n", + " print(f\" {row['reasoning'][:150]}...\")" + ], + "id": "7f6ab7d9dfd52577", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " did_canonical_data_0.csv (did_canonical) → FAIL\n", + " The assumption of no interference is likely violated in this study because students within the same school may influence each other's health behaviors...\n", + "\n", + " did_canonical_data_1.csv (did_canonical) → FAIL\n", + " The assumption of no interference is likely violated in this context, as factories may influence each other's production outputs through shared labor ...\n", + "\n", + " did_canonical_data_3.csv (did_canonical) → FAIL\n", + " The assumption of no interference is likely violated in this context, as schools may influence each other's outcomes through shared students, resource...\n", + "\n", + " did_canonical_data_2.csv (did_canonical) → FAIL\n", + " The assumption of no interference is likely violated in this context, as households in rural areas may share resources or influence each other's energ...\n", + "\n", + " did_canonical_data_4.csv (did_canonical) → FAIL\n", + " The assumption of no interference is likely violated in this context, as retail stores in close proximity may influence each other's sales due to shar...\n", + "\n", + " did_twfe_data_4.csv (did_twfe) → FAIL\n", + " The staggered rollout of the policy across different regions raises concerns about potential interference, as the treatment in one region could influe...\n", + "\n", + " did_twfe_data_2.csv (did_twfe) → FAIL\n", + " The assumption of no interference is likely violated in this context, as retail stores may influence each other's sales through shared customer bases,...\n", + "\n", + " did_twfe_data_3.csv (did_twfe) → FAIL\n", + " The assumption of no interference is likely violated in this context, as stores within the same chain may influence each other's sales through shared ...\n", + "\n", + " did_twfe_data_1.csv (did_twfe) → FAIL\n", + " The assumption of no interference is likely violated in this context, as health clubs in close proximity may influence each other's members' workout d...\n", + "\n", + " did_twfe_data_0.csv (did_twfe) → FAIL\n", + " The assumption of no interference is likely violated in this context, as students within the same school may influence each other's performance, espec...\n", + "\n", + " iv_encouragement_data_4.csv (iv_encouragement) → FAIL\n", + " The assumption of no interference is likely violated in this context, as families in close proximity may influence each other's decisions regarding en...\n", + "\n", + " iv_encouragement_data_0.csv (iv_encouragement) → FAIL\n", + " The assumption of no interference is violated due to the presence of partial compliance, as not all students who were assigned encouragement actually ...\n", + "\n", + " iv_encouragement_data_1.csv (iv_encouragement) → PASS\n", + " The assumption of SUTVA appears to be plausibly satisfied in this study. The random assignment of patients to receive personal consultations suggests ...\n", + "\n", + " iv_encouragement_data_3.csv (iv_encouragement) → FAIL\n", + " The assumption of no interference is likely violated due to the nature of the fitness program, where participants' decisions to join may be influenced...\n", + "\n", + " iv_encouragement_data_2.csv (iv_encouragement) → FAIL\n", + " The assumption of no interference is likely violated in this context, as employees within the same workplace may influence each other's productivity t...\n", + "\n", + " iv_data_0.csv (iv) → FAIL\n", + " The assumption of no interference is likely violated in this context, as farms may be influenced by shared environmental factors such as regional rain...\n", + "\n", + " iv_data_1.csv (iv) → FAIL\n", + " The assumption of no interference is likely violated in this context, as individuals may influence each other's housing costs through shared living ar...\n", + "\n", + " iv_data_3.csv (iv) → FAIL\n", + " The assumption of no interference is likely violated in this context, as individuals living in the same city may influence each other's healthcare beh...\n", + "\n", + " iv_data_2.csv (iv) → PASS\n", + " The assumption of SUTVA appears to be plausibly satisfied in this analysis. The treatment, which involves providing paid leave days for training, is l...\n", + "\n", + " iv_data_4.csv (iv) → FAIL\n", + " The assumption of no interference is likely violated in this context, as individuals may influence each other's physical activity levels through socia...\n", + "\n", + " observational_data_8.csv (observational) → FAIL\n", + " The assumption of no interference is likely violated in this context, as the life satisfaction of one resident may be influenced by the housing status...\n", + "\n", + " observational_data_9.csv (observational) → FAIL\n", + " The assumption of no interference is likely violated in this study, as vehicle ownership could influence not only the owner's happiness but also the h...\n", + "\n", + " observational_data_2.csv (observational) → FAIL\n", + " The assumption of no interference is likely violated in this context, as home ownership can have spillover effects on neighbors and community members,...\n", + "\n", + " observational_data_3.csv (observational) → FAIL\n", + " The assumption of no interference is likely violated in this study, as individuals may influence each other's stress levels through shared environment...\n", + "\n", + " observational_data_1.csv (observational) → FAIL\n", + " The assumption of no interference is likely violated in this context, as individuals' health outcomes may be influenced by shared environments, such a...\n", + "\n", + " observational_data_0.csv (observational) → FAIL\n", + " The assumption of no interference is likely violated in this dataset, as individuals' health behaviors, such as exercise, can influence one another, e...\n", + "\n", + " observational_data_4.csv (observational) → FAIL\n", + " The assumption of no interference is likely violated in this context, as vaccination status could influence the health outcomes of others in close con...\n", + "\n", + " observational_data_5.csv (observational) → FAIL\n", + " The assumption of no interference is likely violated in this dataset, as lifestyle choices such as dietary adherence can be influenced by social netwo...\n", + "\n", + " observational_data_7.csv (observational) → FAIL\n", + " The assumption of no interference is likely violated in this context, as young adults living in close proximity to each other may influence one anothe...\n", + "\n", + " observational_data_6.csv (observational) → FAIL\n", + " The assumption of no interference is likely violated in this observational study, as participants may influence each other's health behaviors, such as...\n", + "\n", + " rct_data_8.csv (rct) → PASS\n", + " The study is a randomized control trial (RCT) where participants were randomly assigned to either the treatment or control group, which minimizes the ...\n", + "\n", + " rct_data_9.csv (rct) → PASS\n", + " The study employs a randomized controlled trial (RCT) design, which typically minimizes interference between units, as participants are randomly assig...\n", + "\n", + " rct_data_2.csv (rct) → PASS\n", + " The study employs a randomized controlled trial (RCT) design, which typically minimizes interference between units, as participants are treated indepe...\n", + "\n", + " rct_data_3.csv (rct) → FAIL\n", + " The assumption of no interference is likely violated in this study because students are clustered within the same schools, which may lead to spillover...\n", + "\n", + " rct_data_1.csv (rct) → PASS\n", + " The study design involves random assignment of subjects to either a treatment or control group, which minimizes the risk of interference between units...\n", + "\n", + " rct_data_0.csv (rct) → PASS\n", + " The randomized controlled trial design suggests that the treatment assignment was independent and that the experimental drug was administered consiste...\n", + "\n", + " rct_data_4.csv (rct) → PASS\n", + " The study is a randomized controlled trial (RCT) where participants are assigned to either the new medication or a placebo, which minimizes the risk o...\n", + "\n", + " rct_data_5.csv (rct) → PASS\n", + " The assumption of SUTVA appears to be plausibly satisfied in this study as the treatment (cognitive therapy) is administered to a randomly selected su...\n", + "\n", + " rct_data_7.csv (rct) → PASS\n", + " The SUTVA assumption appears to be plausibly satisfied in this study as it is designed as a randomized controlled trial (RCT), which typically minimiz...\n", + "\n", + " rct_data_6.csv (rct) → PASS\n", + " The study employs a randomized controlled trial (RCT) design, which typically minimizes interference between units, as participants are randomly assig...\n", + "\n", + " rdd_data_2.csv (rdd) → FAIL\n", + " The assumption of no interference is likely violated in this study because the treatment (medication for high cholesterol) could affect patients' bloo...\n", + "\n", + " rdd_data_3.csv (rdd) → FAIL\n", + " The assumption of no interference is likely violated in this context, as students within the same school may influence each other's performance, espec...\n", + "\n", + " rdd_data_1.csv (rdd) → PASS\n", + " The SUTVA assumption appears to be plausibly satisfied in this context as the treatment (credit score) is an individual characteristic that does not d...\n", + "\n", + " rdd_data_0.csv (rdd) → FAIL\n", + " The assumption of no interference is likely violated in this context, as schools within the same district may influence each other's outcomes through ...\n", + "\n", + " rdd_data_4.csv (rdd) → FAIL\n", + " The assumption of no interference is likely violated in this context, as companies may influence each other's emissions through shared supply chains, ...\n" + ] + } + ], + "execution_count": 26 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}