diff --git a/README.md b/README.md new file mode 100644 index 0000000..57785cb --- /dev/null +++ b/README.md @@ -0,0 +1,83 @@ +# Segmentation Plus + +Customer segmentation and persona pipeline using the `segplus` module and a single Excel input file. + +## What This Project Uses + +- Input data: **Excel file** +- Core code: `segplus/` +- Final runnable notebook: `segplus/final_segplus_pipeline2.ipynb` +- Required support files: + - `requirements.txt` + - `main.py` + +## Minimal Project Structure + +```text +Segmentation_Plus/ +├── final_enterprise_clustering_dataset_single_sheet.xlsx # Excel input +├── requirements.txt +├── segplus/ +│ ├── clustering.py +│ ├── config.py +│ ├── data_input.py +│ ├── evaluation.py +│ ├── experiment_log.py +│ ├── explainability.py +│ ├── feature_engineering.py +│ ├── modeling_loop.py +│ ├── ollama_client.py +│ ├── persona_generation.py +│ ├── persona_generator.py +│ ├── pipeline.py +│ ├── types.py +│ ├── visualization.py +│ └── final_segplus_pipeline2.ipynb # Run this notebook +└── README.md +``` + +## Setup + +```bash +pip install -r requirements.txt +``` + +## Ollama (Local LLM) + +```bash +ollama serve +ollama list +ollama pull qwen2.5:7b +``` + +The notebook is configured for local Ollama and selects `qwen2.5:7b`. + +## Run + +1. Open `segplus/final_segplus_pipeline2.ipynb` +2. Run cells top-to-bottom +3. Ensure the Excel file path is correct in the config cell + +## Outputs + +Pipeline outputs are written to `segplus_output/` and include: + +- `clustered_customers.csv` +- `cluster_profiles.csv` +- `ordered_feature_drivers.csv` +- `shap_feature_importance_pct.csv` +- `shap_summary.png` +- `shap_summary_bar.png` +- `shap_interpretation.csv` +- `pc_feature_map.json` +- `personas.csv` +- `personas.json` +- `business_grounding.json` +- `experiment_log.json` + +## Notes + +- Persona names are generated by the model (no hardcoded cluster names). +- `profile_descriptor` and `description` are included in persona outputs. +- If grounding times out, fallback logic still returns persona outputs. + diff --git a/data/.gitkeep b/data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e48c250 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,13 @@ +pandas>=2.0.0 +numpy>=1.24.0 +scikit-learn>=1.3.0 +matplotlib>=3.7.0 +seaborn>=0.12.0 +plotly>=5.15.0 +shap>=0.42.0 +pyyaml>=6.0 +openpyxl>=3.1.0 +requests>=2.31.0 +scipy>=1.11.0 +google-generativeai>=0.3.0 +kaleido>=0.2.1 diff --git a/segplus/clustering.py b/segplus/clustering.py new file mode 100644 index 0000000..fa9423b --- /dev/null +++ b/segplus/clustering.py @@ -0,0 +1,131 @@ +"""Clustering algorithm runners: K-Means, DBSCAN, GMM.""" +from __future__ import annotations + +import logging + +import numpy as np +from sklearn.cluster import DBSCAN, KMeans +from sklearn.metrics import silhouette_score +from sklearn.mixture import GaussianMixture +from sklearn.neighbors import NearestNeighbors + +from .types import ClusteringConfig, ClusterRunResult + +log = logging.getLogger("segplus.clustering") + + +def run_kmeans(X: np.ndarray, config: ClusteringConfig) -> ClusterRunResult: + """Run K-Means clustering.""" + model = KMeans( + n_clusters=config.k, + init=config.kmeans_init, + n_init=config.kmeans_n_init, + max_iter=config.kmeans_max_iter, + random_state=config.random_state, + ) + labels = model.fit_predict(X) + return ClusterRunResult( + algorithm="kmeans", + labels=labels, + n_clusters=len(set(labels)), + model=model, + extra={"inertia": float(model.inertia_)}, + ) + + +def run_dbscan(X: np.ndarray, config: ClusteringConfig) -> ClusterRunResult: + """Run DBSCAN density-based clustering.""" + model = DBSCAN(eps=config.dbscan_eps, min_samples=config.dbscan_min_samples) + labels = model.fit_predict(X) + n_clusters = len(set(labels)) - (1 if -1 in labels else 0) + noise_count = int((labels == -1).sum()) + return ClusterRunResult( + algorithm="dbscan", + labels=labels, + n_clusters=max(n_clusters, 1), + model=model, + extra={"noise_count": noise_count, "noise_pct": noise_count / len(labels)}, + ) + + +def run_gmm(X: np.ndarray, config: ClusteringConfig) -> ClusterRunResult: + """Run Gaussian Mixture Model clustering.""" + model = GaussianMixture( + n_components=config.k, + covariance_type=config.gmm_covariance_type, + n_init=config.gmm_n_init, + random_state=config.random_state, + ) + labels = model.fit_predict(X) + probs = model.predict_proba(X) + return ClusterRunResult( + algorithm="gmm", + labels=labels, + n_clusters=len(set(labels)), + model=model, + probabilities=probs, + extra={"bic": float(model.bic(X)), "aic": float(model.aic(X))}, + ) + + +def run_all_algorithms(X: np.ndarray, config: ClusteringConfig) -> dict[str, ClusterRunResult]: + """Run all three clustering algorithms, applying feature subset if configured.""" + X_work = X + if config.feature_subset_indices is not None: + X_work = X[:, config.feature_subset_indices] + + results = {} + for name, runner in [("kmeans", run_kmeans), ("dbscan", run_dbscan), ("gmm", run_gmm)]: + try: + results[name] = runner(X_work, config) + log.info( + " [%s] clusters=%d", + name, results[name].n_clusters, + ) + except Exception as e: + log.warning(" [%s] failed: %s", name, e) + return results + + +def find_optimal_k( + X: np.ndarray, + k_range: tuple[int, int], + random_state: int = 42, +) -> tuple[int, dict[int, float]]: + """Silhouette sweep to find optimal K for K-Means.""" + best_k, best_score = k_range[0], -1.0 + scores: dict[int, float] = {} + + for k in range(k_range[0], k_range[1] + 1): + km = KMeans(n_clusters=k, init="k-means++", n_init=5, random_state=random_state) + labels = km.fit_predict(X) + if len(set(labels)) < 2: + continue + s = silhouette_score(X, labels, sample_size=min(2000, len(X))) + scores[k] = round(s, 4) + if s > best_score: + best_score, best_k = s, k + + log.info("K search: scores=%s | best k=%d (sil=%.4f)", scores, best_k, best_score) + return best_k, scores + + +def estimate_dbscan_eps(X: np.ndarray, min_samples: int = 5) -> float: + """Estimate DBSCAN eps using k-distance knee detection.""" + nn = NearestNeighbors(n_neighbors=min_samples) + nn.fit(X) + distances, _ = nn.kneighbors(X) + k_dist = np.sort(distances[:, -1]) + + # Simple knee detection: max second derivative + if len(k_dist) < 10: + return float(np.median(k_dist)) + + diffs = np.diff(k_dist) + diffs2 = np.diff(diffs) + knee_idx = int(np.argmax(diffs2)) + 2 + eps = float(k_dist[min(knee_idx, len(k_dist) - 1)]) + eps = max(eps, 0.1) # floor + + log.info("DBSCAN eps estimated: %.3f (knee at index %d)", eps, knee_idx) + return round(eps, 3) diff --git a/segplus/config.py b/segplus/config.py new file mode 100644 index 0000000..e38c9c9 --- /dev/null +++ b/segplus/config.py @@ -0,0 +1,198 @@ +"""Pipeline and domain configuration management.""" +from __future__ import annotations + +import logging +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + +import pandas as pd +import yaml + +log = logging.getLogger("segplus.config") + +_DOMAINS_DIR = Path(__file__).resolve().parent.parent / "domains" + + +# ── Domain Config (parsed from YAML) ──────────────────────────────────────── + +@dataclass +class FeatureEngineeringRule: + name: str + formula: str + bins: list[str] | None = None + description: str = "" + + +@dataclass +class DomainConfig: + domain_key: str + display_name: str + features: dict[str, list[str]] = field(default_factory=dict) + required_columns: list[str] = field(default_factory=list) + feature_engineering: list[FeatureEngineeringRule] = field(default_factory=list) + eda_analyses: list[str] = field(default_factory=list) + persona_prompt_template: str = "" + scaling_exclude: list[str] = field(default_factory=list) + categorical_columns: list[str] = field(default_factory=list) + metadata: dict = field(default_factory=dict) + + @property + def all_feature_columns(self) -> list[str]: + cols: list[str] = [] + for group_cols in self.features.values(): + cols.extend(group_cols) + return cols + + @property + def numerical_columns(self) -> list[str]: + return [c for c in self.all_feature_columns if c not in self.categorical_columns] + + +def load_domain_config(domain_key: str, domains_dir: Path | None = None) -> DomainConfig: + """Load a domain configuration from YAML. Falls back to empty direct-ingest config if missing.""" + d = domains_dir or _DOMAINS_DIR + yaml_path = d / f"{domain_key}.yaml" + if not yaml_path.exists(): + log.warning( + "Domain YAML '%s' not found. Falling back to direct-ingest config (no YAML rules).", + yaml_path, + ) + return DomainConfig( + domain_key=domain_key, + display_name=domain_key.replace("_", " ").title(), + features={}, + required_columns=[], + feature_engineering=[], + eda_analyses=[], + persona_prompt_template="", + scaling_exclude=[], + categorical_columns=[], + metadata={"source": "direct_no_yaml_fallback"}, + ) + + with open(yaml_path, "r", encoding="utf-8") as f: + raw = yaml.safe_load(f) + + fe_rules = [] + for rule in raw.get("feature_engineering", []): + fe_rules.append(FeatureEngineeringRule( + name=rule["name"], + formula=rule["formula"], + bins=rule.get("bins"), + description=rule.get("description", ""), + )) + + eda_analyses = [] + eda_block = raw.get("eda", {}) + if isinstance(eda_block, dict): + eda_analyses = eda_block.get("domain_analyses", []) + + return DomainConfig( + domain_key=raw.get("domain", domain_key), + display_name=raw.get("display_name", domain_key), + features=raw.get("features", {}), + required_columns=raw.get("required_columns", []), + feature_engineering=fe_rules, + eda_analyses=eda_analyses, + persona_prompt_template=raw.get("persona_prompt_template", ""), + scaling_exclude=raw.get("scaling_exclude", []), + categorical_columns=raw.get("categorical_columns", []), + metadata=raw.get("metadata", {}), + ) + + +def list_available_domains(domains_dir: Path | None = None) -> list[str]: + d = domains_dir or _DOMAINS_DIR + return sorted(f.stem for f in d.glob("*.yaml") if not f.stem.startswith("_")) + + +def build_domain_config_from_dataframe( + df: pd.DataFrame, + domain_key: str = "direct_ingest", + exclude_cols: list[str] | None = None, +) -> DomainConfig: + """ + Build a DomainConfig directly from dataframe schema (no YAML dependency). + This is the recommended path when users provide only an Excel file. + """ + excludes = set(c.lower() for c in (exclude_cols or [])) + cols = [c for c in df.columns if c.lower() not in excludes] + + categorical_cols: list[str] = [] + for c in cols: + s = df[c] + n_unique = s.nunique(dropna=True) + ratio = n_unique / max(len(s), 1) + is_cat = ( + pd.api.types.is_object_dtype(s) + or pd.api.types.is_categorical_dtype(s) + or pd.api.types.is_bool_dtype(s) + or (pd.api.types.is_numeric_dtype(s) and (n_unique <= 10 and ratio < 0.05)) + ) + if is_cat: + categorical_cols.append(c) + + return DomainConfig( + domain_key=domain_key, + display_name=domain_key.replace("_", " ").title(), + features={"all": cols}, + required_columns=[], + feature_engineering=[], + eda_analyses=[], + persona_prompt_template="", + scaling_exclude=[], + categorical_columns=categorical_cols, + metadata={"source": "direct_dataframe_schema"}, + ) + + +# ── Pipeline Config ────────────────────────────────────────────────────────── + +@dataclass +class PipelineConfig: + """All tunable pipeline parameters with sensible defaults.""" + + # Data + data_path: str = "final_enterprise_clustering_dataset.xlsx" + domain_key: str = "financial_services" + sheet_name: Optional[str] = None + exclude_cols: list[str] = field(default_factory=lambda: ["customer_id"]) + + # Feature Engineering + pca_variance_threshold: float = 0.85 + winsorize_lower: float = 0.01 + winsorize_upper: float = 0.99 + imputation_strategy: str = "median" + + # Clustering + k_range: tuple[int, int] = (2, 8) + random_state: int = 42 + + # Evaluation Gate + silhouette_threshold: float = 0.15 + davies_bouldin_threshold: float = 2.5 + stability_ari_threshold: float = 0.7 + stability_n_bootstraps: int = 30 + + # Modeling Loop + max_iterations: int = 5 + + # Explainability + n_top_features: int = 10 + shap_n_repeats: int = 10 + + # Ollama + ollama_host: str = "http://localhost:11434" + ollama_model: str = "llama3" + ollama_timeout: int = 120 + + # Business Objective + business_objective: str = ( + "Identify distinct customer segments to personalise marketing campaigns, " + "improve retention for high-value customers, and convert mid-tier customers " + "to premium products." + ) + + # Output + output_dir: str = "segplus_output" diff --git a/segplus/data_input.py b/segplus/data_input.py new file mode 100644 index 0000000..931975b --- /dev/null +++ b/segplus/data_input.py @@ -0,0 +1,140 @@ +"""Data loading, schema inference, and validation.""" +from __future__ import annotations + +import logging +from pathlib import Path +from typing import Optional + +import pandas as pd + +from .config import DomainConfig +from .types import ColumnMeta, DataQualityReport, DataSchema + +log = logging.getLogger("segplus.data_input") + + +def load_data(file_path: str | Path, sheet_name: Optional[str] = None) -> pd.DataFrame: + """Load data from CSV, Excel, or Parquet.""" + p = Path(file_path) + if not p.exists(): + raise FileNotFoundError(f"Data file not found: {p}") + + suffix = p.suffix.lower() + if suffix == ".csv": + df = pd.read_csv(p) + elif suffix in (".xlsx", ".xls"): + df = pd.read_excel(p, sheet_name=sheet_name or 0, engine="openpyxl") + elif suffix == ".parquet": + df = pd.read_parquet(p) + else: + raise ValueError(f"Unsupported file format: {suffix}") + + log.info("Loaded %s: %d rows x %d cols", p.name, len(df), len(df.columns)) + return df + + +def infer_schema(df: pd.DataFrame) -> DataSchema: + """Infer column types and build a DataSchema.""" + columns: list[ColumnMeta] = [] + for col in df.columns: + s = df[col] + n_unique = s.nunique() + if pd.api.types.is_bool_dtype(s): + dtype = "boolean" + elif pd.api.types.is_datetime64_any_dtype(s): + dtype = "datetime" + elif pd.api.types.is_numeric_dtype(s): + dtype = "numeric" + elif n_unique / max(len(s), 1) < 0.05 or n_unique <= 20: + dtype = "categorical" + else: + dtype = "text" + columns.append(ColumnMeta(col, dtype, round(s.isna().mean(), 4), n_unique)) + return DataSchema(len(df), len(df.columns), columns) + + +def validate_schema( + df: pd.DataFrame, + domain_config: DomainConfig, + schema: DataSchema, +) -> DataQualityReport: + """Validate the dataframe against domain requirements.""" + report = DataQualityReport( + total_rows=len(df), + total_columns=len(df.columns), + ) + + # Column type map + report.column_types = {c.name: c.dtype for c in schema.columns} + + # Missing values + missing = df.isnull().sum() + report.missing_values = {c: int(v) for c, v in missing.items() if v > 0} + report.missing_pct = {c: round(v / len(df), 4) for c, v in report.missing_values.items()} + + # Duplicates + report.duplicate_rows = int(df.duplicated().sum()) + if report.duplicate_rows > 0: + report.warnings.append(f"{report.duplicate_rows} duplicate rows found") + + # Required columns check + df_cols_lower = {c.lower(): c for c in df.columns} + for req in domain_config.required_columns: + if req.lower() not in df_cols_lower: + report.errors.append(f"Required column missing: '{req}'") + report.passed = False + + # Row count check + if len(df) < 50: + report.warnings.append(f"Very few rows ({len(df)}). Results may be unreliable.") + + # High-missing columns + for col, pct in report.missing_pct.items(): + if pct > 0.5: + report.warnings.append(f"Column '{col}' is {pct:.0%} missing") + + if report.errors: + report.passed = False + + return report + + +def auto_map_columns( + df: pd.DataFrame, + domain_config: DomainConfig, +) -> tuple[pd.DataFrame, dict[str, str]]: + """Case-insensitive column mapping to domain config names.""" + df_cols_lower = {c.lower().replace(" ", "_").replace("-", "_"): c for c in df.columns} + mapping: dict[str, str] = {} + + all_expected = set(domain_config.all_feature_columns + domain_config.required_columns) + + for expected in all_expected: + norm = expected.lower().replace(" ", "_").replace("-", "_") + if norm in df_cols_lower and df_cols_lower[norm] != expected: + mapping[df_cols_lower[norm]] = expected + + if mapping: + df = df.rename(columns=mapping) + log.info("Auto-mapped %d columns: %s", len(mapping), mapping) + + return df, mapping + + +def import_and_validate( + file_path: str | Path, + domain_config: DomainConfig, + auto_map: bool = True, + sheet_name: Optional[str] = None, +) -> tuple[pd.DataFrame, DataQualityReport, DataSchema]: + """Full import pipeline: load -> auto-map -> infer schema -> validate.""" + df = load_data(file_path, sheet_name) + + if auto_map: + df, _ = auto_map_columns(df, domain_config) + + schema = infer_schema(df) + report = validate_schema(df, domain_config, schema) + + log.info("Data quality: %s", "PASSED" if report.passed else "FAILED") + return df, report, schema diff --git a/segplus/evaluation.py b/segplus/evaluation.py new file mode 100644 index 0000000..a14feb8 --- /dev/null +++ b/segplus/evaluation.py @@ -0,0 +1,212 @@ +"""Cluster evaluation: scoring, pass/fail gate, stability testing.""" +from __future__ import annotations + +import logging + +import numpy as np +from sklearn.cluster import KMeans +from sklearn.metrics import ( + adjusted_rand_score, + calinski_harabasz_score, + davies_bouldin_score, + silhouette_score, +) + +from .config import PipelineConfig +from .types import ClusterRunResult, EvaluationResult, StabilityResult + +log = logging.getLogger("segplus.evaluation") + + +class ClusterEvaluator: + """Evaluates clustering results and applies the pass/fail gate.""" + + def __init__(self, config: PipelineConfig): + self.config = config + + def evaluate( + self, + X: np.ndarray, + results: dict[str, ClusterRunResult], + ) -> EvaluationResult: + """Score all algorithm results, pick the best via composite ranking, apply pass/fail gate.""" + # Filter to valid results first (>= 2 non-noise clusters) + valid_results = {name: res for name, res in results.items() if res.is_valid} + if not valid_results: + log.warning("No algorithm produced >= 2 valid clusters; scoring all results as fallback.") + valid_results = results + + scored: dict[str, dict[str, float]] = {} + + for name, res in valid_results.items(): + s = self._score_one(X, res) + s["n_clusters"] = float(res.n_clusters) + # Coverage: fraction of data points actually clustered (penalises heavy noise) + n_clustered = int((res.labels != -1).sum()) + s["coverage"] = n_clustered / max(len(res.labels), 1) + scored[name] = s + log.info( + " [%s] k=%d | sil=%.4f | DB=%.4f | CH=%.1f | cov=%.2f", + name, res.n_clusters, + s["silhouette"], s["davies_bouldin"], s["calinski_harabasz"], + s["coverage"], + ) + + # Pick best using composite rank across all three metrics + coverage + best_name = self._pick_best_composite(scored) + best_s = scored[best_name] + best_res = valid_results[best_name] + + passes = self._check_pass(best_s["silhouette"], best_s["davies_bouldin"]) + + return EvaluationResult( + algorithm=best_name, + labels=best_res.labels, + n_clusters=best_res.n_clusters, + silhouette=best_s["silhouette"], + davies_bouldin=best_s["davies_bouldin"], + calinski_harabasz=best_s["calinski_harabasz"], + passes=passes, + all_scores=scored, + model=best_res.model, + ) + + def _pick_best_composite(self, scored: dict[str, dict[str, float]]) -> str: + """ + Rank-based composite selection using all computed metrics. + + Each algorithm is ranked per metric (1=best), ranks are normalised to [0,1], + then blended: + composite = 0.40 * silhouette_rank (higher is better) + + 0.25 * db_rank (lower is better → inverted) + + 0.20 * ch_rank (higher is better) + + 0.15 * coverage_rank (higher is better, penalises DBSCAN noise) + + If there is only one algorithm, it wins by default. + """ + names = list(scored.keys()) + if len(names) == 1: + return names[0] + + n = len(names) + + def _rank_higher_better(metric: str) -> dict[str, float]: + """Rank so that higher value → rank 1 (best).""" + ordered = sorted(names, key=lambda nm: scored[nm][metric], reverse=True) + return {nm: (i + 1) for i, nm in enumerate(ordered)} + + def _rank_lower_better(metric: str) -> dict[str, float]: + """Rank so that lower value → rank 1 (best).""" + ordered = sorted(names, key=lambda nm: scored[nm][metric]) + return {nm: (i + 1) for i, nm in enumerate(ordered)} + + sil_ranks = _rank_higher_better("silhouette") + db_ranks = _rank_lower_better("davies_bouldin") + ch_ranks = _rank_higher_better("calinski_harabasz") + cov_ranks = _rank_higher_better("coverage") + + # Normalise ranks to [0,1] where 1=best + def _norm(rank: float) -> float: + return 1.0 - (rank - 1.0) / max(n - 1, 1) + + composite: dict[str, float] = {} + for nm in names: + composite[nm] = ( + 0.40 * _norm(sil_ranks[nm]) + + 0.25 * _norm(db_ranks[nm]) + + 0.20 * _norm(ch_ranks[nm]) + + 0.15 * _norm(cov_ranks[nm]) + ) + + best = max(composite, key=lambda nm: composite[nm]) + log.info( + "Composite ranking: %s", + " | ".join(f"{nm}={composite[nm]:.3f}" for nm in names), + ) + log.info("Winner: %s (composite=%.3f)", best, composite[best]) + return best + + def _score_one(self, X: np.ndarray, result: ClusterRunResult) -> dict[str, float]: + """Compute metrics for a single clustering result.""" + labels = result.labels + valid_mask = labels != -1 + X_v = X[valid_mask] + labels_v = labels[valid_mask] + + if len(set(labels_v)) < 2 or len(X_v) < 10: + return {"silhouette": -1.0, "davies_bouldin": 99.0, "calinski_harabasz": 0.0} + + sample_size = min(2000, len(X_v)) + sil = silhouette_score(X_v, labels_v, sample_size=sample_size) + db = davies_bouldin_score(X_v, labels_v) + ch = calinski_harabasz_score(X_v, labels_v) + + return { + "silhouette": round(sil, 4), + "davies_bouldin": round(db, 4), + "calinski_harabasz": round(ch, 2), + } + + def _check_pass(self, silhouette: float, davies_bouldin: float) -> bool: + """Pure pass/fail gate.""" + return ( + silhouette >= self.config.silhouette_threshold + and davies_bouldin <= self.config.davies_bouldin_threshold + ) + + +def run_stability_test( + X: np.ndarray, + labels: np.ndarray, + n_clusters: int, + config: PipelineConfig, + algorithm: str = "kmeans", + model: object = None, +) -> StabilityResult: + """Bootstrap ARI stability test using the winning algorithm.""" + from sklearn.mixture import GaussianMixture + + rng = np.random.default_rng(config.random_state) + n = len(X) + ari_scores: list[float] = [] + + for _ in range(config.stability_n_bootstraps): + idx = rng.choice(n, size=n, replace=True) + X_boot = X[idx] + + if algorithm == "gmm": + cov_type = ( + model.covariance_type + if model is not None and hasattr(model, "covariance_type") + else "full" + ) + boot_model = GaussianMixture( + n_components=n_clusters, covariance_type=cov_type, + n_init=3, random_state=config.random_state, + ) + else: + # KMeans for kmeans winner; also used as proxy for DBSCAN + # (DBSCAN is density-based so bootstrap changes density structure) + boot_model = KMeans( + n_clusters=n_clusters, n_init=5, + random_state=config.random_state, + ) + + boot_labels = boot_model.fit_predict(X_boot) + ari = adjusted_rand_score(labels[idx], boot_labels) + ari_scores.append(ari) + + ari_mean = float(np.mean(ari_scores)) + ari_std = float(np.std(ari_scores)) + stable = ari_mean >= config.stability_ari_threshold + + log.info( + "Stability (%s): ARI=%.3f +/- %.3f (threshold=%.2f, stable=%s)", + algorithm, ari_mean, ari_std, config.stability_ari_threshold, stable, + ) + return StabilityResult( + ari_mean=round(ari_mean, 4), + ari_std=round(ari_std, 4), + n_bootstraps=config.stability_n_bootstraps, + stable=stable, + ) diff --git a/segplus/experiment_log.py b/segplus/experiment_log.py new file mode 100644 index 0000000..a6fc6c1 --- /dev/null +++ b/segplus/experiment_log.py @@ -0,0 +1,135 @@ +"""Experiment tracking: logs every modeling loop iteration.""" +from __future__ import annotations + +import json +import logging +from pathlib import Path + +import pandas as pd + +from .types import ExperimentRecord + +log = logging.getLogger("segplus.experiment_log") + + +def _to_native(value): + """Recursively convert numpy/pandas scalars and containers to JSON-safe Python types.""" + # Dict + if isinstance(value, dict): + return {str(k): _to_native(v) for k, v in value.items()} + # List / tuple + if isinstance(value, (list, tuple)): + return [_to_native(v) for v in value] + # Numpy scalars / arrays (without importing numpy directly) + if hasattr(value, "item") and callable(getattr(value, "item")): + try: + return value.item() + except Exception: + pass + if hasattr(value, "tolist") and callable(getattr(value, "tolist")): + try: + return value.tolist() + except Exception: + pass + # Path + if isinstance(value, Path): + return str(value) + return value + + +def _json_default(obj): + """Fallback serializer for json.dump to handle numpy/pandas scalar objects.""" + if hasattr(obj, "item") and callable(getattr(obj, "item")): + try: + return obj.item() + except Exception: + pass + if hasattr(obj, "tolist") and callable(getattr(obj, "tolist")): + try: + return obj.tolist() + except Exception: + pass + return str(obj) + + +class ExperimentLog: + """Tracks config, metrics, and results for every modeling loop iteration.""" + + def __init__(self) -> None: + self._records: list[ExperimentRecord] = [] + + @property + def records(self) -> list[ExperimentRecord]: + return list(self._records) + + def add(self, record: ExperimentRecord) -> None: + self._records.append(record) + log.info( + "Experiment %d: %s k=%d sil=%.4f pass=%s", + record.iteration, record.best_algorithm, + record.n_clusters, record.silhouette, record.passed, + ) + + def to_dataframe(self) -> pd.DataFrame: + if not self._records: + return pd.DataFrame() + rows = [] + for r in self._records: + rows.append({ + "iteration": r.iteration, + "timestamp": r.timestamp, + "k": r.config_k, + "eps": r.config_eps, + "gmm_cov": r.config_gmm_cov, + "best_algorithm": r.best_algorithm, + "n_clusters": r.n_clusters, + "silhouette": r.silhouette, + "davies_bouldin": r.davies_bouldin, + "calinski_harabasz": r.calinski_harabasz, + "passed": r.passed, + "reconfig_strategy": r.reconfiguration_strategy, + }) + return pd.DataFrame(rows) + + def to_json(self, path: Path) -> None: + data = [ + { + "iteration": int(r.iteration), + "timestamp": r.timestamp, + "config": { + "k": int(r.config_k), + "eps": float(r.config_eps), + "gmm_cov": str(r.config_gmm_cov), + }, + "best_algorithm": r.best_algorithm, + "n_clusters": int(r.n_clusters), + "silhouette": float(r.silhouette), + "davies_bouldin": float(r.davies_bouldin), + "calinski_harabasz": float(r.calinski_harabasz), + "passed": bool(r.passed), + "reconfiguration_strategy": r.reconfiguration_strategy, + } + for r in self._records + ] + data = _to_native(data) + with open(path, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2, default=_json_default) + log.info("Experiment log saved: %s", path) + + def summary(self) -> str: + if not self._records: + return "No experiments recorded." + lines = ["Experiment Log Summary:", f" Total iterations: {len(self._records)}"] + for r in self._records: + status = "PASS" if r.passed else "FAIL" + strategy = f" (reconfig: {r.reconfiguration_strategy})" if r.reconfiguration_strategy else "" + lines.append( + f" [{r.iteration}] {r.best_algorithm:8s} k={r.n_clusters} " + f"sil={r.silhouette:.4f} DB={r.davies_bouldin:.4f} " + f"{status}{strategy}" + ) + best = max(self._records, key=lambda r: r.silhouette) + lines.append( + f" Best: iter {best.iteration} ({best.best_algorithm}, sil={best.silhouette:.4f})" + ) + return "\n".join(lines) diff --git a/segplus/explainability.py b/segplus/explainability.py new file mode 100644 index 0000000..1f07a30 --- /dev/null +++ b/segplus/explainability.py @@ -0,0 +1,300 @@ +"""Explainability: SHAP importance, PCA loadings, inertia curve, cluster profiles.""" +from __future__ import annotations + +import logging +from typing import Optional + +import numpy as np +import pandas as pd +from sklearn.cluster import KMeans +from sklearn.decomposition import PCA +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import silhouette_score + +from .types import ExplainabilityReport + +log = logging.getLogger("segplus.explainability") + + +def compute_shap_importance( + X: np.ndarray, + labels: np.ndarray, + feature_names: list[str], + random_state: int = 42, +) -> dict[str, float]: + """ + Feature importance using three-tier fallback: + 1. TreeSHAP (if shap is installed) + 2. RF feature_importances_ + 3. Permutation importance + """ + valid_mask = labels != -1 + Xv, lv = X[valid_mask], labels[valid_mask] + + if len(set(lv)) < 2: + return {f: 0.0 for f in feature_names} + + # Train RF classifier on cluster labels + rf = RandomForestClassifier( + n_estimators=100, max_depth=10, random_state=random_state, n_jobs=-1 + ) + rf.fit(Xv, lv) + + # Tier 1: Try TreeSHAP + try: + import shap + explainer = shap.TreeExplainer(rf) + sample_size = min(500, len(Xv)) + rng = np.random.default_rng(random_state) + idx = rng.choice(len(Xv), size=sample_size, replace=False) + shap_values = explainer.shap_values(Xv[idx]) + + if isinstance(shap_values, list): + mean_abs = np.mean([np.abs(sv).mean(axis=0) for sv in shap_values], axis=0) + else: + mean_abs = np.abs(shap_values).mean(axis=0) + + importances = {feature_names[i]: round(float(mean_abs[i]), 5) for i in range(len(feature_names))} + log.info("SHAP importance computed via TreeSHAP") + return dict(sorted(importances.items(), key=lambda x: -x[1])) + + except ImportError: + log.info("shap not installed, falling back to RF feature_importances_") + except Exception as e: + log.warning("TreeSHAP failed (%s), falling back to RF feature_importances_", e) + + # Tier 2: RF feature importances + try: + imp = rf.feature_importances_ + importances = {feature_names[i]: round(float(imp[i]), 5) for i in range(len(feature_names))} + log.info("Feature importance computed via RF feature_importances_") + return dict(sorted(importances.items(), key=lambda x: -x[1])) + except Exception as e: + log.warning("RF importances failed (%s), falling back to permutation", e) + + # Tier 3: Permutation importance + return _compute_permutation_importance(Xv, lv, feature_names, random_state=random_state) + + +def to_importance_percentages(importances: dict[str, float]) -> dict[str, float]: + """Convert absolute importance scores to percentage contribution.""" + total = float(sum(max(v, 0.0) for v in importances.values())) + if total <= 0: + return {k: 0.0 for k in importances.keys()} + return {k: round((max(v, 0.0) / total) * 100.0, 4) for k, v in importances.items()} + + +def _compute_permutation_importance( + X: np.ndarray, + labels: np.ndarray, + feature_names: list[str], + n_repeats: int = 10, + random_state: int = 0, +) -> dict[str, float]: + """Permutation importance: silhouette drop when feature is shuffled.""" + rng = np.random.default_rng(random_state) + baseline = silhouette_score(X, labels, sample_size=min(1000, len(X))) + importances: dict[str, float] = {} + + for i, fname in enumerate(feature_names): + drops = [] + for _ in range(n_repeats): + Xp = X.copy() + rng.shuffle(Xp[:, i]) + s = silhouette_score(Xp, labels, sample_size=min(1000, len(X))) + drops.append(baseline - s) + importances[fname] = round(float(np.mean(drops)), 5) + + log.info("Feature importance computed via permutation importance") + return dict(sorted(importances.items(), key=lambda x: -x[1])) + + +def compute_pca_loadings( + pca: PCA, + feature_names: list[str], +) -> pd.DataFrame: + """PCA component loadings as a DataFrame.""" + n_components = pca.n_components_ + return pd.DataFrame( + np.abs(pca.components_), + columns=feature_names, + index=[f"PC{i+1}" for i in range(n_components)], + ) + + +def compute_inertia_curve( + X: np.ndarray, + k_range: tuple[int, int], + random_state: int = 42, +) -> dict[int, float]: + """Elbow curve: k -> inertia for K-Means.""" + curve: dict[int, float] = {} + for k in range(k_range[0], k_range[1] + 1): + km = KMeans(n_clusters=k, n_init=5, random_state=random_state) + km.fit(X) + curve[k] = float(km.inertia_) + return curve + + +def build_pc_feature_map( + pca_loadings: pd.DataFrame, + top_n: int = 5, +) -> dict[str, list[str]]: + """Map each principal component to its top contributing original features.""" + pc_map: dict[str, list[str]] = {} + for pc in pca_loadings.index: + top_features = ( + pca_loadings.loc[pc] + .sort_values(ascending=False) + .head(top_n) + .index + .tolist() + ) + pc_map[str(pc)] = top_features + return pc_map + + +def build_ordered_feature_drivers( + feature_importances: dict[str, float], + pca_loadings: pd.DataFrame, + pca_variance_ratio: list[float], + inertia_curve: dict[int, float] | None, +) -> pd.DataFrame: + """ + Build ordered feature drivers of convergence by combining: + - SHAP/RF feature importance + - Weighted PCA loading strength + - Global inertia elbow strength (reported as context) + """ + shap_series = pd.Series(feature_importances, dtype=float) + + # Weighted PCA contribution score per original feature + if len(pca_variance_ratio) > 0: + comp_weights = np.array(pca_variance_ratio[: len(pca_loadings.index)], dtype=float) + else: + comp_weights = np.ones(len(pca_loadings.index), dtype=float) + comp_weights = comp_weights / max(comp_weights.sum(), 1e-12) + pca_score = (pca_loadings.T * comp_weights).sum(axis=1) + + df = pd.DataFrame({ + "feature": sorted(set(shap_series.index).union(set(pca_score.index))), + }) + df["shap_importance"] = df["feature"].map(shap_series).fillna(0.0).astype(float) + df["pca_weighted_loading"] = df["feature"].map(pca_score).fillna(0.0).astype(float) + + # Rank normalize to [0,1], then blend. + df["shap_rank"] = df["shap_importance"].rank(ascending=False, method="average") + df["pca_rank"] = df["pca_weighted_loading"].rank(ascending=False, method="average") + n = max(len(df), 1) + df["shap_rank_norm"] = 1.0 - (df["shap_rank"] - 1.0) / max(n - 1, 1) + df["pca_rank_norm"] = 1.0 - (df["pca_rank"] - 1.0) / max(n - 1, 1) + + # Inertia is cluster-level evidence, not feature-specific. Keep as metadata/context column. + elbow_strength = 0.0 + if inertia_curve and len(inertia_curve) >= 3: + ks = sorted(inertia_curve.keys()) + vals = np.array([inertia_curve[k] for k in ks], dtype=float) + second_diff = np.diff(vals, n=2) + if len(second_diff) > 0: + elbow_strength = float(np.max(np.abs(second_diff))) + df["inertia_elbow_strength"] = elbow_strength + + df["convergence_score"] = ( + 0.6 * df["shap_rank_norm"] + + 0.4 * df["pca_rank_norm"] + ) + df = df.sort_values("convergence_score", ascending=False).reset_index(drop=True) + return df + + +def build_cluster_profiles( + df: pd.DataFrame, + labels: np.ndarray, + feature_names: list[str], +) -> pd.DataFrame: + """Cluster x feature mean table on raw/unscaled data.""" + profiled = df.copy() + profiled["_cluster"] = labels + # Use only numeric columns that exist in the dataframe + num_cols = [c for c in feature_names if c in profiled.columns and pd.api.types.is_numeric_dtype(profiled[c])] + if not num_cols: + num_cols = [c for c in profiled.select_dtypes(include="number").columns if c != "_cluster"] + profiles = profiled[profiled["_cluster"] != -1].groupby("_cluster")[num_cols].mean().round(2) + return profiles + + +def build_explainability_report( + X: np.ndarray, + labels: np.ndarray, + feature_names: list[str], + df_raw: pd.DataFrame, + pca: Optional[PCA], + k_range: tuple[int, int], + random_state: int = 42, + n_top: int = 10, + X_original: np.ndarray | None = None, + original_feature_names: list[str] | None = None, +) -> ExplainabilityReport: + """Orchestrate all explainability analyses into one report.""" + log.info("Computing feature importances...") + shap_X = X_original if X_original is not None else X + shap_feature_names = ( + original_feature_names + if original_feature_names is not None and len(original_feature_names) == shap_X.shape[1] + else feature_names + ) + importances = compute_shap_importance(shap_X, labels, shap_feature_names, random_state) + importance_pct = to_importance_percentages(importances) + top_features = list(importances.keys())[:n_top] + + # PCA loadings are defined on the original pre-PCA feature space. + # When clustering uses PCA output, `feature_names` may be ["PC1", ...], + # so derive a compatible name list for loadings to avoid shape mismatch. + n_pca_input_features = int(pca.components_.shape[1]) if pca is not None else len(feature_names) + numeric_raw_cols = [c for c in df_raw.columns if pd.api.types.is_numeric_dtype(df_raw[c])] + if len(feature_names) == n_pca_input_features: + loading_feature_names = feature_names + elif len(numeric_raw_cols) == n_pca_input_features: + loading_feature_names = numeric_raw_cols + else: + loading_feature_names = [f"feature_{i+1}" for i in range(n_pca_input_features)] + + log.info("Computing PCA loadings...") + if pca is not None: + pca_loadings = compute_pca_loadings(pca, loading_feature_names) + pca_var = pca.explained_variance_ratio_.tolist() + else: + # Fit a quick PCA for loadings analysis + n_comp = min(5, len(feature_names)) + pca_temp = PCA(n_components=n_comp, random_state=random_state) + pca_temp.fit(X) + pca_loadings = compute_pca_loadings(pca_temp, loading_feature_names) + pca_var = pca_temp.explained_variance_ratio_.tolist() + + log.info("Computing inertia curve...") + inertia = compute_inertia_curve(X, k_range, random_state) + + log.info("Computing cluster profiles...") + profiles = build_cluster_profiles(df_raw, labels, feature_names) + + log.info("Building ordered convergence drivers...") + ordered_drivers = build_ordered_feature_drivers( + feature_importances=importances, + pca_loadings=pca_loadings, + pca_variance_ratio=pca_var, + inertia_curve=inertia, + ) + pc_map = build_pc_feature_map(pca_loadings, top_n=5) + top_features = ordered_drivers["feature"].head(n_top).tolist() + + return ExplainabilityReport( + top_features=top_features, + feature_importances=importances, + feature_importance_pct=importance_pct, + pca_loadings=pca_loadings, + cluster_profiles=profiles, + pca_variance_ratio=pca_var, + inertia_curve=inertia, + ordered_feature_drivers=ordered_drivers, + pc_feature_map=pc_map, + ) diff --git a/segplus/feature_engineering.py b/segplus/feature_engineering.py new file mode 100644 index 0000000..77910c6 --- /dev/null +++ b/segplus/feature_engineering.py @@ -0,0 +1,141 @@ +"""Feature engineering: domain rules, imputation, encoding, scaling, PCA.""" +from __future__ import annotations + +import logging +from typing import Optional + +import numpy as np +import pandas as pd +from sklearn.decomposition import PCA +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import LabelEncoder, StandardScaler + +from .config import DomainConfig, PipelineConfig +from .types import DataSchema, FeatureEngineeringResult + +log = logging.getLogger("segplus.feature_engineering") + + +class FeatureEngineer: + """Full feature engineering pipeline: FE rules -> impute -> encode -> winsorize -> scale -> PCA.""" + + def __init__(self, domain_config: DomainConfig, pipeline_config: PipelineConfig): + self.domain = domain_config + self.config = pipeline_config + self._scaler: StandardScaler | None = None + self._pca: PCA | None = None + self._imputer: SimpleImputer | None = None + self._label_encoders: dict[str, LabelEncoder] = {} + self._feature_cols: list[str] = [] + + def run(self, df: pd.DataFrame) -> FeatureEngineeringResult: + """Execute the full FE pipeline and return results.""" + df_original = df.copy() + + # Step 1: Apply domain-specific FE rules + df = self._apply_domain_rules(df) + + # Step 2: Drop excluded columns + drop_cols = [c for c in self.config.exclude_cols if c in df.columns] + df_work = df.drop(columns=drop_cols, errors="ignore") + + # Step 3: Identify column types + num_cols = [c for c in df_work.columns if pd.api.types.is_numeric_dtype(df_work[c])] + cat_cols = [c for c in self.domain.categorical_columns if c in df_work.columns] + + # Step 4: Encode categoricals + for col in cat_cols: + le = LabelEncoder() + df_work[col] = le.fit_transform(df_work[col].astype(str)) + self._label_encoders[col] = le + if col not in num_cols: + num_cols.append(col) + + # Step 5: Select feature columns (numeric + encoded categoricals) + self._feature_cols = [c for c in num_cols if c in df_work.columns] + X = df_work[self._feature_cols].copy() + + # Step 6: Impute missing values + self._imputer = SimpleImputer(strategy=self.config.imputation_strategy) + X_arr = self._imputer.fit_transform(X) + X = pd.DataFrame(X_arr, columns=self._feature_cols, index=df_work.index) + + # Step 7: Winsorize outliers + X = self._winsorize(X, self._feature_cols) + + df_engineered = X.copy() + + # Step 8: Scale + self._scaler = StandardScaler() + X_scaled = self._scaler.fit_transform(X) + + # Step 9: PCA (optional) + pca = None + if self.config.pca_variance_threshold < 1.0: + pca, X_scaled = self._apply_pca(X_scaled) + pca_names = [f"PC{i+1}" for i in range(X_scaled.shape[1])] + log.info( + "PCA: %d components explain %.1f%% variance", + X_scaled.shape[1], + sum(pca.explained_variance_ratio_[:X_scaled.shape[1]]) * 100, + ) + else: + pca_names = None + + feature_names = pca_names if pca_names else self._feature_cols + + log.info("Feature engineering complete: %d features -> %d-dim output", len(self._feature_cols), X_scaled.shape[1]) + + return FeatureEngineeringResult( + df_original=df_original, + df_engineered=df_engineered, + X_scaled=X_scaled, + feature_names=feature_names if pca_names else list(self._feature_cols), + pca=pca, + scaler=self._scaler, + label_encoders=self._label_encoders, + ) + + def _apply_domain_rules(self, df: pd.DataFrame) -> pd.DataFrame: + """Apply feature engineering rules from domain config using df.eval().""" + df = df.copy() + for rule in self.domain.feature_engineering: + try: + result = df.eval(rule.formula) + if rule.bins: + n_bins = len(rule.bins) + df[rule.name] = pd.qcut(result, q=n_bins, labels=rule.bins, duplicates="drop") + else: + df[rule.name] = result + log.info(" FE rule applied: %s", rule.name) + except Exception as e: + log.warning(" FE rule '%s' failed: %s", rule.name, e) + return df + + def _winsorize(self, X: pd.DataFrame, num_cols: list[str]) -> pd.DataFrame: + """Clip outliers to [lower, upper] percentiles.""" + for col in num_cols: + if col in X.columns: + lo = X[col].quantile(self.config.winsorize_lower) + hi = X[col].quantile(self.config.winsorize_upper) + X[col] = X[col].clip(lo, hi) + return X + + def _apply_pca(self, X_scaled: np.ndarray) -> tuple[PCA, np.ndarray]: + """Apply PCA with automatic component selection.""" + pca = PCA(random_state=self.config.random_state) + pca.fit(X_scaled) + + cumvar = np.cumsum(pca.explained_variance_ratio_) + n_components = int(np.searchsorted(cumvar, self.config.pca_variance_threshold) + 1) + n_components = max(2, min(n_components, X_scaled.shape[1])) + + pca_final = PCA(n_components=n_components, random_state=self.config.random_state) + X_pca = pca_final.fit_transform(X_scaled) + self._pca = pca_final + return pca_final, X_pca + + @property + def original_feature_names(self) -> list[str]: + """Original feature column names before PCA.""" + return list(self._feature_cols) diff --git a/segplus/final_segplus_pipeline2.ipynb b/segplus/final_segplus_pipeline2.ipynb new file mode 100644 index 0000000..22bec32 --- /dev/null +++ b/segplus/final_segplus_pipeline2.ipynb @@ -0,0 +1,1290 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c0c4075e", + "metadata": {}, + "source": [ + "# SegPlus Final End-to-End Pipeline\n", + "\n", + "This notebook runs the full architecture:\n", + "- Data input and validation\n", + "- Feature engineering\n", + "- KMeans + DBSCAN + GMM modeling loop\n", + "- Cluster evaluation + stability\n", + "- Explainability (feature drivers, PCA loadings, profiles)\n", + "- Persona generation and business grounding (Ollama with fallback)\n", + "- Visualization and export\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "64579db4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Missing required: ['scikit-learn', 'pyyaml']\n", + "Missing optional: []\n" + ] + } + ], + "source": [ + "# Optional dependency installer/check\n", + "import importlib\n", + "import subprocess\n", + "import sys\n", + "\n", + "REQUIRED_PACKAGES = [\n", + " \"numpy\", \"pandas\", \"scikit-learn\", \"matplotlib\", \"seaborn\",\n", + " \"pyyaml\", \"openpyxl\", \"requests\", \"scipy\"\n", + "]\n", + "OPTIONAL_PACKAGES = [\"shap\"]\n", + "AUTO_INSTALL_MISSING = False # set True if you want notebook to install missing packages\n", + "\n", + "missing_required = []\n", + "missing_optional = []\n", + "\n", + "for pkg in REQUIRED_PACKAGES:\n", + " mod = pkg.replace(\"-\", \"_\")\n", + " try:\n", + " importlib.import_module(mod)\n", + " except Exception:\n", + " missing_required.append(pkg)\n", + "\n", + "for pkg in OPTIONAL_PACKAGES:\n", + " try:\n", + " importlib.import_module(pkg)\n", + " except Exception:\n", + " missing_optional.append(pkg)\n", + "\n", + "print(\"Missing required:\", missing_required)\n", + "print(\"Missing optional:\", missing_optional)\n", + "\n", + "if missing_required and AUTO_INSTALL_MISSING:\n", + " cmd = [sys.executable, \"-m\", \"pip\", \"install\", *missing_required]\n", + " print(\"Installing:\", \" \".join(missing_required))\n", + " subprocess.check_call(cmd)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "32c77bb2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Project root: c:\\Users\\UmairAhmed\\OneDrive - Blend 360\\Documents\\Segmentation_Plus\n" + ] + } + ], + "source": [ + "import logging\n", + "import json\n", + "from pathlib import Path\n", + "import sys\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "# Ensure project root is importable when notebook is inside segplus/\n", + "cwd = Path.cwd()\n", + "project_root = cwd.parent if cwd.name.lower() == \"segplus\" else cwd\n", + "if str(project_root) not in sys.path:\n", + " sys.path.insert(0, str(project_root))\n", + "\n", + "from segplus.config import PipelineConfig, build_domain_config_from_dataframe\n", + "from segplus.data_input import load_data, infer_schema, validate_schema\n", + "from segplus.feature_engineering import FeatureEngineer\n", + "from segplus.modeling_loop import modeling_loop\n", + "from segplus.evaluation import run_stability_test\n", + "from segplus.experiment_log import ExperimentLog\n", + "from segplus.explainability import build_explainability_report\n", + "from segplus.ollama_client import OllamaClient\n", + "from segplus.persona_generation import PersonaGenerator\n", + "from segplus.visualization import (\n", + " plot_cluster_scatter_2d,\n", + " plot_shap_importance,\n", + " plot_shap_summary,\n", + " plot_pca_loadings_heatmap,\n", + " plot_cluster_profiles_heatmap,\n", + " plot_cluster_sizes,\n", + " plot_radar_charts,\n", + " plot_experiment_history,\n", + " plot_elbow_curve,\n", + ")\n", + "\n", + "logging.basicConfig(\n", + " level=logging.INFO,\n", + " format=\"%(asctime)s | %(levelname)-8s | %(name)-30s | %(message)s\",\n", + ")\n", + "\n", + "print(\"Project root:\", project_root)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e0196175", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data: c:\\Users\\UmairAhmed\\OneDrive - Blend 360\\Documents\\Segmentation_Plus\\final_enterprise_clustering_dataset_single_sheet.xlsx\n", + "Output: c:\\Users\\UmairAhmed\\OneDrive - Blend 360\\Documents\\Segmentation_Plus\\segplus_output\n", + "Ollama host: http://localhost:11434\n", + "Primary LLM: qwen2.5:7b\n" + ] + } + ], + "source": [ + "# Configuration (Excel direct-ingest, no domain YAML)\n", + "default_data_path = project_root / \"final_enterprise_clustering_dataset_single_sheet.xlsx\"\n", + "default_output_dir = project_root / \"segplus_output\"\n", + "\n", + "config = PipelineConfig(\n", + " data_path=str(default_data_path),\n", + " domain_key=\"direct_ingest\",\n", + " sheet_name=None,\n", + " k_range=(2, 8),\n", + " max_iterations=5,\n", + " pca_variance_threshold=0.85,\n", + " output_dir=str(default_output_dir),\n", + " ollama_host=\"http://localhost:11434\",\n", + " ollama_model=\"qwen2.5:7b\",\n", + " ollama_timeout=240,\n", + " business_objective=(\n", + " \"Identify distinct customer segments to personalise marketing campaigns, \"\n", + " \"improve retention for high-value customers, and convert mid-tier customers \"\n", + " \"to premium products.\"\n", + " ),\n", + ")\n", + "\n", + "# Preferred model order for local Ollama auto-selection\n", + "PREFERRED_OLLAMA_MODELS = [\n", + " \"qwen2.5\",\n", + " \"qwen2\",\n", + " \"llama3.2\",\n", + " \"llama3.1\",\n", + " \"llama3\",\n", + " \"mistral\",\n", + "]\n", + "\n", + "output_dir = Path(config.output_dir)\n", + "output_dir.mkdir(parents=True, exist_ok=True)\n", + "\n", + "print(\"Data:\", config.data_path)\n", + "print(\"Output:\", output_dir)\n", + "print(\"Ollama host:\", config.ollama_host)\n", + "print(\"Primary LLM:\", config.ollama_model)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "85b24976", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-03-09 12:43:26,707 | INFO | segplus.data_input | Loaded final_enterprise_clustering_dataset_single_sheet.xlsx: 120000 rows x 27 cols\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data Quality Report\n", + " Rows: 120,000 | Columns: 27\n", + " Duplicates: 0\n", + " Status: PASSED\n", + "Raw shape: (120000, 27)\n", + "Inferred categorical columns: ['delinquency_flag_x', 'CreditCard', 'Investment', 'Loan']\n" + ] + } + ], + "source": [ + "# 1) Data Input + Validation (no YAML)\n", + "df_raw = load_data(config.data_path, sheet_name=config.sheet_name)\n", + "\n", + "# Build domain config directly from dataframe schema\n", + "_domain_key = config.domain_key if config.domain_key else \"direct_ingest\"\n", + "domain_config = build_domain_config_from_dataframe(\n", + " df_raw,\n", + " domain_key=_domain_key,\n", + " exclude_cols=config.exclude_cols,\n", + ")\n", + "\n", + "schema = infer_schema(df_raw)\n", + "quality_report = validate_schema(df_raw, domain_config, schema)\n", + "\n", + "print(quality_report.summary())\n", + "if not quality_report.passed:\n", + " raise ValueError(\"Data quality checks failed. Fix input data and rerun.\")\n", + "\n", + "print(\"Raw shape:\", df_raw.shape)\n", + "print(\"Inferred categorical columns:\", domain_config.categorical_columns[:15])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7e4bf552", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-03-09 12:43:27,584 | INFO | segplus.feature_engineering | PCA: 7 components explain 87.2% variance\n", + "2026-03-09 12:43:27,585 | INFO | segplus.feature_engineering | Feature engineering complete: 26 features -> 7-dim output\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Engineered dataframe shape: (120000, 26)\n", + "Model matrix shape used for clustering: (120000, 7)\n", + "Feature count used for clustering: 7\n", + "SHAP source feature count (original vars): 26\n", + "First SHAP source vars: ['month', 'monthly_spend_x', 'utilization_ratio_x', 'delinquency_flag_x', 'age', 'annual_income', 'risk_score', 'credit_score', 'digital_affinity', 'tenure_months']\n" + ] + } + ], + "source": [ + "# 2) Feature Engineering\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "feature_engineer = FeatureEngineer(domain_config, config)\n", + "fe_result = feature_engineer.run(df_raw)\n", + "\n", + "# Original-variable matrix for SHAP (never PCA names)\n", + "original_feature_names_for_shap = fe_result.df_engineered.columns.tolist()\n", + "X_original_for_shap = StandardScaler().fit_transform(fe_result.df_engineered.values)\n", + "\n", + "# Optional latent representation via local autoencoder (if tensorflow is installed)\n", + "USE_AUTOENCODER_REPRESENTATION = False\n", + "AE_LATENT_DIM = 8\n", + "AE_EPOCHS = 40\n", + "AE_BATCH_SIZE = 256\n", + "\n", + "X = fe_result.X_scaled\n", + "feature_names = fe_result.feature_names\n", + "pca_for_explainability = fe_result.pca\n", + "\n", + "if USE_AUTOENCODER_REPRESENTATION:\n", + " try:\n", + " import tensorflow as tf\n", + " from tensorflow.keras import Model\n", + " from tensorflow.keras.layers import Dense, Input\n", + " from tensorflow.keras.callbacks import EarlyStopping\n", + "\n", + " tf.random.set_seed(config.random_state)\n", + " input_dim = X.shape[1]\n", + " latent_dim = max(2, min(AE_LATENT_DIM, input_dim - 1))\n", + "\n", + " inp = Input(shape=(input_dim,))\n", + " x = Dense(max(16, input_dim // 2), activation=\"relu\")(inp)\n", + " latent = Dense(latent_dim, activation=\"linear\", name=\"latent\")(x)\n", + " x = Dense(max(16, input_dim // 2), activation=\"relu\")(latent)\n", + " out = Dense(input_dim, activation=\"linear\")(x)\n", + "\n", + " autoencoder = Model(inp, out)\n", + " encoder = Model(inp, latent)\n", + " autoencoder.compile(optimizer=\"adam\", loss=\"mse\")\n", + " autoencoder.fit(\n", + " X,\n", + " X,\n", + " epochs=AE_EPOCHS,\n", + " batch_size=min(AE_BATCH_SIZE, len(X)),\n", + " verbose=0,\n", + " callbacks=[EarlyStopping(monitor=\"loss\", patience=5, restore_best_weights=True)],\n", + " )\n", + "\n", + " X = encoder.predict(X, verbose=0)\n", + " feature_names = [f\"AE{i+1}\" for i in range(X.shape[1])]\n", + " pca_for_explainability = None # PCA loadings no longer match AE latent space\n", + " print(f\"Autoencoder latent representation enabled: {X.shape}\")\n", + " except Exception as e:\n", + " print(f\"Autoencoder path unavailable ({e}); continuing with FE output.\")\n", + "\n", + "print(\"Engineered dataframe shape:\", fe_result.df_engineered.shape)\n", + "print(\"Model matrix shape used for clustering:\", X.shape)\n", + "print(\"Feature count used for clustering:\", len(feature_names))\n", + "print(\"SHAP source feature count (original vars):\", len(original_feature_names_for_shap))\n", + "print(\"First SHAP source vars:\", original_feature_names_for_shap[:10])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ca7f19aa", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\UmairAhmed\\anaconda3\\Lib\\site-packages\\joblib\\externals\\loky\\backend\\context.py:136: UserWarning: Could not find the number of physical cores for the following reason:\n", + "[WinError 2] The system cannot find the file specified\n", + "Returning the number of logical cores instead. You can silence this warning by setting LOKY_MAX_CPU_COUNT to the number of cores you want to use.\n", + " warnings.warn(\n", + " File \"c:\\Users\\UmairAhmed\\anaconda3\\Lib\\site-packages\\joblib\\externals\\loky\\backend\\context.py\", line 257, in _count_physical_cores\n", + " cpu_info = subprocess.run(\n", + " \"wmic CPU Get NumberOfCores /Format:csv\".split(),\n", + " capture_output=True,\n", + " text=True,\n", + " )\n", + " File \"c:\\Users\\UmairAhmed\\anaconda3\\Lib\\subprocess.py\", line 554, in run\n", + " with Popen(*popenargs, **kwargs) as process:\n", + " ~~~~~^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"c:\\Users\\UmairAhmed\\anaconda3\\Lib\\subprocess.py\", line 1039, in __init__\n", + " self._execute_child(args, executable, preexec_fn, close_fds,\n", + " ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " pass_fds, cwd, env,\n", + " ^^^^^^^^^^^^^^^^^^^\n", + " ...<5 lines>...\n", + " gid, gids, uid, umask,\n", + " ^^^^^^^^^^^^^^^^^^^^^^\n", + " start_new_session, process_group)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"c:\\Users\\UmairAhmed\\anaconda3\\Lib\\subprocess.py\", line 1554, in _execute_child\n", + " hp, ht, pid, tid = _winapi.CreateProcess(executable, args,\n", + " ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^\n", + " # no special security\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + " ...<4 lines>...\n", + " cwd,\n", + " ^^^^\n", + " startupinfo)\n", + " ^^^^^^^^^^^^\n", + "2026-03-09 12:43:30,308 | INFO | segplus.clustering | K search: scores={2: np.float64(0.1996), 3: np.float64(0.2036), 4: np.float64(0.1656), 5: np.float64(0.1648), 6: np.float64(0.1443), 7: np.float64(0.1333), 8: np.float64(0.1348)} | best k=3 (sil=0.2036)\n", + "2026-03-09 12:43:31,440 | INFO | segplus.clustering | DBSCAN eps estimated: 1.661 (knee at index 119999)\n", + "2026-03-09 12:43:31,442 | INFO | segplus.modeling_loop | ============================================================\n", + "2026-03-09 12:43:31,442 | INFO | segplus.modeling_loop | Modeling Loop - Iteration 1/5 | k=3\n", + "2026-03-09 12:43:31,443 | INFO | segplus.modeling_loop | ============================================================\n", + "2026-03-09 12:43:31,810 | INFO | segplus.clustering | [kmeans] clusters=3\n", + "2026-03-09 12:43:48,045 | INFO | segplus.clustering | [dbscan] clusters=178\n", + "2026-03-09 12:44:03,897 | INFO | segplus.clustering | [gmm] clusters=3\n", + "2026-03-09 12:44:04,007 | INFO | segplus.evaluation | [kmeans] k=3 | sil=0.2117 | DB=1.5251 | CH=31125.2 | cov=1.00\n", + "2026-03-09 12:44:04,207 | INFO | segplus.evaluation | [dbscan] k=178 | sil=-0.1921 | DB=0.8652 | CH=142.0 | cov=1.00\n", + "2026-03-09 12:44:04,304 | INFO | segplus.evaluation | [gmm] k=3 | sil=0.0720 | DB=3.5552 | CH=13484.7 | cov=1.00\n", + "2026-03-09 12:44:04,304 | INFO | segplus.evaluation | Composite ranking: kmeans=0.875 | dbscan=0.325 | gmm=0.300\n", + "2026-03-09 12:44:04,305 | INFO | segplus.evaluation | Winner: kmeans (composite=0.875)\n", + "2026-03-09 12:44:04,305 | INFO | segplus.experiment_log | Experiment 1: kmeans k=3 sil=0.2117 pass=True\n", + "2026-03-09 12:44:04,306 | INFO | segplus.modeling_loop | Best: kmeans | sil=0.2117 | DB=1.5251 | PASS=True\n", + "2026-03-09 12:44:04,306 | INFO | segplus.modeling_loop | Evaluation PASSED on iteration 1.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best algorithm: kmeans\n", + "Clusters: 3\n", + "Silhouette: 0.2117\n", + "Davies-Bouldin: 1.5251\n", + "Passes gate: True\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iterationtimestampkepsgmm_covbest_algorithmn_clusterssilhouettedavies_bouldincalinski_harabaszpassedreconfig_strategy
012026-03-09T12:44:04.30592331.661fullkmeans30.21171.525131125.2TrueNone
\n", + "
" + ], + "text/plain": [ + " iteration timestamp k eps gmm_cov best_algorithm \\\n", + "0 1 2026-03-09T12:44:04.305923 3 1.661 full kmeans \n", + "\n", + " n_clusters silhouette davies_bouldin calinski_harabasz passed \\\n", + "0 3 0.2117 1.5251 31125.2 True \n", + "\n", + " reconfig_strategy \n", + "0 None " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 3) Modeling Loop (KMeans + DBSCAN + GMM with reconfiguration)\n", + "experiment_log = ExperimentLog()\n", + "\n", + "best_eval, best_cfg = modeling_loop(\n", + " X=X,\n", + " feature_names=feature_names,\n", + " config=config,\n", + " experiment_log=experiment_log,\n", + ")\n", + "\n", + "print(\"Best algorithm:\", best_eval.algorithm)\n", + "print(\"Clusters:\", best_eval.n_clusters)\n", + "print(\"Silhouette:\", best_eval.silhouette)\n", + "print(\"Davies-Bouldin:\", best_eval.davies_bouldin)\n", + "print(\"Passes gate:\", best_eval.passes)\n", + "\n", + "exp_df = experiment_log.to_dataframe()\n", + "exp_df\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7e9bafbb", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-03-09 12:44:11,558 | INFO | segplus.evaluation | Stability (kmeans): ARI=0.988 +/- 0.006 (threshold=0.70, stable=True)\n", + "2026-03-09 12:44:11,561 | INFO | segplus.explainability | Computing feature importances...\n", + "2026-03-09 12:44:18,121 | WARNING | segplus.explainability | TreeSHAP failed (only length-1 arrays can be converted to Python scalars), falling back to RF feature_importances_\n", + "2026-03-09 12:44:18,142 | INFO | segplus.explainability | Feature importance computed via RF feature_importances_\n", + "2026-03-09 12:44:18,146 | INFO | segplus.explainability | Computing PCA loadings...\n", + "2026-03-09 12:44:18,146 | INFO | segplus.explainability | Computing inertia curve...\n", + "2026-03-09 12:44:20,124 | INFO | segplus.explainability | Computing cluster profiles...\n", + "2026-03-09 12:44:20,189 | INFO | segplus.explainability | Building ordered convergence drivers...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Stability ARI mean: 0.988\n", + "Stability algorithm used: kmeans\n", + "Ordered top drivers: ['risk_score', 'risk_score_cluster', 'risk_behavior_score', 'delinquency_flag_y', 'credit_score', 'credit_score_cluster', 'monthly_spend_y', 'value_score', 'annual_income', 'log_income']\n", + "Top SHAP % (original variables):\n", + " credit_score: 19.58%\n", + " credit_score_cluster: 19.05%\n", + " risk_score: 9.32%\n", + " risk_score_cluster: 7.35%\n", + " risk_behavior_score: 6.92%\n", + " monthly_spend_y: 5.59%\n", + " delinquency_flag_y: 5.43%\n", + " value_score: 5.40%\n", + " monthly_spend_x: 3.53%\n", + " annual_income: 3.00%\n", + "PC to original feature map (top 5):\n", + " PC1: ['credit_score', 'credit_score_cluster', 'risk_score', 'risk_score_cluster', 'log_income']\n", + " PC2: ['value_score', 'monthly_spend_y', 'monthly_spend_x', 'lifetime_value_proxy', 'utilization_ratio_x']\n", + " PC3: ['age', 'age_cluster', 'digital_affinity_cluster', 'digital_affinity', 'delinquency_flag_y']\n", + " PC4: ['tenure_months', 'tenure_months_cluster', 'lifetime_value_proxy', 'utilization_ratio_x', 'utilization_ratio_y']\n", + " PC5: ['annual_income', 'annual_income_cluster', 'log_income', 'delinquency_flag_y', 'risk_behavior_score']\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
featureshap_importancepca_weighted_loadingshap_rankpca_rankshap_rank_normpca_rank_norminertia_elbow_strengthconvergence_score
0risk_score0.093210.1696243.03.50.920.90158504.1408420.912
1risk_score_cluster0.073450.1696244.03.50.880.90158504.1408420.888
2risk_behavior_score0.069180.1701175.02.00.840.96158504.1408420.888
3delinquency_flag_y0.054290.1702847.01.00.761.00158504.1408420.856
4credit_score0.195770.1500411.012.51.000.54158504.1408420.816
5credit_score_cluster0.190510.1500412.012.50.960.54158504.1408420.792
6monthly_spend_y0.055910.1504126.010.50.800.62158504.1408420.728
7value_score0.053970.1504128.010.50.720.62158504.1408420.680
8annual_income0.030040.15700110.08.00.640.72158504.1408420.672
9log_income0.029880.15923811.07.00.600.76158504.1408420.664
10utilization_ratio_y0.028070.16146912.06.00.560.80158504.1408420.656
11monthly_spend_x0.035330.1490169.014.00.680.48158504.1408420.600
12utilization_ratio_x0.020780.16152515.05.00.440.84158504.1408420.600
13annual_income_cluster0.027650.15700113.09.00.520.68158504.1408420.584
14lifetime_value_proxy0.024850.14514214.015.00.480.44158504.1408420.464
\n", + "
" + ], + "text/plain": [ + " feature shap_importance pca_weighted_loading shap_rank \\\n", + "0 risk_score 0.09321 0.169624 3.0 \n", + "1 risk_score_cluster 0.07345 0.169624 4.0 \n", + "2 risk_behavior_score 0.06918 0.170117 5.0 \n", + "3 delinquency_flag_y 0.05429 0.170284 7.0 \n", + "4 credit_score 0.19577 0.150041 1.0 \n", + "5 credit_score_cluster 0.19051 0.150041 2.0 \n", + "6 monthly_spend_y 0.05591 0.150412 6.0 \n", + "7 value_score 0.05397 0.150412 8.0 \n", + "8 annual_income 0.03004 0.157001 10.0 \n", + "9 log_income 0.02988 0.159238 11.0 \n", + "10 utilization_ratio_y 0.02807 0.161469 12.0 \n", + "11 monthly_spend_x 0.03533 0.149016 9.0 \n", + "12 utilization_ratio_x 0.02078 0.161525 15.0 \n", + "13 annual_income_cluster 0.02765 0.157001 13.0 \n", + "14 lifetime_value_proxy 0.02485 0.145142 14.0 \n", + "\n", + " pca_rank shap_rank_norm pca_rank_norm inertia_elbow_strength \\\n", + "0 3.5 0.92 0.90 158504.140842 \n", + "1 3.5 0.88 0.90 158504.140842 \n", + "2 2.0 0.84 0.96 158504.140842 \n", + "3 1.0 0.76 1.00 158504.140842 \n", + "4 12.5 1.00 0.54 158504.140842 \n", + "5 12.5 0.96 0.54 158504.140842 \n", + "6 10.5 0.80 0.62 158504.140842 \n", + "7 10.5 0.72 0.62 158504.140842 \n", + "8 8.0 0.64 0.72 158504.140842 \n", + "9 7.0 0.60 0.76 158504.140842 \n", + "10 6.0 0.56 0.80 158504.140842 \n", + "11 14.0 0.68 0.48 158504.140842 \n", + "12 5.0 0.44 0.84 158504.140842 \n", + "13 9.0 0.52 0.68 158504.140842 \n", + "14 15.0 0.48 0.44 158504.140842 \n", + "\n", + " convergence_score \n", + "0 0.912 \n", + "1 0.888 \n", + "2 0.888 \n", + "3 0.856 \n", + "4 0.816 \n", + "5 0.792 \n", + "6 0.728 \n", + "7 0.680 \n", + "8 0.672 \n", + "9 0.664 \n", + "10 0.656 \n", + "11 0.600 \n", + "12 0.600 \n", + "13 0.584 \n", + "14 0.464 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 4) Stability + Explainability\n", + "stability = run_stability_test(\n", + " X=X,\n", + " labels=best_eval.labels,\n", + " n_clusters=best_eval.n_clusters,\n", + " config=config,\n", + " algorithm=best_eval.algorithm,\n", + " model=best_eval.model,\n", + ")\n", + "\n", + "explainability = build_explainability_report(\n", + " X=X,\n", + " labels=best_eval.labels,\n", + " feature_names=feature_names,\n", + " df_raw=fe_result.df_engineered,\n", + " pca=pca_for_explainability,\n", + " k_range=config.k_range,\n", + " random_state=config.random_state,\n", + " n_top=config.n_top_features,\n", + " X_original=X_original_for_shap,\n", + " original_feature_names=original_feature_names_for_shap,\n", + ")\n", + "\n", + "print(\"Stability ARI mean:\", stability.ari_mean)\n", + "print(\"Stability algorithm used:\", best_eval.algorithm)\n", + "print(\"Ordered top drivers:\", explainability.top_features[:10])\n", + "print(\"Top SHAP % (original variables):\")\n", + "for k, v in list(explainability.feature_importance_pct.items())[:10]:\n", + " print(f\" {k}: {v:.2f}%\")\n", + "\n", + "print(\"PC to original feature map (top 5):\")\n", + "for pc, cols in list(explainability.pc_feature_map.items())[:5]:\n", + " print(f\" {pc}: {cols}\")\n", + "\n", + "explainability.ordered_feature_drivers.head(15)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "000fb2d4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Installed Ollama models: ['qwen2.5:7b']\n", + "Selected model tag: qwen2.5:7b\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-03-09 12:44:24,482 | INFO | segplus.ollama_client | Ollama reachable. Model 'qwen2.5:7b' available.\n", + "2026-03-09 12:47:02,703 | INFO | segplus.ollama_client | Ollama response received (695 chars)\n", + "2026-03-09 12:47:26,233 | INFO | segplus.ollama_client | Ollama response received (12 chars)\n", + "2026-03-09 12:47:26,236 | INFO | segplus.persona_generation | Focused naming for Cluster A: 'High Rollers'\n", + "2026-03-09 12:47:26,247 | INFO | segplus.persona_generation | Persona generated: Cluster A -> High Rollers\n", + "2026-03-09 12:49:15,034 | INFO | segplus.ollama_client | Ollama response received (625 chars)\n", + "2026-03-09 12:49:39,299 | INFO | segplus.ollama_client | Ollama response received (18 chars)\n", + "2026-03-09 12:49:39,307 | INFO | segplus.persona_generation | Persona generated: Cluster B -> Credit Risk Strategists\n", + "2026-03-09 12:51:31,781 | INFO | segplus.ollama_client | Ollama response received (659 chars)\n", + "2026-03-09 12:51:54,394 | INFO | segplus.ollama_client | Ollama response received (16 chars)\n", + "2026-03-09 12:51:54,396 | INFO | segplus.persona_generation | Persona generated: Cluster C -> Credit Risk Strategists\n", + "2026-03-09 12:51:54,421 | INFO | segplus.persona_generation | Grounding personas in business objective...\n", + "2026-03-09 12:55:23,911 | INFO | segplus.ollama_client | Ollama response received (2919 chars)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generated personas: 3\n", + "Executive summary: Our analysis reveals three distinct customer segments: High Rollers, Credit Risk Strategists with Higher Credit Scores, and Credit Risk Strategists with Lower Credit Scores. High Rollers represent the highest value and should be prioritized for premium product offerings. Credit Risk Strategists with Higher Credit Scores are loyal and should be retained through personalized loyalty programs. Lower Credit Score Strategists present a risk but have potential for growth through targeted credit enhancement programs.\n", + "persona_df columns: ['cluster', 'genai_persona_name', 'profile_descriptor', 'description', 'cluster_size', 'cluster_pct', 'ordered_top_features', 'genai_recommendations', 'categorization_basis', 'naming_rationale']\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
clustergenai_persona_nameprofile_descriptordescriptioncluster_sizecluster_pctordered_top_featuresgenai_recommendationscategorization_basisnaming_rationale
0Cluster AHigh RollersHigh Value Score & High Monthly Spend YHigh-Value Premium Spenders are among the most...87740.0731value_score:0.1, monthly_spend_y:25414.84, mon...Tailor premium offers and services to enhance ...value_score, monthly_spend_y, monthly_spend_x,...Second-pass focused LLM naming from profile: H...
1Cluster BCredit Risk Strategists – Higher Credit Score ...High Credit Score & Low Risk ScoreCreditworthy Loyalists are high-income, low-ri...567820.4732credit_score:827.07, credit_score_cluster:827....Target with premium product promotions | Offer...credit_score, credit_score_cluster, risk_score...LLM returned generic name; using business-styl...
2Cluster CCredit Risk Strategists – Lower Credit Score C...Low Credit Score & High Risk ScoreThese customers have slightly lower credit sco...544440.4537credit_score:692.54, credit_score_cluster:692....Offer personalized financial education and bud...credit_score, credit_score_cluster, risk_score...LLM returned generic name; using business-styl...
\n", + "
" + ], + "text/plain": [ + " cluster genai_persona_name \\\n", + "0 Cluster A High Rollers \n", + "1 Cluster B Credit Risk Strategists – Higher Credit Score ... \n", + "2 Cluster C Credit Risk Strategists – Lower Credit Score C... \n", + "\n", + " profile_descriptor \\\n", + "0 High Value Score & High Monthly Spend Y \n", + "1 High Credit Score & Low Risk Score \n", + "2 Low Credit Score & High Risk Score \n", + "\n", + " description cluster_size \\\n", + "0 High-Value Premium Spenders are among the most... 8774 \n", + "1 Creditworthy Loyalists are high-income, low-ri... 56782 \n", + "2 These customers have slightly lower credit sco... 54444 \n", + "\n", + " cluster_pct ordered_top_features \\\n", + "0 0.0731 value_score:0.1, monthly_spend_y:25414.84, mon... \n", + "1 0.4732 credit_score:827.07, credit_score_cluster:827.... \n", + "2 0.4537 credit_score:692.54, credit_score_cluster:692.... \n", + "\n", + " genai_recommendations \\\n", + "0 Tailor premium offers and services to enhance ... \n", + "1 Target with premium product promotions | Offer... \n", + "2 Offer personalized financial education and bud... \n", + "\n", + " categorization_basis \\\n", + "0 value_score, monthly_spend_y, monthly_spend_x,... \n", + "1 credit_score, credit_score_cluster, risk_score... \n", + "2 credit_score, credit_score_cluster, risk_score... \n", + "\n", + " naming_rationale \n", + "0 Second-pass focused LLM naming from profile: H... \n", + "1 LLM returned generic name; using business-styl... \n", + "2 LLM returned generic name; using business-styl... " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 5) Persona Generation + Business Grounding (Ollama local, with fallback)\n", + "import requests\n", + "\n", + "\n", + "def _list_local_ollama_models(host: str) -> list[str]:\n", + " try:\n", + " r = requests.get(f\"{host.rstrip('/')}/api/tags\", timeout=8)\n", + " r.raise_for_status()\n", + " return [m.get(\"name\", \"\") for m in r.json().get(\"models\", []) if m.get(\"name\")]\n", + " except Exception as e:\n", + " print(f\"Ollama model discovery failed: {e}\")\n", + " return []\n", + "\n", + "\n", + "def _choose_ollama_model(installed: list[str], preferred: list[str], configured: str) -> str:\n", + " if configured and configured in installed:\n", + " return configured\n", + " if configured:\n", + " matches = [m for m in installed if configured in m]\n", + " if matches:\n", + " return matches[0]\n", + " for pref in preferred:\n", + " matches = [m for m in installed if m.startswith(pref) or pref in m]\n", + " if matches:\n", + " return matches[0]\n", + " return installed[0] if installed else (configured or \"qwen2.5:7b\")\n", + "\n", + "\n", + "installed_models = _list_local_ollama_models(config.ollama_host)\n", + "selected_model = _choose_ollama_model(installed_models, PREFERRED_OLLAMA_MODELS, config.ollama_model)\n", + "config.ollama_model = selected_model\n", + "\n", + "print(\"Installed Ollama models:\", installed_models if installed_models else \"None found / Ollama offline\")\n", + "print(\"Selected model tag:\", config.ollama_model)\n", + "\n", + "ollama_client = OllamaClient(\n", + " host=config.ollama_host,\n", + " model=config.ollama_model,\n", + " timeout=config.ollama_timeout,\n", + ")\n", + "persona_generator = PersonaGenerator(ollama_client, config)\n", + "\n", + "# Step A: Cluster personas\n", + "personas = persona_generator.generate_personas(\n", + " evaluation=best_eval,\n", + " explainability=explainability,\n", + " raw_df=fe_result.df_original,\n", + " schema=schema,\n", + ")\n", + "\n", + "# Step B: Business grounding + GenAI per-cluster naming/actions\n", + "grounding = persona_generator.ground_in_business_objective(personas)\n", + "personas = persona_generator.apply_grounding_to_personas(personas, grounding)\n", + "\n", + "persona_df = pd.DataFrame([\n", + " {\n", + " \"cluster\": p.persona_name,\n", + " \"genai_persona_name\": p.archetype,\n", + " \"profile_descriptor\": getattr(p, \"profile_descriptor\", \"\"),\n", + " \"description\": getattr(p, \"description\", \"\"),\n", + " \"cluster_size\": p.cluster_size,\n", + " \"cluster_pct\": round(p.cluster_pct, 4),\n", + " \"ordered_top_features\": \", \".join([f\"{k}:{v}\" for k, v in p.top_features.items()]),\n", + " \"genai_recommendations\": \" | \".join(p.business_recommendations),\n", + " \"categorization_basis\": \", \".join(getattr(p, \"categorization_basis\", [])),\n", + " \"naming_rationale\": getattr(p, \"naming_rationale\", \"\"),\n", + " }\n", + " for p in personas\n", + "])\n", + "\n", + "# Exact output schema check\n", + "required_persona_cols = [\n", + " \"cluster\", \"genai_persona_name\", \"profile_descriptor\", \"description\",\n", + " \"cluster_size\", \"cluster_pct\", \"ordered_top_features\",\n", + " \"genai_recommendations\", \"categorization_basis\", \"naming_rationale\",\n", + "]\n", + "missing_cols = [c for c in required_persona_cols if c not in persona_df.columns]\n", + "if missing_cols:\n", + " raise ValueError(f\"persona_df missing required columns: {missing_cols}\")\n", + "\n", + "print(\"Generated personas:\", len(personas))\n", + "print(\"Executive summary:\", grounding.executive_summary)\n", + "print(\"persona_df columns:\", list(persona_df.columns))\n", + "persona_df\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "707cdc91", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-03-09 12:55:27,692 | INFO | segplus.visualization | Saved: cluster_scatter.png\n", + "2026-03-09 12:55:32,434 | INFO | segplus.visualization | Saved: feature_importance.png\n", + "2026-03-09 12:56:01,854 | INFO | segplus.visualization | Saved: shap_summary.png, shap_summary_bar.png, shap_interpretation.csv\n", + "2026-03-09 12:56:03,099 | INFO | segplus.visualization | Saved: pca_loadings.png\n", + "2026-03-09 12:56:03,653 | INFO | segplus.visualization | Saved: cluster_profiles.png\n", + "2026-03-09 12:56:03,848 | INFO | segplus.visualization | Saved: cluster_sizes.png\n", + "2026-03-09 12:56:04,758 | INFO | segplus.visualization | Saved: persona_radar.png\n", + "2026-03-09 12:56:05,102 | INFO | segplus.visualization | Saved: experiment_history.png\n", + "2026-03-09 12:56:05,498 | INFO | segplus.visualization | Saved: elbow_curve.png\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved charts to: c:\\Users\\UmairAhmed\\OneDrive - Blend 360\\Documents\\Segmentation_Plus\\segplus_output\n" + ] + } + ], + "source": [ + "# 6) Visualizations\n", + "plot_cluster_scatter_2d(X, best_eval.labels, output_dir, best_eval)\n", + "plot_shap_importance(\n", + " explainability.feature_importances, output_dir, top_n=15,\n", + " importance_pct=explainability.feature_importance_pct,\n", + ")\n", + "plot_shap_summary(\n", + " X_original_for_shap,\n", + " best_eval.labels,\n", + " original_feature_names_for_shap,\n", + " output_dir,\n", + " random_state=config.random_state,\n", + " max_display=15,\n", + ")\n", + "plot_pca_loadings_heatmap(explainability.pca_loadings, output_dir)\n", + "plot_cluster_profiles_heatmap(explainability.cluster_profiles, explainability.top_features, output_dir)\n", + "plot_cluster_sizes(best_eval.labels, personas, output_dir)\n", + "plot_radar_charts(fe_result.df_original, best_eval.labels, personas, explainability.top_features, output_dir)\n", + "plot_experiment_history(experiment_log, output_dir)\n", + "plot_elbow_curve(explainability.inertia_curve, output_dir)\n", + "\n", + "print(\"Saved charts to:\", output_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "969da7d8", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2026-03-09 12:56:08,644 | INFO | segplus.experiment_log | Experiment log saved: c:\\Users\\UmairAhmed\\OneDrive - Blend 360\\Documents\\Segmentation_Plus\\segplus_output\\experiment_log.json\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Exported:\n", + " - c:\\Users\\UmairAhmed\\OneDrive - Blend 360\\Documents\\Segmentation_Plus\\segplus_output\\clustered_customers.csv\n", + " - c:\\Users\\UmairAhmed\\OneDrive - Blend 360\\Documents\\Segmentation_Plus\\segplus_output\\cluster_profiles.csv\n", + " - c:\\Users\\UmairAhmed\\OneDrive - Blend 360\\Documents\\Segmentation_Plus\\segplus_output\\ordered_feature_drivers.csv\n", + " - c:\\Users\\UmairAhmed\\OneDrive - Blend 360\\Documents\\Segmentation_Plus\\segplus_output\\shap_feature_importance_pct.csv\n", + " - c:\\Users\\UmairAhmed\\OneDrive - Blend 360\\Documents\\Segmentation_Plus\\segplus_output\\pc_feature_map.json\n", + " - c:\\Users\\UmairAhmed\\OneDrive - Blend 360\\Documents\\Segmentation_Plus\\segplus_output\\personas.csv\n", + " - c:\\Users\\UmairAhmed\\OneDrive - Blend 360\\Documents\\Segmentation_Plus\\segplus_output\\personas.json\n", + " - c:\\Users\\UmairAhmed\\OneDrive - Blend 360\\Documents\\Segmentation_Plus\\segplus_output\\business_grounding.json\n", + " - c:\\Users\\UmairAhmed\\OneDrive - Blend 360\\Documents\\Segmentation_Plus\\segplus_output\\experiment_log.json\n" + ] + } + ], + "source": [ + "# 7) Export Outputs\n", + "clustered_df = fe_result.df_original.copy()\n", + "clustered_df[\"cluster\"] = best_eval.labels\n", + "\n", + "clustered_path = output_dir / \"clustered_customers.csv\"\n", + "profiles_path = output_dir / \"cluster_profiles.csv\"\n", + "ordered_drivers_path = output_dir / \"ordered_feature_drivers.csv\"\n", + "shap_pct_path = output_dir / \"shap_feature_importance_pct.csv\"\n", + "pc_feature_map_path = output_dir / \"pc_feature_map.json\"\n", + "personas_path = output_dir / \"personas.csv\"\n", + "personas_json_path = output_dir / \"personas.json\"\n", + "grounding_path = output_dir / \"business_grounding.json\"\n", + "exp_log_path = output_dir / \"experiment_log.json\"\n", + "\n", + "clustered_df.to_csv(clustered_path, index=False)\n", + "explainability.cluster_profiles.to_csv(profiles_path)\n", + "if explainability.ordered_feature_drivers is not None:\n", + " explainability.ordered_feature_drivers.to_csv(ordered_drivers_path, index=False)\n", + "\n", + "shap_pct_df = pd.DataFrame([\n", + " {\"feature\": k, \"importance_pct\": v}\n", + " for k, v in explainability.feature_importance_pct.items()\n", + "]).sort_values(\"importance_pct\", ascending=False)\n", + "shap_pct_df.to_csv(shap_pct_path, index=False)\n", + "\n", + "with open(pc_feature_map_path, \"w\", encoding=\"utf-8\") as f:\n", + " json.dump(explainability.pc_feature_map, f, indent=2)\n", + "\n", + "# Export exact persona schema\n", + "persona_df.to_csv(personas_path, index=False)\n", + "\n", + "personas_payload = [\n", + " {\n", + " \"cluster\": p.persona_name,\n", + " \"genai_persona_name\": p.archetype,\n", + " \"profile_descriptor\": getattr(p, \"profile_descriptor\", \"\"),\n", + " \"description\": getattr(p, \"description\", \"\"),\n", + " \"key_traits\": p.key_traits,\n", + " \"genai_recommendations\": p.business_recommendations,\n", + " \"cluster_size\": p.cluster_size,\n", + " \"cluster_pct\": p.cluster_pct,\n", + " \"top_features\": p.top_features,\n", + " \"categorization_basis\": getattr(p, \"categorization_basis\", []),\n", + " \"naming_rationale\": getattr(p, \"naming_rationale\", \"\"),\n", + " }\n", + " for p in personas\n", + "]\n", + "with open(personas_json_path, \"w\", encoding=\"utf-8\") as f:\n", + " json.dump(personas_payload, f, indent=2)\n", + "\n", + "experiment_log.to_json(exp_log_path)\n", + "\n", + "with open(grounding_path, \"w\", encoding=\"utf-8\") as f:\n", + " json.dump(\n", + " {\n", + " \"executive_summary\": grounding.executive_summary,\n", + " \"cluster_priorities\": grounding.cluster_priorities,\n", + " \"cluster_actions\": grounding.cluster_actions,\n", + " \"quick_wins\": grounding.quick_wins,\n", + " },\n", + " f,\n", + " indent=2,\n", + " )\n", + "\n", + "print(\"Exported:\")\n", + "print(\" -\", clustered_path)\n", + "print(\" -\", profiles_path)\n", + "print(\" -\", ordered_drivers_path)\n", + "print(\" -\", shap_pct_path)\n", + "print(\" -\", pc_feature_map_path)\n", + "print(\" -\", personas_path)\n", + "print(\" -\", personas_json_path)\n", + "print(\" -\", grounding_path)\n", + "print(\" -\", exp_log_path)\n" + ] + }, + { + "cell_type": "markdown", + "id": "772cf844", + "metadata": {}, + "source": [ + "## Notes\n", + "\n", + "- Architecture implemented: data input -> FE -> KMeans/DBSCAN/GMM -> eval gate loop -> explainability -> ordered drivers -> persona generation -> business grounding -> per-cluster rationale outputs.\n", + "- Local Ollama is first-class. `qwen2.5` is primary model and auto-selected when installed.\n", + "- Ordered feature driver list combines SHAP importance + weighted PCA loadings, with inertia elbow context.\n", + "- `pc_feature_map.json` provides exact mapping from each PC to top original variables.\n", + "- SHAP summary plot is saved as `shap_summary.png` when `shap` is installed.\n", + "- Optional autoencoder latent representation is available via `USE_AUTOENCODER_REPRESENTATION=True`.\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.x" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/segplus/modeling_loop.py b/segplus/modeling_loop.py new file mode 100644 index 0000000..8ec42b7 --- /dev/null +++ b/segplus/modeling_loop.py @@ -0,0 +1,273 @@ +"""Core modeling loop: configure -> cluster -> evaluate -> reconfigure retry cycle.""" +from __future__ import annotations + +import logging +from datetime import datetime +from enum import Enum + +import numpy as np +from sklearn.metrics import silhouette_score + +from .clustering import ( + estimate_dbscan_eps, + find_optimal_k, + run_all_algorithms, +) +from .config import PipelineConfig +from .evaluation import ClusterEvaluator +from .experiment_log import ExperimentLog +from .types import ClusteringConfig, EvaluationResult, ExperimentRecord + +log = logging.getLogger("segplus.modeling_loop") + + +class ReconfigStrategy(str, Enum): + ADJUST_K = "adjust_k" + ADJUST_EPS = "adjust_eps" + FEATURE_SUBSET = "feature_subset" + CHANGE_GMM_COV = "change_gmm_covariance" + WIDEN_K = "widen_k_range" + + +class ClusteringConfigurator: + """Initial configuration and smart reconfiguration on evaluation failure.""" + + def __init__(self, config: PipelineConfig, feature_names: list[str]): + self.config = config + self.feature_names = feature_names + self._k_scores: dict[int, float] = {} + + def initial_configure(self, X: np.ndarray) -> ClusteringConfig: + """Silhouette-guided K selection + DBSCAN eps estimation.""" + best_k, self._k_scores = find_optimal_k( + X, self.config.k_range, self.config.random_state + ) + eps = estimate_dbscan_eps(X, min_samples=5) + + return ClusteringConfig( + k=best_k, + dbscan_eps=eps, + dbscan_min_samples=5, + random_state=self.config.random_state, + ) + + def reconfigure( + self, + X: np.ndarray, + current: ClusteringConfig, + eval_result: EvaluationResult, + iteration: int, + history: list[ExperimentRecord], + ) -> tuple[ClusteringConfig, str]: + """Smart reconfiguration. Returns (new_config, strategy_name).""" + strategy = self._select_strategy(iteration, eval_result, history) + new = ClusteringConfig( + k=current.k, + dbscan_eps=current.dbscan_eps, + dbscan_min_samples=current.dbscan_min_samples, + gmm_covariance_type=current.gmm_covariance_type, + random_state=current.random_state, + feature_subset_indices=current.feature_subset_indices, + ) + + if strategy == ReconfigStrategy.ADJUST_K: + new = self._adjust_k(new, eval_result) + elif strategy == ReconfigStrategy.ADJUST_EPS: + new = self._adjust_eps(new, eval_result) + elif strategy == ReconfigStrategy.FEATURE_SUBSET: + new = self._try_feature_subset(X, eval_result, new) + elif strategy == ReconfigStrategy.CHANGE_GMM_COV: + new = self._change_gmm_cov(new) + elif strategy == ReconfigStrategy.WIDEN_K: + new = self._widen_k(X, new) + + log.info( + "Reconfigured (strategy=%s): k=%d, eps=%.3f, gmm_cov=%s, subset=%s", + strategy.value, new.k, new.dbscan_eps, new.gmm_covariance_type, + new.feature_subset_indices is not None, + ) + return new, strategy.value + + def _select_strategy( + self, iteration: int, eval_result: EvaluationResult, history: list[ExperimentRecord] + ) -> ReconfigStrategy: + """Deterministic strategy selection based on iteration and history.""" + strategies = [ + ReconfigStrategy.ADJUST_K, + ReconfigStrategy.ADJUST_EPS, + ReconfigStrategy.FEATURE_SUBSET, + ReconfigStrategy.CHANGE_GMM_COV, + ReconfigStrategy.WIDEN_K, + ] + idx = min(iteration - 1, len(strategies) - 1) + return strategies[idx] + + def _adjust_k(self, cfg: ClusteringConfig, eval_result: EvaluationResult) -> ClusteringConfig: + """Adjust K based on silhouette trend.""" + # If we have k_scores from initial scan, use them to decide direction + current_k = cfg.k + k_min, k_max = self.config.k_range + + # Check if higher K had better scores during initial scan + higher_better = any( + self._k_scores.get(k, -1) > self._k_scores.get(current_k, -1) + for k in range(current_k + 1, k_max + 1) + ) + lower_better = any( + self._k_scores.get(k, -1) > self._k_scores.get(current_k, -1) + for k in range(k_min, current_k) + ) + + if higher_better and current_k < k_max: + cfg.k = current_k + 1 + elif lower_better and current_k > k_min: + cfg.k = current_k - 1 + elif current_k < k_max: + cfg.k = current_k + 1 + else: + cfg.k = max(k_min, current_k - 1) + + return cfg + + def _adjust_eps(self, cfg: ClusteringConfig, eval_result: EvaluationResult) -> ClusteringConfig: + """Adjust DBSCAN eps based on noise ratio.""" + dbscan_scores = eval_result.all_scores.get("dbscan", {}) + # If DBSCAN silhouette was very low or negative, try larger eps + dbscan_sil = dbscan_scores.get("silhouette", -1) + if dbscan_sil < 0: + cfg.dbscan_eps = round(cfg.dbscan_eps * 1.5, 3) + else: + cfg.dbscan_eps = round(cfg.dbscan_eps * 1.2, 3) + cfg.dbscan_min_samples = max(3, cfg.dbscan_min_samples - 1) + return cfg + + def _try_feature_subset( + self, X: np.ndarray, eval_result: EvaluationResult, cfg: ClusteringConfig + ) -> ClusteringConfig: + """Drop lowest-importance features using quick permutation check.""" + labels = eval_result.labels + valid_mask = labels != -1 + Xv, lv = X[valid_mask], labels[valid_mask] + + if len(set(lv)) < 2 or Xv.shape[1] <= 3: + return cfg + + rng = np.random.default_rng(0) + baseline = silhouette_score(Xv, lv, sample_size=min(1000, len(Xv))) + + drops = [] + for i in range(Xv.shape[1]): + Xp = Xv.copy() + rng.shuffle(Xp[:, i]) + s = silhouette_score(Xp, lv, sample_size=min(1000, len(Xv))) + drops.append(baseline - s) + + # Keep features whose removal hurts silhouette the most + n_keep = max(3, int(len(drops) * 0.7)) + top_indices = sorted(range(len(drops)), key=lambda i: -drops[i])[:n_keep] + cfg.feature_subset_indices = sorted(top_indices) + return cfg + + def _change_gmm_cov(self, cfg: ClusteringConfig) -> ClusteringConfig: + """Cycle GMM covariance type.""" + cov_types = ["full", "tied", "diag", "spherical"] + current_idx = cov_types.index(cfg.gmm_covariance_type) if cfg.gmm_covariance_type in cov_types else 0 + cfg.gmm_covariance_type = cov_types[(current_idx + 1) % len(cov_types)] + return cfg + + def _widen_k(self, X: np.ndarray, cfg: ClusteringConfig) -> ClusteringConfig: + """Widen K range and re-search.""" + wider_min = max(2, self.config.k_range[0] - 1) + wider_max = min(self.config.k_range[1] + 2, len(X) // 10) + best_k, scores = find_optimal_k(X, (wider_min, wider_max), self.config.random_state) + self._k_scores.update(scores) + cfg.k = best_k + return cfg + + +def modeling_loop( + X: np.ndarray, + feature_names: list[str], + config: PipelineConfig, + experiment_log: ExperimentLog, +) -> tuple[EvaluationResult, ClusteringConfig]: + """ + Core retry loop: + 1. Configure initial clustering params + 2. Run all 3 algorithms (KMeans, DBSCAN, GMM) + 3. Evaluate -> pass/fail gate + 4. If fail -> reconfigure (smart strategy) -> goto 2 + 5. If pass or max_iterations -> return best result + """ + configurator = ClusteringConfigurator(config, feature_names) + evaluator = ClusterEvaluator(config) + + cluster_config = configurator.initial_configure(X) + best_eval: EvaluationResult | None = None + best_config: ClusteringConfig | None = None + + for iteration in range(1, config.max_iterations + 1): + log.info("=" * 60) + log.info("Modeling Loop - Iteration %d/%d | k=%d", iteration, config.max_iterations, cluster_config.k) + log.info("=" * 60) + + # Run all algorithms + results = run_all_algorithms(X, cluster_config) + if not results: + log.warning("No clustering algorithms succeeded in iteration %d", iteration) + continue + + # Evaluate + eval_result = evaluator.evaluate(X, results) + + # Log experiment + record = ExperimentRecord( + iteration=iteration, + timestamp=datetime.now().isoformat(), + config_k=cluster_config.k, + config_eps=cluster_config.dbscan_eps, + config_gmm_cov=cluster_config.gmm_covariance_type, + best_algorithm=eval_result.algorithm, + n_clusters=eval_result.n_clusters, + silhouette=eval_result.silhouette, + davies_bouldin=eval_result.davies_bouldin, + calinski_harabasz=eval_result.calinski_harabasz, + passed=eval_result.passes, + ) + experiment_log.add(record) + + # Track global best + if best_eval is None or eval_result.silhouette > best_eval.silhouette: + best_eval = eval_result + best_config = cluster_config + + log.info( + "Best: %s | sil=%.4f | DB=%.4f | PASS=%s", + eval_result.algorithm, eval_result.silhouette, + eval_result.davies_bouldin, eval_result.passes, + ) + + # Gate check + if eval_result.passes: + log.info("Evaluation PASSED on iteration %d.", iteration) + break + + # Reconfigure for next iteration + if iteration < config.max_iterations: + log.info("Evaluation FAILED. Reconfiguring...") + cluster_config, strategy = configurator.reconfigure( + X, cluster_config, eval_result, iteration, experiment_log.records + ) + # Update the record with reconfiguration strategy + record.reconfiguration_strategy = strategy + + if best_eval is None: + raise RuntimeError("No valid clustering result produced across all iterations.") + + if not best_eval.passes: + log.warning( + "Max iterations reached without passing gate. Using best result " + "(sil=%.4f, algorithm=%s).", best_eval.silhouette, best_eval.algorithm, + ) + + return best_eval, best_config diff --git a/segplus/ollama_client.py b/segplus/ollama_client.py new file mode 100644 index 0000000..7edb462 --- /dev/null +++ b/segplus/ollama_client.py @@ -0,0 +1,162 @@ +"""Robust HTTP client for Ollama's local LLM API.""" +from __future__ import annotations + +import json +import logging +import re +import time +from typing import Iterator, Optional + +import requests + +log = logging.getLogger("segplus.ollama_client") + + +class OllamaClient: + """ + Production-grade client for Ollama's local LLM API. + Features: health checks, exponential backoff retries, timeout, + streaming support, structured JSON extraction. + """ + + def __init__(self, host: str, model: str, timeout: int = 120): + self.host = host.rstrip("/") + self.model = model + self.timeout = timeout + self._endpoint = f"{self.host}/api/generate" + + def health_check(self) -> bool: + """Check if Ollama is reachable and the model is available.""" + try: + r = requests.get(f"{self.host}/api/tags", timeout=5) + r.raise_for_status() + models = [m["name"] for m in r.json().get("models", [])] + available = any(self.model in m for m in models) + if not available: + log.warning("Model '%s' not found. Available: %s", self.model, models) + else: + log.info("Ollama reachable. Model '%s' available.", self.model) + return available + except requests.RequestException as e: + log.error("Ollama not reachable at %s: %s", self.host, e) + return False + + def generate( + self, + prompt: str, + system: Optional[str] = None, + temperature: float = 0.3, + max_retries: int = 2, + max_tokens: int = 2048, + ) -> str: + """Generate a response with retry logic. Returns full text.""" + payload: dict = { + "model": self.model, + "prompt": prompt, + "stream": False, + "options": { + "temperature": temperature, + "num_predict": max_tokens, + }, + } + if system: + payload["system"] = system + + last_error = None + for attempt in range(1, max_retries + 2): + try: + r = requests.post( + self._endpoint, + json=payload, + timeout=self.timeout, + ) + r.raise_for_status() + response_text = r.json().get("response", "").strip() + log.info("Ollama response received (%d chars)", len(response_text)) + return response_text + + except requests.Timeout: + last_error = f"Timeout after {self.timeout}s" + except requests.RequestException as e: + last_error = str(e) + + if attempt <= max_retries: + wait = 2 ** attempt + log.warning( + "Ollama attempt %d failed (%s). Retrying in %ds...", + attempt, last_error, wait, + ) + time.sleep(wait) + + raise RuntimeError(f"Ollama failed after {max_retries + 1} attempts: {last_error}") + + def generate_stream( + self, + prompt: str, + system: Optional[str] = None, + temperature: float = 0.3, + max_tokens: int = 2048, + ) -> Iterator[str]: + """Streaming generation, yields tokens as they arrive.""" + payload: dict = { + "model": self.model, + "prompt": prompt, + "stream": True, + "options": { + "temperature": temperature, + "num_predict": max_tokens, + }, + } + if system: + payload["system"] = system + + r = requests.post( + self._endpoint, + json=payload, + timeout=self.timeout, + stream=True, + ) + r.raise_for_status() + + for line in r.iter_lines(): + if line: + data = json.loads(line) + token = data.get("response", "") + if token: + yield token + if data.get("done", False): + break + + def extract_json(self, text: str) -> dict: + """Extract the first valid JSON object from LLM output. + Handles markdown fences, preamble text, and trailing text. + """ + # Strip markdown code fences + cleaned = re.sub(r"```(?:json)?\s*", "", text) + cleaned = re.sub(r"```", "", cleaned) + + # Try to find a JSON object + match = re.search(r"(\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\})", cleaned, re.DOTALL) + if match: + try: + return json.loads(match.group(1)) + except json.JSONDecodeError: + pass + + # Broader match: find outermost braces + depth = 0 + start = None + for i, ch in enumerate(cleaned): + if ch == "{": + if depth == 0: + start = i + depth += 1 + elif ch == "}": + depth -= 1 + if depth == 0 and start is not None: + try: + return json.loads(cleaned[start:i + 1]) + except json.JSONDecodeError: + start = None + + raise ValueError(f"No valid JSON found in LLM response:\n{text[:400]}") diff --git a/segplus/persona_generation.py b/segplus/persona_generation.py new file mode 100644 index 0000000..1981d2d --- /dev/null +++ b/segplus/persona_generation.py @@ -0,0 +1,675 @@ +"""LLM persona generation and business objective grounding.""" +from __future__ import annotations + +import logging +import re + +import pandas as pd + +from .config import PipelineConfig +from .ollama_client import OllamaClient +from .types import ( + BusinessGrounding, + DataSchema, + EvaluationResult, + ExplainabilityReport, + PersonaResult, +) + +log = logging.getLogger("segplus.persona_generation") + +CLUSTER_LETTERS = {0: "A", 1: "B", 2: "C", 3: "D", 4: "E", 5: "F", 6: "G", 7: "H"} + +PERSONA_SYSTEM_PROMPT = """You are a senior customer analytics expert and strategic advisor. +You analyse customer cluster profiles and generate precise, actionable personas. +Always respond with ONLY valid JSON - no preamble, no explanation, no markdown fences. + +CRITICAL NAMING RULES: +- The archetype MUST be a unique, descriptive 2-5 word business label. +- Base the name on the ACTUAL DATA VALUES shown (high/low/above/below average), not just feature column names. +- NEVER use generic names like 'Segment A', 'Cluster Profile', 'Credit Score Optimizers', or names that merely repeat feature column names. +- NEVER include raw column names like 'credit_score' or 'risk_score' directly in the archetype. +- Good examples: 'Premium Low-Risk Elite', 'Budget-Conscious Rebuilders', 'High-Value Dormant Accounts', 'Creditworthy Loyalists', 'At-Risk High-Spenders' +- The name should instantly convey WHO these customers are to a business stakeholder. + +The JSON must contain exactly these keys: + archetype: string (2-5 word evocative business label reflecting this cluster's actual value profile) + description: string (2-3 sentences describing who this customer is, referencing specific data patterns) + key_traits: list of 4 strings (concise behavioural characteristics derived from the data) + business_recommendations: list of 3 strings (specific, actionable strategies tied to this cluster's profile) +""" + +GROUNDING_SYSTEM_PROMPT = """You are a Chief Strategy Officer synthesising customer segmentation results +into an executive-level strategic report. +Respond with ONLY valid JSON containing: + executive_summary: string (3-4 sentences - overall findings and strategic implication) + cluster_priority: list of objects, each with: + cluster: string + archetype: string + priority: string ("High" | "Medium" | "Low") + strategic_action: string (one concrete next action) + cluster_actions: list of objects, each with: + cluster: string + grounded_persona_name: string (business-ready segment label) + grounded_recommendations: list of 3 strings (specific actions for this cluster) + quick_wins: list of 3 strings (immediate actions any team can take this week) +""" + + +def build_cluster_profile_text( + cluster_letter: str, + profile_row: pd.Series, + cluster_pct: float, + top_features: dict[str, float], + top_feature_names: list[str], + business_objective: str, + global_means: pd.Series | None = None, + global_stds: pd.Series | None = None, +) -> str: + """Build a rich textual summary of a cluster's statistical profile with population comparison.""" + lines = [ + f"CLUSTER {cluster_letter} PROFILE", + f" Size: {int(cluster_pct * 100)}% of total customers", + "", + f" Business Objective: {business_objective}", + "", + " Key metrics (cluster mean vs population mean):", + ] + for feat in top_feature_names[:8]: + if feat in profile_row.index: + val = profile_row[feat] + if global_means is not None and feat in global_means.index: + pop_mean = float(global_means[feat]) + if pop_mean != 0: + pct_diff = ((val - pop_mean) / abs(pop_mean)) * 100 + direction = "above" if val > pop_mean else "below" + lines.append( + f" - {feat}: {val:.2f} ({direction} pop. avg {pop_mean:.2f}, {pct_diff:+.0f}%)" + ) + else: + lines.append(f" - {feat}: {val:.2f} (pop. avg {pop_mean:.2f})") + else: + lines.append(f" - {feat}: {val:.2f}") + + lines += ["", " Feature importances (contribution to cluster separation):"] + for feat, imp in list(top_features.items())[:5]: + lines.append(f" - {feat}: {imp:.4f}") + + return "\n".join(lines) + + +class PersonaGenerator: + """Generate LLM-powered personas for each cluster and ground in business objective.""" + + def __init__(self, client: OllamaClient, config: PipelineConfig): + self.client = client + self.config = config + self._is_available: bool | None = None + + def _check_availability(self) -> bool: + if self._is_available is None: + self._is_available = self.client.health_check() + return self._is_available + + def generate_personas( + self, + evaluation: EvaluationResult, + explainability: ExplainabilityReport, + raw_df: pd.DataFrame, + schema: DataSchema, + ) -> list[PersonaResult]: + """For each cluster: build profile -> call LLM -> parse JSON -> create PersonaResult.""" + use_fallback = not self._check_availability() + labels = evaluation.labels + total = int((labels != -1).sum()) + cluster_ids = sorted(c for c in set(labels) if c != -1) + + num_cols = [c for c in schema.numeric_cols if c in raw_df.columns] + global_means = raw_df[num_cols].mean() if num_cols else pd.Series(dtype=float) + global_stds = raw_df[num_cols].std().replace(0, 1.0) if num_cols else pd.Series(dtype=float) + self._global_means = global_means + self._global_stds = global_stds + personas: list[PersonaResult] = [] + + for cid in cluster_ids: + mask = labels == cid + size = int(mask.sum()) + pct = size / total + letter = CLUSTER_LETTERS.get(cid, str(cid)) + persona_name = f"Cluster {letter}" + + # Compute profile from raw data + profile_row = raw_df[mask][num_cols].mean() + top_features_dict = { + k: v for k, v in explainability.feature_importances.items() if v > 0 + } + cluster_top_features = self._cluster_specific_feature_order( + profile_row=profile_row, + global_means=global_means, + global_stds=global_stds, + candidate_features=list(top_features_dict.keys()), + ) + + if use_fallback: + persona = self._build_fallback_persona( + cid, persona_name, size, pct, profile_row, top_features_dict, cluster_top_features + ) + else: + persona = self._generate_llm_persona( + cid, letter, persona_name, size, pct, + profile_row, top_features_dict, cluster_top_features, + ) + + personas.append(persona) + log.info("Persona generated: %s -> %s", persona_name, persona.archetype) + + return self._ensure_unique_archetypes(personas) + + def _generate_focused_name(self, profile_descriptor: str, letter: str) -> str: + """Second-pass LLM call: given a data profile, generate a catchy 2-4 word business name.""" + prompt = ( + f"A customer segment has this data profile: {profile_descriptor}\n\n" + f"Business context: {self.config.business_objective}\n\n" + f"Generate ONE catchy 2-4 word business persona name for this segment.\n" + f"Rules:\n" + f"- Must be a memorable business label, e.g. 'Premium Loyalists', 'Budget Pragmatists', " + f"'Rising Stars', 'At-Risk Big Spenders', 'Creditworthy Savers'\n" + f"- Do NOT use generic words like 'Segment', 'Cluster', 'Group', 'Profile', 'Optimizers'\n" + f"- Do NOT just repeat data column names or metrics\n" + f"- Respond with ONLY the persona name, nothing else. No quotes, no explanation." + ) + try: + raw = self.client.generate(prompt, temperature=0.6, max_tokens=30) + name = raw.strip().strip('"').strip("'").strip() + # Remove any JSON wrapper if LLM over-formats + if "{" in name or "}" in name: + return "" + # Validate length + words = name.split() + if 2 <= len(words) <= 5 and not self._is_generic_archetype(name): + log.info("Focused naming for Cluster %s: '%s'", letter, name) + return name + return "" + except Exception as e: + log.debug("Focused naming failed for Cluster %s: %s", letter, e) + return "" + + def _generate_llm_persona( + self, + cid: int, + letter: str, + persona_name: str, + size: int, + pct: float, + profile_row: pd.Series, + top_features_dict: dict[str, float], + top_feature_names: list[str], + ) -> PersonaResult: + """Generate persona via Ollama LLM with two-pass naming.""" + # Always compute the data-driven profile descriptor + profile_descriptor = self._derive_data_driven_archetype(profile_row, top_feature_names, letter) + + profile_text = build_cluster_profile_text( + cluster_letter=letter, + profile_row=profile_row, + cluster_pct=pct, + top_features=top_features_dict, + top_feature_names=top_feature_names, + business_objective=self.config.business_objective, + global_means=self._global_means, + global_stds=self._global_stds, + ) + + prompt = ( + f"{profile_text}\n\n" + f"Generate a customer persona for this cluster. " + f"Respond ONLY with valid JSON." + ) + + try: + raw = self.client.generate(prompt, system=PERSONA_SYSTEM_PROMPT, temperature=0.3) + parsed = self.client.extract_json(raw) + llm_archetype = parsed.get("archetype", f"Segment {letter}") + naming_rationale = ( + f"LLM-generated persona name using cluster-specific drivers: " + f"{', '.join(top_feature_names[:3])}." + ) + + # If the first pass returned a generic name, try a focused second pass + if self._is_generic_archetype(llm_archetype): + focused_name = self._generate_focused_name(profile_descriptor, letter) + if focused_name: + llm_archetype = focused_name + naming_rationale = ( + f"Second-pass focused LLM naming from profile: {profile_descriptor}." + ) + else: + llm_archetype = self._business_fallback_name(top_feature_names, letter) + naming_rationale = ( + f"LLM returned generic name; using business-style fallback from drivers: " + f"{', '.join(top_feature_names[:2])}." + ) + elif self._needs_business_rewrite(llm_archetype): + focused_name = self._generate_focused_name(profile_descriptor, letter) + if focused_name: + llm_archetype = focused_name + naming_rationale = ( + f"LLM returned metric-literal name; rewritten via focused naming from profile: " + f"{profile_descriptor}." + ) + else: + llm_archetype = self._business_fallback_name(top_feature_names, letter) + naming_rationale = ( + f"LLM returned metric-literal name; rewritten to business-style fallback " + f"from drivers: {', '.join(top_feature_names[:2])}." + ) + + return PersonaResult( + cluster_id=cid, + persona_name=persona_name, + archetype=llm_archetype, + description=parsed.get("description", ""), + key_traits=parsed.get("key_traits", []), + business_recommendations=parsed.get("business_recommendations", []), + cluster_size=size, + cluster_pct=pct, + top_features={ + k: round(float(profile_row.get(k, 0)), 2) + for k in top_feature_names[:5] + if k in profile_row.index + }, + naming_rationale=naming_rationale, + categorization_basis=top_feature_names[:5], + profile_descriptor=profile_descriptor, + ) + except Exception as e: + log.warning("LLM persona generation failed for cluster %s: %s", letter, e) + return self._build_fallback_persona( + cid, persona_name, size, pct, profile_row, top_features_dict, top_feature_names + ) + + def _is_generic_archetype(self, name: str) -> bool: + n = (name or "").strip().lower() + if not n: + return True + generic_tokens = ["segment", "cluster", "profile", "group", "persona", "a", "b", "c", "d"] + if n in {"segment a", "segment b", "segment c", "cluster a", "cluster b", "cluster c"}: + return True + if sum(1 for t in generic_tokens if t in n) >= 2: + return True + # Reject names that are just column names glued together (e.g. "Credit Score Risk Score Optimizers") + raw_col_tokens = set() + for tok in n.replace("-", " ").replace("_", " ").split(): + raw_col_tokens.add(tok) + filler_words = {"optimizers", "drivers", "segment", "cluster", "profile", "group", "and", "&", "the"} + meaningful_words = raw_col_tokens - filler_words + if len(meaningful_words) < 2: + return True + # Reject literal metric-style labels; force focused renaming pass + if "&" in n and ("high " in n or "low " in n): + return True + if " score" in n and ("high " in n or "low " in n): + return True + return False + + def _needs_business_rewrite(self, name: str) -> bool: + """Detect literal metric-style names that should be rewritten as business labels.""" + n = (name or "").strip().lower() + if not n: + return True + bad_patterns = [ + r"\bhigh\b.*\blow\b", + r"\blow\b.*\bhigh\b", + r"\bcredit score\b", + r"\brisk score\b", + r"&", + ] + return any(re.search(p, n) for p in bad_patterns) + + def _business_fallback_name(self, top_feature_names: list[str], letter: str) -> str: + """Create a compact business-style name from top drivers when LLM naming is literal/generic.""" + roots = [] + for f in self._deduplicate_feature_roots(top_feature_names): + pretty = f.replace("_score", "").replace("_cluster", "").replace("_", " ").title().strip() + if pretty: + roots.append(pretty.split()[0]) + if len(roots) >= 2: + break + if len(roots) >= 2: + return f"{roots[0]} {roots[1]} Strategists" + if len(roots) == 1: + return f"{roots[0]} Navigators" + return f"Business Segment {letter}" + + @staticmethod + def _deduplicate_feature_roots(features: list[str]) -> list[str]: + """Remove semantically duplicate features (e.g. credit_score vs credit_score_cluster).""" + seen_roots: set[str] = set() + unique: list[str] = [] + for f in features: + # Strip common suffixes that create near-duplicate feature names + root = re.sub( + r'_(cluster|group|bin|cat|flag|encoded|scaled|norm|raw|[xy]|bucket|band)$', + '', f.lower(), + ) + root = re.sub(r'_+$', '', root) + if root not in seen_roots: + seen_roots.add(root) + unique.append(f) + return unique + + def _classify_feature_level(self, feat: str, profile_row: pd.Series) -> str: + """Classify a feature value as High/Moderate/Low relative to global population.""" + if not hasattr(self, '_global_means') or self._global_means is None: + return "" + if feat not in self._global_means.index or feat not in self._global_stds.index: + return "" + if feat not in profile_row.index: + return "" + + value = float(profile_row[feat]) + mean = float(self._global_means[feat]) + std = float(self._global_stds[feat]) + if std < 1e-10: + return "" + + z = (value - mean) / std + if z >= 0.5: + return "High" + elif z <= -0.5: + return "Low" + else: + return "Moderate" + + def _derive_data_driven_archetype( + self, + profile_row: pd.Series, + top_feature_names: list[str], + letter: str, + ) -> str: + """Derive a descriptive archetype using feature VALUES (not just names).""" + usable = [f for f in top_feature_names if f in profile_row.index] + # Remove semantically duplicate features (credit_score vs credit_score_cluster) + usable = self._deduplicate_feature_roots(usable) + if not usable: + return f"Strategic Segment {letter}" + + parts: list[str] = [] + for feat in usable[:3]: + pretty = feat.replace("_", " ").title() + level = self._classify_feature_level(feat, profile_row) + if level: + parts.append(f"{level} {pretty}") + else: + parts.append(pretty) + if len(parts) >= 2: + break + + if len(parts) >= 2: + return f"{parts[0]} & {parts[1]}" + if len(parts) == 1: + return f"{parts[0]} Segment" + return f"Strategic Segment {letter}" + + def _build_fallback_persona( + self, + cid: int, + persona_name: str, + size: int, + pct: float, + profile_row: pd.Series, + top_features_dict: dict[str, float], + top_feature_names: list[str] | None = None, + ) -> PersonaResult: + """Rule-based fallback when Ollama is unavailable.""" + letter = CLUSTER_LETTERS.get(cid, str(cid)) + + # Derive data-driven descriptor from top feature values + top_feats = top_feature_names if top_feature_names else list(top_features_dict.keys())[:5] + if top_feats: + profile_descriptor = self._derive_data_driven_archetype(profile_row, top_feats, letter) + else: + profile_descriptor = f"Cluster {letter} Profile" + + # Try focused LLM naming even in fallback (persona JSON failed but naming might work) + archetype = profile_descriptor + naming_rationale = ( + "Fallback rule-based name from cluster-specific top movers: " + + ", ".join(top_feats[:3]) + ) + if self._check_availability(): + focused_name = self._generate_focused_name(profile_descriptor, letter) + if focused_name: + archetype = focused_name + naming_rationale = f"Focused LLM naming from profile: {profile_descriptor}." + + return PersonaResult( + cluster_id=cid, + persona_name=persona_name, + archetype=archetype, + description=f"Cluster {letter} represents {pct:.0%} of customers. " + f"Key differentiators: {', '.join(top_feats[:3])}.", + key_traits=[f"Distinctive {f.replace('_', ' ').title()}" for f in top_feats[:4]], + business_recommendations=[ + "Run Ollama to generate detailed recommendations", + f"Investigate {top_feats[0].replace('_', ' ') if top_feats else 'key features'} patterns", + "Conduct qualitative research with cluster members", + ], + cluster_size=size, + cluster_pct=pct, + top_features={ + k: round(float(profile_row.get(k, 0)), 2) + for k in top_feats[:5] + if k in profile_row.index + }, + naming_rationale=naming_rationale, + categorization_basis=top_feats[:5], + profile_descriptor=profile_descriptor, + ) + + def _cluster_specific_feature_order( + self, + profile_row: pd.Series, + global_means: pd.Series, + global_stds: pd.Series, + candidate_features: list[str], + ) -> list[str]: + """Sort features by absolute standardized movement for this cluster vs total population.""" + scored: list[tuple[str, float]] = [] + for f in candidate_features: + if f in profile_row.index and f in global_means.index and f in global_stds.index: + z_move = abs(float(profile_row[f] - global_means[f]) / float(global_stds[f])) + scored.append((f, z_move)) + scored.sort(key=lambda x: x[1], reverse=True) + return [f for f, _ in scored] + + def ground_in_business_objective( + self, + personas: list[PersonaResult], + ) -> BusinessGrounding: + """LLM synthesises executive summary mapping personas to business objective.""" + if not self._check_availability(): + return BusinessGrounding( + executive_summary="Ollama offline. Start Ollama and re-run for LLM-generated strategic grounding.", + cluster_priorities=[ + {"cluster": p.persona_name, "archetype": p.archetype, + "priority": "TBD", "strategic_action": "Run Ollama to generate"} + for p in personas + ], + quick_wins=[ + "Start Ollama: ollama serve", + f"Pull model: ollama pull {self.config.ollama_model}", + "Re-run notebook", + ], + cluster_actions=[ + { + "cluster": p.persona_name, + "grounded_persona_name": p.archetype, + "grounded_recommendations": p.business_recommendations[:3], + } + for p in personas + ], + ) + + persona_summaries = [] + for p in personas: + persona_summaries.append( + f" {p.persona_name} - {p.archetype}\n" + f" Size: {p.cluster_size:,} ({p.cluster_pct * 100:.1f}%)\n" + f" Description: {p.description[:140]}\n" + f" Top traits: {'; '.join(p.key_traits[:2])}\n" + f" Drivers: {', '.join(p.categorization_basis[:3])}" + ) + + prompt = ( + f"Business Objective:\n{self.config.business_objective}\n\n" + f"Identified Customer Personas:\n" + "\n\n".join(persona_summaries) + "\n\n" + f"Ground these personas in the stated business objective. " + f"Identify priorities and strategic actions. Respond ONLY with valid JSON." + ) + + try: + log.info("Grounding personas in business objective...") + raw = self.client.generate( + prompt, + system=GROUNDING_SYSTEM_PROMPT, + temperature=0.2, + max_tokens=900, + ) + parsed = self.client.extract_json(raw) + return BusinessGrounding( + executive_summary=parsed.get("executive_summary", ""), + cluster_priorities=parsed.get("cluster_priority", []), + quick_wins=parsed.get("quick_wins", []), + cluster_actions=parsed.get("cluster_actions", []), + ) + except Exception as e: + log.warning("Business grounding failed: %s", e) + return BusinessGrounding( + executive_summary=f"LLM grounding failed: {e}", + cluster_priorities=[], + quick_wins=[], + cluster_actions=[], + ) + + def apply_grounding_to_personas( + self, + personas: list[PersonaResult], + grounding: BusinessGrounding, + ) -> list[PersonaResult]: + """Apply grounded persona names and recommendations returned by the grounding LLM step.""" + if not grounding.cluster_actions: + return self._ensure_unique_archetypes(personas) + + action_map = {} + for a in grounding.cluster_actions: + cluster_key = str(a.get("cluster", "")).strip().lower() + if cluster_key: + action_map[cluster_key] = a + + updated: list[PersonaResult] = [] + for p in personas: + key = str(p.persona_name).strip().lower() + action = action_map.get(key) + if action: + grounded_name = action.get("grounded_persona_name") + grounded_recs = action.get("grounded_recommendations", []) + if isinstance(grounded_name, str) and grounded_name.strip(): + p.archetype = grounded_name.strip() + p.naming_rationale = ( + p.naming_rationale + " " + + "Business grounding renamed this persona to align with objective." + ).strip() + if isinstance(grounded_recs, list) and grounded_recs: + p.business_recommendations = [str(x) for x in grounded_recs[:3]] + updated.append(p) + return self._ensure_unique_archetypes(updated) + + def _ensure_unique_archetypes(self, personas: list[PersonaResult]) -> list[PersonaResult]: + """Enforce unique archetype names using distinguishing cluster characteristics.""" + groups: dict[str, list[PersonaResult]] = {} + for p in personas: + base = self._clean_archetype_text(p.archetype) + p.archetype = base + key = self._archetype_key(base) + groups.setdefault(key, []).append(p) + + for _, group in groups.items(): + if len(group) <= 1: + continue + self._differentiate_duplicate_group(group) + + # Final hard guarantee of uniqueness + seen_exact: dict[str, int] = {} + for p in personas: + k = p.archetype.strip().lower() + seen_exact[k] = seen_exact.get(k, 0) + 1 + if seen_exact[k] > 1: + cluster_tag = str(p.persona_name).replace("Cluster ", "").strip() or str(seen_exact[k]) + p.archetype = f"{p.archetype} ({cluster_tag})" + p.naming_rationale = ( + p.naming_rationale + " " + + f"Appended cluster tag '{cluster_tag}' to guarantee unique persona naming." + ).strip() + return personas + + def _differentiate_duplicate_group(self, group: list[PersonaResult]) -> None: + """Rename duplicate archetypes using the feature that most distinguishes them.""" + # Collect all features present across the group's top_features + all_feats: set[str] = set() + for p in group: + all_feats.update(p.top_features.keys()) + + # Deduplicate feature roots to avoid credit_score vs credit_score_cluster + deduped_feats = self._deduplicate_feature_roots(list(all_feats)) + + # Find the feature with the largest value spread across the duplicate group + best_feat, best_spread = None, -1.0 + for feat in deduped_feats: + values = [p.top_features.get(feat, 0.0) for p in group] + spread = max(values) - min(values) + if spread > best_spread: + best_spread = spread + best_feat = feat + + if best_feat is None or best_spread < 1e-10: + # No distinguishing feature found; fall back to cluster letters + for p in group: + letter = str(p.persona_name).replace("Cluster ", "").strip() + p.archetype = f"{p.archetype} ({letter})" + p.naming_rationale += f" Differentiated by cluster label '{letter}'." + return + + # Rank clusters by the distinguishing feature and assign descriptive modifiers + ranked = sorted(group, key=lambda p: p.top_features.get(best_feat, 0.0), reverse=True) + pretty_feat = best_feat.replace("_", " ").title() + n = len(ranked) + + for i, p in enumerate(ranked): + val = p.top_features.get(best_feat, 0.0) + if n == 2: + modifier = "Higher" if i == 0 else "Lower" + else: + if i == 0: + modifier = "High" + elif i == n - 1: + modifier = "Low" + else: + modifier = "Mid" + p.archetype = f"{p.archetype} – {modifier} {pretty_feat}" + p.naming_rationale += ( + f" Differentiated by {best_feat} (value={val:.2f}, " + f"ranked {i + 1}/{n} across duplicate names)." + ) + + def _clean_archetype_text(self, name: str) -> str: + n = (name or "").strip() + n = re.sub(r"\s+", " ", n) + n = re.sub(r"[-\s]+$", "", n) # remove trailing hyphen/space + return n or "Strategic Segment" + + def _archetype_key(self, name: str) -> str: + n = name.lower() + n = re.sub(r"[^a-z0-9\s]", " ", n) + n = re.sub(r"\s+", " ", n).strip() + return n diff --git a/segplus/persona_generator.py b/segplus/persona_generator.py new file mode 100644 index 0000000..5fb2c6e --- /dev/null +++ b/segplus/persona_generator.py @@ -0,0 +1,5 @@ +"""Compatibility wrapper for persona generator naming.""" + +from .persona_generation import PersonaGenerator + +__all__ = ["PersonaGenerator"] diff --git a/segplus/pipeline.py b/segplus/pipeline.py new file mode 100644 index 0000000..5a0f863 --- /dev/null +++ b/segplus/pipeline.py @@ -0,0 +1,16 @@ +"""Pipeline entrypoint wrapper for segplus notebook workflow.""" + +from dataclasses import dataclass + +from .config import PipelineConfig + + +@dataclass +class Pipeline: + """Thin pipeline wrapper used by notebook and future CLI orchestration.""" + + config: PipelineConfig + + +def build_pipeline(config: PipelineConfig) -> Pipeline: + return Pipeline(config=config) diff --git a/segplus/types.py b/segplus/types.py new file mode 100644 index 0000000..a7dd9b1 --- /dev/null +++ b/segplus/types.py @@ -0,0 +1,210 @@ +"""Shared dataclasses and type definitions for the segplus pipeline.""" +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Literal, Optional + +import numpy as np +import pandas as pd + + +# ── Data Layer ─────────────────────────────────────────────────────────────── + +@dataclass +class ColumnMeta: + name: str + dtype: Literal["numeric", "categorical", "datetime", "text", "boolean"] + null_rate: float + n_unique: int + + +@dataclass +class DataSchema: + n_rows: int + n_cols: int + columns: list[ColumnMeta] = field(default_factory=list) + + @property + def numeric_cols(self) -> list[str]: + return [c.name for c in self.columns if c.dtype == "numeric"] + + @property + def categorical_cols(self) -> list[str]: + return [c.name for c in self.columns if c.dtype == "categorical"] + + +@dataclass +class DataQualityReport: + total_rows: int + total_columns: int + missing_values: dict[str, int] = field(default_factory=dict) + missing_pct: dict[str, float] = field(default_factory=dict) + duplicate_rows: int = 0 + column_types: dict[str, str] = field(default_factory=dict) + warnings: list[str] = field(default_factory=list) + errors: list[str] = field(default_factory=list) + passed: bool = True + + def summary(self) -> str: + lines = [ + f"Data Quality Report", + f" Rows: {self.total_rows:,} | Columns: {self.total_columns}", + f" Duplicates: {self.duplicate_rows:,}", + ] + if self.missing_values: + top = sorted(self.missing_pct.items(), key=lambda x: -x[1])[:5] + lines.append(" Top missing:") + for col, pct in top: + if pct > 0: + lines.append(f" {col}: {pct:.1%}") + if self.warnings: + lines.append(f" Warnings ({len(self.warnings)}):") + for w in self.warnings[:5]: + lines.append(f" - {w}") + if self.errors: + lines.append(f" Errors ({len(self.errors)}):") + for e in self.errors[:5]: + lines.append(f" - {e}") + lines.append(f" Status: {'PASSED' if self.passed else 'FAILED'}") + return "\n".join(lines) + + +# ── Feature Engineering Layer ──────────────────────────────────────────────── + +@dataclass +class FeatureEngineeringResult: + df_original: pd.DataFrame + df_engineered: pd.DataFrame + X_scaled: np.ndarray + feature_names: list[str] + pca: object | None = None # Optional[PCA] + scaler: object | None = None # Optional[StandardScaler] + label_encoders: dict = field(default_factory=dict) + + +# ── Clustering Layer ───────────────────────────────────────────────────────── + +@dataclass +class ClusteringConfig: + k: int = 3 + kmeans_init: str = "k-means++" + kmeans_n_init: int = 10 + kmeans_max_iter: int = 300 + dbscan_eps: float = 0.5 + dbscan_min_samples: int = 5 + gmm_covariance_type: str = "full" + gmm_n_init: int = 5 + random_state: int = 42 + feature_subset_indices: list[int] | None = None + + +@dataclass +class ClusterRunResult: + algorithm: str + labels: np.ndarray + n_clusters: int + model: object + probabilities: np.ndarray | None = None + extra: dict = field(default_factory=dict) + + @property + def is_valid(self) -> bool: + unique = set(self.labels) + unique.discard(-1) + return len(unique) >= 2 + + +# ── Evaluation Layer ───────────────────────────────────────────────────────── + +@dataclass +class EvaluationResult: + algorithm: str + labels: np.ndarray + n_clusters: int + silhouette: float + davies_bouldin: float + calinski_harabasz: float + passes: bool + all_scores: dict[str, dict[str, float]] = field(default_factory=dict) + model: object = None + + +@dataclass +class StabilityResult: + ari_mean: float + ari_std: float + n_bootstraps: int + stable: bool + + +# ── Explainability Layer ───────────────────────────────────────────────────── + +@dataclass +class ExplainabilityReport: + top_features: list[str] + feature_importances: dict[str, float] + pca_loadings: pd.DataFrame + cluster_profiles: pd.DataFrame + pca_variance_ratio: list[float] + inertia_curve: dict[int, float] | None = None + feature_importance_pct: dict[str, float] = field(default_factory=dict) + ordered_feature_drivers: pd.DataFrame | None = None + pc_feature_map: dict[str, list[str]] = field(default_factory=dict) + + +# ── Persona Layer ──────────────────────────────────────────────────────────── + +@dataclass +class PersonaResult: + cluster_id: int | str + persona_name: str + archetype: str + description: str + key_traits: list[str] + business_recommendations: list[str] + cluster_size: int + cluster_pct: float + top_features: dict[str, float] = field(default_factory=dict) + naming_rationale: str = "" + categorization_basis: list[str] = field(default_factory=list) + profile_descriptor: str = "" # Data-driven descriptor (e.g. "High Credit Score & Low Risk") + + +@dataclass +class BusinessGrounding: + executive_summary: str + cluster_priorities: list[dict] = field(default_factory=list) + quick_wins: list[str] = field(default_factory=list) + cluster_actions: list[dict] = field(default_factory=list) + + +# ── Experiment Tracking ────────────────────────────────────────────────────── + +@dataclass +class ExperimentRecord: + iteration: int + timestamp: str + config_k: int + config_eps: float + config_gmm_cov: str + best_algorithm: str + n_clusters: int + silhouette: float + davies_bouldin: float + calinski_harabasz: float + passed: bool + reconfiguration_strategy: str | None = None + + +# ── Pipeline Result ────────────────────────────────────────────────────────── + +@dataclass +class PipelineResult: + data_quality: DataQualityReport + feature_engineering: FeatureEngineeringResult + best_evaluation: EvaluationResult + stability: StabilityResult + explainability: ExplainabilityReport + personas: list[PersonaResult] + grounding: BusinessGrounding + experiment_log: list[ExperimentRecord] diff --git a/segplus/visualization.py b/segplus/visualization.py new file mode 100644 index 0000000..a1274c7 --- /dev/null +++ b/segplus/visualization.py @@ -0,0 +1,439 @@ +"""Visualization: all matplotlib/seaborn charts for the pipeline.""" +from __future__ import annotations + +import logging +from pathlib import Path + +import matplotlib.patches as mpatches +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +from sklearn.decomposition import PCA +from sklearn.ensemble import RandomForestClassifier + +from .experiment_log import ExperimentLog +from .types import ExplainabilityReport, EvaluationResult, PersonaResult + +log = logging.getLogger("segplus.visualization") + +PALETTE = ["#4C72B0", "#DD8452", "#55A868", "#C44E52", "#8172B2", "#937860", "#DA8BC3"] + + +def _setup_style() -> None: + sns.set_theme(style="whitegrid", palette=PALETTE) + plt.rcParams.update({"figure.dpi": 130, "axes.titlesize": 13, "axes.labelsize": 11}) + + +def plot_cluster_scatter_2d( + X: np.ndarray, + labels: np.ndarray, + output_dir: Path, + eval_result: EvaluationResult | None = None, +) -> None: + """PCA 2D scatter plot of clusters with optional silhouette comparison.""" + _setup_style() + pca2 = PCA(n_components=2, random_state=42) + X_2d = pca2.fit_transform(X) + var_explained = pca2.explained_variance_ratio_ + + n_subplots = 2 if eval_result else 1 + fig, axes = plt.subplots(1, n_subplots, figsize=(8 * n_subplots, 6)) + if n_subplots == 1: + axes = [axes] + + # Cluster scatter + unique_labels = sorted(set(labels)) + colors = PALETTE[:len(unique_labels)] + for lbl, col in zip(unique_labels, colors): + mask = labels == lbl + name = f"Cluster {lbl}" if lbl != -1 else "Noise" + axes[0].scatter(X_2d[mask, 0], X_2d[mask, 1], c=col, alpha=0.55, s=18, + label=name, edgecolors="none") + + algo = eval_result.algorithm.upper() if eval_result else "Clustering" + sil = f" (sil={eval_result.silhouette:.3f})" if eval_result else "" + axes[0].set_title(f"Clusters - {algo}{sil}") + axes[0].set_xlabel(f"PC1 ({var_explained[0] * 100:.1f}% var)") + axes[0].set_ylabel(f"PC2 ({var_explained[1] * 100:.1f}% var)") + axes[0].legend(framealpha=0.8) + + # Silhouette comparison bar chart + if eval_result and eval_result.all_scores: + algos = list(eval_result.all_scores.keys()) + sil_scores = [eval_result.all_scores[a].get("silhouette", 0) for a in algos] + bar_colors = [PALETTE[2] if a == eval_result.algorithm else PALETTE[0] for a in algos] + bars = axes[1].bar(algos, sil_scores, color=bar_colors, width=0.4, edgecolor="white") + axes[1].set_title("Silhouette Score by Algorithm") + axes[1].set_ylabel("Silhouette Score") + for bar, val in zip(bars, sil_scores): + axes[1].text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.005, + f"{val:.3f}", ha="center", va="bottom", fontsize=10) + + plt.tight_layout() + fig.savefig(output_dir / "cluster_scatter.png", dpi=150, bbox_inches="tight") + plt.close(fig) + log.info("Saved: cluster_scatter.png") + + +def plot_shap_importance( + importances: dict[str, float], + output_dir: Path, + top_n: int = 15, + importance_pct: dict[str, float] | None = None, +) -> None: + """Horizontal bar chart of feature importances with percentage labels.""" + _setup_style() + names = list(importances.keys())[:top_n] + scores = list(importances.values())[:top_n] + + # Compute percentages from raw scores if not provided + if importance_pct: + pcts = [importance_pct.get(n, 0.0) for n in names] + else: + total = sum(abs(s) for s in scores) or 1.0 + pcts = [(abs(s) / total) * 100 for s in scores] + + fig, ax = plt.subplots(figsize=(10, max(4, len(names) * 0.45))) + colors = [PALETTE[2] if s > 0 else PALETTE[3] for s in scores] + bars = ax.barh(names[::-1], scores[::-1], color=colors[::-1], edgecolor="white") + + # Add percentage labels on each bar + for bar, pct in zip(bars, pcts[::-1]): + width = bar.get_width() + label_x = width + (ax.get_xlim()[1] - ax.get_xlim()[0]) * 0.01 + ax.text( + label_x, bar.get_y() + bar.get_height() / 2, + f"{pct:.1f}%", + va="center", ha="left", fontsize=9, fontweight="bold", color="#333333", + ) + + ax.set_title("Feature Importance (Cluster Separation Drivers)") + ax.set_xlabel("Importance Score") + ax.axvline(0, color="gray", linewidth=0.8) + # Extend x-axis slightly for percentage labels + xlim = ax.get_xlim() + ax.set_xlim(xlim[0], xlim[1] * 1.15) + + plt.tight_layout() + fig.savefig(output_dir / "feature_importance.png", dpi=150, bbox_inches="tight") + plt.close(fig) + log.info("Saved: feature_importance.png") + + +def plot_shap_summary( + X: np.ndarray, + labels: np.ndarray, + feature_names: list[str], + output_dir: Path, + random_state: int = 42, + max_display: int = 15, +) -> None: + """SHAP summary plots (beeswarm + bar) with robust multiclass handling.""" + valid_mask = labels != -1 + Xv, lv = X[valid_mask], labels[valid_mask] + if len(set(lv)) < 2: + log.warning("Skipping SHAP summary: need at least 2 clusters after excluding noise.") + return + + try: + import shap + except ImportError: + log.warning("Skipping SHAP summary: shap is not installed.") + return + + try: + model = RandomForestClassifier( + n_estimators=120, max_depth=12, random_state=random_state, n_jobs=-1 + ) + model.fit(Xv, lv) + + sample_n = min(1000, len(Xv)) + rng = np.random.default_rng(random_state) + idx = rng.choice(len(Xv), size=sample_n, replace=False) + X_sample = Xv[idx] + + explainer = shap.TreeExplainer(model) + raw_shap = explainer.shap_values(X_sample) + + # Normalize shap outputs to 2D (n_samples, n_features) for clean beeswarm plots. + major_class = pd.Series(lv).value_counts().index[0] + if isinstance(raw_shap, list): + class_idx = int(major_class) if int(major_class) < len(raw_shap) else 0 + shap_2d = np.array(raw_shap[class_idx], dtype=float) + else: + arr = np.array(raw_shap) + if arr.ndim == 2: + shap_2d = arr + elif arr.ndim == 3: + # Common shape: (n_samples, n_features, n_classes) + if arr.shape[0] == X_sample.shape[0] and arr.shape[1] == X_sample.shape[1]: + class_idx = int(major_class) if int(major_class) < arr.shape[2] else 0 + shap_2d = arr[:, :, class_idx] + # Alternate shape: (n_classes, n_samples, n_features) + elif arr.shape[1] == X_sample.shape[0] and arr.shape[2] == X_sample.shape[1]: + class_idx = int(major_class) if int(major_class) < arr.shape[0] else 0 + shap_2d = arr[class_idx, :, :] + else: + # Last-resort aggregation across class axis. + shap_2d = np.mean(arr, axis=-1) + if shap_2d.ndim != 2: + shap_2d = shap_2d.reshape(X_sample.shape[0], X_sample.shape[1]) + else: + log.warning("Unexpected SHAP output shape %s; skipping SHAP summary.", arr.shape) + return + + # Human-readable labels (no hardcoding; generated from column names) + pretty_feature_names = [f.replace("_", " ").title() for f in feature_names] + display_n = min(max_display, len(pretty_feature_names)) + plot_h = max(6, int(display_n * 0.45)) + + # Build interpretation table: importance % + directionality hint. + mean_abs = np.mean(np.abs(shap_2d), axis=0) + total_abs = float(np.sum(mean_abs)) if np.sum(mean_abs) > 0 else 1.0 + rows = [] + for i, fname in enumerate(feature_names): + xi = X_sample[:, i] + si = shap_2d[:, i] + if np.std(xi) > 1e-12 and np.std(si) > 1e-12: + corr = float(np.corrcoef(xi, si)[0, 1]) + else: + corr = 0.0 + if corr > 0.1: + effect = "Higher value tends to increase model score" + elif corr < -0.1: + effect = "Higher value tends to decrease model score" + else: + effect = "Mixed / nonlinear effect" + rows.append({ + "feature": fname, + "feature_pretty": pretty_feature_names[i], + "mean_abs_shap": float(mean_abs[i]), + "importance_pct": float((mean_abs[i] / total_abs) * 100.0), + "value_shap_corr": corr, + "effect_hint": effect, + }) + interp_df = pd.DataFrame(rows).sort_values("mean_abs_shap", ascending=False).reset_index(drop=True) + interp_df.to_csv(output_dir / "shap_interpretation.csv", index=False) + + # Beeswarm summary + plt.figure(figsize=(12, plot_h)) + shap.summary_plot( + shap_2d, + features=X_sample, + feature_names=pretty_feature_names, + max_display=display_n, + show=False, + plot_size=(12, plot_h), + ) + plt.tight_layout() + plt.savefig(output_dir / "shap_summary.png", dpi=180, bbox_inches="tight") + plt.close() + + # Custom bar summary with percentage labels (replaces shap's default bar chart) + top_interp = interp_df.head(display_n).copy() + fig_bar, ax_bar = plt.subplots(figsize=(11, max(5, int(display_n * 0.45)))) + bar_names = top_interp["feature_pretty"].tolist()[::-1] + bar_pcts = top_interp["importance_pct"].tolist()[::-1] + bar_colors = [PALETTE[2]] * len(bar_names) + bars = ax_bar.barh(bar_names, bar_pcts, color=bar_colors, edgecolor="white", height=0.6) + + for bar, pct in zip(bars, bar_pcts): + label_x = bar.get_width() + 0.3 + ax_bar.text( + label_x, bar.get_y() + bar.get_height() / 2, + f"{pct:.1f}%", + va="center", ha="left", fontsize=10, fontweight="bold", color="#333333", + ) + + ax_bar.set_xlabel("Importance (%)", fontsize=11) + ax_bar.set_title("SHAP Feature Importance (%)", fontsize=13, fontweight="bold") + xlim = ax_bar.get_xlim() + ax_bar.set_xlim(xlim[0], xlim[1] * 1.12) + plt.tight_layout() + fig_bar.savefig(output_dir / "shap_summary_bar.png", dpi=180, bbox_inches="tight") + plt.close(fig_bar) + + log.info("Saved: shap_summary.png, shap_summary_bar.png, shap_interpretation.csv") + except Exception as e: + log.warning("SHAP summary plot failed: %s", e) + + +def plot_pca_loadings_heatmap( + loadings: pd.DataFrame, + output_dir: Path, +) -> None: + """Heatmap of PCA component loadings.""" + _setup_style() + fig, ax = plt.subplots(figsize=(max(10, len(loadings.columns) * 0.6), max(4, len(loadings) * 0.8))) + sns.heatmap(loadings, ax=ax, cmap="YlOrRd", annot=True, fmt=".2f", + linewidths=0.5, cbar_kws={"label": "|Loading|"}) + ax.set_title("PCA Component Loadings") + ax.set_ylabel("Component") + + plt.tight_layout() + fig.savefig(output_dir / "pca_loadings.png", dpi=150, bbox_inches="tight") + plt.close(fig) + log.info("Saved: pca_loadings.png") + + +def plot_cluster_profiles_heatmap( + profiles: pd.DataFrame, + top_features: list[str], + output_dir: Path, +) -> None: + """Z-scored cluster profile heatmap for top features.""" + _setup_style() + cols = [c for c in top_features if c in profiles.columns][:10] + if not cols: + return + + sub = profiles[cols] + z_scored = (sub - sub.mean()) / sub.std().replace(0, 1) + + fig, ax = plt.subplots(figsize=(max(10, len(cols) * 0.8), max(4, len(z_scored) * 0.8))) + sns.heatmap(z_scored.T, ax=ax, cmap="RdBu_r", center=0, annot=True, fmt=".2f", + linewidths=0.5, cbar_kws={"label": "Z-score vs. mean"}) + ax.set_title("Cluster Profiles - Top Driving Features") + ax.set_xlabel("Cluster ID") + + plt.tight_layout() + fig.savefig(output_dir / "cluster_profiles.png", dpi=150, bbox_inches="tight") + plt.close(fig) + log.info("Saved: cluster_profiles.png") + + +def plot_cluster_sizes( + labels: np.ndarray, + personas: list[PersonaResult] | None, + output_dir: Path, +) -> None: + """Pie chart of cluster size distribution.""" + _setup_style() + unique = sorted(c for c in set(labels) if c != -1) + + fig, ax = plt.subplots(figsize=(8, 6)) + sizes = [int((labels == c).sum()) for c in unique] + + if personas and len(personas) == len(unique): + pie_labels = [f"{p.persona_name}\n{p.archetype[:20]}" for p in personas] + else: + pie_labels = [f"Cluster {c}" for c in unique] + + colors = PALETTE[:len(unique)] + wedges, texts, autotexts = ax.pie( + sizes, labels=pie_labels, colors=colors, + autopct="%1.1f%%", startangle=140, + wedgeprops={"edgecolor": "white", "linewidth": 1.5}, + ) + for at in autotexts: + at.set_fontsize(10) + ax.set_title("Cluster Size Distribution", fontsize=13, pad=15) + + plt.tight_layout() + fig.savefig(output_dir / "cluster_sizes.png", dpi=150, bbox_inches="tight") + plt.close(fig) + log.info("Saved: cluster_sizes.png") + + +def plot_radar_charts( + df_raw: pd.DataFrame, + labels: np.ndarray, + personas: list[PersonaResult], + top_features: list[str], + output_dir: Path, +) -> None: + """Radar chart per cluster for top features.""" + _setup_style() + num_feats = [f for f in top_features if f in df_raw.columns + and pd.api.types.is_numeric_dtype(df_raw[f])][:6] + n = len(num_feats) + if n < 3: + log.warning("Not enough numeric features for radar chart (need >= 3, got %d)", n) + return + + n_personas = min(len(personas), 3) + fig = plt.figure(figsize=(6 * n_personas, 6)) + angles = np.linspace(0, 2 * np.pi, n, endpoint=False).tolist() + angles += angles[:1] + + global_min = df_raw[num_feats].min() + global_max = df_raw[num_feats].max() + + for idx, p in enumerate(personas[:n_personas]): + ax = fig.add_subplot(1, n_personas, idx + 1, polar=True) + mask = labels == p.cluster_id + means = df_raw[mask][num_feats].mean() + normed = ((means - global_min) / (global_max - global_min + 1e-9)).values.tolist() + normed += normed[:1] + + ax.plot(angles, normed, color=PALETTE[idx], linewidth=2) + ax.fill(angles, normed, color=PALETTE[idx], alpha=0.25) + ax.set_xticks(angles[:-1]) + ax.set_xticklabels(num_feats, size=8) + ax.set_ylim(0, 1) + ax.set_title(f"{p.persona_name}\n{p.archetype[:18]}", size=10, pad=14, + color=PALETTE[idx], fontweight="bold") + + plt.suptitle("Customer Persona Profiles", fontsize=15, fontweight="bold", y=1.01) + plt.tight_layout() + fig.savefig(output_dir / "persona_radar.png", dpi=150, bbox_inches="tight") + plt.close(fig) + log.info("Saved: persona_radar.png") + + +def plot_experiment_history( + exp_log: ExperimentLog, + output_dir: Path, +) -> None: + """Line chart of silhouette score across modeling loop iterations.""" + _setup_style() + df = exp_log.to_dataframe() + if df.empty: + return + + fig, ax = plt.subplots(figsize=(10, 5)) + ax.plot(df["iteration"], df["silhouette"], "o-", color=PALETTE[0], linewidth=2, markersize=8) + + for _, row in df.iterrows(): + marker = "o" if not row["passed"] else "s" + color = PALETTE[3] if not row["passed"] else PALETTE[2] + ax.scatter(row["iteration"], row["silhouette"], c=color, s=100, zorder=5, marker=marker) + if row.get("reconfig_strategy"): + ax.annotate(row["reconfig_strategy"], (row["iteration"], row["silhouette"]), + textcoords="offset points", xytext=(5, 10), fontsize=8, alpha=0.7) + + ax.set_xlabel("Iteration") + ax.set_ylabel("Silhouette Score") + ax.set_title("Modeling Loop - Silhouette Score per Iteration") + ax.legend(handles=[ + mpatches.Patch(color=PALETTE[2], label="Passed"), + mpatches.Patch(color=PALETTE[3], label="Failed"), + ]) + + plt.tight_layout() + fig.savefig(output_dir / "experiment_history.png", dpi=150, bbox_inches="tight") + plt.close(fig) + log.info("Saved: experiment_history.png") + + +def plot_elbow_curve( + inertia_curve: dict[int, float], + output_dir: Path, +) -> None: + """Elbow curve (K vs inertia).""" + _setup_style() + ks = sorted(inertia_curve.keys()) + inertias = [inertia_curve[k] for k in ks] + + fig, ax = plt.subplots(figsize=(8, 5)) + ax.plot(ks, inertias, "o-", color=PALETTE[0], linewidth=2, markersize=8) + ax.set_xlabel("Number of Clusters (K)") + ax.set_ylabel("Inertia") + ax.set_title("Elbow Curve - K-Means Inertia") + ax.set_xticks(ks) + + plt.tight_layout() + fig.savefig(output_dir / "elbow_curve.png", dpi=150, bbox_inches="tight") + plt.close(fig) + log.info("Saved: elbow_curve.png")