Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Segmentation Plus

Customer segmentation and persona pipeline using the `segplus` module and a single Excel input file.

## What This Project Uses

- Input data: **Excel file**
- Core code: `segplus/`
- Final runnable notebook: `segplus/final_segplus_pipeline2.ipynb`
- Required support files:
- `requirements.txt`
- `main.py`

## Minimal Project Structure

```text
Segmentation_Plus/
├── final_enterprise_clustering_dataset_single_sheet.xlsx # Excel input
├── requirements.txt
├── segplus/
│ ├── clustering.py
│ ├── config.py
│ ├── data_input.py
│ ├── evaluation.py
│ ├── experiment_log.py
│ ├── explainability.py
│ ├── feature_engineering.py
│ ├── modeling_loop.py
│ ├── ollama_client.py
│ ├── persona_generation.py
│ ├── persona_generator.py
│ ├── pipeline.py
│ ├── types.py
│ ├── visualization.py
│ └── final_segplus_pipeline2.ipynb # Run this notebook
└── README.md
```

## Setup

```bash
pip install -r requirements.txt
```

## Ollama (Local LLM)

```bash
ollama serve
ollama list
ollama pull qwen2.5:7b
```

The notebook is configured for local Ollama and selects `qwen2.5:7b`.

## Run

1. Open `segplus/final_segplus_pipeline2.ipynb`
2. Run cells top-to-bottom
3. Ensure the Excel file path is correct in the config cell

## Outputs

Pipeline outputs are written to `segplus_output/` and include:

- `clustered_customers.csv`
- `cluster_profiles.csv`
- `ordered_feature_drivers.csv`
- `shap_feature_importance_pct.csv`
- `shap_summary.png`
- `shap_summary_bar.png`
- `shap_interpretation.csv`
- `pc_feature_map.json`
- `personas.csv`
- `personas.json`
- `business_grounding.json`
- `experiment_log.json`

## Notes

- Persona names are generated by the model (no hardcoded cluster names).
- `profile_descriptor` and `description` are included in persona outputs.
- If grounding times out, fallback logic still returns persona outputs.

Empty file added data/.gitkeep
Empty file.
13 changes: 13 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
pandas>=2.0.0
numpy>=1.24.0
scikit-learn>=1.3.0
matplotlib>=3.7.0
seaborn>=0.12.0
plotly>=5.15.0
shap>=0.42.0
pyyaml>=6.0
openpyxl>=3.1.0
requests>=2.31.0
scipy>=1.11.0
google-generativeai>=0.3.0
kaleido>=0.2.1
131 changes: 131 additions & 0 deletions segplus/clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
"""Clustering algorithm runners: K-Means, DBSCAN, GMM."""
from __future__ import annotations

import logging

import numpy as np
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestNeighbors

from .types import ClusteringConfig, ClusterRunResult

log = logging.getLogger("segplus.clustering")


def run_kmeans(X: np.ndarray, config: ClusteringConfig) -> ClusterRunResult:
"""Run K-Means clustering."""
model = KMeans(
n_clusters=config.k,
init=config.kmeans_init,
n_init=config.kmeans_n_init,
max_iter=config.kmeans_max_iter,
random_state=config.random_state,
)
labels = model.fit_predict(X)
return ClusterRunResult(
algorithm="kmeans",
labels=labels,
n_clusters=len(set(labels)),
model=model,
extra={"inertia": float(model.inertia_)},
)


def run_dbscan(X: np.ndarray, config: ClusteringConfig) -> ClusterRunResult:
"""Run DBSCAN density-based clustering."""
model = DBSCAN(eps=config.dbscan_eps, min_samples=config.dbscan_min_samples)
labels = model.fit_predict(X)
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
noise_count = int((labels == -1).sum())
return ClusterRunResult(
algorithm="dbscan",
labels=labels,
n_clusters=max(n_clusters, 1),
model=model,
extra={"noise_count": noise_count, "noise_pct": noise_count / len(labels)},
)


def run_gmm(X: np.ndarray, config: ClusteringConfig) -> ClusterRunResult:
"""Run Gaussian Mixture Model clustering."""
model = GaussianMixture(
n_components=config.k,
covariance_type=config.gmm_covariance_type,
n_init=config.gmm_n_init,
random_state=config.random_state,
)
labels = model.fit_predict(X)
probs = model.predict_proba(X)
return ClusterRunResult(
algorithm="gmm",
labels=labels,
n_clusters=len(set(labels)),
model=model,
probabilities=probs,
extra={"bic": float(model.bic(X)), "aic": float(model.aic(X))},
)


def run_all_algorithms(X: np.ndarray, config: ClusteringConfig) -> dict[str, ClusterRunResult]:
"""Run all three clustering algorithms, applying feature subset if configured."""
X_work = X
if config.feature_subset_indices is not None:
X_work = X[:, config.feature_subset_indices]

results = {}
for name, runner in [("kmeans", run_kmeans), ("dbscan", run_dbscan), ("gmm", run_gmm)]:
try:
results[name] = runner(X_work, config)
log.info(
" [%s] clusters=%d",
name, results[name].n_clusters,
)
except Exception as e:
log.warning(" [%s] failed: %s", name, e)
return results


def find_optimal_k(
X: np.ndarray,
k_range: tuple[int, int],
random_state: int = 42,
) -> tuple[int, dict[int, float]]:
"""Silhouette sweep to find optimal K for K-Means."""
best_k, best_score = k_range[0], -1.0
scores: dict[int, float] = {}

for k in range(k_range[0], k_range[1] + 1):
km = KMeans(n_clusters=k, init="k-means++", n_init=5, random_state=random_state)
labels = km.fit_predict(X)
if len(set(labels)) < 2:
continue
s = silhouette_score(X, labels, sample_size=min(2000, len(X)))
scores[k] = round(s, 4)
if s > best_score:
best_score, best_k = s, k

log.info("K search: scores=%s | best k=%d (sil=%.4f)", scores, best_k, best_score)
return best_k, scores


def estimate_dbscan_eps(X: np.ndarray, min_samples: int = 5) -> float:
"""Estimate DBSCAN eps using k-distance knee detection."""
nn = NearestNeighbors(n_neighbors=min_samples)
nn.fit(X)
distances, _ = nn.kneighbors(X)
k_dist = np.sort(distances[:, -1])

# Simple knee detection: max second derivative
if len(k_dist) < 10:
return float(np.median(k_dist))

diffs = np.diff(k_dist)
diffs2 = np.diff(diffs)
knee_idx = int(np.argmax(diffs2)) + 2
eps = float(k_dist[min(knee_idx, len(k_dist) - 1)])
eps = max(eps, 0.1) # floor

log.info("DBSCAN eps estimated: %.3f (knee at index %d)", eps, knee_idx)
return round(eps, 3)
Loading