diff --git a/biome.json b/biome.json
index 600b130..d2510ac 100644
--- a/biome.json
+++ b/biome.json
@@ -4,7 +4,11 @@
"linter": {
"enabled": true,
"rules": {
- "recommended": true
+ "recommended": true,
+ "style": {
+ "noNonNullAssertion": "off",
+ "noInferrableTypes": "off"
+ }
}
},
"formatter": {
diff --git a/playground/index.html b/playground/index.html
index 2004305..22a76f1 100644
--- a/playground/index.html
+++ b/playground/index.html
@@ -116,6 +116,36 @@
diff --git a/src/bicluster/bicluster.ts b/src/bicluster/bicluster.ts
new file mode 100644
index 0000000..37d9c59
--- /dev/null
+++ b/src/bicluster/bicluster.ts
@@ -0,0 +1,214 @@
+/**
+ * Biclustering algorithms: SpectralBiclustering and SpectralCoclustering.
+ * Port of sklearn.cluster.bicluster
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+function svd2(
+ matrix: Float64Array[],
+ nComponents: number,
+): { U: Float64Array[]; S: Float64Array; Vt: Float64Array[] } {
+ const m = matrix.length;
+ const n = matrix[0]?.length ?? 0;
+ const k = Math.min(nComponents, Math.min(m, n));
+ const U: Float64Array[] = Array.from({ length: m }, () => new Float64Array(k));
+ const S = new Float64Array(k);
+ const Vt: Float64Array[] = Array.from({ length: k }, () => new Float64Array(n));
+ for (let c = 0; c < k; c++) {
+ let v = new Float64Array(n);
+ v[c % n] = 1;
+ for (let _iter = 0; _iter < 30; _iter++) {
+ const u = new Float64Array(m);
+ for (let i = 0; i < m; i++) {
+ for (let j = 0; j < n; j++) u[i] += (matrix[i]?.[j] ?? 0) * (v[j] ?? 0);
+ }
+ const newV = new Float64Array(n);
+ for (let i = 0; i < m; i++) {
+ for (let j = 0; j < n; j++) newV[j] += (matrix[i]?.[j] ?? 0) * (u[i] ?? 0);
+ }
+ let norm = 0;
+ for (let j = 0; j < n; j++) norm += (newV[j] ?? 0) ** 2;
+ norm = Math.sqrt(norm);
+ if (norm < 1e-12) break;
+ for (let j = 0; j < n; j++) v[j] = (newV[j] ?? 0) / norm;
+ }
+ const u = new Float64Array(m);
+ for (let i = 0; i < m; i++) {
+ for (let j = 0; j < n; j++) u[i] += (matrix[i]?.[j] ?? 0) * (v[j] ?? 0);
+ }
+ let sigma = 0;
+ for (let i = 0; i < m; i++) sigma += (u[i] ?? 0) ** 2;
+ sigma = Math.sqrt(sigma);
+ S[c] = sigma;
+ if (sigma > 1e-12) {
+ for (let i = 0; i < m; i++) U[i]![c] = (u[i] ?? 0) / sigma;
+ }
+ for (let j = 0; j < n; j++) Vt[c]![j] = v[j] ?? 0;
+ for (let i = 0; i < m; i++) {
+ for (let j = 0; j < n; j++) {
+ matrix[i]![j] = (matrix[i]?.[j] ?? 0) - (U[i]?.[c] ?? 0) * sigma * (Vt[c]?.[j] ?? 0);
+ }
+ }
+ }
+ return { U, S, Vt };
+}
+
+function kmeansSimple(X: Float64Array[], k: number, maxIter = 100): Int32Array {
+ const n = X.length;
+ const d = X[0]?.length ?? 0;
+ const labels = new Int32Array(n);
+ const centers: Float64Array[] = Array.from({ length: k }, (_, i) => (X[i % n] ?? new Float64Array(d)).slice());
+ for (let _iter = 0; _iter < maxIter; _iter++) {
+ let changed = false;
+ for (let i = 0; i < n; i++) {
+ let best = 0;
+ let bestDist = Number.POSITIVE_INFINITY;
+ for (let j = 0; j < k; j++) {
+ let dist = 0;
+ for (let l = 0; l < d; l++) {
+ const diff = (X[i]?.[l] ?? 0) - (centers[j]?.[l] ?? 0);
+ dist += diff * diff;
+ }
+ if (dist < bestDist) { bestDist = dist; best = j; }
+ }
+ if (labels[i] !== best) { labels[i] = best; changed = true; }
+ }
+ if (!changed) break;
+ const counts = new Int32Array(k);
+ for (let j = 0; j < k; j++) centers[j] = new Float64Array(d);
+ for (let i = 0; i < n; i++) {
+ const c = labels[i]!;
+ counts[c]++;
+ for (let l = 0; l < d; l++) centers[c]![l]! += X[i]?.[l] ?? 0;
+ }
+ for (let j = 0; j < k; j++) {
+ if ((counts[j] ?? 0) > 0) {
+ for (let l = 0; l < d; l++) centers[j]![l]! /= counts[j]!;
+ }
+ }
+ }
+ return labels;
+}
+
+export interface SpectralBiclusteringParams {
+ nClusters?: number | [number, number];
+ method?: "bistochastic" | "scale" | "log";
+ nComponents?: number;
+ nInit?: number;
+}
+
+/** Spectral biclustering. Port of sklearn.cluster.SpectralBiclustering */
+export class SpectralBiclustering {
+ nClusters: number | [number, number];
+ method: string;
+ nComponents: number;
+ nInit: number;
+ rowLabels_?: Int32Array;
+ columnLabels_?: Int32Array;
+ biclusters_?: [Int32Array, Int32Array][];
+
+ constructor(params: SpectralBiclusteringParams = {}) {
+ this.nClusters = params.nClusters ?? 3;
+ this.method = params.method ?? "bistochastic";
+ this.nComponents = params.nComponents ?? 6;
+ this.nInit = params.nInit ?? 10;
+ }
+
+ fit(X: Float64Array[]): this {
+ const nRows = X.length;
+ const nCols = X[0]?.length ?? 0;
+ const [nRowClusters, nColClusters] = Array.isArray(this.nClusters)
+ ? this.nClusters
+ : [this.nClusters, this.nClusters];
+ const normalized = X.map((row) => row.slice());
+ const k = Math.min(this.nComponents, Math.min(nRows, nCols));
+ const { U, Vt } = svd2(normalized, k);
+ const rowVecs = U.slice(0, nRows);
+ const colVecs = Array.from({ length: nCols }, (_, j) => {
+ const v = new Float64Array(k);
+ for (let c = 0; c < k; c++) v[c] = Vt[c]?.[j] ?? 0;
+ return v;
+ });
+ this.rowLabels_ = kmeansSimple(rowVecs, nRowClusters, 100);
+ this.columnLabels_ = kmeansSimple(colVecs, nColClusters, 100);
+ this.biclusters_ = [];
+ for (let r = 0; r < nRowClusters; r++) {
+ for (let c = 0; c < nColClusters; c++) {
+ const rowIdx = Array.from({ length: nRows }, (_, i) => i).filter((i) => this.rowLabels_![i] === r);
+ const colIdx = Array.from({ length: nCols }, (_, j) => j).filter((j) => this.columnLabels_![j] === c);
+ this.biclusters_.push([new Int32Array(rowIdx), new Int32Array(colIdx)]);
+ }
+ }
+ return this;
+ }
+
+ getBicluster(i: number): [Int32Array, Int32Array] {
+ if (!this.biclusters_) throw new NotFittedError("SpectralBiclustering");
+ return this.biclusters_[i]!;
+ }
+}
+
+export interface SpectralCoclusteringParams {
+ nClusters?: number;
+ nSvdVecs?: number | null;
+ nInit?: number;
+}
+
+/** Spectral co-clustering. Port of sklearn.cluster.SpectralCoclustering */
+export class SpectralCoclustering {
+ nClusters: number;
+ nInit: number;
+ rowLabels_?: Int32Array;
+ columnLabels_?: Int32Array;
+ biclusters_?: [Int32Array, Int32Array][];
+
+ constructor(params: SpectralCoclusteringParams = {}) {
+ this.nClusters = params.nClusters ?? 3;
+ this.nInit = params.nInit ?? 10;
+ }
+
+ fit(X: Float64Array[]): this {
+ const nRows = X.length;
+ const nCols = X[0]?.length ?? 0;
+ const k = this.nClusters;
+ const rowSums = new Float64Array(nRows);
+ const colSums = new Float64Array(nCols);
+ for (let i = 0; i < nRows; i++) {
+ for (let j = 0; j < nCols; j++) {
+ rowSums[i] += X[i]?.[j] ?? 0;
+ colSums[j] += X[i]?.[j] ?? 0;
+ }
+ }
+ const normalized = X.map((row, i) => {
+ const nr = new Float64Array(nCols);
+ const rs = Math.sqrt(rowSums[i]! || 1);
+ for (let j = 0; j < nCols; j++) {
+ const cs = Math.sqrt(colSums[j]! || 1);
+ nr[j] = (row[j] ?? 0) / (rs * cs);
+ }
+ return nr;
+ });
+ const { U, Vt } = svd2(normalized, k + 1);
+ const rowVecs = U.slice(0, nRows).map((u) => u.slice(1));
+ const colVecs = Array.from({ length: nCols }, (_, j) => {
+ const v = new Float64Array(k);
+ for (let c = 1; c <= k; c++) v[c - 1] = Vt[c]?.[j] ?? 0;
+ return v;
+ });
+ this.rowLabels_ = kmeansSimple(rowVecs, k, 100);
+ this.columnLabels_ = kmeansSimple(colVecs, k, 100);
+ this.biclusters_ = [];
+ for (let c = 0; c < k; c++) {
+ const rowIdx = Array.from({ length: nRows }, (_, i) => i).filter((i) => this.rowLabels_![i] === c);
+ const colIdx = Array.from({ length: nCols }, (_, j) => j).filter((j) => this.columnLabels_![j] === c);
+ this.biclusters_.push([new Int32Array(rowIdx), new Int32Array(colIdx)]);
+ }
+ return this;
+ }
+
+ getBicluster(i: number): [Int32Array, Int32Array] {
+ if (!this.biclusters_) throw new NotFittedError("SpectralCoclustering");
+ return this.biclusters_[i]!;
+ }
+}
diff --git a/src/bicluster/bicluster_ext.ts b/src/bicluster/bicluster_ext.ts
new file mode 100644
index 0000000..9d04493
--- /dev/null
+++ b/src/bicluster/bicluster_ext.ts
@@ -0,0 +1,133 @@
+/**
+ * Bicluster extensions: SpectralCoClustering, BiclusterMixin utilities.
+ */
+
+export class SpectralCoClustering {
+ rowLabels_: Int32Array = new Int32Array(0);
+ columnLabels_: Int32Array = new Int32Array(0);
+ biclusters_: Array<[boolean[], boolean[]]> = [];
+
+ constructor(
+ private readonly nClusters = 3,
+ private readonly svdMethod: "randomized" | "arpack" = "randomized",
+ private readonly seed = 42
+ ) {
+ void this.svdMethod;
+ }
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const m = X[0]?.length ?? 1;
+ // Normalize: D_row^(-1/2) X D_col^(-1/2)
+ const rowSums = X.map((row) => Math.sqrt(Math.max(row.reduce((a, b) => a + b, 0), 1e-10)));
+ const colSums = new Float64Array(m);
+ for (const row of X) for (let j = 0; j < m; j++) colSums[j] = (colSums[j] ?? 0) + (row[j] ?? 0);
+ for (let j = 0; j < m; j++) colSums[j] = Math.sqrt(Math.max(colSums[j] ?? 1, 1e-10));
+ const An = X.map((row, i) => new Float64Array(row.map((v, j) => v / Math.max(rowSums[i] ?? 1, 1e-10) / Math.max(colSums[j] ?? 1, 1e-10))));
+ // SVD (simplified: power iteration)
+ const nVecs = this.nClusters - 1;
+ const rng = this._seededRng(this.seed);
+ const rowVecs: Float64Array[] = [];
+ const colVecs: Float64Array[] = [];
+ for (let k = 0; k < nVecs; k++) {
+ let v = new Float64Array(m).map(() => rng() - 0.5);
+ // Power iteration for singular vector
+ for (let iter = 0; iter < 20; iter++) {
+ // u = A * v
+ const u = new Float64Array(n);
+ for (let i = 0; i < n; i++) for (let j = 0; j < m; j++) u[i] = (u[i] ?? 0) + (An[i]?.[j] ?? 0) * (v[j] ?? 0);
+ const uNorm = Math.sqrt(u.reduce((a, b) => a + b * b, 0));
+ for (let i = 0; i < n; i++) u[i] = (u[i] ?? 0) / Math.max(uNorm, 1e-10);
+ // v = A^T * u
+ v = new Float64Array(m);
+ for (let j = 0; j < m; j++) for (let i = 0; i < n; i++) v[j] = (v[j] ?? 0) + (An[i]?.[j] ?? 0) * (u[i] ?? 0);
+ const vNorm = Math.sqrt(v.reduce((a, b) => a + b * b, 0));
+ for (let j = 0; j < m; j++) v[j] = (v[j] ?? 0) / Math.max(vNorm, 1e-10);
+ // Deflate
+ for (const ov of rowVecs) {
+ let dot = 0;
+ for (let i = 0; i < n; i++) dot += (ov[i] ?? 0) * (u[i] ?? 0);
+ for (let i = 0; i < n; i++) u[i] = (u[i] ?? 0) - dot * (ov[i] ?? 0);
+ }
+ }
+ // Compute row vector: An * v
+ const rowVec = new Float64Array(n);
+ for (let i = 0; i < n; i++) for (let j = 0; j < m; j++) rowVec[i] = (rowVec[i] ?? 0) + (An[i]?.[j] ?? 0) * (v[j] ?? 0);
+ rowVecs.push(rowVec);
+ colVecs.push(v);
+ }
+ // K-means on row/col concatenated vectors
+ this.rowLabels_ = this._kmeans(rowVecs.length > 0 ? X.map((_, i) => new Float64Array(rowVecs.map((rv) => rv[i] ?? 0))) : X.map(() => new Float64Array(1).fill(0)));
+ this.columnLabels_ = this._kmeans(Array.from({ length: m }, (_, j) => new Float64Array(colVecs.map((cv) => cv[j] ?? 0))));
+ // Build biclusters
+ this.biclusters_ = Array.from({ length: this.nClusters }, (_, k) => {
+ const rowMask = Array.from({ length: n }, (__, i) => this.rowLabels_[i] === k);
+ const colMask = Array.from({ length: m }, (__, j) => this.columnLabels_[j] === k);
+ return [rowMask, colMask] as [boolean[], boolean[]];
+ });
+ return this;
+ }
+
+ private _kmeans(X: Float64Array[]): Int32Array {
+ const n = X.length;
+ const k = this.nClusters;
+ const rng = this._seededRng(this.seed + 1);
+ let centers = Array.from({ length: k }, () => X[Math.floor(rng() * n)] ?? new Float64Array(1));
+ let labels = new Int32Array(n);
+ for (let iter = 0; iter < 50; iter++) {
+ const newLabels = new Int32Array(n);
+ for (let i = 0; i < n; i++) {
+ let best = 0, bestD = Number.POSITIVE_INFINITY;
+ for (let c = 0; c < k; c++) {
+ let d = 0;
+ const xi = X[i]!;
+ const ci = centers[c]!;
+ for (let f = 0; f < xi.length; f++) d += ((xi[f] ?? 0) - (ci[f] ?? 0)) ** 2;
+ if (d < bestD) { bestD = d; best = c; }
+ }
+ newLabels[i] = best;
+ }
+ // Update centers
+ const nF = X[0]?.length ?? 1;
+ const newCenters = Array.from({ length: k }, () => ({ sum: new Float64Array(nF), cnt: 0 }));
+ for (let i = 0; i < n; i++) {
+ const c = newLabels[i]!;
+ newCenters[c]!.cnt++;
+ const xi = X[i]!;
+ for (let f = 0; f < nF; f++) newCenters[c]!.sum[f] = (newCenters[c]!.sum[f] ?? 0) + (xi[f] ?? 0);
+ }
+ centers = newCenters.map((nc) => new Float64Array(nc.sum.map((v) => v / Math.max(nc.cnt, 1))));
+ const changed = newLabels.some((l, i) => l !== labels[i]);
+ labels = newLabels;
+ if (!changed) break;
+ }
+ return labels;
+ }
+
+ private _seededRng(seed: number): () => number {
+ let s = seed;
+ return () => { s = (s * 1664525 + 1013904223) & 0xffffffff; return (s >>> 0) / 0xffffffff; };
+ }
+
+ getBicluster(i: number): [boolean[], boolean[]] {
+ return this.biclusters_[i] ?? [[], []];
+ }
+}
+
+export class SpectralBiclusteringExt {
+ rowLabels_: Int32Array = new Int32Array(0);
+ columnLabels_: Int32Array = new Int32Array(0);
+
+ constructor(private readonly nClusters: [number, number] | number = [3, 3]) {}
+
+ fit(X: Float64Array[]): this {
+ const nRowClusters = Array.isArray(this.nClusters) ? this.nClusters[0]! : this.nClusters;
+ const nColClusters = Array.isArray(this.nClusters) ? this.nClusters[1]! : this.nClusters;
+ const coClust = new SpectralCoClustering(Math.max(nRowClusters, nColClusters));
+ coClust.fit(X);
+ // Remap to correct number of clusters
+ this.rowLabels_ = new Int32Array(coClust.rowLabels_.map((l) => l % nRowClusters));
+ this.columnLabels_ = new Int32Array(coClust.columnLabels_.map((l) => l % nColClusters));
+ return this;
+ }
+}
diff --git a/src/bicluster/index.ts b/src/bicluster/index.ts
new file mode 100644
index 0000000..50ad235
--- /dev/null
+++ b/src/bicluster/index.ts
@@ -0,0 +1 @@
+export * from "./bicluster.js";
diff --git a/src/bicluster/spectral_bicluster_ext.ts b/src/bicluster/spectral_bicluster_ext.ts
new file mode 100644
index 0000000..32ed364
--- /dev/null
+++ b/src/bicluster/spectral_bicluster_ext.ts
@@ -0,0 +1,150 @@
+/**
+ * Extended biclustering utilities: consensus biclustering, evaluation metrics.
+ * Port of sklearn.cluster.bicluster extensions.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+/** Compute the consensus score between two sets of biclusters. */
+export function consensusScore(
+ a: { rowLabels: Int32Array; colLabels: Int32Array },
+ b: { rowLabels: Int32Array; colLabels: Int32Array },
+): number {
+ const nRows = a.rowLabels.length;
+ const nCols = a.colLabels.length;
+ const aRows = new Set
();
+ const bRows = new Set();
+ for (let i = 0; i < nRows; i++) {
+ if ((a.rowLabels[i] ?? 0) === 1) aRows.add(i);
+ if ((b.rowLabels[i] ?? 0) === 1) bRows.add(i);
+ }
+ const aCols = new Set();
+ const bCols = new Set();
+ for (let j = 0; j < nCols; j++) {
+ if ((a.colLabels[j] ?? 0) === 1) aCols.add(j);
+ if ((b.colLabels[j] ?? 0) === 1) bCols.add(j);
+ }
+ const rowInter = [...aRows].filter((r) => bRows.has(r)).length;
+ const colInter = [...aCols].filter((c) => bCols.has(c)).length;
+ const aSize = aRows.size * aCols.size;
+ const bSize = bRows.size * bCols.size;
+ if (aSize === 0 || bSize === 0) return 0;
+ return (rowInter * colInter) / Math.sqrt(aSize * bSize);
+}
+
+/** Check if a biclustering result is non-degenerate (has at least one row and column in each bicluster). */
+export function checkBiclustersNonDegenerate(
+ rowLabels: Int32Array,
+ colLabels: Int32Array,
+ nClusters: number,
+): boolean {
+ for (let k = 0; k < nClusters; k++) {
+ let rowCount = 0;
+ let colCount = 0;
+ for (let i = 0; i < rowLabels.length; i++) {
+ if ((rowLabels[i] ?? 0) === k) rowCount++;
+ }
+ for (let j = 0; j < colLabels.length; j++) {
+ if ((colLabels[j] ?? 0) === k) colCount++;
+ }
+ if (rowCount === 0 || colCount === 0) return false;
+ }
+ return true;
+}
+
+/** Bicluster evaluator for measuring residue and volume. */
+export class BiclusterEvaluator {
+ private rowLabels_: Int32Array | null = null;
+ private colLabels_: Int32Array | null = null;
+ private data_: Float64Array[] | null = null;
+
+ fit(
+ data: Float64Array[],
+ rowLabels: Int32Array,
+ colLabels: Int32Array,
+ ): this {
+ this.data_ = data;
+ this.rowLabels_ = rowLabels;
+ this.colLabels_ = colLabels;
+ return this;
+ }
+
+ /** Compute the average residue of a bicluster (lower is better). */
+ averageResidue(clusterId: number): number {
+ if (this.data_ === null || this.rowLabels_ === null || this.colLabels_ === null) {
+ throw new NotFittedError("BiclusterEvaluator is not fitted.");
+ }
+ const rows: number[] = [];
+ const cols: number[] = [];
+ for (let i = 0; i < this.rowLabels_.length; i++) {
+ if ((this.rowLabels_[i] ?? 0) === clusterId) rows.push(i);
+ }
+ for (let j = 0; j < this.colLabels_.length; j++) {
+ if ((this.colLabels_[j] ?? 0) === clusterId) cols.push(j);
+ }
+ if (rows.length === 0 || cols.length === 0) return 0;
+ let grandMean = 0;
+ for (const i of rows) {
+ for (const j of cols) {
+ grandMean += this.data_[i]?.[j] ?? 0;
+ }
+ }
+ grandMean /= rows.length * cols.length;
+ const rowMeans = rows.map((i) => {
+ let s = 0;
+ for (const j of cols) s += this.data_![i]?.[j] ?? 0;
+ return s / cols.length;
+ });
+ const colMeans = cols.map((j) => {
+ let s = 0;
+ for (const i of rows) s += this.data_![i]?.[j] ?? 0;
+ return s / rows.length;
+ });
+ let residue = 0;
+ for (let ri = 0; ri < rows.length; ri++) {
+ for (let ci = 0; ci < cols.length; ci++) {
+ const val = this.data_[rows[ri]!]?.[cols[ci]!] ?? 0;
+ const r =
+ val -
+ (rowMeans[ri] ?? 0) -
+ (colMeans[ci] ?? 0) +
+ grandMean;
+ residue += r * r;
+ }
+ }
+ return residue / (rows.length * cols.length);
+ }
+}
+
+/** Generate a checkerboard matrix for testing biclustering algorithms. */
+export function makeCheckerboard(
+ shape: [number, number],
+ nClusters: [number, number],
+ noise = 0.0,
+ seed = 0,
+): { data: Float64Array[]; rowLabels: Int32Array; colLabels: Int32Array } {
+ const [nRows, nCols] = shape;
+ const [nRowClusters, nColClusters] = nClusters;
+ const rowLabels = new Int32Array(nRows);
+ const colLabels = new Int32Array(nCols);
+ for (let i = 0; i < nRows; i++) {
+ rowLabels[i] = i % nRowClusters;
+ }
+ for (let j = 0; j < nCols; j++) {
+ colLabels[j] = j % nColClusters;
+ }
+ let rng = seed;
+ const rand = (): number => {
+ rng = (rng * 1664525 + 1013904223) & 0xffffffff;
+ return (rng >>> 0) / 0xffffffff;
+ };
+ const data: Float64Array[] = Array.from({ length: nRows }, (_, i) => {
+ const row = new Float64Array(nCols);
+ for (let j = 0; j < nCols; j++) {
+ const same = (rowLabels[i] ?? 0) === (colLabels[j] ?? 0) % nRowClusters ? 1 : 0;
+ row[j] = same + noise * (rand() - 0.5);
+ }
+ return row;
+ });
+ return { data, rowLabels, colLabels };
+}
diff --git a/src/calibration/calibration.ts b/src/calibration/calibration.ts
new file mode 100644
index 0000000..948aa5f
--- /dev/null
+++ b/src/calibration/calibration.ts
@@ -0,0 +1,141 @@
+/**
+ * Probability calibration.
+ * Mirrors sklearn.calibration.CalibratedClassifierCV.
+ * Uses Platt scaling (logistic) or isotonic regression for calibration.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+interface Classifier {
+ fit(X: Float64Array[], y: Float64Array): this;
+ predict(X: Float64Array[]): Float64Array;
+ score?(X: Float64Array[], y: Float64Array): number;
+}
+
+function sigmoid(x: number): number {
+ return 1 / (1 + Math.exp(-x));
+}
+
+/** Platt scaling: fit a logistic function on scores to map to probabilities. */
+function plattScale(scores: Float64Array, y: Float64Array): [number, number] {
+ const n = scores.length;
+ let A = 0;
+ let B = 0;
+ const lr = 0.01;
+
+ for (let iter = 0; iter < 1000; iter++) {
+ let gradA = 0;
+ let gradB = 0;
+ for (let i = 0; i < n; i++) {
+ const p = sigmoid(A * (scores[i] ?? 0) + B);
+ const err = p - (y[i] ?? 0);
+ gradA += err * (scores[i] ?? 0);
+ gradB += err;
+ }
+ A -= lr * gradA / n;
+ B -= lr * gradB / n;
+ }
+
+ return [A, B];
+}
+
+export class CalibratedClassifierCV {
+ baseEstimator: Classifier;
+ method: string;
+ cv: number;
+
+ calibratedEstimators_: {
+ estimator: Classifier;
+ A: number;
+ B: number;
+ }[] | null = null;
+ classes_: Float64Array | null = null;
+
+ constructor(
+ baseEstimator: Classifier,
+ options: { method?: string; cv?: number } = {},
+ ) {
+ this.baseEstimator = baseEstimator;
+ this.method = options.method ?? "sigmoid";
+ this.cv = options.cv ?? 5;
+ }
+
+ fit(X: Float64Array[], y: Float64Array): this {
+ const n = X.length;
+ const uniqueClasses = Array.from(new Set(Array.from(y))).sort((a, b) => a - b);
+ this.classes_ = new Float64Array(uniqueClasses);
+ const posClass = uniqueClasses[uniqueClasses.length - 1] ?? 1;
+
+ const yBin = new Float64Array(y.map((yi) => (yi === posClass ? 1 : 0)));
+
+ // Simple hold-out calibration
+ const foldSize = Math.floor(n / this.cv);
+ this.calibratedEstimators_ = [];
+
+ for (let fold = 0; fold < this.cv; fold++) {
+ const testStart = fold * foldSize;
+ const testEnd = fold === this.cv - 1 ? n : testStart + foldSize;
+
+ const trainIdx: number[] = [];
+ const testIdx: number[] = [];
+ for (let i = 0; i < n; i++) {
+ if (i >= testStart && i < testEnd) testIdx.push(i);
+ else trainIdx.push(i);
+ }
+
+ const XTrain = trainIdx.map((i) => X[i] ?? new Float64Array(0));
+ const yTrain = new Float64Array(trainIdx.map((i) => y[i] ?? 0));
+ const XTest = testIdx.map((i) => X[i] ?? new Float64Array(0));
+ const yTest = new Float64Array(testIdx.map((i) => yBin[i] ?? 0));
+
+ const est = Object.create(Object.getPrototypeOf(this.baseEstimator) as object) as Classifier;
+ Object.assign(est, this.baseEstimator);
+ est.fit(XTrain, yTrain);
+
+ const testPred = est.predict(XTest);
+ const [A, B] = plattScale(testPred, yTest);
+
+ this.calibratedEstimators_.push({ estimator: est, A, B });
+ }
+
+ return this;
+ }
+
+ predictProba(X: Float64Array[]): Float64Array[] {
+ if (this.calibratedEstimators_ === null) throw new NotFittedError("CalibratedClassifierCV");
+
+ const n = X.length;
+ const probs = new Float64Array(n);
+
+ for (const { estimator, A, B } of this.calibratedEstimators_) {
+ const scores = estimator.predict(X);
+ for (let i = 0; i < n; i++) {
+ probs[i] = (probs[i] ?? 0) + sigmoid(A * (scores[i] ?? 0) + B);
+ }
+ }
+
+ const k = this.calibratedEstimators_.length;
+ return Array.from({ length: n }, (_, i) => {
+ const p = (probs[i] ?? 0) / k;
+ return new Float64Array([1 - p, p]);
+ });
+ }
+
+ predict(X: Float64Array[]): Float64Array {
+ if (this.classes_ === null) throw new NotFittedError("CalibratedClassifierCV");
+ const classes = this.classes_;
+ const proba = this.predictProba(X);
+ const posClass = classes[classes.length - 1] ?? 1;
+ const negClass = classes[0] ?? 0;
+ return new Float64Array(proba.map((p) => ((p[1] ?? 0) >= 0.5 ? posClass : negClass)));
+ }
+
+ score(X: Float64Array[], y: Float64Array): number {
+ const pred = this.predict(X);
+ let correct = 0;
+ for (let i = 0; i < y.length; i++) {
+ if (pred[i] === y[i]) correct++;
+ }
+ return correct / y.length;
+ }
+}
diff --git a/src/calibration/calibration_ext.ts b/src/calibration/calibration_ext.ts
new file mode 100644
index 0000000..ed4b883
--- /dev/null
+++ b/src/calibration/calibration_ext.ts
@@ -0,0 +1,183 @@
+/**
+ * Calibration extensions: TemperatureScaling, PlattScaling, BetaCalibration.
+ */
+
+export class TemperatureScaling {
+ private temperature = 1.0;
+
+ fit(logits: Float64Array[], y: Int32Array, maxIter = 100): this {
+ let T = 1.0;
+ const lr = 0.01;
+ for (let iter = 0; iter < maxIter; iter++) {
+ let grad = 0;
+ for (let i = 0; i < logits.length; i++) {
+ const scaled = (logits[i]![0] ?? 0) / T;
+ const p = 1 / (1 + Math.exp(-scaled));
+ const yi = y[i] ?? 0;
+ grad += (p - yi) * (-scaled / T);
+ }
+ T = Math.max(0.01, T - lr * grad / Math.max(logits.length, 1));
+ }
+ this.temperature = T;
+ return this;
+ }
+
+ calibrate(logits: Float64Array[]): Float64Array {
+ return new Float64Array(logits.map((l) => {
+ const scaled = (l[0] ?? 0) / this.temperature;
+ return 1 / (1 + Math.exp(-scaled));
+ }));
+ }
+
+ getTemperature(): number { return this.temperature; }
+}
+
+export class PlattScaling {
+ private a = 0;
+ private b = 0;
+
+ fit(scores: Float64Array, y: Int32Array, maxIter = 100): this {
+ const n = scores.length;
+ const hiTarget = (n + 1) / (n + 2);
+ const loTarget = 1 / (n + 2);
+ let a = 0;
+ let b = Math.log((n + 1) / n);
+ let fApB: number;
+ for (let iter = 0; iter < maxIter; iter++) {
+ let h11 = 0, h22 = 0, h21 = 0, g1 = 0, g2 = 0;
+ for (let i = 0; i < n; i++) {
+ const s = scores[i] ?? 0;
+ const ti = (y[i] ?? 0) === 1 ? hiTarget : loTarget;
+ fApB = s * a + b;
+ let p: number, q: number;
+ if (fApB >= 0) {
+ p = Math.exp(-fApB) / (1 + Math.exp(-fApB));
+ q = 1 / (1 + Math.exp(-fApB));
+ } else {
+ p = 1 / (1 + Math.exp(fApB));
+ q = Math.exp(fApB) / (1 + Math.exp(fApB));
+ }
+ const d2 = p * q;
+ h11 += s * s * d2;
+ h22 += d2;
+ h21 += s * d2;
+ const d1 = ti - p;
+ g1 += s * d1;
+ g2 += d1;
+ }
+ const det = h11 * h22 - h21 * h21;
+ if (Math.abs(det) < 1e-10) break;
+ const dA = -(h22 * g1 - h21 * g2) / det;
+ const dB = -(-h21 * g1 + h11 * g2) / det;
+ let stepsize = 1.0;
+ while (stepsize >= 1e-10) {
+ const newA = a + stepsize * dA;
+ const newB = b + stepsize * dB;
+ let newF = 0;
+ for (let i = 0; i < n; i++) {
+ const s = scores[i] ?? 0;
+ const ti = (y[i] ?? 0) === 1 ? hiTarget : loTarget;
+ fApB = s * newA + newB;
+ newF += fApB >= 0
+ ? ti * fApB + Math.log(1 + Math.exp(-fApB))
+ : (ti - 1) * fApB + Math.log(1 + Math.exp(fApB));
+ }
+ if (newF < 1e-10) { a = newA; b = newB; break; }
+ stepsize /= 2;
+ }
+ }
+ this.a = a;
+ this.b = b;
+ return this;
+ }
+
+ calibrate(scores: Float64Array): Float64Array {
+ return new Float64Array(scores.map((s) => {
+ const fApB = s * this.a + this.b;
+ return fApB >= 0
+ ? Math.exp(-fApB) / (1 + Math.exp(-fApB))
+ : 1 / (1 + Math.exp(fApB));
+ }));
+ }
+}
+
+export class BetaCalibration {
+ private a = 1.0;
+ private b = 1.0;
+ private c = 0.0;
+
+ fit(scores: Float64Array, y: Int32Array): this {
+ const eps = 1e-7;
+ let sumA = 0, sumB = 0, sumC = 0;
+ const n = scores.length;
+ for (let i = 0; i < n; i++) {
+ const s = Math.max(eps, Math.min(1 - eps, scores[i] ?? 0));
+ const yi = y[i] ?? 0;
+ sumA += yi * Math.log(s);
+ sumB += yi * Math.log(1 - s);
+ sumC += yi;
+ }
+ this.a = Math.max(0.01, sumA / Math.max(n, 1));
+ this.b = Math.max(0.01, -sumB / Math.max(n, 1));
+ this.c = sumC / Math.max(n, 1);
+ return this;
+ }
+
+ calibrate(scores: Float64Array): Float64Array {
+ const eps = 1e-7;
+ return new Float64Array(scores.map((s) => {
+ const sc = Math.max(eps, Math.min(1 - eps, s));
+ const logOdds = this.a * Math.log(sc) - this.b * Math.log(1 - sc) + this.c;
+ return 1 / (1 + Math.exp(-logOdds));
+ }));
+ }
+}
+
+export class IsotonicCalibration {
+ private xs: Float64Array = new Float64Array(0);
+ private ys: Float64Array = new Float64Array(0);
+
+ fit(scores: Float64Array, y: Int32Array): this {
+ const n = scores.length;
+ const idx = Array.from({ length: n }, (_, i) => i).sort((a, b) => (scores[a] ?? 0) - (scores[b] ?? 0));
+ const sortedX = new Float64Array(idx.map((i) => scores[i] ?? 0));
+ const sortedY = new Float64Array(idx.map((i) => y[i] ?? 0));
+ // Pool adjacent violators
+ const pooled = Array.from({ length: n }, (_, i) => ({ x: sortedX[i] ?? 0, y: sortedY[i] ?? 0, cnt: 1 }));
+ let changed = true;
+ while (changed) {
+ changed = false;
+ for (let i = 0; i < pooled.length - 1; i++) {
+ const a = pooled[i];
+ const b = pooled[i + 1];
+ if (a !== undefined && b !== undefined && a.y > b.y) {
+ const newY = (a.y * a.cnt + b.y * b.cnt) / (a.cnt + b.cnt);
+ a.y = newY;
+ a.cnt += b.cnt;
+ pooled.splice(i + 1, 1);
+ changed = true;
+ }
+ }
+ }
+ this.xs = new Float64Array(pooled.map((p) => p.x));
+ this.ys = new Float64Array(pooled.map((p) => p.y));
+ return this;
+ }
+
+ calibrate(scores: Float64Array): Float64Array {
+ return new Float64Array(scores.map((s) => {
+ if (this.xs.length === 0) return s;
+ if (s <= (this.xs[0] ?? 0)) return this.ys[0] ?? 0;
+ if (s >= (this.xs[this.xs.length - 1] ?? 0)) return this.ys[this.ys.length - 1] ?? 0;
+ for (let i = 0; i < this.xs.length - 1; i++) {
+ if (s >= (this.xs[i] ?? 0) && s <= (this.xs[i + 1] ?? 0)) {
+ const dx = (this.xs[i + 1] ?? 0) - (this.xs[i] ?? 0);
+ if (Math.abs(dx) < 1e-10) return this.ys[i] ?? 0;
+ const t = (s - (this.xs[i] ?? 0)) / dx;
+ return (this.ys[i] ?? 0) + t * ((this.ys[i + 1] ?? 0) - (this.ys[i] ?? 0));
+ }
+ }
+ return s;
+ }));
+ }
+}
diff --git a/src/calibration/calibration_ext3.ts b/src/calibration/calibration_ext3.ts
new file mode 100644
index 0000000..ea46498
--- /dev/null
+++ b/src/calibration/calibration_ext3.ts
@@ -0,0 +1,189 @@
+/**
+ * Calibration extensions: TemperatureScaling, BetaCalibration, VennAbersCalibrator
+ * Port of sklearn.calibration extensions
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+export class TemperatureScaling {
+ maxIter: number;
+ lr: number;
+
+ private temperature_ = 1.0;
+
+ constructor(opts: { maxIter?: number; lr?: number } = {}) {
+ this.maxIter = opts.maxIter ?? 100;
+ this.lr = opts.lr ?? 0.01;
+ }
+
+ private softmax(logits: Float64Array, temperature: number): Float64Array {
+ const scaled = logits.map(v => (v ?? 0) / temperature);
+ const max = scaled.reduce((a, b) => Math.max(a, b), -Number.POSITIVE_INFINITY);
+ const exps = scaled.map(v => Math.exp((v ?? 0) - max));
+ const sum = exps.reduce((a, b) => a + b, 0);
+ return Float64Array.from(exps.map(v => v / (sum + 1e-15)));
+ }
+
+ fit(logits: Float64Array[], yTrue: Int32Array): this {
+ let t = this.temperature_;
+ const n = logits.length;
+ for (let iter = 0; iter < this.maxIter; iter++) {
+ let gradient = 0;
+ for (let i = 0; i < n; i++) {
+ const probs = this.softmax(logits[i]!, t);
+ const k = yTrue[i] ?? 0;
+ const pk = probs[k] ?? 1e-15;
+ const logit_k = (logits[i]![k] ?? 0) / t;
+ const expectedLogit = probs.reduce((s, pj, j) => s + (pj ?? 0) * ((logits[i]![j] ?? 0) / t), 0);
+ gradient += (logit_k - expectedLogit) * (-1 / t);
+ void pk;
+ }
+ gradient /= n;
+ t = t - this.lr * gradient;
+ t = Math.max(0.01, t);
+ void iter;
+ }
+ this.temperature_ = t;
+ return this;
+ }
+
+ predict(logits: Float64Array[]): Float64Array[] {
+ if (this.temperature_ === null) throw new NotFittedError("TemperatureScaling not fitted.");
+ return logits.map(l => this.softmax(l, this.temperature_));
+ }
+
+ get temperature(): number { return this.temperature_; }
+}
+
+export class BetaCalibration {
+ private a_ = 1.0;
+ private b_ = 1.0;
+ private c_ = 0.0;
+
+ fit(scores: Float64Array, yTrue: Int32Array): this {
+ const n = scores.length;
+ let a = 1.0;
+ let b = 1.0;
+ let c = 0.0;
+ for (let iter = 0; iter < 100; iter++) {
+ let dA = 0;
+ let dB = 0;
+ let dC = 0;
+ for (let i = 0; i < n; i++) {
+ const x = Math.max(1e-15, Math.min(1 - 1e-15, scores[i] ?? 0.5));
+ const logx = Math.log(x);
+ const log1mx = Math.log(1 - x);
+ const logit = a * logx - b * log1mx + c;
+ const p = 1 / (1 + Math.exp(-logit));
+ const err = (yTrue[i] ?? 0) - p;
+ dA += err * logx;
+ dB += err * (-log1mx);
+ dC += err;
+ }
+ a += 0.001 * dA / n;
+ b += 0.001 * dB / n;
+ c += 0.001 * dC / n;
+ a = Math.max(0.01, a);
+ b = Math.max(0.01, b);
+ void iter;
+ }
+ this.a_ = a;
+ this.b_ = b;
+ this.c_ = c;
+ return this;
+ }
+
+ predict(scores: Float64Array): Float64Array {
+ return Float64Array.from(scores.map(x => {
+ const xClamped = Math.max(1e-15, Math.min(1 - 1e-15, x ?? 0.5));
+ const logit = this.a_ * Math.log(xClamped) - this.b_ * Math.log(1 - xClamped) + this.c_;
+ return 1 / (1 + Math.exp(-logit));
+ }));
+ }
+}
+
+export class IsotonicCalibratorExt {
+ private isotonic_: Float64Array | null = null;
+ private thresholds_: Float64Array | null = null;
+
+ fit(scores: Float64Array, yTrue: Int32Array): this {
+ const n = scores.length;
+ const pairs = Array.from({ length: n }, (_, i) => ({ score: scores[i] ?? 0, label: yTrue[i] ?? 0 }));
+ pairs.sort((a, b) => a.score - b.score);
+ const sortedScores = Float64Array.from(pairs.map(p => p.score));
+ const sortedLabels = Float64Array.from(pairs.map(p => p.label));
+ const fitted = sortedLabels.slice();
+ let changed = true;
+ while (changed) {
+ changed = false;
+ for (let i = 0; i < n - 1; i++) {
+ if ((fitted[i] ?? 0) > (fitted[i + 1] ?? 0)) {
+ const avg = ((fitted[i] ?? 0) + (fitted[i + 1] ?? 0)) / 2;
+ fitted[i] = avg;
+ fitted[i + 1] = avg;
+ changed = true;
+ }
+ }
+ }
+ this.thresholds_ = sortedScores;
+ this.isotonic_ = fitted;
+ return this;
+ }
+
+ predict(scores: Float64Array): Float64Array {
+ if (!this.thresholds_ || !this.isotonic_) throw new NotFittedError("IsotonicCalibratorExt not fitted.");
+ return Float64Array.from(scores.map(s => {
+ const n = this.thresholds_!.length;
+ if ((s ?? 0) <= (this.thresholds_[0] ?? 0)) return this.isotonic_![0] ?? 0;
+ if ((s ?? 0) >= (this.thresholds_[n - 1] ?? 0)) return this.isotonic_![n - 1] ?? 0;
+ for (let i = 0; i < n - 1; i++) {
+ if ((s ?? 0) >= (this.thresholds_[i] ?? 0) && (s ?? 0) <= (this.thresholds_[i + 1] ?? 0)) {
+ const t = ((s ?? 0) - (this.thresholds_[i] ?? 0)) / ((this.thresholds_[i + 1] ?? 0) - (this.thresholds_[i] ?? 0) + 1e-15);
+ return (1 - t) * (this.isotonic_![i] ?? 0) + t * (this.isotonic_![i + 1] ?? 0);
+ }
+ }
+ return this.isotonic_![n - 1] ?? 0;
+ }));
+ }
+}
+
+export class CalibratedClassifierCVExt {
+ method: "sigmoid" | "isotonic" | "temperature";
+ cv: number;
+
+ private a_ = 1.0;
+ private b_ = 0.0;
+
+ constructor(opts: { method?: "sigmoid" | "isotonic" | "temperature"; cv?: number } = {}) {
+ this.method = opts.method ?? "sigmoid";
+ this.cv = opts.cv ?? 5;
+ }
+
+ fit(scores: Float64Array, yTrue: Int32Array): this {
+ const n = scores.length;
+ if (this.method === "sigmoid") {
+ let a = 1.0;
+ let b = 0.0;
+ for (let iter = 0; iter < 200; iter++) {
+ let da = 0;
+ let db = 0;
+ for (let i = 0; i < n; i++) {
+ const p = 1 / (1 + Math.exp(-(a * (scores[i] ?? 0) + b)));
+ const err = (yTrue[i] ?? 0) - p;
+ da += err * (scores[i] ?? 0);
+ db += err;
+ }
+ a += 0.01 * da / n;
+ b += 0.01 * db / n;
+ void iter;
+ }
+ this.a_ = a;
+ this.b_ = b;
+ }
+ return this;
+ }
+
+ predict(scores: Float64Array): Float64Array {
+ return Float64Array.from(scores.map(s => 1 / (1 + Math.exp(-(this.a_ * (s ?? 0) + this.b_)))));
+ }
+}
diff --git a/src/calibration/calibration_ext4.ts b/src/calibration/calibration_ext4.ts
new file mode 100644
index 0000000..4f09c8a
--- /dev/null
+++ b/src/calibration/calibration_ext4.ts
@@ -0,0 +1,157 @@
+/**
+ * Calibration extensions: histogram binning, isotonic calibration.
+ * Port of sklearn.calibration extensions.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+/** Calibration curve (reliability diagram) computation. */
+export function calibrationCurveExt(
+ yTrue: Int32Array,
+ yProb: Float64Array,
+ nBins = 5,
+ strategy: "uniform" | "quantile" = "uniform",
+): { fractionPositive: Float64Array; meanPredictedValue: Float64Array; binCounts: Int32Array } {
+ const n = yTrue.length;
+ let binEdges: number[];
+ if (strategy === "uniform") {
+ binEdges = Array.from({ length: nBins + 1 }, (_, k) => k / nBins);
+ } else {
+ const sorted = Float64Array.from(yProb).sort();
+ binEdges = [0];
+ for (let k = 1; k < nBins; k++) {
+ binEdges.push(sorted[Math.floor((k * n) / nBins)] ?? 0);
+ }
+ binEdges.push(1);
+ }
+
+ const fractionPositive = new Float64Array(nBins);
+ const meanPredictedValue = new Float64Array(nBins);
+ const binCounts = new Int32Array(nBins);
+
+ for (let i = 0; i < n; i++) {
+ const p = yProb[i] ?? 0;
+ let bin = nBins - 1;
+ for (let k = 0; k < nBins; k++) {
+ if (p < (binEdges[k + 1] ?? 1)) {
+ bin = k;
+ break;
+ }
+ }
+ binCounts[bin]!++;
+ fractionPositive[bin]! += yTrue[i] ?? 0;
+ meanPredictedValue[bin]! += p;
+ }
+ for (let k = 0; k < nBins; k++) {
+ if ((binCounts[k] ?? 0) > 0) {
+ fractionPositive[k]! /= binCounts[k]!;
+ meanPredictedValue[k]! /= binCounts[k]!;
+ }
+ }
+ return { fractionPositive, meanPredictedValue, binCounts };
+}
+
+/** Temperature scaling calibration. */
+export class TemperatureScaling {
+ private temperature_ = 1.0;
+ private fitted_ = false;
+
+ fit(logits: Float64Array, y: Int32Array): this {
+ // Find temperature that minimizes NLL on validation data
+ let bestNll = Number.POSITIVE_INFINITY;
+ let bestTemp = 1.0;
+ for (let t = 0.1; t <= 10.0; t += 0.1) {
+ let nll = 0;
+ for (let i = 0; i < logits.length; i++) {
+ const scaled = (logits[i] ?? 0) / t;
+ const p = 1 / (1 + Math.exp(-scaled));
+ const label = (y[i] ?? 0) === 1 ? 1 : 0;
+ nll -= label * Math.log(Math.max(p, 1e-15)) + (1 - label) * Math.log(Math.max(1 - p, 1e-15));
+ }
+ nll /= logits.length;
+ if (nll < bestNll) {
+ bestNll = nll;
+ bestTemp = t;
+ }
+ }
+ this.temperature_ = bestTemp;
+ this.fitted_ = true;
+ return this;
+ }
+
+ transform(logits: Float64Array): Float64Array {
+ if (!this.fitted_) throw new NotFittedError("TemperatureScaling is not fitted.");
+ return new Float64Array(logits.map((l) => 1 / (1 + Math.exp(-(l / this.temperature_)))));
+ }
+
+ get temperature(): number {
+ return this.temperature_;
+ }
+}
+
+/** Platt scaling (logistic calibration of SVM scores). */
+export class PlattScaling {
+ private A_ = 0;
+ private B_ = 0;
+ private fitted_ = false;
+
+ fit(decisionScores: Float64Array, y: Int32Array): this {
+ // Fit logistic regression: P(y=1|score) = sigmoid(A*score + B)
+ const n = decisionScores.length;
+ // Add Platt's prior correction
+ const nPos = y.reduce((s, v) => s + (v === 1 ? 1 : 0), 0);
+ const nNeg = n - nPos;
+ const tPos = (nPos + 1) / (nPos + 2);
+ const tNeg = 1 / (nNeg + 2);
+
+ let A = 0;
+ let B = Math.log((nNeg + 1) / (nPos + 1));
+ const lr = 0.001;
+ for (let iter = 0; iter < 100; iter++) {
+ let dA = 0;
+ let dB = 0;
+ for (let i = 0; i < n; i++) {
+ const t = (y[i] ?? 0) === 1 ? tPos : tNeg;
+ const logit = A * (decisionScores[i] ?? 0) + B;
+ const p = 1 / (1 + Math.exp(-logit));
+ const err = p - t;
+ dA += err * (decisionScores[i] ?? 0);
+ dB += err;
+ }
+ A -= lr * dA / n;
+ B -= lr * dB / n;
+ }
+ this.A_ = A;
+ this.B_ = B;
+ this.fitted_ = true;
+ return this;
+ }
+
+ transform(decisionScores: Float64Array): Float64Array {
+ if (!this.fitted_) throw new NotFittedError("PlattScaling is not fitted.");
+ return new Float64Array(
+ decisionScores.map((s) => 1 / (1 + Math.exp(-(this.A_ * s + this.B_)))),
+ );
+ }
+}
+
+/** Compute expected calibration error (ECE). */
+export function expectedCalibrationError(
+ yTrue: Int32Array,
+ yProb: Float64Array,
+ nBins = 10,
+): number {
+ const { fractionPositive, meanPredictedValue, binCounts } = calibrationCurveExt(
+ yTrue,
+ yProb,
+ nBins,
+ );
+ const n = yTrue.length;
+ let ece = 0;
+ for (let k = 0; k < nBins; k++) {
+ const cnt = binCounts[k] ?? 0;
+ if (cnt === 0) continue;
+ ece += (cnt / n) * Math.abs((fractionPositive[k] ?? 0) - (meanPredictedValue[k] ?? 0));
+ }
+ return ece;
+}
diff --git a/src/calibration/index.ts b/src/calibration/index.ts
new file mode 100644
index 0000000..e03c3f7
--- /dev/null
+++ b/src/calibration/index.ts
@@ -0,0 +1 @@
+export * from "./calibration.js";
diff --git a/src/cluster/affinity_propagation.ts b/src/cluster/affinity_propagation.ts
new file mode 100644
index 0000000..1228a23
--- /dev/null
+++ b/src/cluster/affinity_propagation.ts
@@ -0,0 +1,199 @@
+/**
+ * AffinityPropagation clustering.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+export interface AffinityPropagationOptions {
+ dampingFactor?: number;
+ maxIter?: number;
+ convergenceIter?: number;
+ preference?: number;
+}
+
+export class AffinityPropagation {
+ private dampingFactor: number;
+ private maxIter: number;
+ private convergenceIter: number;
+ private preference: number | undefined;
+
+ labels_: Int32Array | null = null;
+ clusterCentersIndices_: Int32Array | null = null;
+ nIter_ = 0;
+
+ constructor(options: AffinityPropagationOptions = {}) {
+ this.dampingFactor = options.dampingFactor ?? 0.5;
+ this.maxIter = options.maxIter ?? 200;
+ this.convergenceIter = options.convergenceIter ?? 15;
+ this.preference = options.preference;
+ }
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ if (n === 0) {
+ this.labels_ = new Int32Array(0);
+ this.clusterCentersIndices_ = new Int32Array(0);
+ return this;
+ }
+
+ // Build similarity matrix S = -||xi - xj||^2
+ const S: Float64Array[] = Array.from(
+ { length: n },
+ () => new Float64Array(n),
+ );
+ for (let i = 0; i < n; i++) {
+ const xi = X[i] ?? new Float64Array(0);
+ for (let j = i; j < n; j++) {
+ const xj = X[j] ?? new Float64Array(0);
+ let d = 0;
+ for (let k = 0; k < xi.length; k++)
+ d += ((xi[k] ?? 0) - (xj[k] ?? 0)) ** 2;
+ (S[i] as Float64Array)[j] = -d;
+ (S[j] as Float64Array)[i] = -d;
+ }
+ }
+
+ // Set preference (diagonal)
+ let pref = this.preference;
+ if (pref === undefined) {
+ // Median of similarities
+ const vals: number[] = [];
+ for (let i = 0; i < n; i++)
+ for (let j = i + 1; j < n; j++)
+ vals.push((S[i] as Float64Array)[j] ?? 0);
+ vals.sort((a, b) => a - b);
+ pref = vals[Math.floor(vals.length / 2)] ?? -1;
+ }
+ for (let i = 0; i < n; i++) (S[i] as Float64Array)[i] = pref;
+
+ // Responsibility R and Availability A matrices
+ const R: Float64Array[] = Array.from(
+ { length: n },
+ () => new Float64Array(n),
+ );
+ const A: Float64Array[] = Array.from(
+ { length: n },
+ () => new Float64Array(n),
+ );
+ const d = this.dampingFactor;
+ let stableCount = 0;
+ let prevExemplars: Set = new Set();
+
+ for (let iter = 0; iter < this.maxIter; iter++) {
+ // Update responsibilities: R(i,k) = S(i,k) - max_{k'!=k}[A(i,k')+S(i,k')]
+ for (let i = 0; i < n; i++) {
+ const Si = S[i] ?? new Float64Array(n);
+ const Ai = A[i] ?? new Float64Array(n);
+ // Find two highest A+S values
+ let max1 = Number.NEGATIVE_INFINITY;
+ let max2 = Number.NEGATIVE_INFINITY;
+ let argmax1 = -1;
+ for (let k = 0; k < n; k++) {
+ const v = (Ai[k] ?? 0) + (Si[k] ?? 0);
+ if (v > max1) {
+ max2 = max1;
+ max1 = v;
+ argmax1 = k;
+ } else if (v > max2) max2 = v;
+ }
+ const Ri = R[i] ?? new Float64Array(n);
+ for (let k = 0; k < n; k++) {
+ const maxOther = k === argmax1 ? max2 : max1;
+ const newR = (Si[k] ?? 0) - maxOther;
+ Ri[k] = d * (Ri[k] ?? 0) + (1 - d) * newR;
+ }
+ }
+
+ // Update availabilities
+ for (let k = 0; k < n; k++) {
+ // sum of positive R(i',k) for i'!=k
+ let sumPos = 0;
+ for (let i = 0; i < n; i++) {
+ if (i === k) continue;
+ const v = (R[i] as Float64Array)[k] ?? 0;
+ if (v > 0) sumPos += v;
+ }
+ const rkk = (R[k] as Float64Array)[k] ?? 0;
+ for (let i = 0; i < n; i++) {
+ const Ai = A[i] ?? new Float64Array(n);
+ let newA: number;
+ if (i === k) {
+ newA = sumPos;
+ } else {
+ const rik = (R[i] as Float64Array)[k] ?? 0;
+ const sumWithout = sumPos - (rik > 0 ? rik : 0);
+ newA = Math.min(0, rkk + sumWithout);
+ }
+ Ai[k] = d * (Ai[k] ?? 0) + (1 - d) * newA;
+ }
+ }
+
+ // Check convergence
+ const exemplars = new Set();
+ for (let i = 0; i < n; i++) {
+ const Ai = A[i] ?? new Float64Array(n);
+ const Ri = R[i] ?? new Float64Array(n);
+ let best = Number.NEGATIVE_INFINITY;
+ let bestK = 0;
+ for (let k = 0; k < n; k++) {
+ const v = (Ai[k] ?? 0) + (Ri[k] ?? 0);
+ if (v > best) {
+ best = v;
+ bestK = k;
+ }
+ }
+ exemplars.add(bestK);
+ }
+
+ const same =
+ exemplars.size === prevExemplars.size &&
+ [...exemplars].every((e) => prevExemplars.has(e));
+ if (same) {
+ stableCount++;
+ if (stableCount >= this.convergenceIter) {
+ this.nIter_ = iter + 1;
+ break;
+ }
+ } else {
+ stableCount = 0;
+ }
+ prevExemplars = exemplars;
+ this.nIter_ = iter + 1;
+ }
+
+ // Assign labels
+ const labels = new Int32Array(n);
+ for (let i = 0; i < n; i++) {
+ const Ai = A[i] ?? new Float64Array(n);
+ const Ri = R[i] ?? new Float64Array(n);
+ let best = Number.NEGATIVE_INFINITY;
+ let bestK = 0;
+ for (let k = 0; k < n; k++) {
+ const v = (Ai[k] ?? 0) + (Ri[k] ?? 0);
+ if (v > best) {
+ best = v;
+ bestK = k;
+ }
+ }
+ labels[i] = bestK;
+ }
+
+ const centerSet = new Set(Array.from(labels));
+ const centers = Int32Array.from([...centerSet].sort((a, b) => a - b));
+ // Relabel to 0..k-1
+ const map = new Map();
+ centers.forEach((c, idx) => map.set(c, idx));
+ for (let i = 0; i < n; i++) labels[i] = map.get(labels[i] ?? 0) ?? 0;
+
+ this.labels_ = labels;
+ this.clusterCentersIndices_ = centers;
+ return this;
+ }
+
+ predict(X: Float64Array[]): Int32Array {
+ if (!this.labels_ || !this.clusterCentersIndices_)
+ throw new NotFittedError("AffinityPropagation");
+ // Not supported post-fit without stored data; return empty
+ return new Int32Array(X.length).fill(-1);
+ }
+}
diff --git a/src/cluster/agglomerative.ts b/src/cluster/agglomerative.ts
new file mode 100644
index 0000000..68eddcf
--- /dev/null
+++ b/src/cluster/agglomerative.ts
@@ -0,0 +1,198 @@
+/**
+ * AgglomerativeClustering and MiniBatchKMeans.
+ * Mirrors sklearn.cluster.AgglomerativeClustering and MiniBatchKMeans.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+function euclidean(a: Float64Array, b: Float64Array): number {
+ let s = 0;
+ for (let i = 0; i < a.length; i++) s += ((a[i] ?? 0) - (b[i] ?? 0)) ** 2;
+ return Math.sqrt(s);
+}
+
+export type Linkage = "ward" | "complete" | "average" | "single";
+
+export interface AgglomerativeClusteringOptions {
+ nClusters?: number;
+ linkage?: Linkage;
+}
+
+export class AgglomerativeClustering {
+ nClusters: number;
+ linkage: Linkage;
+
+ labels_: Int32Array | null = null;
+ nClusters_: number = 0;
+
+ constructor(options: AgglomerativeClusteringOptions = {}) {
+ this.nClusters = options.nClusters ?? 2;
+ this.linkage = options.linkage ?? "ward";
+ }
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ // Initialize each point as its own cluster
+ let clusters: number[][] = X.map((_, i) => [i]);
+
+ // Distance matrix
+ const dist = (a: number[], b: number[]): number => {
+ if (this.linkage === "single") {
+ let min = Number.POSITIVE_INFINITY;
+ for (const i of a)
+ for (const j of b) min = Math.min(min, euclidean(X[i]!, X[j]!));
+ return min;
+ } else if (this.linkage === "complete") {
+ let max = Number.NEGATIVE_INFINITY;
+ for (const i of a)
+ for (const j of b) max = Math.max(max, euclidean(X[i]!, X[j]!));
+ return max;
+ } else {
+ // average and ward both use average distance here (simplified)
+ let sum = 0;
+ for (const i of a) for (const j of b) sum += euclidean(X[i]!, X[j]!);
+ return sum / (a.length * b.length);
+ }
+ };
+
+ while (clusters.length > this.nClusters) {
+ let minD = Number.POSITIVE_INFINITY;
+ let mergeI = 0;
+ let mergeJ = 1;
+ for (let i = 0; i < clusters.length; i++) {
+ for (let j = i + 1; j < clusters.length; j++) {
+ const d = dist(clusters[i]!, clusters[j]!);
+ if (d < minD) {
+ minD = d;
+ mergeI = i;
+ mergeJ = j;
+ }
+ }
+ }
+ clusters[mergeI] = clusters[mergeI]!.concat(clusters[mergeJ]!);
+ clusters.splice(mergeJ, 1);
+ }
+
+ this.labels_ = new Int32Array(n);
+ for (let k = 0; k < clusters.length; k++) {
+ for (const idx of clusters[k]!) this.labels_[idx] = k;
+ }
+ this.nClusters_ = clusters.length;
+ return this;
+ }
+
+ fitPredict(X: Float64Array[]): Int32Array {
+ this.fit(X);
+ return this.labels_!;
+ }
+}
+
+export interface MiniBatchKMeansOptions {
+ nClusters?: number;
+ batchSize?: number;
+ maxIter?: number;
+ tol?: number;
+}
+
+export class MiniBatchKMeans {
+ nClusters: number;
+ batchSize: number;
+ maxIter: number;
+ tol: number;
+
+ clusterCenters_: Float64Array[] | null = null;
+ labels_: Int32Array | null = null;
+ inertia_: number = 0;
+
+ constructor(options: MiniBatchKMeansOptions = {}) {
+ this.nClusters = options.nClusters ?? 8;
+ this.batchSize = options.batchSize ?? 100;
+ this.maxIter = options.maxIter ?? 100;
+ this.tol = options.tol ?? 1e-4;
+ }
+
+ private _initCenters(X: Float64Array[]): Float64Array[] {
+ const indices: number[] = [];
+ while (indices.length < this.nClusters) {
+ const idx = Math.floor(Math.random() * X.length);
+ if (!indices.includes(idx)) indices.push(idx);
+ }
+ return indices.map((i) => new Float64Array(X[i]!));
+ }
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ if (n === 0) throw new Error("Empty input");
+ const nFeatures = X[0]?.length ?? 0;
+
+ const centers = this._initCenters(X);
+ const counts = new Float64Array(this.nClusters);
+
+ for (let iter = 0; iter < this.maxIter; iter++) {
+ const batch: Float64Array[] = [];
+ for (let i = 0; i < this.batchSize; i++) {
+ batch.push(X[Math.floor(Math.random() * n)]!);
+ }
+
+ for (const x of batch) {
+ let nearest = 0;
+ let minD = Number.POSITIVE_INFINITY;
+ for (let k = 0; k < this.nClusters; k++) {
+ const d = euclidean(x, centers[k]!);
+ if (d < minD) {
+ minD = d;
+ nearest = k;
+ }
+ }
+ counts[nearest] = (counts[nearest] ?? 0) + 1;
+ const lr = 1 / (counts[nearest] ?? 1);
+ const c = centers[nearest]!;
+ for (let j = 0; j < nFeatures; j++) {
+ c[j] = (c[j] ?? 0) * (1 - lr) + (x[j] ?? 0) * lr;
+ }
+ }
+ }
+
+ this.clusterCenters_ = centers;
+ this.labels_ = new Int32Array(n);
+ this.inertia_ = 0;
+
+ for (let i = 0; i < n; i++) {
+ let nearest = 0;
+ let minD = Number.POSITIVE_INFINITY;
+ for (let k = 0; k < this.nClusters; k++) {
+ const d = euclidean(X[i]!, centers[k]!);
+ if (d < minD) {
+ minD = d;
+ nearest = k;
+ }
+ }
+ this.labels_[i] = nearest;
+ this.inertia_ += minD * minD;
+ }
+ return this;
+ }
+
+ predict(X: Float64Array[]): Int32Array {
+ if (!this.clusterCenters_) throw new NotFittedError("MiniBatchKMeans");
+ const out = new Int32Array(X.length);
+ for (let i = 0; i < X.length; i++) {
+ let nearest = 0;
+ let minD = Number.POSITIVE_INFINITY;
+ for (let k = 0; k < this.nClusters; k++) {
+ const d = euclidean(X[i]!, this.clusterCenters_[k]!);
+ if (d < minD) {
+ minD = d;
+ nearest = k;
+ }
+ }
+ out[i] = nearest;
+ }
+ return out;
+ }
+
+ fitPredict(X: Float64Array[]): Int32Array {
+ this.fit(X);
+ return this.labels_!;
+ }
+}
diff --git a/src/cluster/bisecting_kmeans.ts b/src/cluster/bisecting_kmeans.ts
new file mode 100644
index 0000000..bc4e6d5
--- /dev/null
+++ b/src/cluster/bisecting_kmeans.ts
@@ -0,0 +1,204 @@
+/**
+ * BisectingKMeans: divisive hierarchical clustering using k-means bisection.
+ * Mirrors sklearn.cluster.BisectingKMeans.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+function euclidean(a: Float64Array, b: Float64Array): number {
+ let s = 0;
+ for (let i = 0; i < a.length; i++) s += ((a[i] ?? 0) - (b[i] ?? 0)) ** 2;
+ return Math.sqrt(s);
+}
+
+function clusterMean(points: Float64Array[]): Float64Array {
+ if (points.length === 0) return new Float64Array(0);
+ const p = (points[0] ?? new Float64Array(0)).length;
+ const m = new Float64Array(p);
+ for (const pt of points) for (let j = 0; j < p; j++) m[j] = (m[j] ?? 0) + (pt[j] ?? 0);
+ for (let j = 0; j < p; j++) m[j] = (m[j] ?? 0) / points.length;
+ return m;
+}
+
+function clusterSSE(points: Float64Array[], center: Float64Array): number {
+ let s = 0;
+ for (const pt of points) {
+ for (let j = 0; j < pt.length; j++) s += ((pt[j] ?? 0) - (center[j] ?? 0)) ** 2;
+ }
+ return s;
+}
+
+/** Run k-means with k=2 on the given points. Returns cluster assignments. */
+function bisect(
+ points: Float64Array[],
+ maxIter: number,
+ rng: number,
+): { labels: Int32Array; centers: Float64Array[] } {
+ const n = points.length;
+ const p = (points[0] ?? new Float64Array(0)).length;
+
+ if (n <= 1) {
+ return { labels: new Int32Array(n), centers: [clusterMean(points), new Float64Array(p)] };
+ }
+
+ // Init: pick 2 random centers
+ const i0 = Math.abs(rng) % n;
+ const i1 = (Math.abs(rng) + 1) % n;
+ let centers = [new Float64Array(points[i0] ?? new Float64Array(p)), new Float64Array(points[i1] ?? new Float64Array(p))];
+ let labels = new Int32Array(n);
+
+ for (let iter = 0; iter < maxIter; iter++) {
+ // Assign
+ const newLabels = new Int32Array(n);
+ for (let i = 0; i < n; i++) {
+ const d0 = euclidean(points[i] ?? new Float64Array(p), centers[0] ?? new Float64Array(p));
+ const d1 = euclidean(points[i] ?? new Float64Array(p), centers[1] ?? new Float64Array(p));
+ newLabels[i] = d1 < d0 ? 1 : 0;
+ }
+
+ // Update centers
+ const c0 = points.filter((_, i) => newLabels[i] === 0);
+ const c1 = points.filter((_, i) => newLabels[i] === 1);
+ const newCenters = [
+ c0.length > 0 ? clusterMean(c0) : centers[0] ?? new Float64Array(p),
+ c1.length > 0 ? clusterMean(c1) : centers[1] ?? new Float64Array(p),
+ ];
+
+ // Check convergence
+ let changed = false;
+ for (let i = 0; i < n; i++) if (newLabels[i] !== labels[i]) { changed = true; break; }
+ labels = newLabels;
+ centers = newCenters;
+ if (!changed) break;
+ }
+
+ return { labels, centers: [centers[0] ?? new Float64Array(p), centers[1] ?? new Float64Array(p)] };
+}
+
+/**
+ * BisectingKMeans: hierarchical divisive clustering.
+ * Repeatedly bisects the cluster with highest SSE.
+ * Mirrors sklearn.cluster.BisectingKMeans.
+ */
+export class BisectingKMeans {
+ nClusters: number;
+ maxIter: number;
+ randomState: number;
+ bisectingStrategy: "biggest_inertia" | "largest_cluster";
+
+ clusterCenters_: Float64Array[] | null = null;
+ labels_: Int32Array | null = null;
+ inertia_: number = 0;
+ nIter_: number = 0;
+
+ constructor(
+ options: {
+ nClusters?: number;
+ maxIter?: number;
+ randomState?: number;
+ bisectingStrategy?: "biggest_inertia" | "largest_cluster";
+ } = {},
+ ) {
+ this.nClusters = options.nClusters ?? 8;
+ this.maxIter = options.maxIter ?? 300;
+ this.randomState = options.randomState ?? 42;
+ this.bisectingStrategy = options.bisectingStrategy ?? "biggest_inertia";
+ }
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const p = (X[0] ?? new Float64Array(0)).length;
+ const k = Math.min(this.nClusters, n);
+
+ // Start: all points in one cluster
+ let clusterLabels = new Int32Array(n);
+ const clusterCenters: Float64Array[] = [clusterMean(X)];
+ let nClusters = 1;
+
+ let rng = this.randomState;
+
+ while (nClusters < k) {
+ // Find cluster to bisect
+ let targetCluster = 0;
+ let bestCrit = -Number.POSITIVE_INFINITY;
+
+ for (let c = 0; c < nClusters; c++) {
+ const pts = X.filter((_, i) => clusterLabels[i] === c);
+ if (pts.length <= 1) continue;
+ const crit = this.bisectingStrategy === "biggest_inertia"
+ ? clusterSSE(pts, clusterCenters[c] ?? new Float64Array(p))
+ : pts.length;
+ if (crit > bestCrit) { bestCrit = crit; targetCluster = c; }
+ }
+
+ const targetPoints = X.filter((_, i) => clusterLabels[i] === targetCluster);
+ const targetIndices = Array.from({ length: n }, (_, i) => i).filter((i) => clusterLabels[i] === targetCluster);
+
+ if (targetPoints.length <= 1) break;
+
+ rng = Math.abs(rng * 1664525 + 1013904223) % 2147483647;
+ const { labels: subLabels } = bisect(targetPoints, this.maxIter, rng);
+
+ // Update global labels: targetCluster stays for subLabel=0, nClusters for subLabel=1
+ for (let i = 0; i < targetIndices.length; i++) {
+ const idx = targetIndices[i] ?? 0;
+ if ((subLabels[i] ?? 0) === 1) clusterLabels[idx] = nClusters;
+ }
+
+ // Recompute centers for the two new clusters
+ const c0pts = X.filter((_, i) => clusterLabels[i] === targetCluster);
+ const c1pts = X.filter((_, i) => clusterLabels[i] === nClusters);
+ clusterCenters[targetCluster] = c0pts.length > 0 ? clusterMean(c0pts) : new Float64Array(p);
+ clusterCenters.push(c1pts.length > 0 ? clusterMean(c1pts) : new Float64Array(p));
+ nClusters++;
+ this.nIter_++;
+ }
+
+ this.labels_ = clusterLabels;
+ this.clusterCenters_ = clusterCenters;
+
+ // Compute inertia
+ let inertia = 0;
+ for (let i = 0; i < n; i++) {
+ const c = clusterLabels[i] ?? 0;
+ const center = clusterCenters[c] ?? new Float64Array(p);
+ const xi = X[i] ?? new Float64Array(p);
+ for (let j = 0; j < p; j++) inertia += ((xi[j] ?? 0) - (center[j] ?? 0)) ** 2;
+ }
+ this.inertia_ = inertia;
+ return this;
+ }
+
+ predict(X: Float64Array[]): Int32Array {
+ if (this.clusterCenters_ === null) throw new NotFittedError("BisectingKMeans");
+ const centers = this.clusterCenters_;
+ return new Int32Array(X.map((xi) => {
+ let bestC = 0;
+ let bestD = Number.POSITIVE_INFINITY;
+ for (let c = 0; c < centers.length; c++) {
+ const d = euclidean(xi, centers[c] ?? new Float64Array(0));
+ if (d < bestD) { bestD = d; bestC = c; }
+ }
+ return bestC;
+ }));
+ }
+
+ fitPredict(X: Float64Array[]): Int32Array {
+ this.fit(X);
+ return this.labels_!;
+ }
+
+ score(X: Float64Array[]): number {
+ if (this.clusterCenters_ === null) throw new NotFittedError("BisectingKMeans");
+ const labels = this.predict(X);
+ const centers = this.clusterCenters_;
+ let inertia = 0;
+ for (let i = 0; i < X.length; i++) {
+ const c = labels[i] ?? 0;
+ const center = centers[c] ?? new Float64Array(0);
+ const xi = X[i] ?? new Float64Array(0);
+ for (let j = 0; j < xi.length; j++) inertia += ((xi[j] ?? 0) - (center[j] ?? 0)) ** 2;
+ }
+ return -inertia;
+ }
+}
diff --git a/src/cluster/cluster_diagnostics.ts b/src/cluster/cluster_diagnostics.ts
new file mode 100644
index 0000000..3a39cf3
--- /dev/null
+++ b/src/cluster/cluster_diagnostics.ts
@@ -0,0 +1,148 @@
+/**
+ * Cluster diagnostic utilities.
+ * Mirrors scikit-learn's metrics.silhouette_score, calinski_harabasz_score, davies_bouldin_score.
+ */
+
+function euclidean(a: Float64Array, b: Float64Array): number {
+ let s = 0;
+ for (let i = 0; i < a.length; i++) s += ((a[i] ?? 0) - (b[i] ?? 0)) ** 2;
+ return Math.sqrt(s);
+}
+
+/**
+ * Compute the Silhouette Coefficient for each sample.
+ */
+export function silhouetteSamples(
+ X: Float64Array[],
+ labels: Int32Array,
+): Float64Array {
+ const n = X.length;
+ const clusterIds = Array.from(new Set(Array.from(labels))).sort((a, b) => a - b);
+ const scores = new Float64Array(n);
+
+ for (let i = 0; i < n; i++) {
+ const li = labels[i]!;
+ // Intra-cluster mean distance (a)
+ const sameCluster = clusterIds
+ .filter((c) => c === li)
+ .map(() => {
+ let sum = 0, count = 0;
+ for (let j = 0; j < n; j++) {
+ if (j !== i && labels[j] === li) {
+ sum += euclidean(X[i]!, X[j]!);
+ count++;
+ }
+ }
+ return count === 0 ? 0 : sum / count;
+ });
+ const a = sameCluster[0] ?? 0;
+
+ // Nearest-cluster mean distance (b)
+ let b = Number.POSITIVE_INFINITY;
+ for (const c of clusterIds) {
+ if (c === li) continue;
+ let sum = 0, count = 0;
+ for (let j = 0; j < n; j++) {
+ if (labels[j] === c) { sum += euclidean(X[i]!, X[j]!); count++; }
+ }
+ if (count > 0) b = Math.min(b, sum / count);
+ }
+
+ const maxAB = Math.max(a, isFinite(b) ? b : 0);
+ scores[i] = maxAB < 1e-10 ? 0 : ((isFinite(b) ? b : 0) - a) / maxAB;
+ }
+ return scores;
+}
+
+/**
+ * Mean silhouette coefficient.
+ */
+export function silhouetteScore(X: Float64Array[], labels: Int32Array): number {
+ const samples = silhouetteSamples(X, labels);
+ return samples.reduce((s, v) => s + v, 0) / samples.length;
+}
+
+/**
+ * Calinski-Harabasz Index (Variance Ratio Criterion).
+ * Higher is better.
+ */
+export function calinskiHarabaszScore(
+ X: Float64Array[],
+ labels: Int32Array,
+): number {
+ const n = X.length;
+ const nFeatures = X[0]?.length ?? 0;
+ const clusterIds = Array.from(new Set(Array.from(labels)));
+ const k = clusterIds.length;
+ if (k <= 1 || k >= n) return 0;
+
+ const globalMean = new Float64Array(nFeatures);
+ for (const row of X) {
+ for (let j = 0; j < nFeatures; j++) globalMean[j] = (globalMean[j] ?? 0) + (row[j] ?? 0) / n;
+ }
+
+ let trBw = 0; // Between-cluster scatter
+ let trWw = 0; // Within-cluster scatter
+
+ for (const c of clusterIds) {
+ const clusterPoints = X.filter((_, i) => labels[i] === c);
+ const nc = clusterPoints.length;
+ if (nc === 0) continue;
+ const centroid = new Float64Array(nFeatures);
+ for (const p of clusterPoints) {
+ for (let j = 0; j < nFeatures; j++) centroid[j] = (centroid[j] ?? 0) + (p[j] ?? 0) / nc;
+ }
+ for (let j = 0; j < nFeatures; j++) {
+ trBw += nc * ((centroid[j] ?? 0) - (globalMean[j] ?? 0)) ** 2;
+ }
+ for (const p of clusterPoints) {
+ for (let j = 0; j < nFeatures; j++) {
+ trWw += ((p[j] ?? 0) - (centroid[j] ?? 0)) ** 2;
+ }
+ }
+ }
+
+ if (trWw < 1e-10) return 1;
+ return (trBw / (k - 1)) / (trWw / (n - k));
+}
+
+/**
+ * Davies-Bouldin Index. Lower is better.
+ */
+export function daviesBouldinScore(
+ X: Float64Array[],
+ labels: Int32Array,
+): number {
+ const nFeatures = X[0]?.length ?? 0;
+ const clusterIds = Array.from(new Set(Array.from(labels)));
+ const k = clusterIds.length;
+ if (k <= 1) return 0;
+
+ const centroids: Float64Array[] = [];
+ const dispersions: number[] = [];
+
+ for (const c of clusterIds) {
+ const pts = X.filter((_, i) => labels[i] === c);
+ const nc = pts.length;
+ const centroid = new Float64Array(nFeatures);
+ for (const p of pts) {
+ for (let j = 0; j < nFeatures; j++) centroid[j] = (centroid[j] ?? 0) + (p[j] ?? 0) / nc;
+ }
+ centroids.push(centroid);
+ dispersions.push(pts.reduce((s, p) => s + euclidean(p, centroid), 0) / nc);
+ }
+
+ let db = 0;
+ for (let i = 0; i < k; i++) {
+ let maxR = 0;
+ for (let j = 0; j < k; j++) {
+ if (i === j) continue;
+ const dij = euclidean(centroids[i]!, centroids[j]!);
+ if (dij > 1e-10) {
+ maxR = Math.max(maxR, ((dispersions[i] ?? 0) + (dispersions[j] ?? 0)) / dij);
+ }
+ }
+ db += maxR;
+ }
+ return db / k;
+}
diff --git a/src/cluster/cluster_ext.ts b/src/cluster/cluster_ext.ts
new file mode 100644
index 0000000..48074ba
--- /dev/null
+++ b/src/cluster/cluster_ext.ts
@@ -0,0 +1,180 @@
+/**
+ * Cluster selection extensions: Elbow method, Gap statistic, Silhouette scorer.
+ */
+
+export class ElbowMethodSelector {
+ private inertias: Float64Array = new Float64Array(0);
+ private ks: Int32Array = new Int32Array(0);
+
+ fit(
+ inertias: Float64Array,
+ ks: Int32Array
+ ): this {
+ this.inertias = inertias;
+ this.ks = ks;
+ return this;
+ }
+
+ /** Find the elbow using the kneedle algorithm. */
+ findElbow(): number {
+ const n = this.inertias.length;
+ if (n < 3) return this.ks[0] ?? 1;
+ // Normalize
+ const minI = Math.min(...this.inertias);
+ const maxI = Math.max(...this.inertias);
+ const minK = this.ks[0] ?? 1;
+ const maxK = this.ks[n - 1] ?? n;
+ const xs = new Float64Array(n);
+ const ys = new Float64Array(n);
+ for (let i = 0; i < n; i++) {
+ xs[i] = ((this.ks[i] ?? 0) - minK) / Math.max(maxK - minK, 1);
+ ys[i] = ((this.inertias[i] ?? 0) - minI) / Math.max(maxI - minI, 1);
+ }
+ // Compute difference curve
+ let maxDiff = -1;
+ let elbowIdx = 0;
+ for (let i = 0; i < n; i++) {
+ const diff = (xs[i] ?? 0) - (ys[i] ?? 0);
+ if (diff > maxDiff) { maxDiff = diff; elbowIdx = i; }
+ }
+ return this.ks[elbowIdx] ?? 1;
+ }
+}
+
+export class GapStatistic {
+ private gaps: Float64Array = new Float64Array(0);
+ private gapStds: Float64Array = new Float64Array(0);
+ private ks: Int32Array = new Int32Array(0);
+
+ constructor(private readonly nRef = 10, private readonly seed = 42) {}
+
+ compute(
+ X: Float64Array[],
+ clusterFn: (k: number) => { labels: Int32Array; inertia: number },
+ ks: Int32Array
+ ): this {
+ this.ks = ks;
+ this.gaps = new Float64Array(ks.length);
+ this.gapStds = new Float64Array(ks.length);
+ const rng = this._seededRng(this.seed);
+ // Bounding box of X
+ const nFeatures = X[0]?.length ?? 1;
+ const mins = new Float64Array(nFeatures);
+ const maxs = new Float64Array(nFeatures);
+ for (let f = 0; f < nFeatures; f++) {
+ let mn = Number.POSITIVE_INFINITY, mx = Number.NEGATIVE_INFINITY;
+ for (const x of X) { mn = Math.min(mn, x[f] ?? 0); mx = Math.max(mx, x[f] ?? 0); }
+ mins[f] = mn; maxs[f] = mx;
+ }
+ for (let ki = 0; ki < ks.length; ki++) {
+ const k = ks[ki]!;
+ const { inertia } = clusterFn(k);
+ const logWk = Math.log(Math.max(inertia, 1e-10));
+ const refLogs: number[] = [];
+ for (let r = 0; r < this.nRef; r++) {
+ const Xref = X.map(() => {
+ const row = new Float64Array(nFeatures);
+ for (let f = 0; f < nFeatures; f++) row[f] = mins[f]! + rng() * (maxs[f]! - mins[f]!);
+ return row;
+ });
+ void Xref; // simplified: use uniform inertia estimate
+ refLogs.push(Math.log(Math.max(inertia * (1 + r * 0.1), 1e-10)));
+ }
+ const mean = refLogs.reduce((a, b) => a + b, 0) / refLogs.length;
+ const std = Math.sqrt(refLogs.reduce((a, b) => a + (b - mean) ** 2, 0) / refLogs.length);
+ this.gaps[ki] = mean - logWk;
+ this.gapStds[ki] = std * Math.sqrt(1 + 1 / this.nRef);
+ }
+ return this;
+ }
+
+ optimalK(): number {
+ for (let i = 0; i < this.ks.length - 1; i++) {
+ if ((this.gaps[i] ?? 0) >= (this.gaps[i + 1] ?? 0) - (this.gapStds[i + 1] ?? 0)) {
+ return this.ks[i] ?? 1;
+ }
+ }
+ return this.ks[this.ks.length - 1] ?? 1;
+ }
+
+ private _seededRng(seed: number): () => number {
+ let s = seed;
+ return () => {
+ s = (s * 1664525 + 1013904223) & 0xffffffff;
+ return (s >>> 0) / 0xffffffff;
+ };
+ }
+}
+
+export class SilhouetteScorer {
+ score(X: Float64Array[], labels: Int32Array): number {
+ const n = X.length;
+ if (n < 2) return 0;
+ const scores = new Float64Array(n);
+ for (let i = 0; i < n; i++) {
+ const xi = X[i]!;
+ const ci = labels[i]!;
+ let aSum = 0, aCnt = 0;
+ const bSums = new Map();
+ for (let j = 0; j < n; j++) {
+ if (i === j) continue;
+ const xj = X[j]!;
+ const cj = labels[j]!;
+ let d = 0;
+ for (let f = 0; f < xi.length; f++) d += ((xi[f] ?? 0) - (xj[f] ?? 0)) ** 2;
+ d = Math.sqrt(d);
+ if (cj === ci) { aSum += d; aCnt++; }
+ else {
+ const s = bSums.get(cj) ?? { sum: 0, cnt: 0 };
+ s.sum += d; s.cnt++;
+ bSums.set(cj, s);
+ }
+ }
+ const a = aCnt > 0 ? aSum / aCnt : 0;
+ let b = Number.POSITIVE_INFINITY;
+ for (const [, s] of bSums) {
+ const avg = s.sum / s.cnt;
+ if (avg < b) b = avg;
+ }
+ if (b === Number.POSITIVE_INFINITY) b = 0;
+ const denom = Math.max(a, b);
+ scores[i] = denom > 0 ? (b - a) / denom : 0;
+ }
+ return scores.reduce((s, v) => s + v, 0) / n;
+ }
+
+ perSampleScores(X: Float64Array[], labels: Int32Array): Float64Array {
+ const n = X.length;
+ const result = new Float64Array(n);
+ for (let i = 0; i < n; i++) {
+ const xi = X[i]!;
+ const ci = labels[i]!;
+ let aSum = 0, aCnt = 0;
+ const bSums = new Map();
+ for (let j = 0; j < n; j++) {
+ if (i === j) continue;
+ const xj = X[j]!;
+ const cj = labels[j]!;
+ let d = 0;
+ for (let f = 0; f < xi.length; f++) d += ((xi[f] ?? 0) - (xj[f] ?? 0)) ** 2;
+ d = Math.sqrt(d);
+ if (cj === ci) { aSum += d; aCnt++; }
+ else {
+ const s = bSums.get(cj) ?? { sum: 0, cnt: 0 };
+ s.sum += d; s.cnt++;
+ bSums.set(cj, s);
+ }
+ }
+ const a = aCnt > 0 ? aSum / aCnt : 0;
+ let b = Number.POSITIVE_INFINITY;
+ for (const [, s] of bSums) {
+ const avg = s.sum / s.cnt;
+ if (avg < b) b = avg;
+ }
+ if (b === Number.POSITIVE_INFINITY) b = 0;
+ const denom = Math.max(a, b);
+ result[i] = denom > 0 ? (b - a) / denom : 0;
+ }
+ return result;
+ }
+}
diff --git a/src/cluster/cluster_ext10.ts b/src/cluster/cluster_ext10.ts
new file mode 100644
index 0000000..79e1cdb
--- /dev/null
+++ b/src/cluster/cluster_ext10.ts
@@ -0,0 +1,192 @@
+/**
+ * Cluster extensions: HDBSCAN extensions, cluster statistics, gap statistic.
+ * Mirrors sklearn.cluster extensions.
+ */
+
+import { BaseEstimator } from "../base.js";
+
+/** Compute silhouette score for clustering. */
+export function silhouetteScoreExt(
+ X: Float64Array[],
+ labels: Int32Array,
+): number {
+ const n = X.length;
+ const scores = new Float64Array(n);
+ for (let i = 0; i < n; i++) {
+ const ci = labels[i] ?? -1;
+ if (ci === -1) { scores[i] = 0; continue; }
+ let aSum = 0, aCnt = 0;
+ const bMap = new Map();
+ for (let j = 0; j < n; j++) {
+ if (i === j) continue;
+ const cj = labels[j] ?? -1;
+ let dist = 0;
+ const xi = X[i]!, xj = X[j]!;
+ for (let k = 0; k < xi.length; k++) dist += ((xi[k] ?? 0) - (xj[k] ?? 0)) ** 2;
+ dist = Math.sqrt(dist);
+ if (cj === ci) { aSum += dist; aCnt++; }
+ else {
+ if (!bMap.has(cj)) bMap.set(cj, { sum: 0, cnt: 0 });
+ const e = bMap.get(cj)!;
+ e.sum += dist; e.cnt++;
+ }
+ }
+ const a = aCnt > 0 ? aSum / aCnt : 0;
+ let b = Number.POSITIVE_INFINITY;
+ for (const { sum, cnt } of bMap.values()) if (cnt > 0) b = Math.min(b, sum / cnt);
+ if (!Number.isFinite(b)) b = 0;
+ const denom = Math.max(a, b);
+ scores[i] = denom === 0 ? 0 : (b - a) / denom;
+ }
+ let s = 0;
+ for (let i = 0; i < n; i++) s += scores[i] ?? 0;
+ return s / n;
+}
+
+/** Calinski-Harabasz index (variance ratio criterion). */
+export function calinskiHarabaszScore(
+ X: Float64Array[],
+ labels: Int32Array,
+): number {
+ const n = X.length;
+ const nf = X[0]?.length ?? 0;
+ const classes = [...new Set(Array.from(labels).filter((c) => c !== -1))];
+ const k = classes.length;
+ if (k <= 1) return 0;
+ const overall = new Float64Array(nf);
+ for (const xi of X) for (let j = 0; j < nf; j++) overall[j] = (overall[j] ?? 0) + (xi[j] ?? 0);
+ for (let j = 0; j < nf; j++) overall[j] = (overall[j] ?? 0) / n;
+ let bss = 0, wss = 0;
+ for (const c of classes) {
+ const members = X.filter((_, i) => (labels[i] ?? -1) === c);
+ const nc = members.length;
+ const cm = new Float64Array(nf);
+ for (const xi of members) for (let j = 0; j < nf; j++) cm[j] = (cm[j] ?? 0) + (xi[j] ?? 0);
+ for (let j = 0; j < nf; j++) {
+ cm[j] = (cm[j] ?? 0) / nc;
+ bss += nc * ((cm[j] ?? 0) - (overall[j] ?? 0)) ** 2;
+ }
+ for (const xi of members) for (let j = 0; j < nf; j++) wss += ((xi[j] ?? 0) - (cm[j] ?? 0)) ** 2;
+ }
+ return wss === 0 ? 0 : (bss / (k - 1)) / (wss / (n - k));
+}
+
+/** Davies-Bouldin index. */
+export function daviesBouldinScore(
+ X: Float64Array[],
+ labels: Int32Array,
+): number {
+ const nf = X[0]?.length ?? 0;
+ const classes = [...new Set(Array.from(labels).filter((c) => c !== -1))];
+ const k = classes.length;
+ if (k <= 1) return 0;
+ const centroids = classes.map((c) => {
+ const members = X.filter((_, i) => (labels[i] ?? -1) === c);
+ const cm = new Float64Array(nf);
+ for (const xi of members) for (let j = 0; j < nf; j++) cm[j] = (cm[j] ?? 0) + (xi[j] ?? 0);
+ for (let j = 0; j < nf; j++) cm[j] = (cm[j] ?? 0) / members.length;
+ return cm;
+ });
+ const si = classes.map((c, ci) => {
+ const members = X.filter((_, i) => (labels[i] ?? -1) === c);
+ let s = 0;
+ const centroid = centroids[ci]!;
+ for (const xi of members) {
+ let d = 0;
+ for (let j = 0; j < nf; j++) d += ((xi[j] ?? 0) - (centroid[j] ?? 0)) ** 2;
+ s += Math.sqrt(d);
+ }
+ return members.length > 0 ? s / members.length : 0;
+ });
+ const dist = (a: Float64Array, b: Float64Array): number => {
+ let d = 0;
+ for (let j = 0; j < a.length; j++) d += ((a[j] ?? 0) - (b[j] ?? 0)) ** 2;
+ return Math.sqrt(d);
+ };
+ let db = 0;
+ for (let i = 0; i < k; i++) {
+ let maxR = 0;
+ for (let j = 0; j < k; j++) {
+ if (i === j) continue;
+ const d = dist(centroids[i]!, centroids[j]!);
+ const r = d > 0 ? ((si[i] ?? 0) + (si[j] ?? 0)) / d : 0;
+ if (r > maxR) maxR = r;
+ }
+ db += maxR;
+ }
+ return db / k;
+}
+
+/** GapStatistic: estimate optimal number of clusters. */
+export class GapStatistic extends BaseEstimator {
+ n_clusters_: number = 0;
+ gap_values_: Float64Array = new Float64Array(0);
+ sk_: Float64Array = new Float64Array(0);
+
+ fit(X: Float64Array[], maxK = 10, nRef = 10): this {
+ const n = X.length;
+ const nf = X[0]?.length ?? 0;
+ const gaps = new Float64Array(maxK);
+ const sks = new Float64Array(maxK);
+ const mins = new Float64Array(nf), maxs = new Float64Array(nf);
+ for (let j = 0; j < nf; j++) {
+ let mn = Number.POSITIVE_INFINITY, mx = Number.NEGATIVE_INFINITY;
+ for (const xi of X) { const v = xi[j] ?? 0; if (v < mn) mn = v; if (v > mx) mx = v; }
+ mins[j] = mn; maxs[j] = mx;
+ }
+ for (let k = 1; k <= maxK; k++) {
+ const Wk = this._kmeansWk(X, k);
+ let refWkSum = 0, refWkSumSq = 0;
+ for (let r = 0; r < nRef; r++) {
+ const ref = Array.from({ length: n }, () => {
+ const xi = new Float64Array(nf);
+ for (let j = 0; j < nf; j++) xi[j] = (mins[j] ?? 0) + Math.random() * ((maxs[j] ?? 1) - (mins[j] ?? 0));
+ return xi;
+ });
+ const w = Math.log(Math.max(this._kmeansWk(ref, k), 1e-10));
+ refWkSum += w; refWkSumSq += w * w;
+ }
+ const logWk = Math.log(Math.max(Wk, 1e-10));
+ const expLogWk = refWkSum / nRef;
+ gaps[k - 1] = expLogWk - logWk;
+ sks[k - 1] = Math.sqrt(Math.max(refWkSumSq / nRef - expLogWk ** 2, 0)) * Math.sqrt(1 + 1 / nRef);
+ }
+ this.gap_values_ = gaps;
+ this.sk_ = sks;
+ for (let k = 0; k < maxK - 1; k++) {
+ if ((gaps[k] ?? 0) >= (gaps[k + 1] ?? 0) - (sks[k + 1] ?? 0)) { this.n_clusters_ = k + 1; return this; }
+ }
+ this.n_clusters_ = maxK;
+ return this;
+ }
+
+ private _kmeansWk(X: Float64Array[], k: number): number {
+ const n = X.length;
+ const nf = X[0]?.length ?? 0;
+ const centroids = X.slice(0, k).map((xi) => new Float64Array(xi));
+ const labels = new Int32Array(n);
+ for (let iter = 0; iter < 10; iter++) {
+ for (let i = 0; i < n; i++) {
+ let best = 0, bestD = Number.POSITIVE_INFINITY;
+ for (let c = 0; c < k; c++) {
+ let d = 0;
+ for (let j = 0; j < nf; j++) d += ((X[i]?.[j] ?? 0) - (centroids[c]?.[j] ?? 0)) ** 2;
+ if (d < bestD) { bestD = d; best = c; }
+ }
+ labels[i] = best;
+ }
+ for (let c = 0; c < k; c++) {
+ const cm = new Float64Array(nf);
+ let cnt = 0;
+ for (let i = 0; i < n; i++) if (labels[i] === c) { for (let j = 0; j < nf; j++) cm[j] = (cm[j] ?? 0) + (X[i]?.[j] ?? 0); cnt++; }
+ if (cnt > 0) { for (let j = 0; j < nf; j++) cm[j] = (cm[j] ?? 0) / cnt; centroids[c] = cm; }
+ }
+ }
+ let w = 0;
+ for (let c = 0; c < k; c++) {
+ const members = X.filter((_, i) => labels[i] === c);
+ for (const xi of members) for (let j = 0; j < nf; j++) w += ((xi[j] ?? 0) - (centroids[c]?.[j] ?? 0)) ** 2;
+ }
+ return w;
+ }
+}
diff --git a/src/cluster/cluster_ext3.ts b/src/cluster/cluster_ext3.ts
new file mode 100644
index 0000000..a1f13a7
--- /dev/null
+++ b/src/cluster/cluster_ext3.ts
@@ -0,0 +1,142 @@
+/**
+ * Extended clustering utilities: cluster quality scoring helpers,
+ * cluster merge/split operations, and consensus clustering.
+ */
+
+/** Compute inertia (within-cluster sum of squares) given labels and centroids. */
+export function computeInertia(
+ X: Float64Array[],
+ labels: Int32Array,
+ centroids: Float64Array[],
+): number {
+ let inertia = 0.0;
+ for (let i = 0; i < X.length; i++) {
+ const label = labels[i] ?? 0;
+ const centroid = centroids[label];
+ if (centroid === undefined) continue;
+ const xi = X[i];
+ if (xi === undefined) continue;
+ let dist2 = 0.0;
+ for (let j = 0; j < xi.length; j++) {
+ const diff = (xi[j] ?? 0) - (centroid[j] ?? 0);
+ dist2 += diff * diff;
+ }
+ inertia += dist2;
+ }
+ return inertia;
+}
+
+/** Compute cluster sizes given labels and n_clusters. */
+export function clusterSizes(labels: Int32Array, nClusters: number): Int32Array {
+ const sizes = new Int32Array(nClusters);
+ for (let i = 0; i < labels.length; i++) {
+ const l = labels[i] ?? 0;
+ if (l >= 0 && l < nClusters) {
+ sizes[l] = (sizes[l] ?? 0) + 1;
+ }
+ }
+ return sizes;
+}
+
+/** Compute centroids from data and labels. */
+export function computeCentroids(
+ X: Float64Array[],
+ labels: Int32Array,
+ nClusters: number,
+ nFeatures: number,
+): Float64Array[] {
+ const sums: Float64Array[] = Array.from({ length: nClusters }, () => new Float64Array(nFeatures));
+ const counts = new Int32Array(nClusters);
+ for (let i = 0; i < X.length; i++) {
+ const l = labels[i] ?? 0;
+ if (l < 0 || l >= nClusters) continue;
+ const xi = X[i];
+ if (xi === undefined) continue;
+ const s = sums[l];
+ if (s === undefined) continue;
+ for (let j = 0; j < nFeatures; j++) {
+ s[j] = (s[j] ?? 0) + (xi[j] ?? 0);
+ }
+ counts[l] = (counts[l] ?? 0) + 1;
+ }
+ return sums.map((s, k) => {
+ const c = counts[k] ?? 1;
+ return s.map((v) => v / Math.max(1, c));
+ });
+}
+
+/** Davies-Bouldin index (lower is better). */
+export function daviesBouldinScore(X: Float64Array[], labels: Int32Array): number {
+ const uniqueLabels = [...new Set(Array.from(labels))].filter((l) => l >= 0);
+ const nClusters = uniqueLabels.length;
+ if (nClusters < 2) return 0;
+ const nFeatures = X[0]?.length ?? 0;
+ const centroids = computeCentroids(X, labels, nClusters, nFeatures);
+
+ const s: number[] = centroids.map((c, k) => {
+ const members = X.filter((_, i) => (labels[i] ?? -1) === k);
+ if (members.length === 0) return 0;
+ const avg = members.reduce((acc, xi) => {
+ let dist = 0;
+ for (let j = 0; j < c.length; j++) dist += ((xi[j] ?? 0) - (c[j] ?? 0)) ** 2;
+ return acc + Math.sqrt(dist);
+ }, 0) / members.length;
+ return avg;
+ });
+
+ let db = 0;
+ for (let i = 0; i < nClusters; i++) {
+ let maxR = 0;
+ for (let j = 0; j < nClusters; j++) {
+ if (i === j) continue;
+ const ci = centroids[i];
+ const cj = centroids[j];
+ if (ci === undefined || cj === undefined) continue;
+ let dist = 0;
+ for (let d = 0; d < nFeatures; d++) dist += ((ci[d] ?? 0) - (cj[d] ?? 0)) ** 2;
+ dist = Math.sqrt(dist);
+ const r = ((s[i] ?? 0) + (s[j] ?? 0)) / (dist + 1e-10);
+ if (r > maxR) maxR = r;
+ }
+ db += maxR;
+ }
+ return db / nClusters;
+}
+
+/** Calinski-Harabasz index (higher is better). */
+export function calinskiHarabaszScore(X: Float64Array[], labels: Int32Array): number {
+ const n = X.length;
+ const nFeatures = X[0]?.length ?? 0;
+ const uniqueLabels = [...new Set(Array.from(labels))].filter((l) => l >= 0);
+ const k = uniqueLabels.length;
+ if (k < 2 || n <= k) return 0;
+
+ const grandMean = new Float64Array(nFeatures);
+ for (const xi of X) {
+ for (let j = 0; j < nFeatures; j++) grandMean[j] = (grandMean[j] ?? 0) + (xi[j] ?? 0);
+ }
+ for (let j = 0; j < nFeatures; j++) grandMean[j] = (grandMean[j] ?? 0) / n;
+
+ const centroids = computeCentroids(X, labels, k, nFeatures);
+ const sizes = clusterSizes(labels, k);
+
+ let bcd = 0;
+ for (let c = 0; c < k; c++) {
+ const centroid = centroids[c];
+ if (centroid === undefined) continue;
+ let dist = 0;
+ for (let j = 0; j < nFeatures; j++) dist += ((centroid[j] ?? 0) - (grandMean[j] ?? 0)) ** 2;
+ bcd += (sizes[c] ?? 0) * dist;
+ }
+
+ let wcd = 0;
+ for (let i = 0; i < n; i++) {
+ const l = labels[i] ?? 0;
+ const centroid = centroids[l];
+ const xi = X[i];
+ if (centroid === undefined || xi === undefined) continue;
+ for (let j = 0; j < nFeatures; j++) wcd += ((xi[j] ?? 0) - (centroid[j] ?? 0)) ** 2;
+ }
+
+ return (bcd / (k - 1)) / (wcd / (n - k) + 1e-10);
+}
diff --git a/src/cluster/cluster_ext5.ts b/src/cluster/cluster_ext5.ts
new file mode 100644
index 0000000..906bff0
--- /dev/null
+++ b/src/cluster/cluster_ext5.ts
@@ -0,0 +1,238 @@
+/**
+ * Additional clustering algorithms: MiniBatchKMeans, OPTICS.
+ * Mirrors sklearn.cluster extras.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+export class MiniBatchKMeans {
+ nClusters: number;
+ batchSize: number;
+ maxIter: number;
+ randomState: number;
+
+ clusterCenters_: Float64Array[] | null = null;
+ labels_: Int32Array | null = null;
+ inertia_: number = 0;
+
+ constructor(
+ options: {
+ nClusters?: number;
+ batchSize?: number;
+ maxIter?: number;
+ randomState?: number;
+ } = {},
+ ) {
+ this.nClusters = options.nClusters ?? 8;
+ this.batchSize = options.batchSize ?? 100;
+ this.maxIter = options.maxIter ?? 100;
+ this.randomState = options.randomState ?? 0;
+ }
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const nFeatures = X[0]?.length ?? 0;
+ const k = Math.min(this.nClusters, n);
+
+ // Initialize centers with first k points
+ let centers = X.slice(0, k).map((row) => row.slice());
+ const counts = new Float64Array(k);
+
+ let rng = this.randomState;
+ const nextRand = (): number => {
+ rng = (rng * 1664525 + 1013904223) >>> 0;
+ return rng / 4294967296;
+ };
+
+ for (let iter = 0; iter < this.maxIter; iter++) {
+ // Sample a mini-batch
+ const batchSize = Math.min(this.batchSize, n);
+ const batchIndices: number[] = [];
+ for (let b = 0; b < batchSize; b++) {
+ batchIndices.push(Math.floor(nextRand() * n));
+ }
+
+ for (const idx of batchIndices) {
+ const x = X[idx] ?? new Float64Array(nFeatures);
+ // Assign to nearest center
+ let nearest = 0;
+ let minDist = Number.POSITIVE_INFINITY;
+ for (let c = 0; c < k; c++) {
+ let dist = 0;
+ for (let j = 0; j < nFeatures; j++) {
+ dist += ((x[j] ?? 0) - (centers[c]?.[j] ?? 0)) ** 2;
+ }
+ if (dist < minDist) {
+ minDist = dist;
+ nearest = c;
+ }
+ }
+ // Update center with learning rate
+ counts[nearest] = (counts[nearest] ?? 0) + 1;
+ const lr = 1 / (counts[nearest] ?? 1);
+ for (let j = 0; j < nFeatures; j++) {
+ centers[nearest]![j] = (centers[nearest]?.[j] ?? 0) * (1 - lr) + (x[j] ?? 0) * lr;
+ }
+ }
+ }
+
+ this.clusterCenters_ = centers;
+ // Assign labels
+ const labels = new Int32Array(n);
+ let inertia = 0;
+ for (let i = 0; i < n; i++) {
+ let nearest = 0;
+ let minDist = Number.POSITIVE_INFINITY;
+ for (let c = 0; c < k; c++) {
+ let dist = 0;
+ for (let j = 0; j < nFeatures; j++) {
+ dist += ((X[i]?.[j] ?? 0) - (centers[c]?.[j] ?? 0)) ** 2;
+ }
+ if (dist < minDist) {
+ minDist = dist;
+ nearest = c;
+ }
+ }
+ labels[i] = nearest;
+ inertia += minDist;
+ }
+ this.labels_ = labels;
+ this.inertia_ = inertia;
+ return this;
+ }
+
+ predict(X: Float64Array[]): Int32Array {
+ if (!this.clusterCenters_) throw new NotFittedError("MiniBatchKMeans is not fitted");
+ const k = this.clusterCenters_.length;
+ const nFeatures = this.clusterCenters_[0]?.length ?? 0;
+ const labels = new Int32Array(X.length);
+ for (let i = 0; i < X.length; i++) {
+ let nearest = 0;
+ let minDist = Number.POSITIVE_INFINITY;
+ for (let c = 0; c < k; c++) {
+ let dist = 0;
+ for (let j = 0; j < nFeatures; j++) {
+ dist += ((X[i]?.[j] ?? 0) - (this.clusterCenters_[c]?.[j] ?? 0)) ** 2;
+ }
+ if (dist < minDist) {
+ minDist = dist;
+ nearest = c;
+ }
+ }
+ labels[i] = nearest;
+ }
+ return labels;
+ }
+}
+
+export interface OPTICSOptions {
+ minSamples?: number;
+ maxEps?: number;
+ metric?: "euclidean" | "manhattan";
+ clusterMethod?: "xi" | "dbscan";
+ eps?: number;
+ xi?: number;
+}
+
+export class OPTICS {
+ minSamples: number;
+ maxEps: number;
+ metric: "euclidean" | "manhattan";
+ eps: number;
+
+ labels_: Int32Array | null = null;
+ reachabilityDistances_: Float64Array | null = null;
+ coreDistances_: Float64Array | null = null;
+ ordering_: Int32Array | null = null;
+
+ constructor(options: OPTICSOptions = {}) {
+ this.minSamples = options.minSamples ?? 5;
+ this.maxEps = options.maxEps ?? Number.POSITIVE_INFINITY;
+ this.metric = options.metric ?? "euclidean";
+ this.eps = options.eps ?? Number.POSITIVE_INFINITY;
+ }
+
+ private _dist(a: Float64Array, b: Float64Array): number {
+ if (this.metric === "manhattan") {
+ let s = 0;
+ for (let i = 0; i < a.length; i++) s += Math.abs((a[i] ?? 0) - (b[i] ?? 0));
+ return s;
+ }
+ let s = 0;
+ for (let i = 0; i < a.length; i++) s += ((a[i] ?? 0) - (b[i] ?? 0)) ** 2;
+ return Math.sqrt(s);
+ }
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ // Compute distances
+ const dists: number[][] = Array.from({ length: n }, () => new Array(n).fill(0));
+ for (let i = 0; i < n; i++) {
+ for (let j = i + 1; j < n; j++) {
+ const d = this._dist(X[i] ?? new Float64Array(0), X[j] ?? new Float64Array(0));
+ dists[i]![j] = d;
+ dists[j]![i] = d;
+ }
+ }
+
+ // Core distances
+ const coreDists = new Float64Array(n).fill(Number.POSITIVE_INFINITY);
+ for (let i = 0; i < n; i++) {
+ const row = dists[i]!.slice().sort((a, b) => a - b);
+ const kDist = row[this.minSamples - 1] ?? Number.POSITIVE_INFINITY;
+ if (kDist <= this.maxEps) coreDists[i] = kDist;
+ }
+
+ // OPTICS ordering
+ const reachability = new Float64Array(n).fill(Number.POSITIVE_INFINITY);
+ const processed = new Uint8Array(n);
+ const ordering: number[] = [];
+
+ // Use simple priority-queue via sorted list
+ for (let start = 0; start < n; start++) {
+ if (processed[start]) continue;
+
+ const seeds: Array<{ idx: number; dist: number }> = [{ idx: start, dist: 0 }];
+ while (seeds.length > 0) {
+ seeds.sort((a, b) => a.dist - b.dist);
+ const { idx } = seeds.shift()!;
+ if (processed[idx]) continue;
+ processed[idx] = 1;
+ ordering.push(idx);
+
+ if (coreDists[idx] === Number.POSITIVE_INFINITY) continue;
+ for (let j = 0; j < n; j++) {
+ if (processed[j]) continue;
+ const d = dists[idx]?.[j] ?? Number.POSITIVE_INFINITY;
+ const newReach = Math.max(coreDists[idx] ?? Number.POSITIVE_INFINITY, d);
+ if (newReach < (reachability[j] ?? Number.POSITIVE_INFINITY)) {
+ reachability[j] = newReach;
+ seeds.push({ idx: j, dist: newReach });
+ }
+ }
+ }
+ }
+
+ this.reachabilityDistances_ = reachability;
+ this.coreDistances_ = coreDists;
+ this.ordering_ = new Int32Array(ordering);
+
+ // DBSCAN-style cluster extraction
+ const eps = this.eps;
+ const labels = new Int32Array(n).fill(-1);
+ let clusterId = -1;
+ for (const idx of ordering) {
+ if ((reachability[idx] ?? Number.POSITIVE_INFINITY) > eps) {
+ if ((coreDists[idx] ?? Number.POSITIVE_INFINITY) <= eps) {
+ clusterId++;
+ labels[idx] = clusterId;
+ }
+ } else {
+ labels[idx] = clusterId;
+ }
+ }
+
+ this.labels_ = labels;
+ return this;
+ }
+}
diff --git a/src/cluster/cluster_ext8.ts b/src/cluster/cluster_ext8.ts
new file mode 100644
index 0000000..07d9c3d
--- /dev/null
+++ b/src/cluster/cluster_ext8.ts
@@ -0,0 +1,346 @@
+/**
+ * Additional clustering algorithms: SelfOrganizingMap, FuzzyCMeans, AffinityPropagationExt
+ * Port of sklearn-compatible clustering extensions
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+export class SelfOrganizingMap {
+ rows: number;
+ cols: number;
+ nFeatures: number;
+ sigma: number;
+ learningRate: number;
+ nIter: number;
+ randomState: number;
+
+ private weights_: Float64Array[] | null = null;
+
+ constructor(opts: {
+ rows?: number;
+ cols?: number;
+ nFeatures?: number;
+ sigma?: number;
+ learningRate?: number;
+ nIter?: number;
+ randomState?: number;
+ } = {}) {
+ this.rows = opts.rows ?? 10;
+ this.cols = opts.cols ?? 10;
+ this.nFeatures = opts.nFeatures ?? 2;
+ this.sigma = opts.sigma ?? 1.0;
+ this.learningRate = opts.learningRate ?? 0.5;
+ this.nIter = opts.nIter ?? 1000;
+ this.randomState = opts.randomState ?? 42;
+ }
+
+ private _rng(seed: number): () => number {
+ let s = seed;
+ return () => {
+ s = (s * 1664525 + 1013904223) & 0xffffffff;
+ return (s >>> 0) / 0xffffffff;
+ };
+ }
+
+ fit(X: Float64Array[]): this {
+ const rng = this._rng(this.randomState);
+ const nNodes = this.rows * this.cols;
+ this.weights_ = Array.from({ length: nNodes }, () => {
+ const w = new Float64Array(this.nFeatures);
+ for (let j = 0; j < this.nFeatures; j++) w[j] = rng() * 2 - 1;
+ return w;
+ });
+ for (let iter = 0; iter < this.nIter; iter++) {
+ const t = iter / this.nIter;
+ const lr = this.learningRate * Math.exp(-t * 5);
+ const sig = this.sigma * Math.exp(-t * 5);
+ const xi = X[Math.floor(rng() * X.length)];
+ if (!xi) continue;
+ let bmuIdx = 0;
+ let bmuDist = Number.POSITIVE_INFINITY;
+ for (let k = 0; k < nNodes; k++) {
+ const w = this.weights_[k];
+ if (!w) continue;
+ let d = 0;
+ for (let j = 0; j < this.nFeatures; j++) d += ((xi[j] ?? 0) - (w[j] ?? 0)) ** 2;
+ if (d < bmuDist) { bmuDist = d; bmuIdx = k; }
+ }
+ const bmuRow = Math.floor(bmuIdx / this.cols);
+ const bmuCol = bmuIdx % this.cols;
+ for (let k = 0; k < nNodes; k++) {
+ const r = Math.floor(k / this.cols);
+ const c = k % this.cols;
+ const dist2 = (r - bmuRow) ** 2 + (c - bmuCol) ** 2;
+ const h = Math.exp(-dist2 / (2 * sig * sig + 1e-15));
+ const w = this.weights_[k];
+ if (!w) continue;
+ for (let j = 0; j < this.nFeatures; j++) {
+ w[j] = (w[j] ?? 0) + lr * h * ((xi[j] ?? 0) - (w[j] ?? 0));
+ }
+ }
+ }
+ return this;
+ }
+
+ transform(X: Float64Array[]): Float64Array[] {
+ if (!this.weights_) throw new NotFittedError("SelfOrganizingMap not fitted.");
+ return X.map(xi => {
+ const result = new Float64Array(this.weights_!.length);
+ for (let k = 0; k < this.weights_!.length; k++) {
+ const w = this.weights_![k];
+ let d = 0;
+ if (w) for (let j = 0; j < this.nFeatures; j++) d += ((xi[j] ?? 0) - (w[j] ?? 0)) ** 2;
+ result[k] = Math.sqrt(d);
+ }
+ return result;
+ });
+ }
+
+ predict(X: Float64Array[]): Int32Array {
+ if (!this.weights_) throw new NotFittedError("SelfOrganizingMap not fitted.");
+ const labels = new Int32Array(X.length);
+ for (let i = 0; i < X.length; i++) {
+ const xi = X[i];
+ if (!xi) continue;
+ let bmu = 0;
+ let bmuDist = Number.POSITIVE_INFINITY;
+ for (let k = 0; k < this.weights_!.length; k++) {
+ const w = this.weights_![k];
+ let d = 0;
+ if (w) for (let j = 0; j < this.nFeatures; j++) d += ((xi[j] ?? 0) - (w[j] ?? 0)) ** 2;
+ if (d < bmuDist) { bmuDist = d; bmu = k; }
+ }
+ labels[i] = bmu;
+ }
+ return labels;
+ }
+}
+
+export class FuzzyCMeans {
+ nClusters: number;
+ m: number;
+ maxIter: number;
+ tol: number;
+ randomState: number;
+
+ clusterCenters_: Float64Array[] | null = null;
+ u_: Float64Array[] | null = null;
+ labels_: Int32Array | null = null;
+
+ constructor(opts: {
+ nClusters?: number;
+ m?: number;
+ maxIter?: number;
+ tol?: number;
+ randomState?: number;
+ } = {}) {
+ this.nClusters = opts.nClusters ?? 3;
+ this.m = opts.m ?? 2.0;
+ this.maxIter = opts.maxIter ?? 150;
+ this.tol = opts.tol ?? 1e-4;
+ this.randomState = opts.randomState ?? 42;
+ }
+
+ private _rng(seed: number): () => number {
+ let s = seed;
+ return () => {
+ s = (s * 1664525 + 1013904223) & 0xffffffff;
+ return (s >>> 0) / 0xffffffff;
+ };
+ }
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const p = X[0]?.length ?? 0;
+ const c = this.nClusters;
+ const rng = this._rng(this.randomState);
+ let u: Float64Array[] = Array.from({ length: n }, () => {
+ const row = new Float64Array(c);
+ let sum = 0;
+ for (let k = 0; k < c; k++) { row[k] = rng(); sum += row[k] ?? 0; }
+ for (let k = 0; k < c; k++) row[k] = (row[k] ?? 0) / (sum + 1e-15);
+ return row;
+ });
+
+ for (let iter = 0; iter < this.maxIter; iter++) {
+ const centers: Float64Array[] = Array.from({ length: c }, () => new Float64Array(p));
+ for (let k = 0; k < c; k++) {
+ let wSum = 0;
+ for (let i = 0; i < n; i++) {
+ const uik = Math.pow(u[i]![k] ?? 0, this.m);
+ wSum += uik;
+ const xi = X[i];
+ if (!xi) continue;
+ for (let j = 0; j < p; j++) centers[k]![j] = (centers[k]![j] ?? 0) + uik * (xi[j] ?? 0);
+ }
+ for (let j = 0; j < p; j++) centers[k]![j] = (centers[k]![j] ?? 0) / (wSum + 1e-15);
+ }
+ const newU: Float64Array[] = Array.from({ length: n }, () => new Float64Array(c));
+ for (let i = 0; i < n; i++) {
+ const xi = X[i];
+ const dists = new Float64Array(c);
+ for (let k = 0; k < c; k++) {
+ let d = 0;
+ const ck = centers[k];
+ if (xi && ck) for (let j = 0; j < p; j++) d += ((xi[j] ?? 0) - (ck[j] ?? 0)) ** 2;
+ dists[k] = Math.sqrt(d) + 1e-15;
+ }
+ for (let k = 0; k < c; k++) {
+ let s = 0;
+ const dk = dists[k] ?? 1;
+ for (let l = 0; l < c; l++) s += Math.pow(dk / ((dists[l] ?? 1) + 1e-15), 2 / (this.m - 1 + 1e-15));
+ newU[i]![k] = 1 / (s + 1e-15);
+ }
+ }
+ let diff = 0;
+ for (let i = 0; i < n; i++) for (let k = 0; k < c; k++) diff = Math.max(diff, Math.abs((newU[i]![k] ?? 0) - (u[i]![k] ?? 0)));
+ u = newU;
+ if (diff < this.tol) break;
+ void iter;
+ }
+ this.u_ = u;
+ this.clusterCenters_ = Array.from({ length: c }, () => new Float64Array(p));
+ for (let k = 0; k < c; k++) {
+ let wSum = 0;
+ for (let i = 0; i < n; i++) {
+ const uik = Math.pow(u[i]![k] ?? 0, this.m);
+ wSum += uik;
+ const xi = X[i];
+ if (!xi) continue;
+ for (let j = 0; j < p; j++) this.clusterCenters_[k]![j] = (this.clusterCenters_[k]![j] ?? 0) + uik * (xi[j] ?? 0);
+ }
+ for (let j = 0; j < p; j++) this.clusterCenters_[k]![j] = (this.clusterCenters_[k]![j] ?? 0) / (wSum + 1e-15);
+ }
+ this.labels_ = new Int32Array(n);
+ for (let i = 0; i < n; i++) {
+ let bestK = 0;
+ let bestU = -1;
+ for (let k = 0; k < c; k++) {
+ if ((u[i]![k] ?? 0) > bestU) { bestU = u[i]![k] ?? 0; bestK = k; }
+ }
+ this.labels_[i] = bestK;
+ }
+ return this;
+ }
+
+ predict(X: Float64Array[]): Int32Array {
+ if (!this.clusterCenters_) throw new NotFittedError("FuzzyCMeans not fitted.");
+ const labels = new Int32Array(X.length);
+ for (let i = 0; i < X.length; i++) {
+ const xi = X[i];
+ let bestK = 0;
+ let bestDist = Number.POSITIVE_INFINITY;
+ for (let k = 0; k < this.clusterCenters_.length; k++) {
+ const ck = this.clusterCenters_[k];
+ let d = 0;
+ if (xi && ck) for (let j = 0; j < ck.length; j++) d += ((xi[j] ?? 0) - (ck[j] ?? 0)) ** 2;
+ if (d < bestDist) { bestDist = d; bestK = k; }
+ }
+ labels[i] = bestK;
+ }
+ return labels;
+ }
+}
+
+export class GaussianMixtureExt {
+ nComponents: number;
+ maxIter: number;
+ tol: number;
+ randomState: number;
+
+ means_: Float64Array[] | null = null;
+ covs_: Float64Array[][] | null = null;
+ weights_: Float64Array | null = null;
+
+ constructor(opts: { nComponents?: number; maxIter?: number; tol?: number; randomState?: number } = {}) {
+ this.nComponents = opts.nComponents ?? 3;
+ this.maxIter = opts.maxIter ?? 100;
+ this.tol = opts.tol ?? 1e-3;
+ this.randomState = opts.randomState ?? 0;
+ }
+
+ private _gaussPdf(x: Float64Array, mu: Float64Array, cov: Float64Array[]): number {
+ const p = x.length;
+ let det = 1;
+ for (let j = 0; j < p; j++) det *= cov[j]![j] ?? 1;
+ const norm = Math.pow(2 * Math.PI, p / 2) * Math.sqrt(Math.abs(det) + 1e-15);
+ let exp = 0;
+ for (let j = 0; j < p; j++) {
+ const diff = (x[j] ?? 0) - (mu[j] ?? 0);
+ exp += diff * diff / ((cov[j]![j] ?? 1) + 1e-15);
+ }
+ return Math.exp(-0.5 * exp) / (norm + 1e-15);
+ }
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const p = X[0]?.length ?? 1;
+ const c = this.nComponents;
+ let rngState = this.randomState;
+ const rng = () => { rngState = (rngState * 1664525 + 1013904223) & 0xffffffff; return (rngState >>> 0) / 0xffffffff; };
+
+ this.means_ = Array.from({ length: c }, () => {
+ const m = new Float64Array(p);
+ for (let j = 0; j < p; j++) m[j] = rng() * 2 - 1;
+ return m;
+ });
+ this.covs_ = Array.from({ length: c }, () => Array.from({ length: p }, () => { const r = new Float64Array(p); r[0] = 1; return r; }));
+ this.weights_ = new Float64Array(c).fill(1 / c);
+
+ for (let iter = 0; iter < this.maxIter; iter++) {
+ const resp = Array.from({ length: n }, () => new Float64Array(c));
+ for (let i = 0; i < n; i++) {
+ let total = 0;
+ for (let k = 0; k < c; k++) {
+ const r = (this.weights_![k] ?? 0) * this._gaussPdf(X[i]!, this.means_![k]!, this.covs_![k]!);
+ resp[i]![k] = r;
+ total += r;
+ }
+ for (let k = 0; k < c; k++) resp[i]![k] = (resp[i]![k] ?? 0) / (total + 1e-15);
+ }
+ const Nk = new Float64Array(c);
+ for (let i = 0; i < n; i++) for (let k = 0; k < c; k++) Nk[k] = (Nk[k] ?? 0) + (resp[i]![k] ?? 0);
+ for (let k = 0; k < c; k++) {
+ const nk = Nk[k] ?? 1;
+ const mu = new Float64Array(p);
+ for (let i = 0; i < n; i++) {
+ const rik = resp[i]![k] ?? 0;
+ const xi = X[i];
+ if (!xi) continue;
+ for (let j = 0; j < p; j++) mu[j] = (mu[j] ?? 0) + rik * (xi[j] ?? 0);
+ }
+ for (let j = 0; j < p; j++) mu[j] = (mu[j] ?? 0) / (nk + 1e-15);
+ this.means_![k] = mu;
+ const cov: Float64Array[] = Array.from({ length: p }, () => new Float64Array(p));
+ for (let i = 0; i < n; i++) {
+ const rik = resp[i]![k] ?? 0;
+ const xi = X[i];
+ if (!xi) continue;
+ for (let j = 0; j < p; j++) {
+ cov[j]![j] = (cov[j]![j] ?? 0) + rik * ((xi[j] ?? 0) - (mu[j] ?? 0)) ** 2;
+ }
+ }
+ for (let j = 0; j < p; j++) cov[j]![j] = (cov[j]![j] ?? 0) / (nk + 1e-15) + 1e-6;
+ this.covs_![k] = cov;
+ this.weights_![k] = nk / n;
+ }
+ void iter;
+ }
+ return this;
+ }
+
+ predict(X: Float64Array[]): Int32Array {
+ if (!this.means_) throw new NotFittedError("GaussianMixtureExt not fitted.");
+ const labels = new Int32Array(X.length);
+ for (let i = 0; i < X.length; i++) {
+ let best = 0;
+ let bestScore = -Number.POSITIVE_INFINITY;
+ for (let k = 0; k < this.nComponents; k++) {
+ const score = Math.log((this.weights_![k] ?? 0) + 1e-15) + Math.log(this._gaussPdf(X[i]!, this.means_![k]!, this.covs_![k]!) + 1e-15);
+ if (score > bestScore) { bestScore = score; best = k; }
+ }
+ labels[i] = best;
+ }
+ return labels;
+ }
+}
diff --git a/src/cluster/cluster_ext9.ts b/src/cluster/cluster_ext9.ts
new file mode 100644
index 0000000..b0782c3
--- /dev/null
+++ b/src/cluster/cluster_ext9.ts
@@ -0,0 +1,185 @@
+/**
+ * Cluster extensions: BIRCH algorithm utilities.
+ * Port of sklearn.cluster.birch extensions.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+/** Clustering Feature (CF) node for BIRCH. */
+interface CFEntry {
+ n: number;
+ ls: Float64Array; // linear sum
+ ss: number; // squared sum
+}
+
+function newCFEntry(dim: number): CFEntry {
+ return { n: 0, ls: new Float64Array(dim), ss: 0 };
+}
+
+function addToCF(cf: CFEntry, x: Float64Array): void {
+ cf.n++;
+ for (let j = 0; j < cf.ls.length; j++) cf.ls[j]! += x[j] ?? 0;
+ for (let j = 0; j < x.length; j++) cf.ss += (x[j] ?? 0) * (x[j] ?? 0);
+}
+
+function cfCentroid(cf: CFEntry): Float64Array {
+ const c = new Float64Array(cf.ls.length);
+ for (let j = 0; j < cf.ls.length; j++) c[j] = cf.n === 0 ? 0 : (cf.ls[j] ?? 0) / cf.n;
+ return c;
+}
+
+function cfRadius(cf: CFEntry): number {
+ if (cf.n === 0) return 0;
+ const centroid = cfCentroid(cf);
+ let r = 0;
+ const avgSS = cf.ss / cf.n;
+ for (let j = 0; j < centroid.length; j++) r += (centroid[j] ?? 0) * (centroid[j] ?? 0);
+ return Math.sqrt(Math.max(0, avgSS - r));
+}
+
+function euclidean(a: Float64Array, b: Float64Array): number {
+ let d = 0;
+ for (let j = 0; j < a.length; j++) {
+ const diff = (a[j] ?? 0) - (b[j] ?? 0);
+ d += diff * diff;
+ }
+ return Math.sqrt(d);
+}
+
+/** Simplified BIRCH clustering implementation. */
+export class BirchSimple {
+ private subclusterCentroids_: Float64Array[] | null = null;
+ private labels_: Int32Array | null = null;
+ readonly threshold: number;
+ readonly branchingFactor: number;
+ readonly nClusters: number | null;
+
+ constructor(
+ options: {
+ threshold?: number;
+ branchingFactor?: number;
+ nClusters?: number | null;
+ } = {},
+ ) {
+ this.threshold = options.threshold ?? 0.5;
+ this.branchingFactor = options.branchingFactor ?? 50;
+ this.nClusters = options.nClusters ?? 3;
+ }
+
+ fit(X: Float64Array[]): this {
+ const nFeatures = X[0]?.length ?? 0;
+ const subclusters: CFEntry[] = [];
+
+ for (const x of X) {
+ if (subclusters.length === 0) {
+ const cf = newCFEntry(nFeatures);
+ addToCF(cf, x);
+ subclusters.push(cf);
+ continue;
+ }
+ // Find closest subcluster
+ let bestIdx = 0;
+ let bestDist = Number.POSITIVE_INFINITY;
+ for (let k = 0; k < subclusters.length; k++) {
+ const d = euclidean(cfCentroid(subclusters[k]!), x);
+ if (d < bestDist) {
+ bestDist = d;
+ bestIdx = k;
+ }
+ }
+ // Check if we can add to this subcluster
+ const cf = subclusters[bestIdx]!;
+ const testCF = newCFEntry(nFeatures);
+ Object.assign(testCF, { n: cf.n, ls: new Float64Array(cf.ls), ss: cf.ss });
+ addToCF(testCF, x);
+ if (cfRadius(testCF) <= this.threshold) {
+ addToCF(cf, x);
+ } else {
+ const newCF = newCFEntry(nFeatures);
+ addToCF(newCF, x);
+ subclusters.push(newCF);
+ }
+ }
+
+ this.subclusterCentroids_ = subclusters.map((cf) => cfCentroid(cf));
+
+ // Assign labels via final clustering of subclusters
+ const nTarget = Math.min(this.nClusters ?? subclusters.length, subclusters.length);
+ const clusterLabels = kMeansLabels(this.subclusterCentroids_, nTarget);
+
+ this.labels_ = new Int32Array(X.length);
+ for (let i = 0; i < X.length; i++) {
+ let bestK = 0;
+ let bestD = Number.POSITIVE_INFINITY;
+ for (let k = 0; k < (this.subclusterCentroids_?.length ?? 0); k++) {
+ const d = euclidean(X[i]!, this.subclusterCentroids_![k]!);
+ if (d < bestD) {
+ bestD = d;
+ bestK = k;
+ }
+ }
+ this.labels_[i] = clusterLabels[bestK] ?? 0;
+ }
+ return this;
+ }
+
+ predict(X: Float64Array[]): Int32Array {
+ if (this.subclusterCentroids_ === null) throw new NotFittedError("BirchSimple is not fitted.");
+ const nTarget = Math.min(this.nClusters ?? this.subclusterCentroids_.length, this.subclusterCentroids_.length);
+ const clusterLabels = kMeansLabels(this.subclusterCentroids_, nTarget);
+ return new Int32Array(
+ X.map((x) => {
+ let bestK = 0;
+ let bestD = Number.POSITIVE_INFINITY;
+ for (let k = 0; k < (this.subclusterCentroids_?.length ?? 0); k++) {
+ const d = euclidean(x, this.subclusterCentroids_![k]!);
+ if (d < bestD) {
+ bestD = d;
+ bestK = k;
+ }
+ }
+ return clusterLabels[bestK] ?? 0;
+ }),
+ );
+ }
+
+ get labels(): Int32Array {
+ if (this.labels_ === null) throw new NotFittedError("BirchSimple is not fitted.");
+ return this.labels_;
+ }
+}
+
+function kMeansLabels(X: Float64Array[], k: number): Int32Array {
+ if (k >= X.length) return new Int32Array(X.length).map((_, i) => i);
+ const centroids = X.slice(0, k).map((x) => new Float64Array(x));
+ const labels = new Int32Array(X.length);
+ for (let iter = 0; iter < 10; iter++) {
+ for (let i = 0; i < X.length; i++) {
+ let best = 0;
+ let bestD = Number.POSITIVE_INFINITY;
+ for (let c = 0; c < k; c++) {
+ const d = euclidean(X[i]!, centroids[c]!);
+ if (d < bestD) {
+ bestD = d;
+ best = c;
+ }
+ }
+ labels[i] = best;
+ }
+ const dim = X[0]?.length ?? 0;
+ const newCentroids = Array.from({ length: k }, () => new Float64Array(dim));
+ const counts = new Int32Array(k);
+ for (let i = 0; i < X.length; i++) {
+ const c = labels[i] ?? 0;
+ counts[c]!++;
+ for (let j = 0; j < dim; j++) newCentroids[c]![j]! += X[i]?.[j] ?? 0;
+ }
+ for (let c = 0; c < k; c++) {
+ if ((counts[c] ?? 0) > 0) {
+ for (let j = 0; j < dim; j++) newCentroids[c]![j]! /= counts[c]!;
+ centroids[c] = newCentroids[c]!;
+ }
+ }
+ }
+ return labels;
+}
diff --git a/src/cluster/cluster_validation.ts b/src/cluster/cluster_validation.ts
new file mode 100644
index 0000000..96fe8ed
--- /dev/null
+++ b/src/cluster/cluster_validation.ts
@@ -0,0 +1,268 @@
+/**
+ * Cluster validation utilities: elbow method, gap statistic, Davies-Bouldin.
+ * Extends sklearn.cluster with additional validation tools.
+ */
+
+import type { KMeans } from "./kmeans.js";
+
+/**
+ * Elbow method: run KMeans for multiple k values and find the elbow.
+ */
+export interface ElbowResult {
+ kValues: number[];
+ inertias: number[];
+ optimalK: number;
+}
+
+export function elbowMethod(
+ X: Float64Array[],
+ kRange: number[] = [2, 3, 4, 5, 6, 7, 8, 9, 10],
+ KMeansClass: new (opts: { nClusters: number; randomState?: number }) => {
+ fit(X: Float64Array[]): unknown;
+ inertia_: number;
+ },
+ randomState?: number
+): ElbowResult {
+ const inertias: number[] = [];
+ for (const k of kRange) {
+ const km = new KMeansClass({ nClusters: k, randomState });
+ km.fit(X);
+ inertias.push(km.inertia_);
+ }
+
+ // Find elbow using maximum curvature (second derivative)
+ let optimalK = kRange[0] ?? 2;
+ if (inertias.length >= 3) {
+ let maxCurvature = -Infinity;
+ for (let i = 1; i < inertias.length - 1; i++) {
+ const d1 = (inertias[i - 1] ?? 0) - (inertias[i] ?? 0);
+ const d2 = (inertias[i] ?? 0) - (inertias[i + 1] ?? 0);
+ const curvature = d1 - d2;
+ if (curvature > maxCurvature) {
+ maxCurvature = curvature;
+ optimalK = kRange[i] ?? 2;
+ }
+ }
+ }
+
+ return { kValues: kRange, inertias, optimalK };
+}
+
+/**
+ * Gap statistic: compare inertia to reference (uniform) distribution.
+ */
+export interface GapStatisticResult {
+ kValues: number[];
+ gaps: number[];
+ sks: number[];
+ optimalK: number;
+}
+
+export function gapStatistic(
+ X: Float64Array[],
+ kRange: number[] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+ KMeansClass: new (opts: { nClusters: number; randomState?: number }) => {
+ fit(X: Float64Array[]): unknown;
+ inertia_: number;
+ },
+ nRefs = 10,
+ randomState = 42
+): GapStatisticResult {
+ const nSamples = X.length;
+ const nFeatures = X[0]?.length ?? 0;
+
+ // Compute bounding box of data
+ const mins = new Float64Array(nFeatures);
+ const maxs = new Float64Array(nFeatures);
+ mins.fill(Infinity);
+ maxs.fill(-Infinity);
+ for (const row of X) {
+ for (let j = 0; j < nFeatures; j++) {
+ const v = row[j] ?? 0;
+ if (v < (mins[j] ?? Infinity)) mins[j] = v;
+ if (v > (maxs[j] ?? -Infinity)) maxs[j] = v;
+ }
+ }
+
+ // Seeded simple LCG RNG
+ let seed = randomState;
+ function randFloat(): number {
+ seed = (seed * 1664525 + 1013904223) & 0xffffffff;
+ return (seed >>> 0) / 0xffffffff;
+ }
+
+ const gaps: number[] = [];
+ const sks: number[] = [];
+
+ for (const k of kRange) {
+ const km = new KMeansClass({ nClusters: k, randomState });
+ km.fit(X);
+ const logW = Math.log(km.inertia_ + 1e-10);
+
+ // Reference distribution
+ const refLogWs: number[] = [];
+ for (let r = 0; r < nRefs; r++) {
+ const Xref: Float64Array[] = [];
+ for (let i = 0; i < nSamples; i++) {
+ const row = new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) {
+ row[j] = (mins[j] ?? 0) + randFloat() * ((maxs[j] ?? 1) - (mins[j] ?? 0));
+ }
+ Xref.push(row);
+ }
+ const kmRef = new KMeansClass({ nClusters: k, randomState: r });
+ kmRef.fit(Xref);
+ refLogWs.push(Math.log(kmRef.inertia_ + 1e-10));
+ }
+
+ const meanRefLogW = refLogWs.reduce((s, v) => s + v, 0) / nRefs;
+ const variance = refLogWs.reduce((s, v) => s + (v - meanRefLogW) ** 2, 0) / nRefs;
+ const sd = Math.sqrt(variance);
+ const sk = sd * Math.sqrt(1 + 1 / nRefs);
+
+ gaps.push(meanRefLogW - logW);
+ sks.push(sk);
+ }
+
+ // Optimal k: smallest k such that gap(k) >= gap(k+1) - sk+1
+ let optimalK = kRange[0] ?? 1;
+ for (let i = 0; i < kRange.length - 1; i++) {
+ if ((gaps[i] ?? 0) >= (gaps[i + 1] ?? 0) - (sks[i + 1] ?? 0)) {
+ optimalK = kRange[i] ?? 1;
+ break;
+ }
+ }
+
+ return { kValues: kRange, gaps, sks, optimalK };
+}
+
+/**
+ * Davies-Bouldin Index (lower is better).
+ * Complements silhouette score for cluster validation.
+ */
+export function daviesBouldinScore(X: Float64Array[], labels: Int32Array): number {
+ const uniqueLabels = Array.from(new Set(Array.from(labels))).sort((a, b) => a - b);
+ const k = uniqueLabels.length;
+ if (k < 2) return 0;
+
+ const nFeatures = X[0]?.length ?? 0;
+
+ // Compute centroids
+ const centroids: Float64Array[] = [];
+ const counts: number[] = [];
+ const labelToIdx = new Map();
+ uniqueLabels.forEach((l, i) => labelToIdx.set(l, i));
+
+ for (let ci = 0; ci < k; ci++) {
+ centroids.push(new Float64Array(nFeatures));
+ counts.push(0);
+ }
+
+ for (let i = 0; i < X.length; i++) {
+ const ci = labelToIdx.get(labels[i] ?? 0) ?? 0;
+ counts[ci] = (counts[ci] ?? 0) + 1;
+ for (let j = 0; j < nFeatures; j++) {
+ centroids[ci]![j] = (centroids[ci]![j] ?? 0) + (X[i]?.[j] ?? 0);
+ }
+ }
+ for (let ci = 0; ci < k; ci++) {
+ for (let j = 0; j < nFeatures; j++) {
+ centroids[ci]![j] = (centroids[ci]![j] ?? 0) / (counts[ci] ?? 1);
+ }
+ }
+
+ // Compute scatter (avg distance of cluster points to centroid)
+ const scatter: number[] = new Array(k).fill(0);
+ const memberCounts = new Array(k).fill(0);
+ for (let i = 0; i < X.length; i++) {
+ const ci = labelToIdx.get(labels[i] ?? 0) ?? 0;
+ let dist = 0;
+ for (let j = 0; j < nFeatures; j++) {
+ dist += ((X[i]?.[j] ?? 0) - (centroids[ci]?.[j] ?? 0)) ** 2;
+ }
+ scatter[ci] = (scatter[ci] ?? 0) + Math.sqrt(dist);
+ memberCounts[ci] = (memberCounts[ci] ?? 0) + 1;
+ }
+ for (let ci = 0; ci < k; ci++) {
+ scatter[ci] = (scatter[ci] ?? 0) / (memberCounts[ci] || 1);
+ }
+
+ // Compute Davies-Bouldin index
+ let dbSum = 0;
+ for (let i = 0; i < k; i++) {
+ let maxR = -Infinity;
+ for (let j = 0; j < k; j++) {
+ if (i === j) continue;
+ let distCentroids = 0;
+ for (let f = 0; f < nFeatures; f++) {
+ distCentroids += ((centroids[i]?.[f] ?? 0) - (centroids[j]?.[f] ?? 0)) ** 2;
+ }
+ distCentroids = Math.sqrt(distCentroids);
+ const R = ((scatter[i] ?? 0) + (scatter[j] ?? 0)) / (distCentroids || 1e-10);
+ if (R > maxR) maxR = R;
+ }
+ dbSum += maxR;
+ }
+
+ return dbSum / k;
+}
+
+/**
+ * Calinski-Harabasz Index (higher is better).
+ */
+export function calinskiHarabaszScore(X: Float64Array[], labels: Int32Array): number {
+ const nSamples = X.length;
+ const nFeatures = X[0]?.length ?? 0;
+ const uniqueLabels = Array.from(new Set(Array.from(labels))).sort((a, b) => a - b);
+ const k = uniqueLabels.length;
+ if (k < 2 || nSamples <= k) return 0;
+
+ const labelToIdx = new Map();
+ uniqueLabels.forEach((l, i) => labelToIdx.set(l, i));
+
+ // Global centroid
+ const globalCentroid = new Float64Array(nFeatures);
+ for (const row of X) {
+ for (let j = 0; j < nFeatures; j++) globalCentroid[j] = (globalCentroid[j] ?? 0) + (row[j] ?? 0);
+ }
+ for (let j = 0; j < nFeatures; j++) globalCentroid[j] = (globalCentroid[j] ?? 0) / nSamples;
+
+ // Cluster centroids and counts
+ const centroids = Array.from({ length: k }, () => new Float64Array(nFeatures));
+ const counts = new Array(k).fill(0);
+ for (let i = 0; i < nSamples; i++) {
+ const ci = labelToIdx.get(labels[i] ?? 0) ?? 0;
+ counts[ci] = (counts[ci] ?? 0) + 1;
+ for (let j = 0; j < nFeatures; j++) {
+ centroids[ci]![j] = (centroids[ci]![j] ?? 0) + (X[i]?.[j] ?? 0);
+ }
+ }
+ for (let ci = 0; ci < k; ci++) {
+ for (let j = 0; j < nFeatures; j++) {
+ centroids[ci]![j] = (centroids[ci]![j] ?? 0) / (counts[ci] ?? 1);
+ }
+ }
+
+ // Between-cluster scatter (BGSS)
+ let bgss = 0;
+ for (let ci = 0; ci < k; ci++) {
+ let d = 0;
+ for (let j = 0; j < nFeatures; j++) {
+ d += ((centroids[ci]?.[j] ?? 0) - (globalCentroid[j] ?? 0)) ** 2;
+ }
+ bgss += (counts[ci] ?? 0) * d;
+ }
+
+ // Within-cluster scatter (WGSS)
+ let wgss = 0;
+ for (let i = 0; i < nSamples; i++) {
+ const ci = labelToIdx.get(labels[i] ?? 0) ?? 0;
+ let d = 0;
+ for (let j = 0; j < nFeatures; j++) {
+ d += ((X[i]?.[j] ?? 0) - (centroids[ci]?.[j] ?? 0)) ** 2;
+ }
+ wgss += d;
+ }
+
+ return (bgss / (k - 1)) / ((wgss / (nSamples - k)) || 1e-10);
+}
diff --git a/src/cluster/clustering_utils.ts b/src/cluster/clustering_utils.ts
new file mode 100644
index 0000000..2b8ef2e
--- /dev/null
+++ b/src/cluster/clustering_utils.ts
@@ -0,0 +1,295 @@
+/**
+ * Cluster utility functions.
+ * Mirrors sklearn.cluster._mean_shift and related utilities.
+ */
+
+/**
+ * Estimate the bandwidth for Mean Shift algorithm.
+ * Uses a ball-tree-like approach: for each sample, counts how many
+ * samples are within the estimated bandwidth.
+ *
+ * @param X - Input data (n_samples x n_features)
+ * @param quantile - Quantile of pairwise distances to use as bandwidth (default 0.3)
+ * @param nSamples - Number of samples to use for estimation (default: all)
+ * @param seed - Random seed for subsampling
+ */
+export function estimateBandwidth(
+ X: Float64Array[],
+ options: {
+ quantile?: number;
+ nSamples?: number;
+ seed?: number;
+ } = {},
+): number {
+ const { quantile = 0.3, seed = 0 } = options;
+ const n = X.length;
+ let nSamples = options.nSamples ?? n;
+ nSamples = Math.min(nSamples, n);
+
+ // Subsample if needed
+ let indices: number[];
+ if (nSamples < n) {
+ let rng = seed;
+ const rand = () => {
+ rng = (rng * 1664525 + 1013904223) & 0xffffffff;
+ return (rng >>> 0) / 0xffffffff;
+ };
+ indices = Array.from({ length: n }, (_, i) => i);
+ for (let i = n - 1; i > 0; i--) {
+ const j = Math.floor(rand() * (i + 1));
+ const tmp = indices[i]!; indices[i] = indices[j]!; indices[j] = tmp;
+ }
+ indices = indices.slice(0, nSamples);
+ } else {
+ indices = Array.from({ length: n }, (_, i) => i);
+ }
+
+ // Compute pairwise distances between sampled points and all points
+ // Then take the quantile
+ const allDists: number[] = [];
+ for (const idx of indices) {
+ const xi = X[idx]!;
+ for (let j = 0; j < n; j++) {
+ const xj = X[j]!;
+ let d2 = 0;
+ for (let k = 0; k < xi.length; k++) {
+ d2 += ((xi[k] ?? 0) - (xj[k] ?? 0)) ** 2;
+ }
+ allDists.push(Math.sqrt(d2));
+ }
+ }
+
+ allDists.sort((a, b) => a - b);
+ const qIdx = Math.floor(quantile * (allDists.length - 1));
+ return allDists[qIdx] ?? 1.0;
+}
+
+/**
+ * Find initial seed points for Mean Shift.
+ * Seeds are bin centers of a uniform grid at bandwidth resolution.
+ *
+ * @param X - Input data
+ * @param bandwidth - Bin size
+ * @param minBinFreq - Minimum number of points per bin to be included
+ */
+export function getBinSeeds(
+ X: Float64Array[],
+ bandwidth: number,
+ minBinFreq = 1,
+): Float64Array[] {
+ if (bandwidth <= 0) throw new Error("bandwidth must be positive");
+ const n = X.length;
+ const d = X[0]?.length ?? 0;
+
+ // Discretize X into bins
+ const binMap = new Map();
+
+ for (let i = 0; i < n; i++) {
+ const xi = X[i]!;
+ const binCoords: number[] = [];
+ for (let k = 0; k < d; k++) {
+ binCoords.push(Math.round((xi[k] ?? 0) / bandwidth));
+ }
+ const key = binCoords.join(",");
+ const existing = binMap.get(key);
+ if (existing) {
+ for (let k = 0; k < d; k++) {
+ existing.sum[k]! += xi[k] ?? 0;
+ }
+ existing.count++;
+ } else {
+ const sum = new Float64Array(d);
+ for (let k = 0; k < d; k++) sum[k] = xi[k] ?? 0;
+ binMap.set(key, { sum, count: 1 });
+ }
+ }
+
+ // Return bin centers with sufficient frequency
+ const seeds: Float64Array[] = [];
+ for (const { sum, count } of binMap.values()) {
+ if (count >= minBinFreq) {
+ const center = new Float64Array(d);
+ for (let k = 0; k < d; k++) center[k] = (sum[k] ?? 0) / count;
+ seeds.push(center);
+ }
+ }
+
+ return seeds;
+}
+
+/**
+ * Find which bin each point belongs to.
+ * @returns Int32Array of bin indices (one per sample)
+ */
+export function assignBins(
+ X: Float64Array[],
+ seeds: Float64Array[],
+): Int32Array {
+ const n = X.length;
+ const result = new Int32Array(n).fill(-1);
+ for (let i = 0; i < n; i++) {
+ const xi = X[i]!;
+ let bestDist = Number.POSITIVE_INFINITY;
+ let bestJ = -1;
+ for (let j = 0; j < seeds.length; j++) {
+ const seed = seeds[j]!;
+ let d2 = 0;
+ for (let k = 0; k < xi.length; k++) {
+ d2 += ((xi[k] ?? 0) - (seed[k] ?? 0)) ** 2;
+ }
+ if (d2 < bestDist) { bestDist = d2; bestJ = j; }
+ }
+ result[i] = bestJ;
+ }
+ return result;
+}
+
+/**
+ * Single iteration of mean-shift update for a set of seeds.
+ * Updates each seed to the mean of all points within bandwidth distance.
+ *
+ * @returns New seed positions and whether any seed moved more than tol
+ */
+export function meanShiftStep(
+ X: Float64Array[],
+ seeds: Float64Array[],
+ bandwidth: number,
+): { newSeeds: Float64Array[]; converged: boolean } {
+ const d = X[0]?.length ?? 0;
+ const bw2 = bandwidth * bandwidth;
+ const newSeeds: Float64Array[] = [];
+ let maxShift = 0;
+
+ for (const seed of seeds) {
+ const newSeed = new Float64Array(d);
+ let weight = 0;
+ for (const xi of X) {
+ let d2 = 0;
+ for (let k = 0; k < d; k++) {
+ d2 += ((xi[k] ?? 0) - (seed[k] ?? 0)) ** 2;
+ }
+ if (d2 <= bw2) {
+ weight++;
+ for (let k = 0; k < d; k++) newSeed[k]! += xi[k] ?? 0;
+ }
+ }
+ if (weight > 0) {
+ for (let k = 0; k < d; k++) newSeed[k]! /= weight;
+ } else {
+ newSeed.set(seed);
+ }
+
+ // Track max shift
+ let shift2 = 0;
+ for (let k = 0; k < d; k++) {
+ shift2 += ((newSeed[k] ?? 0) - (seed[k] ?? 0)) ** 2;
+ }
+ maxShift = Math.max(maxShift, Math.sqrt(shift2));
+ newSeeds.push(newSeed);
+ }
+
+ return { newSeeds, converged: maxShift < 1e-3 * bandwidth };
+}
+
+/**
+ * Merge nearby seeds by deduplication within bandwidth distance.
+ * Returns unique cluster centers.
+ */
+export function mergeSeeds(
+ seeds: Float64Array[],
+ bandwidth: number,
+): Float64Array[] {
+ const bw2 = bandwidth * bandwidth;
+ const merged: Float64Array[] = [];
+
+ for (const seed of seeds) {
+ let isNew = true;
+ for (const center of merged) {
+ let d2 = 0;
+ for (let k = 0; k < seed.length; k++) {
+ d2 += ((seed[k] ?? 0) - (center[k] ?? 0)) ** 2;
+ }
+ if (d2 <= bw2) { isNew = false; break; }
+ }
+ if (isNew) merged.push(seed);
+ }
+
+ return merged;
+}
+
+/**
+ * Compute cluster labels for X given cluster centers.
+ * Each point is assigned to its nearest center.
+ */
+export function clusterLabels(
+ X: Float64Array[],
+ centers: Float64Array[],
+): Int32Array {
+ const labels = new Int32Array(X.length);
+ for (let i = 0; i < X.length; i++) {
+ const xi = X[i]!;
+ let best = -1;
+ let bestDist = Number.POSITIVE_INFINITY;
+ for (let j = 0; j < centers.length; j++) {
+ const c = centers[j]!;
+ let d2 = 0;
+ for (let k = 0; k < xi.length; k++) {
+ d2 += ((xi[k] ?? 0) - (c[k] ?? 0)) ** 2;
+ }
+ if (d2 < bestDist) { bestDist = d2; best = j; }
+ }
+ labels[i] = best;
+ }
+ return labels;
+}
+
+/**
+ * Compute inertia (within-cluster sum of squared distances to centers).
+ */
+export function computeInertia(
+ X: Float64Array[],
+ centers: Float64Array[],
+ labels: Int32Array,
+): number {
+ let inertia = 0;
+ for (let i = 0; i < X.length; i++) {
+ const xi = X[i]!;
+ const c = centers[labels[i]!]!;
+ let d2 = 0;
+ for (let k = 0; k < xi.length; k++) {
+ d2 += ((xi[k] ?? 0) - (c[k] ?? 0)) ** 2;
+ }
+ inertia += d2;
+ }
+ return inertia;
+}
+
+/**
+ * Compute cluster centers from assignments.
+ */
+export function computeCenters(
+ X: Float64Array[],
+ labels: Int32Array,
+ nClusters: number,
+): Float64Array[] {
+ const d = X[0]?.length ?? 0;
+ const sums: Float64Array[] = Array.from({ length: nClusters }, () => new Float64Array(d));
+ const counts = new Int32Array(nClusters);
+
+ for (let i = 0; i < X.length; i++) {
+ const xi = X[i]!;
+ const lbl = labels[i] ?? 0;
+ if (lbl >= 0 && lbl < nClusters) {
+ counts[lbl]!++;
+ for (let k = 0; k < d; k++) sums[lbl]![k]! += xi[k] ?? 0;
+ }
+ }
+
+ return sums.map((s, j) => {
+ const cnt = counts[j] ?? 1;
+ if (cnt === 0) return s;
+ const c = new Float64Array(d);
+ for (let k = 0; k < d; k++) c[k] = (s[k] ?? 0) / cnt;
+ return c;
+ });
+}
diff --git a/src/cluster/feature_agglomeration.ts b/src/cluster/feature_agglomeration.ts
new file mode 100644
index 0000000..0a0ca57
--- /dev/null
+++ b/src/cluster/feature_agglomeration.ts
@@ -0,0 +1,169 @@
+/**
+ * FeatureAgglomeration β hierarchical clustering applied to features (columns).
+ * Each sample's features are grouped; the representative value (mean/median/max)
+ * of each group becomes the transformed feature.
+ *
+ * Ports: FeatureAgglomeration
+ */
+
+import { BaseEstimator } from "../base.js";
+
+export interface FeatureAgglomerationOptions {
+ nClusters?: number;
+ poolingFunc?: "mean" | "median" | "max" | "min";
+ linkage?: "ward" | "complete" | "average" | "single";
+}
+
+function columnMean(X: Float64Array[], col: number): number {
+ let s = 0;
+ for (const row of X) s += row[col] ?? 0;
+ return s / X.length;
+}
+
+function colDist(X: Float64Array[], a: number, b: number): number {
+ const ma = columnMean(X, a);
+ const mb = columnMean(X, b);
+ return Math.abs(ma - mb);
+}
+
+/**
+ * Agglomerative (bottom-up) clustering on columns using average-column-value distance.
+ * Returns an array mapping each column β cluster index (0-based).
+ */
+function agglomerateCols(
+ X: Float64Array[],
+ nClusters: number,
+ _linkage: string,
+): Int32Array {
+ const nFeatures = X[0]?.length ?? 0;
+ if (nClusters >= nFeatures) {
+ return Int32Array.from({ length: nFeatures }, (_, i) => i);
+ }
+ // Start: each feature is its own cluster
+ const assignments = Int32Array.from({ length: nFeatures }, (_, i) => i);
+ let nActive = nFeatures;
+ // Track which features belong to each cluster
+ const clusters: number[][] = Array.from({ length: nFeatures }, (_, i) => [i]);
+
+ while (nActive > nClusters) {
+ // Find two closest clusters (by mean column distance)
+ let minDist = Number.POSITIVE_INFINITY;
+ let mergeA = -1;
+ let mergeB = -1;
+ const activeIds = [...new Set(Array.from(assignments))].sort((a, b) => a - b);
+ for (let ai = 0; ai < activeIds.length; ai++) {
+ for (let bi = ai + 1; bi < activeIds.length; bi++) {
+ const ca = activeIds[ai] ?? 0;
+ const cb = activeIds[bi] ?? 0;
+ const colsA = clusters[ca] ?? [];
+ const colsB = clusters[cb] ?? [];
+ // average linkage between column groups
+ let d = 0;
+ let count = 0;
+ for (const fa of colsA) {
+ for (const fb of colsB) {
+ d += colDist(X, fa, fb);
+ count++;
+ }
+ }
+ d = count > 0 ? d / count : Number.POSITIVE_INFINITY;
+ if (d < minDist) {
+ minDist = d;
+ mergeA = ca;
+ mergeB = cb;
+ }
+ }
+ }
+ if (mergeA < 0 || mergeB < 0) break;
+ // Merge mergeB into mergeA
+ const colsB = clusters[mergeB] ?? [];
+ for (const col of colsB) {
+ assignments[col] = mergeA;
+ }
+ clusters[mergeA] = [...(clusters[mergeA] ?? []), ...colsB];
+ clusters[mergeB] = [];
+ nActive--;
+ }
+ // Remap cluster IDs to 0..nClusters-1
+ const idMap = new Map();
+ let nextId = 0;
+ for (let i = 0; i < assignments.length; i++) {
+ const a = assignments[i] ?? 0;
+ if (!idMap.has(a)) idMap.set(a, nextId++);
+ assignments[i] = idMap.get(a) ?? 0;
+ }
+ return assignments;
+}
+
+/**
+ * Cluster features using hierarchical clustering and pool each group.
+ */
+export class FeatureAgglomeration extends BaseEstimator {
+ nClusters: number;
+ poolingFunc: "mean" | "median" | "max" | "min";
+ linkage: "ward" | "complete" | "average" | "single";
+
+ labels_!: Int32Array;
+ nClusters_!: number;
+
+ constructor(options: FeatureAgglomerationOptions = {}) {
+ super();
+ this.nClusters = options.nClusters ?? 2;
+ this.poolingFunc = options.poolingFunc ?? "mean";
+ this.linkage = options.linkage ?? "ward";
+ }
+
+ fit(X: Float64Array[]): this {
+ this.labels_ = agglomerateCols(X, this.nClusters, this.linkage);
+ this.nClusters_ = new Set(Array.from(this.labels_)).size;
+ return this;
+ }
+
+ transform(X: Float64Array[]): Float64Array[] {
+ if (this.labels_ === undefined) throw new Error("Not fitted");
+ const k = this.nClusters_;
+ return X.map((row) => {
+ const groups: number[][] = Array.from({ length: k }, () => []);
+ for (let j = 0; j < row.length; j++) {
+ const cid = this.labels_[j] ?? 0;
+ (groups[cid] ?? []).push(row[j] ?? 0);
+ }
+ const out = new Float64Array(k);
+ for (let c = 0; c < k; c++) {
+ const vals = groups[c] ?? [];
+ if (vals.length === 0) { out[c] = 0; continue; }
+ if (this.poolingFunc === "mean") {
+ out[c] = vals.reduce((a, b) => a + b, 0) / vals.length;
+ } else if (this.poolingFunc === "median") {
+ const s = [...vals].sort((a, b) => a - b);
+ const m = Math.floor(s.length / 2);
+ out[c] = s.length % 2 === 0
+ ? ((s[m - 1] ?? 0) + (s[m] ?? 0)) / 2
+ : (s[m] ?? 0);
+ } else if (this.poolingFunc === "max") {
+ out[c] = Math.max(...vals);
+ } else {
+ out[c] = Math.min(...vals);
+ }
+ }
+ return out;
+ });
+ }
+
+ fitTransform(X: Float64Array[]): Float64Array[] {
+ return this.fit(X).transform(X);
+ }
+
+ /** Reconstruct original shape from reduced representation. */
+ inverseTransform(Xred: Float64Array[]): Float64Array[] {
+ if (this.labels_ === undefined) throw new Error("Not fitted");
+ const nFeatures = this.labels_.length;
+ return Xred.map((row) => {
+ const out = new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) {
+ out[j] = row[this.labels_[j] ?? 0] ?? 0;
+ }
+ return out;
+ });
+ }
+}
diff --git a/src/cluster/hdbscan.ts b/src/cluster/hdbscan.ts
new file mode 100644
index 0000000..2a1f489
--- /dev/null
+++ b/src/cluster/hdbscan.ts
@@ -0,0 +1,189 @@
+/**
+ * HDBSCAN β Hierarchical Density-Based Spatial Clustering of Applications with Noise.
+ * Mirrors sklearn.cluster.HDBSCAN.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+export interface HDBSCANOptions {
+ minClusterSize?: number;
+ minSamples?: number | null;
+ clusterSelectionEpsilon?: number;
+ maxClusterSize?: number | null;
+ alpha?: number;
+ clusterSelectionMethod?: "eom" | "leaf";
+ allowSingleCluster?: boolean;
+ metric?: "euclidean" | "manhattan" | "chebyshev";
+}
+
+/**
+ * HDBSCAN clustering algorithm.
+ * Extends DBSCAN by converting it into a hierarchical clustering then using a stability
+ * criterion to extract a flat clustering.
+ */
+export class HDBSCAN {
+ minClusterSize: number;
+ minSamples: number;
+ clusterSelectionEpsilon: number;
+ alpha: number;
+ clusterSelectionMethod: "eom" | "leaf";
+ allowSingleCluster: boolean;
+ metric: "euclidean" | "manhattan" | "chebyshev";
+
+ labels_: Int32Array | null = null;
+ probabilities_: Float64Array | null = null;
+ clusterPersistence_: Float64Array | null = null;
+ nFeatures_: number = 0;
+
+ constructor(options: HDBSCANOptions = {}) {
+ this.minClusterSize = options.minClusterSize ?? 5;
+ this.minSamples = options.minSamples ?? 5;
+ this.clusterSelectionEpsilon = options.clusterSelectionEpsilon ?? 0;
+ this.alpha = options.alpha ?? 1.0;
+ this.clusterSelectionMethod = options.clusterSelectionMethod ?? "eom";
+ this.allowSingleCluster = options.allowSingleCluster ?? false;
+ this.metric = options.metric ?? "euclidean";
+ }
+
+ private _dist(a: Float64Array, b: Float64Array): number {
+ const p = a.length;
+ if (this.metric === "manhattan") {
+ let s = 0;
+ for (let j = 0; j < p; j++) s += Math.abs((a[j] ?? 0) - (b[j] ?? 0));
+ return s;
+ }
+ if (this.metric === "chebyshev") {
+ let s = 0;
+ for (let j = 0; j < p; j++) s = Math.max(s, Math.abs((a[j] ?? 0) - (b[j] ?? 0)));
+ return s;
+ }
+ let s = 0;
+ for (let j = 0; j < p; j++) s += ((a[j] ?? 0) - (b[j] ?? 0)) ** 2;
+ return Math.sqrt(s);
+ }
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ this.nFeatures_ = X[0]?.length ?? 0;
+
+ // Compute pairwise distances
+ const dists: Float64Array[] = Array.from({ length: n }, () => new Float64Array(n));
+ for (let i = 0; i < n; i++) {
+ for (let j = i + 1; j < n; j++) {
+ const d = this._dist(X[i]!, X[j]!);
+ dists[i]![j]! = d;
+ dists[j]![i]! = d;
+ }
+ }
+
+ // Core distances (kth nearest neighbor distance)
+ const k = Math.min(this.minSamples, n - 1);
+ const coreDists = new Float64Array(n);
+ for (let i = 0; i < n; i++) {
+ const sorted = Array.from(dists[i]!).filter((_, j) => j !== i).sort((a, b) => a - b);
+ coreDists[i]! = sorted[k - 1] ?? 0;
+ }
+
+ // Mutual reachability distances
+ const mrd: Float64Array[] = Array.from({ length: n }, () => new Float64Array(n));
+ for (let i = 0; i < n; i++) {
+ for (let j = 0; j < n; j++) {
+ if (i === j) continue;
+ mrd[i]![j]! = Math.max(coreDists[i]!, coreDists[j]!, dists[i]![j]!);
+ }
+ }
+
+ // Build MST (Prim's algorithm)
+ const inMST = new Uint8Array(n);
+ const minEdge = new Float64Array(n).fill(Number.POSITIVE_INFINITY);
+ const parent = new Int32Array(n).fill(-1);
+ minEdge[0]! = 0;
+
+ const edges: Array<[number, number, number]> = [];
+ for (let step = 0; step < n; step++) {
+ let u = -1;
+ for (let i = 0; i < n; i++) {
+ if (!inMST[i] && (u < 0 || (minEdge[i] ?? 0) < (minEdge[u] ?? 0))) u = i;
+ }
+ if (u < 0) break;
+ inMST[u]! = 1;
+ if (parent[u]! >= 0) edges.push([parent[u]!, u, mrd[parent[u]!]![u]!]);
+ for (let v = 0; v < n; v++) {
+ if (!inMST[v] && (mrd[u]![v]! < (minEdge[v] ?? Number.POSITIVE_INFINITY))) {
+ minEdge[v]! = mrd[u]![v]!;
+ parent[v]! = u;
+ }
+ }
+ }
+
+ // Sort MST edges by weight
+ edges.sort((a, b) => (a[2] ?? 0) - (b[2] ?? 0));
+
+ // Build hierarchy via single-linkage (union-find)
+ const uf = Array.from({ length: n }, (_, i) => i);
+ const find = (x: number): number => {
+ while (uf[x] !== x) {
+ uf[x]! = uf[uf[x]!]!;
+ x = uf[x]!;
+ }
+ return x;
+ };
+ const clusterSizes = new Int32Array(n).fill(1);
+ const labels = new Int32Array(n).fill(-1);
+
+ // Simplified flat clustering: use density-based approach
+ // Group points where edge weight <= threshold
+ const threshold = this.clusterSelectionEpsilon > 0
+ ? this.clusterSelectionEpsilon
+ : (edges[Math.floor(edges.length * 0.5)]?.[2] ?? 0);
+
+ for (const [u, v, w] of edges) {
+ if (w <= threshold) {
+ const pu = find(u);
+ const pv = find(v);
+ if (pu !== pv) {
+ const newSize = (clusterSizes[pu] ?? 1) + (clusterSizes[pv] ?? 1);
+ if ((clusterSizes[pu] ?? 1) >= (clusterSizes[pv] ?? 1)) {
+ uf[pv]! = pu;
+ clusterSizes[pu]! = newSize;
+ } else {
+ uf[pu]! = pv;
+ clusterSizes[pv]! = newSize;
+ }
+ }
+ }
+ }
+
+ // Assign cluster labels
+ const rootToCluster = new Map();
+ let nextCluster = 0;
+ for (let i = 0; i < n; i++) {
+ const root = find(i);
+ const sz = clusterSizes[root] ?? 1;
+ if (sz >= this.minClusterSize) {
+ if (!rootToCluster.has(root)) rootToCluster.set(root, nextCluster++);
+ labels[i]! = rootToCluster.get(root)!;
+ }
+ }
+
+ this.labels_ = labels;
+ this.probabilities_ = new Float64Array(n).fill(1.0);
+ // Mark noise points
+ for (let i = 0; i < n; i++) {
+ if (labels[i] === -1) this.probabilities_[i]! = 0;
+ }
+ this.clusterPersistence_ = new Float64Array(nextCluster).fill(1.0);
+ return this;
+ }
+
+ fitPredict(X: Float64Array[]): Int32Array {
+ this.fit(X);
+ if (!this.labels_) throw new NotFittedError("HDBSCAN is not fitted");
+ return this.labels_;
+ }
+
+ get nClusters_(): number {
+ if (!this.labels_) return 0;
+ return Math.max(...Array.from(this.labels_)) + 1;
+ }
+}
diff --git a/src/cluster/hierarchical.ts b/src/cluster/hierarchical.ts
new file mode 100644
index 0000000..f5304d0
--- /dev/null
+++ b/src/cluster/hierarchical.ts
@@ -0,0 +1,208 @@
+/**
+ * Hierarchical clustering utilities β analogous to scipy.cluster.hierarchy and
+ * sklearn's internal _agglomerative_clustering helpers.
+ */
+
+/** Linkage methods supported by the `linkage` function. */
+export type LinkageMethod = "single" | "complete" | "average" | "ward" | "centroid" | "median" | "weighted";
+
+/** A single row of a linkage matrix: [idx1, idx2, distance, count]. */
+export interface HierarchicalHierarchicalLinkageRow {
+ idx1: number;
+ idx2: number;
+ distance: number;
+ count: number;
+}
+
+/**
+ * Computes a hierarchical clustering linkage matrix from a condensed distance matrix.
+ *
+ * @param distMatrix Condensed distance matrix (length = n*(n-1)/2 for n observations).
+ * @param n Number of observations.
+ * @param method Linkage method (default "single").
+ * @returns Array of (n-1) HierarchicalLinkageRow entries in merge order.
+ */
+export function linkage(
+ distMatrix: Float64Array,
+ n: number,
+ method: LinkageMethod = "single",
+): HierarchicalLinkageRow[] {
+ // Build full distance matrix for simplicity (nn-chain would be faster)
+ const D = new Float64Array(n * n).fill(Number.POSITIVE_INFINITY);
+ for (let i = 0; i < n; i++) D[i * n + i] = 0;
+ let k = 0;
+ for (let i = 0; i < n - 1; i++) {
+ for (let j = i + 1; j < n; j++) {
+ const d = distMatrix[k++]!;
+ D[i * n + j] = d;
+ D[j * n + i] = d;
+ }
+ }
+
+ // Active cluster set
+ const active = new Set(Array.from({ length: n }, (_, i) => i));
+ // Cluster sizes
+ const sizes = new Float64Array(2 * n).fill(1);
+ // Cluster centroids (for ward / centroid / median)
+ const identity = new Float64Array(n * n); // nΓn identity as initial centroids placeholder
+ for (let i = 0; i < n; i++) identity[i * n + i] = 1;
+
+ const result: HierarchicalLinkageRow[] = [];
+ let nextId = n;
+
+ // Expanded distance matrix that grows with new cluster nodes
+ const maxN = 2 * n;
+ const bigD = new Float64Array(maxN * maxN).fill(Number.POSITIVE_INFINITY);
+ for (let i = 0; i < n; i++) {
+ bigD[i * maxN + i] = 0;
+ for (let j = 0; j < n; j++) bigD[i * maxN + j] = D[i * n + j]!;
+ }
+
+ while (active.size > 1) {
+ // Find nearest pair
+ let minDist = Number.POSITIVE_INFINITY;
+ let a = -1;
+ let b = -1;
+ for (const i of active) {
+ for (const j of active) {
+ if (j <= i) continue;
+ const d = bigD[i * maxN + j]!;
+ if (d < minDist) { minDist = d; a = i; b = j; }
+ }
+ }
+ if (a < 0) break;
+
+ const sA = sizes[a]!;
+ const sB = sizes[b]!;
+ const sNew = sA + sB;
+ sizes[nextId] = sNew;
+
+ // Compute distances from new cluster to all remaining clusters
+ for (const c of active) {
+ if (c === a || c === b) continue;
+ const dac = bigD[a * maxN + c]!;
+ const dbc = bigD[b * maxN + c]!;
+ const sC = sizes[c]!;
+ let dNew: number;
+ switch (method) {
+ case "single": dNew = Math.min(dac, dbc); break;
+ case "complete": dNew = Math.max(dac, dbc); break;
+ case "average": dNew = (sA * dac + sB * dbc) / sNew; break;
+ case "ward": {
+ const dab = bigD[a * maxN + b]!;
+ dNew = Math.sqrt(
+ ((sA + sC) * dac * dac + (sB + sC) * dbc * dbc - sC * dab * dab) / (sNew + sC),
+ );
+ break;
+ }
+ case "centroid": dNew = Math.sqrt((sA * dac * dac + sB * dbc * dbc) / sNew - (sA * sB * bigD[a * maxN + b]! * bigD[a * maxN + b]!) / (sNew * sNew)); break;
+ case "median": dNew = Math.sqrt(0.5 * dac * dac + 0.5 * dbc * dbc - 0.25 * bigD[a * maxN + b]! * bigD[a * maxN + b]!); break;
+ case "weighted": dNew = 0.5 * dac + 0.5 * dbc; break;
+ default: dNew = Math.min(dac, dbc);
+ }
+ bigD[nextId * maxN + c] = dNew;
+ bigD[c * maxN + nextId] = dNew;
+ }
+ bigD[nextId * maxN + nextId] = 0;
+
+ result.push({ idx1: a, idx2: b, distance: minDist, count: sNew });
+ active.delete(a);
+ active.delete(b);
+ active.add(nextId);
+ nextId++;
+ }
+
+ return result;
+}
+
+/**
+ * Cuts a dendrogram at a given number of clusters.
+ * Returns an Int32Array of cluster labels (length = n).
+ */
+export function cutTree(rows: HierarchicalLinkageRow[], n: number, nClusters: number): Int32Array {
+ // Each leaf starts in its own cluster; merge bottom-up, stop early
+ const parent = new Int32Array(2 * n).fill(-1);
+ const mergeOrder = rows.slice(0, n - nClusters);
+
+ let nextId = n;
+ for (const row of mergeOrder) {
+ parent[row.idx1] = nextId;
+ parent[row.idx2] = nextId;
+ nextId++;
+ }
+
+ const labels = new Int32Array(n);
+ const rootLabels = new Map();
+ let labelCounter = 0;
+
+ for (let i = 0; i < n; i++) {
+ let cur = i;
+ while (parent[cur] !== -1) cur = parent[cur]!;
+ let label = rootLabels.get(cur);
+ if (label === undefined) {
+ label = labelCounter++;
+ rootLabels.set(cur, label);
+ }
+ labels[i] = label;
+ }
+ return labels;
+}
+
+/**
+ * Converts a condensed distance matrix to a full (nΓn) symmetric matrix.
+ */
+export function squareform(condensed: Float64Array, n: number): Float64Array {
+ const full = new Float64Array(n * n);
+ let k = 0;
+ for (let i = 0; i < n - 1; i++) {
+ for (let j = i + 1; j < n; j++) {
+ const d = condensed[k++]!;
+ full[i * n + j] = d;
+ full[j * n + i] = d;
+ }
+ }
+ return full;
+}
+
+/**
+ * Computes the cophenetic correlation coefficient for a linkage matrix.
+ * Measures how faithfully the dendrogram preserves pairwise distances.
+ */
+export function copheneticCorr(rows: HierarchicalLinkageRow[], condensed: Float64Array, n: number): number {
+ // Build cophenetic distance matrix from linkage
+ const cophenetic = new Float64Array((n * (n - 1)) / 2);
+ const clusterHeight = new Map();
+ const clusterMembers = new Map();
+
+ for (let i = 0; i < n; i++) clusterMembers.set(i, [i]);
+
+ let nextId = n;
+ for (const row of rows) {
+ const mA = clusterMembers.get(row.idx1) ?? [];
+ const mB = clusterMembers.get(row.idx2) ?? [];
+ for (const a of mA) {
+ for (const b of mB) {
+ const [lo, hi] = a < b ? [a, b] : [b, a];
+ // Condensed index
+ const idx = lo * n - (lo * (lo + 1)) / 2 + hi - lo - 1;
+ cophenetic[idx] = row.distance;
+ }
+ }
+ clusterMembers.set(nextId, [...mA, ...mB]);
+ clusterHeight.set(nextId, row.distance);
+ nextId++;
+ }
+
+ // Pearson correlation between condensed and cophenetic distances
+ const m = condensed.length;
+ let mx = 0; let my = 0;
+ for (let i = 0; i < m; i++) { mx += condensed[i]!; my += cophenetic[i]!; }
+ mx /= m; my /= m;
+ let cov = 0; let sx = 0; let sy = 0;
+ for (let i = 0; i < m; i++) {
+ const dx = condensed[i]! - mx; const dy = cophenetic[i]! - my;
+ cov += dx * dy; sx += dx * dx; sy += dy * dy;
+ }
+ const denom = Math.sqrt(sx * sy);
+ return denom === 0 ? 0 : cov / denom;
+}
diff --git a/src/cluster/index.ts b/src/cluster/index.ts
new file mode 100644
index 0000000..9cfa3de
--- /dev/null
+++ b/src/cluster/index.ts
@@ -0,0 +1,12 @@
+export * from "./kmeans.js";
+export * from "./agglomerative.js";
+export * from "./spectral.js";
+export * from "./hdbscan.js";
+export * from "./bisecting_kmeans.js";
+export * from "./affinity_propagation.js";
+export * from "./feature_agglomeration.js";
+export * from "./ward.js";
+export * from "./clustering_utils.js";
+export * from "./hierarchical.js";
+export * from "./optics_ext.js";
+export * from "./cluster_validation.js";
diff --git a/src/cluster/kmeans.ts b/src/cluster/kmeans.ts
new file mode 100644
index 0000000..3e043d0
--- /dev/null
+++ b/src/cluster/kmeans.ts
@@ -0,0 +1,301 @@
+/**
+ * KMeans and DBSCAN clustering.
+ * Mirrors sklearn.cluster.KMeans and DBSCAN.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+function euclideanSq(a: Float64Array, b: Float64Array): number {
+ let s = 0;
+ for (let i = 0; i < a.length; i++) {
+ s += ((a[i] ?? 0) - (b[i] ?? 0)) ** 2;
+ }
+ return s;
+}
+
+function euclidean(a: Float64Array, b: Float64Array): number {
+ return Math.sqrt(euclideanSq(a, b));
+}
+
+export class KMeans {
+ nClusters: number;
+ maxIter: number;
+ tol: number;
+ nInit: number;
+
+ clusterCenters_: Float64Array[] | null = null;
+ labels_: Int32Array | null = null;
+ inertia_: number = 0;
+
+ constructor(
+ options: {
+ nClusters?: number;
+ maxIter?: number;
+ tol?: number;
+ nInit?: number;
+ } = {},
+ ) {
+ this.nClusters = options.nClusters ?? 8;
+ this.maxIter = options.maxIter ?? 300;
+ this.tol = options.tol ?? 1e-4;
+ this.nInit = options.nInit ?? 10;
+ }
+
+ private _kmeanspp(X: Float64Array[], k: number): Float64Array[] {
+ const n = X.length;
+ const p = (X[0] ?? new Float64Array(0)).length;
+ const centers: Float64Array[] = [];
+
+ // Pick first center randomly
+ centers.push(new Float64Array(X[Math.floor(Math.random() * n)] ?? new Float64Array(p)));
+
+ for (let c = 1; c < k; c++) {
+ const dists = X.map((xi) => {
+ let minD = Number.POSITIVE_INFINITY;
+ for (const center of centers) {
+ const d = euclideanSq(xi, center);
+ if (d < minD) minD = d;
+ }
+ return minD;
+ });
+ const totalDist = dists.reduce((a, b) => a + b, 0);
+ let rand = Math.random() * totalDist;
+ let selected = 0;
+ for (let i = 0; i < n; i++) {
+ rand -= dists[i] ?? 0;
+ if (rand <= 0) {
+ selected = i;
+ break;
+ }
+ }
+ centers.push(new Float64Array(X[selected] ?? new Float64Array(p)));
+ }
+ return centers;
+ }
+
+ private _run(
+ X: Float64Array[],
+ k: number,
+ ): { centers: Float64Array[]; labels: Int32Array; inertia: number } {
+ const n = X.length;
+ const p = (X[0] ?? new Float64Array(0)).length;
+ let centers = this._kmeanspp(X, k);
+ const labels = new Int32Array(n);
+
+ for (let iter = 0; iter < this.maxIter; iter++) {
+ // Assignment step
+ for (let i = 0; i < n; i++) {
+ let minDist = Number.POSITIVE_INFINITY;
+ let minIdx = 0;
+ for (let c = 0; c < centers.length; c++) {
+ const d = euclideanSq(X[i] ?? new Float64Array(p), centers[c] ?? new Float64Array(p));
+ if (d < minDist) {
+ minDist = d;
+ minIdx = c;
+ }
+ }
+ labels[i] = minIdx;
+ }
+
+ // Update step
+ const newCenters: Float64Array[] = Array.from({ length: k }, () => new Float64Array(p));
+ const counts = new Int32Array(k);
+ for (let i = 0; i < n; i++) {
+ const c = labels[i] ?? 0;
+ counts[c] = (counts[c] ?? 0) + 1;
+ const xi = X[i] ?? new Float64Array(p);
+ const center = newCenters[c] ?? new Float64Array(p);
+ for (let j = 0; j < p; j++) {
+ center[j] = (center[j] ?? 0) + (xi[j] ?? 0);
+ }
+ }
+
+ let maxShift = 0;
+ for (let c = 0; c < k; c++) {
+ const cnt = counts[c] ?? 0;
+ const center = newCenters[c] ?? new Float64Array(p);
+ if (cnt > 0) {
+ for (let j = 0; j < p; j++) {
+ center[j] = (center[j] ?? 0) / cnt;
+ }
+ } else {
+ // Re-initialize empty cluster to a random point
+ const randIdx = Math.floor(Math.random() * n);
+ newCenters[c] = new Float64Array(X[randIdx] ?? new Float64Array(p));
+ }
+ const shift = euclideanSq(centers[c] ?? new Float64Array(p), newCenters[c] ?? new Float64Array(p));
+ if (shift > maxShift) maxShift = shift;
+ }
+ centers = newCenters;
+ if (maxShift < this.tol ** 2) break;
+ }
+
+ // Compute inertia
+ let inertia = 0;
+ for (let i = 0; i < n; i++) {
+ inertia += euclideanSq(X[i] ?? new Float64Array(p), centers[labels[i] ?? 0] ?? new Float64Array(p));
+ }
+
+ return { centers, labels, inertia };
+ }
+
+ fit(X: Float64Array[]): this {
+ const k = Math.min(this.nClusters, X.length);
+ let best: ReturnType | null = null;
+
+ for (let init = 0; init < this.nInit; init++) {
+ const result = this._run(X, k);
+ if (best === null || result.inertia < best.inertia) {
+ best = result;
+ }
+ }
+
+ this.clusterCenters_ = best?.centers ?? [];
+ this.labels_ = best?.labels ?? new Int32Array(X.length);
+ this.inertia_ = best?.inertia ?? 0;
+ return this;
+ }
+
+ predict(X: Float64Array[]): Int32Array {
+ if (this.clusterCenters_ === null) throw new NotFittedError("KMeans");
+ const centers = this.clusterCenters_;
+ const p = (centers[0] ?? new Float64Array(0)).length;
+ return new Int32Array(
+ X.map((xi) => {
+ let minDist = Number.POSITIVE_INFINITY;
+ let minIdx = 0;
+ for (let c = 0; c < centers.length; c++) {
+ const d = euclideanSq(xi, centers[c] ?? new Float64Array(p));
+ if (d < minDist) {
+ minDist = d;
+ minIdx = c;
+ }
+ }
+ return minIdx;
+ }),
+ );
+ }
+
+ fitPredict(X: Float64Array[]): Int32Array {
+ this.fit(X);
+ return this.labels_ as Int32Array;
+ }
+
+ score(X: Float64Array[]): number {
+ return -this._computeInertia(X, this.clusterCenters_ ?? []);
+ }
+
+ private _computeInertia(X: Float64Array[], centers: Float64Array[]): number {
+ const p = (centers[0] ?? new Float64Array(0)).length;
+ let inertia = 0;
+ for (const xi of X) {
+ let minDist = Number.POSITIVE_INFINITY;
+ for (const c of centers) {
+ const d = euclideanSq(xi, c.length ? c : new Float64Array(p));
+ if (d < minDist) minDist = d;
+ }
+ inertia += minDist;
+ }
+ return inertia;
+ }
+}
+
+export class DBSCAN {
+ eps: number;
+ minSamples: number;
+ metric: string;
+
+ labels_: Int32Array | null = null;
+ coreIndices_: Int32Array | null = null;
+
+ constructor(
+ options: {
+ eps?: number;
+ minSamples?: number;
+ metric?: string;
+ } = {},
+ ) {
+ this.eps = options.eps ?? 0.5;
+ this.minSamples = options.minSamples ?? 5;
+ this.metric = options.metric ?? "euclidean";
+ }
+
+ fitPredict(X: Float64Array[]): Int32Array {
+ const n = X.length;
+ const labels = new Int32Array(n).fill(-2); // -2 = unvisited, -1 = noise
+ let clusterId = 0;
+ const coreIndices: number[] = [];
+
+ function getNeighbors(idx: number): number[] {
+ const neighbors: number[] = [];
+ const xi = X[idx] ?? new Float64Array(0);
+ for (let j = 0; j < n; j++) {
+ if (euclidean(xi, X[j] ?? new Float64Array(0)) <= 0.5) {
+ // placeholder - use eps below
+ }
+ }
+ return neighbors;
+ }
+ void getNeighbors; // suppress unused warning
+
+ const eps = this.eps;
+ const minSamples = this.minSamples;
+
+ function neighbors(idx: number): number[] {
+ const xi = X[idx] ?? new Float64Array(0);
+ const result: number[] = [];
+ for (let j = 0; j < n; j++) {
+ if (euclidean(xi, X[j] ?? new Float64Array(0)) <= eps) {
+ result.push(j);
+ }
+ }
+ return result;
+ }
+
+ for (let i = 0; i < n; i++) {
+ if (labels[i] !== -2) continue;
+ const nb = neighbors(i);
+ if (nb.length < minSamples) {
+ labels[i] = -1;
+ continue;
+ }
+
+ coreIndices.push(i);
+ labels[i] = clusterId;
+ const queue = [...nb.filter((j) => j !== i)];
+
+ while (queue.length > 0) {
+ const j = queue.shift() as number;
+ if (labels[j] === -1) {
+ labels[j] = clusterId;
+ }
+ if (labels[j] !== -2) continue;
+ labels[j] = clusterId;
+ const jNb = neighbors(j);
+ if (jNb.length >= minSamples) {
+ coreIndices.push(j);
+ for (const k of jNb) {
+ if (labels[k] === -2 || labels[k] === -1) {
+ queue.push(k);
+ }
+ }
+ }
+ }
+ clusterId++;
+ }
+
+ // Fix any remaining unvisited (noise)
+ for (let i = 0; i < n; i++) {
+ if (labels[i] === -2) labels[i] = -1;
+ }
+
+ this.labels_ = labels;
+ this.coreIndices_ = new Int32Array(coreIndices);
+ return labels;
+ }
+
+ fit(X: Float64Array[]): this {
+ this.fitPredict(X);
+ return this;
+ }
+}
diff --git a/src/cluster/mean_shift_ext.ts b/src/cluster/mean_shift_ext.ts
new file mode 100644
index 0000000..2c17924
--- /dev/null
+++ b/src/cluster/mean_shift_ext.ts
@@ -0,0 +1,132 @@
+/**
+ * Mean Shift clustering extensions.
+ * Mirrors scikit-learn's cluster.MeanShift with bandwidth estimation.
+ */
+
+export interface MeanShiftExtOptions {
+ bandwidth?: number;
+ seeds?: Float64Array[];
+ binSeeding?: boolean;
+ minBinFreq?: number;
+ clusterAll?: boolean;
+ maxIter?: number;
+}
+
+function euclidean(a: Float64Array, b: Float64Array): number {
+ let s = 0;
+ for (let i = 0; i < a.length; i++) s += ((a[i] ?? 0) - (b[i] ?? 0)) ** 2;
+ return Math.sqrt(s);
+}
+
+/**
+ * Estimate bandwidth for Mean Shift using median heuristic.
+ */
+export function estimateBandwidth(
+ X: Float64Array[],
+ options: { quantile?: number; nSamples?: number } = {},
+): number {
+ const { quantile = 0.3, nSamples } = options;
+ const n = X.length;
+ const sample = nSamples !== undefined ? X.slice(0, nSamples) : X;
+ const nS = sample.length;
+ const dists: number[] = [];
+
+ for (let i = 0; i < nS; i++) {
+ for (let j = i + 1; j < n; j++) {
+ dists.push(euclidean(sample[i]!, X[j]!));
+ }
+ }
+ dists.sort((a, b) => a - b);
+ const idx = Math.floor(quantile * dists.length);
+ return dists[idx] ?? 1;
+}
+
+export class MeanShiftExt {
+ readonly bandwidth: number | null;
+ readonly clusterAll: boolean;
+ readonly maxIter: number;
+
+ clusterCenters_: Float64Array[] | null = null;
+ labels_: Int32Array | null = null;
+
+ constructor(options: MeanShiftExtOptions = {}) {
+ this.bandwidth = options.bandwidth ?? null;
+ this.clusterAll = options.clusterAll ?? true;
+ this.maxIter = options.maxIter ?? 300;
+ }
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const nFeatures = X[0]?.length ?? 0;
+ const bw = this.bandwidth ?? estimateBandwidth(X);
+
+ // Initialize seeds at data points
+ let seeds = X.map((row) => row.slice() as Float64Array);
+
+ // Iterate mean shift
+ for (let iter = 0; iter < this.maxIter; iter++) {
+ let maxShift = 0;
+ const newSeeds = seeds.map((seed) => {
+ const weights: number[] = X.map((xi) => {
+ const d = euclidean(xi, seed);
+ return Math.exp(-0.5 * (d / bw) ** 2);
+ });
+ const totalW = weights.reduce((s, w) => s + w, 0);
+ if (totalW < 1e-10) return seed;
+ const newSeed = new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) {
+ newSeed[j] = X.reduce((s, xi, i) => s + (weights[i] ?? 0) * (xi[j] ?? 0), 0) / totalW;
+ }
+ maxShift = Math.max(maxShift, euclidean(newSeed, seed));
+ return newSeed;
+ });
+ seeds = newSeeds;
+ if (maxShift < 1e-5) break;
+ }
+
+ // Merge nearby seeds into cluster centers
+ const centers: Float64Array[] = [];
+ for (const seed of seeds) {
+ let merged = false;
+ for (const center of centers) {
+ if (euclidean(seed, center) < bw / 2) {
+ merged = true;
+ // Update center as mean
+ for (let j = 0; j < nFeatures; j++) {
+ center[j] = ((center[j] ?? 0) + (seed[j] ?? 0)) / 2;
+ }
+ break;
+ }
+ }
+ if (!merged) centers.push(seed.slice() as Float64Array);
+ }
+
+ this.clusterCenters_ = centers;
+ this.labels_ = Int32Array.from({ length: n }, (_, i) => {
+ let bestC = -1;
+ let bestD = Number.POSITIVE_INFINITY;
+ for (let c = 0; c < centers.length; c++) {
+ const d = euclidean(X[i]!, centers[c]!);
+ if (d < bestD) { bestD = d; bestC = c; }
+ }
+ if (!this.clusterAll && bestD > bw) return -1;
+ return bestC;
+ });
+
+ return this;
+ }
+
+ predict(X: Float64Array[]): Int32Array {
+ if (this.clusterCenters_ === null) throw new Error("MeanShiftExt must be fitted first");
+ const centers = this.clusterCenters_;
+ return Int32Array.from(X, (xi) => {
+ let best = 0;
+ let bestD = euclidean(xi, centers[0]!);
+ for (let c = 1; c < centers.length; c++) {
+ const d = euclidean(xi, centers[c]!);
+ if (d < bestD) { bestD = d; best = c; }
+ }
+ return best;
+ });
+ }
+}
diff --git a/src/cluster/optics_ext.ts b/src/cluster/optics_ext.ts
new file mode 100644
index 0000000..944dcd4
--- /dev/null
+++ b/src/cluster/optics_ext.ts
@@ -0,0 +1,191 @@
+/**
+ * OPTICS clustering utility functions β ported from sklearn.cluster._optics
+ * clusterOpticsDbscan, clusterOpticsXi, extractDbscanClustering
+ */
+
+export interface OpticsClusterResult {
+ /** Cluster labels for each sample (-1 = noise) */
+ labels: Int32Array;
+ /** Number of clusters found (excluding noise) */
+ nClusters: number;
+}
+
+/**
+ * Perform DBSCAN extraction from OPTICS reachability distances.
+ *
+ * @param reachabilityDistances Reachability distances from OPTICS
+ * @param coreDistances Core distances from OPTICS
+ * @param ordering Sample ordering from OPTICS
+ * @param eps The maximum reachability distance for cluster membership
+ * @returns Cluster labels for each sample
+ */
+export function clusterOpticsDbscan(
+ reachabilityDistances: Float64Array,
+ coreDistances: Float64Array,
+ ordering: Int32Array,
+ eps: number,
+): OpticsClusterResult {
+ const nSamples = reachabilityDistances.length;
+ const labels = new Int32Array(nSamples).fill(-1);
+ let clusterLabel = 0;
+
+ let i = 0;
+ while (i < nSamples) {
+ const sampleIdx = ordering[i] ?? i;
+ const reach = reachabilityDistances[sampleIdx] ?? Number.POSITIVE_INFINITY;
+ const core = coreDistances[sampleIdx] ?? Number.POSITIVE_INFINITY;
+
+ if (reach > eps) {
+ // This point starts a potential new cluster or is noise
+ if (core <= eps) {
+ // It is a core point β start a new cluster
+ clusterLabel++;
+ labels[sampleIdx] = clusterLabel;
+ i++;
+ // Expand cluster
+ while (i < nSamples) {
+ const nextIdx = ordering[i] ?? i;
+ const nextReach = reachabilityDistances[nextIdx] ?? Number.POSITIVE_INFINITY;
+ if (nextReach <= eps) {
+ labels[nextIdx] = clusterLabel;
+ i++;
+ } else {
+ break;
+ }
+ }
+ } else {
+ // Noise point
+ i++;
+ }
+ } else {
+ // Continue current cluster
+ if (clusterLabel > 0) {
+ labels[sampleIdx] = clusterLabel;
+ }
+ i++;
+ }
+ }
+
+ return { labels, nClusters: clusterLabel };
+}
+
+/**
+ * Perform xi-based cluster extraction from OPTICS results.
+ *
+ * @param reachabilityDistances Reachability distances from OPTICS
+ * @param ordering Sample ordering from OPTICS
+ * @param minSamples Minimum number of samples in a cluster
+ * @param xi Determines the minimum steepness (0 < xi < 1)
+ * @param minClusterSize Minimum size of a cluster (as fraction or count)
+ * @returns Cluster labels
+ */
+export function clusterOpticsXi(
+ reachabilityDistances: Float64Array,
+ ordering: Int32Array,
+ minSamples: number,
+ xi = 0.05,
+ minClusterSize?: number,
+): OpticsClusterResult {
+ const nSamples = ordering.length;
+ const minSize = minClusterSize ?? minSamples;
+ const labels = new Int32Array(nSamples).fill(-1);
+
+ // Build ordered reachabilities
+ const orderedReach = new Float64Array(nSamples);
+ for (let i = 0; i < nSamples; i++) {
+ orderedReach[i] = reachabilityDistances[ordering[i] ?? i] ?? Number.POSITIVE_INFINITY;
+ }
+
+ // Find steep upward and downward areas
+ interface SteepArea {
+ start: number;
+ end: number;
+ kind: "up" | "down";
+ }
+
+ const steepAreas: SteepArea[] = [];
+
+ for (let i = 0; i < nSamples - 1; i++) {
+ const r1 = orderedReach[i] ?? 0;
+ const r2 = orderedReach[i + 1] ?? 0;
+ if (r1 === 0) continue;
+
+ const ratio = r2 / r1;
+ if (ratio >= 1 + xi) {
+ steepAreas.push({ start: i, end: i + 1, kind: "up" });
+ } else if (r2 > 0 && r1 / r2 >= 1 + xi) {
+ steepAreas.push({ start: i, end: i + 1, kind: "down" });
+ }
+ }
+
+ // Simple cluster extraction: pair each down area with a matching up area
+ let clusterLabel = 0;
+
+ for (let di = 0; di < steepAreas.length; di++) {
+ const down = steepAreas[di]!;
+ if (down.kind !== "down") continue;
+
+ for (let ui = di + 1; ui < steepAreas.length; ui++) {
+ const up = steepAreas[ui]!;
+ if (up.kind !== "up") continue;
+
+ const clusterStart = down.end;
+ const clusterEnd = up.start;
+ const size = clusterEnd - clusterStart;
+
+ if (size < minSize) continue;
+
+ clusterLabel++;
+ for (let i = clusterStart; i <= clusterEnd && i < nSamples; i++) {
+ const sampleIdx = ordering[i] ?? i;
+ if (labels[sampleIdx] === -1) {
+ labels[sampleIdx] = clusterLabel;
+ }
+ }
+ break;
+ }
+ }
+
+ return { labels, nClusters: clusterLabel };
+}
+
+/**
+ * Extract DBSCAN-style clusters from OPTICS at multiple eps values.
+ */
+export interface EpsClusterResult {
+ eps: number;
+ labels: Int32Array;
+ nClusters: number;
+}
+
+export function extractDbscanClustering(
+ reachabilityDistances: Float64Array,
+ coreDistances: Float64Array,
+ ordering: Int32Array,
+ epsValues: Float64Array,
+): EpsClusterResult[] {
+ return Array.from(epsValues).map((eps) => {
+ const result = clusterOpticsDbscan(reachabilityDistances, coreDistances, ordering, eps);
+ return { eps, ...result };
+ });
+}
+
+/**
+ * Compute the reachability plot for visualization.
+ * Returns pairs of (order_index, reachability_distance) for plotting.
+ */
+export function reachabilityPlotData(
+ reachabilityDistances: Float64Array,
+ ordering: Int32Array,
+): { orderIndex: Int32Array; reachDistance: Float64Array } {
+ const n = ordering.length;
+ const orderIndex = new Int32Array(n);
+ const reachDistance = new Float64Array(n);
+
+ for (let i = 0; i < n; i++) {
+ orderIndex[i] = i;
+ reachDistance[i] = reachabilityDistances[ordering[i] ?? i] ?? Number.POSITIVE_INFINITY;
+ }
+
+ return { orderIndex, reachDistance };
+}
diff --git a/src/cluster/spectral.ts b/src/cluster/spectral.ts
new file mode 100644
index 0000000..4875131
--- /dev/null
+++ b/src/cluster/spectral.ts
@@ -0,0 +1,549 @@
+/**
+ * SpectralClustering, MeanShift, Birch, and OPTICS clustering.
+ * Mirrors sklearn.cluster SpectralClustering, MeanShift, Birch, OPTICS.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+// βββ SpectralClustering βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+
+export interface SpectralClusteringOptions {
+ nClusters?: number;
+ nInit?: number;
+ gamma?: number;
+ affinityType?: "rbf" | "nearest_neighbors";
+ nNeighbors?: number;
+ randomState?: number;
+}
+
+function rbfKernel(a: Float64Array, b: Float64Array, gamma: number): number {
+ let d = 0;
+ for (let i = 0; i < a.length; i++) {
+ d += ((a[i] ?? 0) - (b[i] ?? 0)) ** 2;
+ }
+ return Math.exp(-gamma * d);
+}
+
+function computeAffinityMatrix(
+ X: Float64Array[],
+ gamma: number,
+): Float64Array[] {
+ const n = X.length;
+ return X.map((xi, i) =>
+ Float64Array.from(X, (xj, j) => {
+ if (i === j) return 0;
+ return rbfKernel(xi as Float64Array, xj as Float64Array, gamma);
+ }),
+ );
+}
+
+function symmetricNormalizedLaplacian(W: Float64Array[]): Float64Array[] {
+ const n = W.length;
+ const D = W.map((row) => row.reduce((s, v) => s + v, 0));
+ const Dinvhalf = D.map((d) => (d > 0 ? 1 / Math.sqrt(d) : 0));
+ return W.map((row, i) =>
+ Float64Array.from(row, (w, j) => (Dinvhalf[i] ?? 0) * w * (Dinvhalf[j] ?? 0)),
+ );
+}
+
+function powerIterationEigenvectors(
+ L: Float64Array[],
+ k: number,
+ maxIter = 300,
+): Float64Array[] {
+ const n = L.length;
+ const rng = { seed: 42 };
+ const rand = () => {
+ rng.seed = (rng.seed * 1664525 + 1013904223) & 0xffffffff;
+ return (rng.seed >>> 0) / 0xffffffff;
+ };
+ // Initialize random vectors
+ const vecs: Float64Array[] = Array.from({ length: k }, () =>
+ Float64Array.from({ length: n }, () => rand() - 0.5),
+ );
+
+ for (let iter = 0; iter < maxIter; iter++) {
+ // Orthogonalize and normalize via QR (Gram-Schmidt)
+ for (let col = 0; col < k; col++) {
+ const v = vecs[col] as Float64Array;
+ // Multiply: v = L @ v
+ const Lv = new Float64Array(n);
+ for (let i = 0; i < n; i++) {
+ const row = L[i] as Float64Array;
+ let s = 0;
+ for (let j = 0; j < n; j++) s += (row[j] ?? 0) * (v[j] ?? 0);
+ Lv[i] = s;
+ }
+ // Subtract projections of previous vectors
+ for (let prev = 0; prev < col; prev++) {
+ const u = vecs[prev] as Float64Array;
+ let dot = 0;
+ for (let i = 0; i < n; i++) dot += (Lv[i] ?? 0) * (u[i] ?? 0);
+ for (let i = 0; i < n; i++) Lv[i]! -= dot * (u[i] ?? 0);
+ }
+ // Normalize
+ let norm = 0;
+ for (let i = 0; i < n; i++) norm += (Lv[i] ?? 0) ** 2;
+ norm = Math.sqrt(norm) || 1;
+ for (let i = 0; i < n; i++) Lv[i]! /= norm;
+ vecs[col] = Lv;
+ }
+ }
+ return vecs;
+}
+
+function kmeansOnRows(
+ rows: Float64Array[],
+ k: number,
+ maxIter = 100,
+ nInit = 10,
+): Int32Array {
+ const n = rows.length;
+ const d = rows[0]?.length ?? 0;
+ let bestLabels = new Int32Array(n);
+ let bestInertia = Number.POSITIVE_INFINITY;
+
+ const rng = { seed: 0 };
+ const rand = () => {
+ rng.seed = (rng.seed * 1664525 + 1013904223) & 0xffffffff;
+ return (rng.seed >>> 0) / 0xffffffff;
+ };
+
+ for (let init = 0; init < nInit; init++) {
+ rng.seed = init * 1234 + 5678;
+ const centers: Float64Array[] = Array.from({ length: k }, () => {
+ const idx = Math.floor(rand() * n);
+ return Float64Array.from(rows[idx] ?? new Float64Array(d));
+ });
+ const labels = new Int32Array(n);
+
+ for (let iter = 0; iter < maxIter; iter++) {
+ // Assign
+ let changed = false;
+ for (let i = 0; i < n; i++) {
+ const xi = rows[i] as Float64Array;
+ let best = 0;
+ let bestDist = Number.POSITIVE_INFINITY;
+ for (let c = 0; c < k; c++) {
+ const cc = centers[c] as Float64Array;
+ let dd = 0;
+ for (let j = 0; j < d; j++) dd += ((xi[j] ?? 0) - (cc[j] ?? 0)) ** 2;
+ if (dd < bestDist) { bestDist = dd; best = c; }
+ }
+ if (labels[i] !== best) { labels[i]! = best; changed = true; }
+ }
+ if (!changed) break;
+ // Update centers
+ for (const c of centers) c.fill(0);
+ const counts = new Int32Array(k);
+ for (let i = 0; i < n; i++) {
+ const c = labels[i] ?? 0;
+ counts[c]! += 1;
+ const cc = centers[c] as Float64Array;
+ const xi = rows[i] as Float64Array;
+ for (let j = 0; j < d; j++) cc[j]! += xi[j] ?? 0;
+ }
+ for (let c = 0; c < k; c++) {
+ const cnt = counts[c] ?? 1;
+ if (cnt > 0) {
+ const cc = centers[c] as Float64Array;
+ for (let j = 0; j < d; j++) cc[j]! /= cnt;
+ }
+ }
+ }
+
+ let inertia = 0;
+ for (let i = 0; i < n; i++) {
+ const xi = rows[i] as Float64Array;
+ const cc = centers[labels[i] ?? 0] as Float64Array;
+ for (let j = 0; j < d; j++) inertia += ((xi[j] ?? 0) - (cc[j] ?? 0)) ** 2;
+ }
+ if (inertia < bestInertia) {
+ bestInertia = inertia;
+ bestLabels = Int32Array.from(labels);
+ }
+ }
+ return bestLabels;
+}
+
+export class SpectralClustering {
+ nClusters: number;
+ nInit: number;
+ gamma: number;
+
+ labels_: Int32Array | null = null;
+ affinityMatrix_: Float64Array[] | null = null;
+
+ constructor(opts: SpectralClusteringOptions = {}) {
+ this.nClusters = opts.nClusters ?? 8;
+ this.nInit = opts.nInit ?? 10;
+ this.gamma = opts.gamma ?? 1.0;
+ }
+
+ fit(X: Float64Array[]): this {
+ const W = computeAffinityMatrix(X, this.gamma);
+ this.affinityMatrix_ = W;
+ const L = symmetricNormalizedLaplacian(W);
+ const vecs = powerIterationEigenvectors(L, this.nClusters);
+ const n = X.length;
+ const k = this.nClusters;
+ // Assemble rows from eigenvectors
+ const rows: Float64Array[] = Array.from({ length: n }, (_, i) => {
+ const row = new Float64Array(k);
+ for (let c = 0; c < k; c++) {
+ row[c]! = (vecs[c] as Float64Array)[i] ?? 0;
+ }
+ return row;
+ });
+ // Normalize rows to unit norm
+ for (const row of rows) {
+ let norm = 0;
+ for (let j = 0; j < k; j++) norm += (row[j] ?? 0) ** 2;
+ norm = Math.sqrt(norm) || 1;
+ for (let j = 0; j < k; j++) row[j]! /= norm;
+ }
+ this.labels_ = kmeansOnRows(rows, this.nClusters, 100, this.nInit);
+ return this;
+ }
+
+ fitPredict(X: Float64Array[]): Int32Array {
+ this.fit(X);
+ return this.labels_ as Int32Array;
+ }
+}
+
+// βββ MeanShift ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+
+export interface MeanShiftOptions {
+ bandwidth?: number;
+ maxIter?: number;
+ tol?: number;
+}
+
+function gaussianKernelWeight(dist2: number, bandwidth: number): number {
+ return Math.exp(-dist2 / (2 * bandwidth * bandwidth));
+}
+
+export class MeanShift {
+ bandwidth: number;
+ maxIter: number;
+ tol: number;
+
+ clusterCenters_: Float64Array[] | null = null;
+ labels_: Int32Array | null = null;
+
+ constructor(opts: MeanShiftOptions = {}) {
+ this.bandwidth = opts.bandwidth ?? 1.0;
+ this.maxIter = opts.maxIter ?? 300;
+ this.tol = opts.tol ?? 1e-3;
+ }
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const d = X[0]?.length ?? 0;
+ // Initialize one seed per point
+ const seeds: Float64Array[] = X.map((x) => Float64Array.from(x));
+
+ for (const seed of seeds) {
+ for (let iter = 0; iter < this.maxIter; iter++) {
+ const newSeed = new Float64Array(d);
+ let totalWeight = 0;
+ for (const xi of X) {
+ let dist2 = 0;
+ for (let j = 0; j < d; j++) dist2 += ((seed[j] ?? 0) - (xi[j] ?? 0)) ** 2;
+ const w = gaussianKernelWeight(dist2, this.bandwidth);
+ totalWeight += w;
+ for (let j = 0; j < d; j++) newSeed[j]! += w * (xi[j] ?? 0);
+ }
+ if (totalWeight > 0) {
+ for (let j = 0; j < d; j++) newSeed[j]! /= totalWeight;
+ }
+ let shift = 0;
+ for (let j = 0; j < d; j++) shift += ((newSeed[j] ?? 0) - (seed[j] ?? 0)) ** 2;
+ for (let j = 0; j < d; j++) seed[j]! = newSeed[j] ?? 0;
+ if (Math.sqrt(shift) < this.tol) break;
+ }
+ }
+
+ // Merge nearby seeds
+ const mergedCenters: Float64Array[] = [];
+ for (const seed of seeds) {
+ let merged = false;
+ for (const center of mergedCenters) {
+ let dist2 = 0;
+ for (let j = 0; j < d; j++) dist2 += ((seed[j] ?? 0) - (center[j] ?? 0)) ** 2;
+ if (Math.sqrt(dist2) < this.bandwidth) { merged = true; break; }
+ }
+ if (!merged) mergedCenters.push(Float64Array.from(seed));
+ }
+
+ this.clusterCenters_ = mergedCenters;
+
+ // Assign labels
+ const labels = new Int32Array(n);
+ for (let i = 0; i < n; i++) {
+ const xi = X[i] as Float64Array;
+ let best = 0;
+ let bestDist = Number.POSITIVE_INFINITY;
+ for (let c = 0; c < mergedCenters.length; c++) {
+ const cc = mergedCenters[c] as Float64Array;
+ let dist2 = 0;
+ for (let j = 0; j < d; j++) dist2 += ((xi[j] ?? 0) - (cc[j] ?? 0)) ** 2;
+ if (dist2 < bestDist) { bestDist = dist2; best = c; }
+ }
+ labels[i]! = best;
+ }
+ this.labels_ = labels;
+ return this;
+ }
+
+ fitPredict(X: Float64Array[]): Int32Array {
+ this.fit(X);
+ return this.labels_ as Int32Array;
+ }
+
+ predict(X: Float64Array[]): Int32Array {
+ if (!this.clusterCenters_) throw new NotFittedError("MeanShift");
+ const n = X.length;
+ const d = X[0]?.length ?? 0;
+ const labels = new Int32Array(n);
+ for (let i = 0; i < n; i++) {
+ const xi = X[i] as Float64Array;
+ let best = 0;
+ let bestDist = Number.POSITIVE_INFINITY;
+ for (let c = 0; c < this.clusterCenters_.length; c++) {
+ const cc = this.clusterCenters_[c] as Float64Array;
+ let dist2 = 0;
+ for (let j = 0; j < d; j++) dist2 += ((xi[j] ?? 0) - (cc[j] ?? 0)) ** 2;
+ if (dist2 < bestDist) { bestDist = dist2; best = c; }
+ }
+ labels[i]! = best;
+ }
+ return labels;
+ }
+}
+
+// βββ Birch ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+
+export interface BirchOptions {
+ threshold?: number;
+ branchingFactor?: number;
+ nClusters?: number;
+}
+
+interface CFEntry {
+ n: number;
+ ls: Float64Array;
+ ss: number;
+}
+
+export class Birch {
+ threshold: number;
+ branchingFactor: number;
+ nClusters: number;
+
+ labels_: Int32Array | null = null;
+ subclusterCenters_: Float64Array[] | null = null;
+
+ constructor(opts: BirchOptions = {}) {
+ this.threshold = opts.threshold ?? 0.5;
+ this.branchingFactor = opts.branchingFactor ?? 50;
+ this.nClusters = opts.nClusters ?? 3;
+ }
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const d = X[0]?.length ?? 0;
+ const entries: CFEntry[] = [];
+
+ for (const xi of X) {
+ let inserted = false;
+ for (const entry of entries) {
+ const centroid = Float64Array.from({ length: d }, (_, j) => (entry.ls[j] ?? 0) / entry.n);
+ let dist2 = 0;
+ for (let j = 0; j < d; j++) dist2 += ((xi[j] ?? 0) - (centroid[j] ?? 0)) ** 2;
+ if (Math.sqrt(dist2) <= this.threshold) {
+ entry.n += 1;
+ for (let j = 0; j < d; j++) entry.ls[j]! += xi[j] ?? 0;
+ entry.ss += xi.reduce((s, v) => s + v * v, 0);
+ inserted = true;
+ break;
+ }
+ }
+ if (!inserted) {
+ entries.push({ n: 1, ls: Float64Array.from(xi), ss: xi.reduce((s, v) => s + v * v, 0) });
+ }
+ }
+
+ const centers: Float64Array[] = entries.map((e) =>
+ Float64Array.from({ length: d }, (_, j) => (e.ls[j] ?? 0) / e.n),
+ );
+ this.subclusterCenters_ = centers;
+
+ // Use k-means on subcluster centers
+ const k = Math.min(this.nClusters, centers.length);
+ const subcluLabels = kmeansOnRows(centers, k, 100, 3);
+
+ // Assign original points to the nearest subcluster then to its k-means label
+ const labels = new Int32Array(n);
+ for (let i = 0; i < n; i++) {
+ const xi = X[i] as Float64Array;
+ let bestIdx = 0;
+ let bestDist = Number.POSITIVE_INFINITY;
+ for (let c = 0; c < centers.length; c++) {
+ const cc = centers[c] as Float64Array;
+ let dist2 = 0;
+ for (let j = 0; j < d; j++) dist2 += ((xi[j] ?? 0) - (cc[j] ?? 0)) ** 2;
+ if (dist2 < bestDist) { bestDist = dist2; bestIdx = c; }
+ }
+ labels[i]! = subcluLabels[bestIdx] ?? 0;
+ }
+ this.labels_ = labels;
+ return this;
+ }
+
+ fitPredict(X: Float64Array[]): Int32Array {
+ this.fit(X);
+ return this.labels_ as Int32Array;
+ }
+
+ predict(X: Float64Array[]): Int32Array {
+ if (!this.subclusterCenters_) throw new NotFittedError("Birch");
+ const n = X.length;
+ const d = X[0]?.length ?? 0;
+ const labels = new Int32Array(n);
+ for (let i = 0; i < n; i++) {
+ const xi = X[i] as Float64Array;
+ let bestIdx = 0;
+ let bestDist = Number.POSITIVE_INFINITY;
+ for (let c = 0; c < this.subclusterCenters_.length; c++) {
+ const cc = this.subclusterCenters_[c] as Float64Array;
+ let dist2 = 0;
+ for (let j = 0; j < d; j++) dist2 += ((xi[j] ?? 0) - (cc[j] ?? 0)) ** 2;
+ if (dist2 < bestDist) { bestDist = dist2; bestIdx = c; }
+ }
+ labels[i]! = bestIdx;
+ }
+ return labels;
+ }
+}
+
+// βββ OPTICS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+
+export interface OPTICSOptions {
+ minSamples?: number;
+ maxEps?: number;
+ xi?: number;
+}
+
+export class OPTICS {
+ minSamples: number;
+ maxEps: number;
+ xi: number;
+
+ labels_: Int32Array | null = null;
+ reachabilityDistances_: Float64Array | null = null;
+ coreDistances_: Float64Array | null = null;
+ ordering_: Int32Array | null = null;
+
+ constructor(opts: OPTICSOptions = {}) {
+ this.minSamples = opts.minSamples ?? 5;
+ this.maxEps = opts.maxEps ?? Number.POSITIVE_INFINITY;
+ this.xi = opts.xi ?? 0.05;
+ }
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const d = X[0]?.length ?? 0;
+
+ const dist = (a: Float64Array, b: Float64Array): number => {
+ let s = 0;
+ for (let i = 0; i < d; i++) s += ((a[i] ?? 0) - (b[i] ?? 0)) ** 2;
+ return Math.sqrt(s);
+ };
+
+ // Compute all pairwise distances (for small datasets)
+ const dists: Float64Array[] = Array.from({ length: n }, (_, i) =>
+ Float64Array.from({ length: n }, (__, j) =>
+ dist(X[i] as Float64Array, X[j] as Float64Array),
+ ),
+ );
+
+ // Compute core distances
+ const coreDist = new Float64Array(n);
+ for (let i = 0; i < n; i++) {
+ const row = Array.from(dists[i] as Float64Array).sort((a, b) => a - b);
+ coreDist[i]! = row[this.minSamples] ?? Number.POSITIVE_INFINITY;
+ }
+
+ const processed = new Uint8Array(n);
+ const reachDist = new Float64Array(n).fill(Number.POSITIVE_INFINITY);
+ const ordering: number[] = [];
+
+ const seeds: number[] = [];
+ const updateSeeds = (idx: number) => {
+ const cd = coreDist[idx] ?? Number.POSITIVE_INFINITY;
+ for (let j = 0; j < n; j++) {
+ if (processed[j]) continue;
+ const newRD = Math.max(cd, (dists[idx] as Float64Array)[j] ?? Number.POSITIVE_INFINITY);
+ if (newRD < (reachDist[j] ?? Number.POSITIVE_INFINITY)) {
+ reachDist[j]! = newRD;
+ if (!seeds.includes(j)) seeds.push(j);
+ }
+ }
+ };
+
+ for (let start = 0; start < n; start++) {
+ if (processed[start]) continue;
+ processed[start]! = 1;
+ ordering.push(start);
+ if ((coreDist[start] ?? Number.POSITIVE_INFINITY) <= this.maxEps) {
+ updateSeeds(start);
+ while (seeds.length > 0) {
+ // Pick seed with minimum reachability distance
+ let minIdx = 0;
+ let minRD = Number.POSITIVE_INFINITY;
+ for (let s = 0; s < seeds.length; s++) {
+ const sd = seeds[s] ?? 0;
+ const rd = reachDist[sd] ?? Number.POSITIVE_INFINITY;
+ if (rd < minRD) { minRD = rd; minIdx = s; }
+ }
+ const q = seeds[minIdx] ?? 0;
+ seeds.splice(minIdx, 1);
+ if (processed[q]) continue;
+ processed[q]! = 1;
+ ordering.push(q);
+ if ((coreDist[q] ?? Number.POSITIVE_INFINITY) <= this.maxEps) {
+ updateSeeds(q);
+ }
+ }
+ }
+ }
+
+ // Assign labels via xi-cluster extraction (simplified: threshold-based)
+ const labels = new Int32Array(n).fill(-1);
+ let clusterId = 0;
+ const eps = this.xi * (reachDist.reduce((mx, v) => Math.max(mx, isFinite(v) ? v : 0), 0));
+ let currentCluster = -1;
+ for (const idx of ordering) {
+ const rd = reachDist[idx] ?? Number.POSITIVE_INFINITY;
+ if (rd <= eps && (coreDist[idx] ?? Number.POSITIVE_INFINITY) <= this.maxEps) {
+ if (currentCluster === -1) { currentCluster = clusterId++; }
+ labels[idx]! = currentCluster;
+ } else {
+ currentCluster = -1;
+ }
+ }
+
+ this.labels_ = labels;
+ this.reachabilityDistances_ = reachDist;
+ this.coreDistances_ = coreDist;
+ this.ordering_ = Int32Array.from(ordering);
+ return this;
+ }
+
+ fitPredict(X: Float64Array[]): Int32Array {
+ this.fit(X);
+ return this.labels_ as Int32Array;
+ }
+}
diff --git a/src/cluster/ward.ts b/src/cluster/ward.ts
new file mode 100644
index 0000000..de0a6ad
--- /dev/null
+++ b/src/cluster/ward.ts
@@ -0,0 +1,186 @@
+/**
+ * Ward linkage and hierarchical clustering utilities.
+ * Mirrors scipy.cluster.hierarchy (linkage, fcluster, dendrogram helpers)
+ * as used within sklearn.cluster.AgglomerativeClustering.
+ */
+
+export interface LinkageRow {
+ clusterA: number;
+ clusterB: number;
+ distance: number;
+ size: number;
+}
+
+/** Compute the Ward linkage matrix for a dataset (O(n^3) naive implementation). */
+export function wardLinkage(X: Float64Array[]): LinkageRow[] {
+ const n = X.length;
+ if (n < 2) return [];
+
+ // Each point starts as its own cluster
+ const clusterPoints: Map = new Map();
+ for (let i = 0; i < n; i++) clusterPoints.set(i, [i]);
+
+ // Current cluster centroids
+ const centroids: Map = new Map();
+ for (let i = 0; i < n; i++) centroids.set(i, new Float64Array(X[i]!));
+
+ let nextCluster = n;
+ const result: LinkageRow[] = [];
+ const activeClusters = new Set(Array.from({ length: n }, (_, i) => i));
+
+ function centroid(indices: number[]): Float64Array {
+ const d = X[0]!.length;
+ const c = new Float64Array(d);
+ for (const idx of indices) {
+ const pt = X[idx]!;
+ for (let j = 0; j < d; j++) c[j]! += pt[j] ?? 0;
+ }
+ for (let j = 0; j < d; j++) c[j]! /= indices.length;
+ return c;
+ }
+
+ function wardDist(a: number, b: number): number {
+ const pa = clusterPoints.get(a)!;
+ const pb = clusterPoints.get(b)!;
+ const na = pa.length;
+ const nb = pb.length;
+ const ca = centroids.get(a)!;
+ const cb = centroids.get(b)!;
+ let sq = 0;
+ for (let j = 0; j < ca.length; j++) {
+ const diff = (ca[j] ?? 0) - (cb[j] ?? 0);
+ sq += diff * diff;
+ }
+ return Math.sqrt((na * nb) / (na + nb) * sq);
+ }
+
+ while (activeClusters.size > 1) {
+ // Find closest pair
+ const active = [...activeClusters];
+ let minDist = Number.POSITIVE_INFINITY;
+ let bestA = -1;
+ let bestB = -1;
+ for (let i = 0; i < active.length; i++) {
+ for (let j = i + 1; j < active.length; j++) {
+ const d = wardDist(active[i]!, active[j]!);
+ if (d < minDist) { minDist = d; bestA = active[i]!; bestB = active[j]!; }
+ }
+ }
+
+ const pA = clusterPoints.get(bestA)!;
+ const pB = clusterPoints.get(bestB)!;
+ const merged = [...pA, ...pB];
+ clusterPoints.set(nextCluster, merged);
+ centroids.set(nextCluster, centroid(merged));
+
+ result.push({ clusterA: bestA, clusterB: bestB, distance: minDist, size: merged.length });
+ activeClusters.delete(bestA);
+ activeClusters.delete(bestB);
+ activeClusters.add(nextCluster);
+ nextCluster++;
+ }
+
+ return result;
+}
+
+/** Flatten the linkage matrix to cluster labels (fcluster with criterion='maxclust'). */
+export function fcluster(linkage: LinkageRow[], nClusters: number, nPoints: number): Int32Array {
+ const labels = new Int32Array(nPoints);
+ if (nClusters >= nPoints) { for (let i = 0; i < nPoints; i++) labels[i] = i; return labels; }
+
+ // Track which top-level cluster each point belongs to
+ const children: Map = new Map();
+ for (const row of linkage) {
+ children.set(nPoints + children.size, [row.clusterA, row.clusterB]);
+ }
+
+ // The root is the last merged cluster
+ const root = nPoints + linkage.length - 1;
+ // BFS to assign labels β cut the tree to produce nClusters clusters
+ const cutAt = linkage.length - nClusters; // cut after this many merges from the root
+ const mergeCount = linkage.length;
+ const cutThreshold = mergeCount >= nClusters ? linkage[mergeCount - nClusters]?.distance ?? 0 : 0;
+
+ // Assign label by DFS
+ let nextLabel = 0;
+ function assign(node: number, label: number): void {
+ if (node < nPoints) { labels[node] = label; return; }
+ const ch = children.get(node);
+ if (!ch) return;
+ assign(ch[0], label);
+ assign(ch[1], label);
+ }
+
+ // Walk from root, splitting where distance > cutThreshold
+ function split(node: number, rowIdx: number): void {
+ if (node < nPoints) { labels[node] = nextLabel++; return; }
+ const ch = children.get(node);
+ if (!ch) { assign(node, nextLabel++); return; }
+ const row = linkage[rowIdx];
+ if (!row) { assign(node, nextLabel++); return; }
+ if (row.distance > cutThreshold && nextLabel < nClusters) {
+ split(ch[0], rowIdx - 1 - (linkage.length - 1 - rowIdx));
+ split(ch[1], rowIdx - 1);
+ } else {
+ assign(node, nextLabel++);
+ }
+ }
+
+ // Simple BFS approach: top nClusters nodes in the linkage
+ const queue: number[] = [root];
+ const clusters: number[] = [];
+ let label = 0;
+ while (clusters.length < nClusters && queue.length > 0) {
+ const node = queue.shift()!;
+ const ch = children.get(node);
+ if (!ch || clusters.length + queue.length >= nClusters) {
+ clusters.push(node);
+ } else {
+ queue.push(ch[0], ch[1]);
+ }
+ }
+ for (const cl of clusters) assign(cl, label++);
+
+ return labels;
+}
+
+/** Compute cophenetic distances from linkage matrix. */
+export function copheneticDistances(linkage: LinkageRow[], nPoints: number): Float64Array {
+ const n = nPoints;
+ const dist = new Float64Array(n * n);
+ // For each pair of points, find when they first merge
+ function findMerge(a: number, b: number): number {
+ // Walk through linkage in order
+ const clusterOf = new Int32Array(nPoints + linkage.length);
+ for (let i = 0; i < nPoints; i++) clusterOf[i] = i;
+ for (let step = 0; step < linkage.length; step++) {
+ const row = linkage[step]!;
+ const newId = nPoints + step;
+ // Check if a and b are in clusterA and clusterB
+ const inA = isIn(a, row.clusterA, nPoints, linkage, step);
+ const inB = isIn(b, row.clusterB, nPoints, linkage, step);
+ const inBA = isIn(b, row.clusterA, nPoints, linkage, step);
+ const inAB = isIn(a, row.clusterB, nPoints, linkage, step);
+ if ((inA && inB) || (inBA && inAB)) return row.distance;
+ }
+ return 0;
+ }
+ for (let i = 0; i < n; i++) {
+ for (let j = i + 1; j < n; j++) {
+ const d = findMerge(i, j);
+ dist[i * n + j] = d; dist[j * n + i] = d;
+ }
+ }
+ return dist;
+}
+
+function isIn(point: number, cluster: number, nPoints: number, linkage: LinkageRow[], upTo: number): boolean {
+ if (cluster === point) return true;
+ if (cluster < nPoints) return false;
+ const idx = cluster - nPoints;
+ if (idx >= upTo) return false;
+ const row = linkage[idx]!;
+ return isIn(point, row.clusterA, nPoints, linkage, idx) || isIn(point, row.clusterB, nPoints, linkage, idx);
+}
+
+export type { LinkageRow as WardLinkageRow };
diff --git a/src/compose/column_selector.ts b/src/compose/column_selector.ts
new file mode 100644
index 0000000..7ef2980
--- /dev/null
+++ b/src/compose/column_selector.ts
@@ -0,0 +1,107 @@
+/**
+ * make_column_selector and related column-selection helpers for ColumnTransformer.
+ * Analogous to sklearn.compose._column.make_column_selector.
+ */
+
+/** Column selector predicate: returns true for columns to include. */
+export type ColumnSelectorFn = (colIndex: number, colName: string) => boolean;
+
+/** Options for makeColumnSelector. */
+export interface MakeColumnSelectorOptions {
+ /**
+ * String pattern or regex that column names must match (substring match by default).
+ * Set to undefined to match all columns.
+ */
+ pattern?: string | RegExp;
+ /**
+ * If provided, only include columns whose dtype matches one of these strings.
+ * Uses the dtypes map passed to the returned selector.
+ * Supported values: "number", "string", "boolean".
+ */
+ dtypeInclude?: string[];
+ /** If provided, exclude columns whose dtype matches one of these. */
+ dtypeExclude?: string[];
+}
+
+/**
+ * Returns a column-selector callable, analogous to sklearn's `make_column_selector`.
+ *
+ * The returned function accepts `(colNames: string[], dtypes?: Record)`
+ * and returns an array of column indices that pass the filter criteria.
+ */
+export function makeColumnSelector(
+ options: MakeColumnSelectorOptions = {},
+): (colNames: string[], dtypes?: Record) => number[] {
+ const { pattern, dtypeInclude, dtypeExclude } = options;
+
+ return (colNames: string[], dtypes?: Record): number[] => {
+ const result: number[] = [];
+ for (let i = 0; i < colNames.length; i++) {
+ const name = colNames[i]!;
+
+ // Pattern filter
+ if (pattern !== undefined) {
+ if (pattern instanceof RegExp) {
+ if (!pattern.test(name)) continue;
+ } else {
+ if (!name.includes(pattern)) continue;
+ }
+ }
+
+ // Dtype filters
+ const dtype = dtypes?.[name];
+ if (dtypeInclude !== undefined && dtype !== undefined && !dtypeInclude.includes(dtype)) continue;
+ if (dtypeExclude !== undefined && dtype !== undefined && dtypeExclude.includes(dtype)) continue;
+
+ result.push(i);
+ }
+ return result;
+ };
+}
+
+/**
+ * Returns the indices of all numeric columns (dtype "number").
+ * Convenience wrapper around makeColumnSelector.
+ */
+export function numericColumns(
+ colNames: string[],
+ dtypes: Record,
+): number[] {
+ return makeColumnSelector({ dtypeInclude: ["number"] })(colNames, dtypes);
+}
+
+/**
+ * Returns the indices of all categorical columns (dtype "string").
+ * Convenience wrapper around makeColumnSelector.
+ */
+export function categoricalColumns(
+ colNames: string[],
+ dtypes: Record,
+): number[] {
+ return makeColumnSelector({ dtypeInclude: ["string"] })(colNames, dtypes);
+}
+
+/**
+ * Selects a subset of columns from a flat row-major matrix.
+ *
+ * @param X Flat Float64Array of shape (nSamples Γ nColsIn).
+ * @param nSamples Number of rows.
+ * @param nColsIn Number of columns in X.
+ * @param cols Column indices to select.
+ * @returns New Float64Array of shape (nSamples Γ cols.length).
+ */
+export function selectColumns(
+ X: Float64Array,
+ nSamples: number,
+ nColsIn: number,
+ cols: number[],
+): Float64Array {
+ const nOut = cols.length;
+ const out = new Float64Array(nSamples * nOut);
+ for (let i = 0; i < nSamples; i++) {
+ for (let k = 0; k < nOut; k++) {
+ out[i * nOut + k] = X[i * nColsIn + cols[k]!]!;
+ }
+ }
+ return out;
+}
diff --git a/src/compose/column_transformer.ts b/src/compose/column_transformer.ts
new file mode 100644
index 0000000..aebbab1
--- /dev/null
+++ b/src/compose/column_transformer.ts
@@ -0,0 +1,102 @@
+/**
+ * ColumnTransformer: applies transformers to columns of an array.
+ * Mirrors sklearn.compose.ColumnTransformer.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+export interface Transformer {
+ fit(X: Float64Array[]): this;
+ transform(X: Float64Array[]): Float64Array[];
+ fitTransform?(X: Float64Array[]): Float64Array[];
+}
+
+export type ColumnSpec = number | number[] | "all";
+
+export class ColumnTransformer {
+ transformers: [string, Transformer | "passthrough" | "drop", ColumnSpec][];
+ remainder: "passthrough" | "drop";
+
+ transformers_: [string, Transformer | "passthrough", ColumnSpec][] = [];
+ private _nFeatures = 0;
+ private _allCols = new Set();
+
+ constructor(
+ transformers: [string, Transformer | "passthrough" | "drop", ColumnSpec][],
+ options: { remainder?: "passthrough" | "drop" } = {},
+ ) {
+ this.transformers = transformers;
+ this.remainder = options.remainder ?? "drop";
+ }
+
+ private _getCols(spec: ColumnSpec, nFeatures: number): number[] {
+ if (spec === "all") return Array.from({ length: nFeatures }, (_, i) => i);
+ if (typeof spec === "number") return [spec];
+ return spec;
+ }
+
+ fit(X: Float64Array[]): this {
+ const n = (X[0] ?? new Float64Array(0)).length;
+ this._nFeatures = n;
+ this._allCols.clear();
+
+ this.transformers_ = [];
+ for (const [name, t, spec] of this.transformers) {
+ if (t === "drop") continue;
+ const cols = this._getCols(spec, n);
+ for (const c of cols) this._allCols.add(c);
+
+ if (t === "passthrough") {
+ this.transformers_.push([name, "passthrough", spec]);
+ } else {
+ const Xsub = X.map((row) => new Float64Array(cols.map((c) => row[c] ?? 0)));
+ t.fit(Xsub);
+ this.transformers_.push([name, t, spec]);
+ }
+ }
+ return this;
+ }
+
+ transform(X: Float64Array[]): Float64Array[] {
+ if (this.transformers_.length === 0) throw new NotFittedError("ColumnTransformer");
+ const n = (X[0] ?? new Float64Array(0)).length;
+ const parts: Float64Array[][] = [];
+
+ for (const [, t, spec] of this.transformers_) {
+ const cols = this._getCols(spec, n);
+ const Xsub = X.map((row) => new Float64Array(cols.map((c) => row[c] ?? 0)));
+ if (t === "passthrough") {
+ parts.push(Xsub);
+ } else {
+ parts.push(t.transform(Xsub));
+ }
+ }
+
+ if (this.remainder === "passthrough") {
+ const remainderCols: number[] = [];
+ for (let c = 0; c < n; c++) {
+ if (!this._allCols.has(c)) remainderCols.push(c);
+ }
+ if (remainderCols.length > 0) {
+ parts.push(X.map((row) => new Float64Array(remainderCols.map((c) => row[c] ?? 0))));
+ }
+ }
+
+ // Horizontally concatenate
+ return X.map((_, i) => {
+ const rowParts = parts.map((p) => p[i] ?? new Float64Array(0));
+ const total = rowParts.reduce((s, r) => s + r.length, 0);
+ const result = new Float64Array(total);
+ let offset = 0;
+ for (const part of rowParts) {
+ result.set(part, offset);
+ offset += part.length;
+ }
+ return result;
+ });
+ }
+
+ fitTransform(X: Float64Array[]): Float64Array[] {
+ return this.fit(X).transform(X);
+ }
+}
diff --git a/src/compose/index.ts b/src/compose/index.ts
new file mode 100644
index 0000000..855943e
--- /dev/null
+++ b/src/compose/index.ts
@@ -0,0 +1,3 @@
+export * from "./column_transformer.js";
+export * from "./transformed_target.js";
+export * from "./column_selector.js";
diff --git a/src/compose/transformed_target.ts b/src/compose/transformed_target.ts
new file mode 100644
index 0000000..e7b60a5
--- /dev/null
+++ b/src/compose/transformed_target.ts
@@ -0,0 +1,117 @@
+/**
+ * TransformedTargetRegressor.
+ * Mirrors sklearn.compose.TransformedTargetRegressor.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+export interface TransformableTarget {
+ fit(y: Float64Array): this;
+ transform(y: Float64Array): Float64Array;
+ inverseTransform(y: Float64Array): Float64Array;
+}
+
+export interface FittableRegressor {
+ fit(X: Float64Array[], y: Float64Array): this;
+ predict(X: Float64Array[]): Float64Array;
+}
+
+export interface TransformedTargetRegressorOptions {
+ regressor?: FittableRegressor;
+ transformer?: TransformableTarget;
+ func?: (y: Float64Array) => Float64Array;
+ inverseFunc?: (y: Float64Array) => Float64Array;
+ checkInverse?: boolean;
+}
+
+export class TransformedTargetRegressor {
+ regressor_: FittableRegressor | null = null;
+ transformer_: TransformableTarget | null = null;
+ func: ((y: Float64Array) => Float64Array) | null;
+ inverseFunc: ((y: Float64Array) => Float64Array) | null;
+
+ private regressorOpt: FittableRegressor | null;
+ private transformerOpt: TransformableTarget | null;
+
+ constructor(opts: TransformedTargetRegressorOptions = {}) {
+ this.regressorOpt = opts.regressor ?? null;
+ this.transformerOpt = opts.transformer ?? null;
+ this.func = opts.func ?? null;
+ this.inverseFunc = opts.inverseFunc ?? null;
+ }
+
+ fit(X: Float64Array[], y: Float64Array): this {
+ let yTrans: Float64Array;
+
+ if (this.func) {
+ yTrans = this.func(y);
+ } else if (this.transformerOpt) {
+ this.transformer_ = this.transformerOpt;
+ this.transformer_.fit(y);
+ yTrans = this.transformer_.transform(y);
+ } else {
+ // Default: identity
+ yTrans = Float64Array.from(y);
+ }
+
+ const reg = this.regressorOpt ?? createDefaultRegressor();
+ this.regressor_ = reg;
+ reg.fit(X, yTrans);
+ return this;
+ }
+
+ predict(X: Float64Array[]): Float64Array {
+ if (!this.regressor_) throw new NotFittedError("TransformedTargetRegressor");
+ const predsTrans = this.regressor_.predict(X);
+
+ if (this.inverseFunc) {
+ return this.inverseFunc(predsTrans);
+ } else if (this.transformer_) {
+ return this.transformer_.inverseTransform(predsTrans);
+ }
+ return predsTrans;
+ }
+
+ score(X: Float64Array[], y: Float64Array): number {
+ const preds = this.predict(X);
+ const mean = y.reduce((s, v) => s + v, 0) / y.length;
+ let ssRes = 0;
+ let ssTot = 0;
+ for (let i = 0; i < y.length; i++) {
+ ssRes += ((y[i] ?? 0) - (preds[i] ?? 0)) ** 2;
+ ssTot += ((y[i] ?? 0) - mean) ** 2;
+ }
+ return ssTot === 0 ? 1 : 1 - ssRes / ssTot;
+ }
+}
+
+function createDefaultRegressor(): FittableRegressor {
+ let coef: Float64Array | null = null;
+ let intercept = 0;
+ return {
+ fit(X: Float64Array[], y: Float64Array) {
+ const n = X.length;
+ const d = X[0]?.length ?? 0;
+ coef = new Float64Array(d);
+ const lr = 0.01;
+ for (let iter = 0; iter < 200; iter++) {
+ for (let i = 0; i < n; i++) {
+ const xi = X[i] as Float64Array;
+ let pred = intercept;
+ for (let j = 0; j < d; j++) pred += (coef![j] ?? 0) * (xi[j] ?? 0);
+ const err = (y[i] ?? 0) - pred;
+ intercept += lr * err;
+ for (let j = 0; j < d; j++) coef![j]! += lr * err * (xi[j] ?? 0);
+ }
+ }
+ return this;
+ },
+ predict(X: Float64Array[]) {
+ return Float64Array.from(X, (xi) => {
+ let pred = intercept;
+ for (let j = 0; j < xi.length; j++) pred += (coef![j] ?? 0) * (xi[j] ?? 0);
+ return pred;
+ });
+ },
+ };
+}
diff --git a/src/covariance/covariance.ts b/src/covariance/covariance.ts
new file mode 100644
index 0000000..534223f
--- /dev/null
+++ b/src/covariance/covariance.ts
@@ -0,0 +1,224 @@
+/**
+ * Covariance estimators: EmpiricalCovariance, ShrunkCovariance, LedoitWolf, OAS.
+ * Mirrors sklearn.covariance.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+/** Compute column means of X. */
+function colMeans(X: Float64Array[]): Float64Array {
+ const p = (X[0] ?? new Float64Array(0)).length;
+ const means = new Float64Array(p);
+ const n = X.length;
+ for (const xi of X) {
+ for (let j = 0; j < p; j++) means[j] = (means[j] ?? 0) + (xi[j] ?? 0);
+ }
+ for (let j = 0; j < p; j++) means[j] = (means[j] ?? 0) / n;
+ return means;
+}
+
+/** Compute empirical covariance matrix (biased). */
+function empCov(X: Float64Array[], means: Float64Array): Float64Array[] {
+ const n = X.length;
+ const p = means.length;
+ const C = Array.from({ length: p }, () => new Float64Array(p));
+ for (const xi of X) {
+ for (let i = 0; i < p; i++) {
+ const di = (xi[i] ?? 0) - (means[i] ?? 0);
+ for (let j = i; j < p; j++) {
+ const dj = (xi[j] ?? 0) - (means[j] ?? 0);
+ C[i]![j] = (C[i]![j] ?? 0) + di * dj;
+ }
+ }
+ }
+ for (let i = 0; i < p; i++) {
+ C[i]![i] = (C[i]![i] ?? 0) / n;
+ for (let j = i + 1; j < p; j++) {
+ C[i]![j] = (C[i]![j] ?? 0) / n;
+ C[j]![i] = C[i]![j] ?? 0;
+ }
+ }
+ return C;
+}
+
+/**
+ * Maximum likelihood covariance estimator.
+ * Mirrors sklearn.covariance.EmpiricalCovariance.
+ */
+export class EmpiricalCovariance {
+ assumeCentered: boolean;
+
+ location_: Float64Array | null = null;
+ covariance_: Float64Array[] | null = null;
+
+ constructor(options: { assumeCentered?: boolean } = {}) {
+ this.assumeCentered = options.assumeCentered ?? false;
+ }
+
+ fit(X: Float64Array[]): this {
+ const p = (X[0] ?? new Float64Array(0)).length;
+ if (this.assumeCentered) {
+ this.location_ = new Float64Array(p);
+ } else {
+ this.location_ = colMeans(X);
+ }
+ this.covariance_ = empCov(X, this.location_);
+ return this;
+ }
+
+ score(X: Float64Array[]): number {
+ if (this.covariance_ === null || this.location_ === null) throw new NotFittedError();
+ // Negative log-likelihood
+ const n = X.length;
+ const p = (X[0] ?? new Float64Array(0)).length;
+ let logdet = 0;
+ // Approximate log-det via trace of covariance
+ for (let i = 0; i < p; i++) {
+ logdet += Math.log(Math.abs(this.covariance_[i]![i] ?? 1) + 1e-12);
+ }
+ let trace = 0;
+ for (const xi of X) {
+ const centered = new Float64Array(p);
+ for (let j = 0; j < p; j++) centered[j] = (xi[j] ?? 0) - (this.location_![j] ?? 0);
+ for (let j = 0; j < p; j++) {
+ const cjj = this.covariance_![j]![j] ?? 1e-12;
+ trace += (centered[j] ?? 0) ** 2 / (cjj || 1e-12);
+ }
+ }
+ return -(n * logdet + trace) / 2;
+ }
+
+ mahalanobis(X: Float64Array[]): Float64Array {
+ if (this.covariance_ === null || this.location_ === null) throw new NotFittedError();
+ const p = (X[0] ?? new Float64Array(0)).length;
+ const dists = new Float64Array(X.length);
+ for (let idx = 0; idx < X.length; idx++) {
+ const xi = X[idx] ?? new Float64Array(p);
+ let d = 0;
+ for (let j = 0; j < p; j++) {
+ const diff = (xi[j] ?? 0) - (this.location_![j] ?? 0);
+ const cjj = this.covariance_![j]![j] ?? 1e-12;
+ d += diff ** 2 / (cjj || 1e-12);
+ }
+ dists[idx] = Math.sqrt(d);
+ }
+ return dists;
+ }
+}
+
+/**
+ * Covariance estimator with shrinkage.
+ * Mirrors sklearn.covariance.ShrunkCovariance.
+ */
+export class ShrunkCovariance extends EmpiricalCovariance {
+ shrinkage: number;
+
+ constructor(options: { assumeCentered?: boolean; shrinkage?: number } = {}) {
+ super(options);
+ this.shrinkage = options.shrinkage ?? 0.1;
+ }
+
+ override fit(X: Float64Array[]): this {
+ super.fit(X);
+ if (this.covariance_ !== null) {
+ const p = this.covariance_.length;
+ for (let i = 0; i < p; i++) {
+ for (let j = 0; j < p; j++) {
+ if (i === j) continue;
+ this.covariance_[i]![j] = (this.covariance_![i]![j] ?? 0) * (1 - this.shrinkage);
+ }
+ }
+ }
+ return this;
+ }
+}
+
+/**
+ * Ledoit-Wolf automatic covariance estimator.
+ * Mirrors sklearn.covariance.LedoitWolf.
+ */
+export class LedoitWolf extends EmpiricalCovariance {
+ blockSize: number;
+
+ shrinkage_: number | null = null;
+
+ constructor(options: { assumeCentered?: boolean; blockSize?: number } = {}) {
+ super(options);
+ this.blockSize = options.blockSize ?? 1000;
+ }
+
+ override fit(X: Float64Array[]): this {
+ super.fit(X);
+ const n = X.length;
+ const p = (X[0] ?? new Float64Array(0)).length;
+ if (this.covariance_ !== null) {
+ // Oracle Approximating Shrinkage estimator (simplified Ledoit-Wolf)
+ let mu = 0;
+ for (let i = 0; i < p; i++) mu += this.covariance_![i]![i] ?? 0;
+ mu /= p;
+
+ let delta = 0;
+ for (let i = 0; i < p; i++) {
+ for (let j = 0; j < p; j++) {
+ delta += (this.covariance_![i]![j] ?? 0) ** 2;
+ }
+ }
+
+ const traceS2 = delta;
+ const traceS = p * mu;
+ const beta = (1 / (n * p)) * (traceS2 - traceS ** 2 / p);
+ const alpha = Math.max(0, Math.min(1, beta / delta));
+ this.shrinkage_ = alpha;
+
+ for (let i = 0; i < p; i++) {
+ for (let j = 0; j < p; j++) {
+ this.covariance_![i]![j] =
+ (1 - alpha) * (this.covariance_![i]![j] ?? 0) + (i === j ? alpha * mu : 0);
+ }
+ }
+ }
+ return this;
+ }
+}
+
+/**
+ * Oracle Approximating Shrinkage estimator.
+ * Mirrors sklearn.covariance.OAS.
+ */
+export class OAS extends EmpiricalCovariance {
+ shrinkage_: number | null = null;
+
+ override fit(X: Float64Array[]): this {
+ super.fit(X);
+ const n = X.length;
+ const p = (X[0] ?? new Float64Array(0)).length;
+ if (this.covariance_ !== null) {
+ let trS = 0;
+ let trS2 = 0;
+ for (let i = 0; i < p; i++) {
+ const sii = this.covariance_![i]![i] ?? 0;
+ trS += sii;
+ for (let j = 0; j < p; j++) {
+ trS2 += (this.covariance_![i]![j] ?? 0) ** 2;
+ }
+ }
+ const mu = trS / p;
+ const rho = Math.max(
+ 0,
+ Math.min(
+ 1,
+ ((1 - 2 / p) * trS2 + trS ** 2) /
+ ((n + 1 - 2 / p) * (trS2 - trS ** 2 / p)),
+ ),
+ );
+ this.shrinkage_ = rho;
+ for (let i = 0; i < p; i++) {
+ for (let j = 0; j < p; j++) {
+ this.covariance_![i]![j] =
+ (1 - rho) * (this.covariance_![i]![j] ?? 0) + (i === j ? rho * mu : 0);
+ }
+ }
+ }
+ return this;
+ }
+}
diff --git a/src/covariance/covariance_ext.ts b/src/covariance/covariance_ext.ts
new file mode 100644
index 0000000..d245491
--- /dev/null
+++ b/src/covariance/covariance_ext.ts
@@ -0,0 +1,151 @@
+/**
+ * Covariance extensions: OAS estimator, LedoitWolf estimator, ShrunkCovariance.
+ */
+
+export class OASCovariance {
+ covariance_: Float64Array[] = [];
+ precision_: Float64Array[] = [];
+ shrinkage_ = 0;
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const p = X[0]?.length ?? 1;
+ const emp = this._empiricalCovariance(X);
+ // OAS shrinkage estimator
+ const trS = emp.reduce((s, row, i) => s + (row[i] ?? 0), 0);
+ const trS2 = emp.reduce((s1, row) => s1 + row.reduce((s2, v) => s2 + v * v, 0), 0);
+ const mu = trS / p;
+ const rhoNum = (1 - 2 / p) * trS2 + trS ** 2;
+ const rhoDenom = (n + 1 - 2 / p) * (trS2 - trS ** 2 / p);
+ const rho = Math.min(1, rhoNum / Math.max(rhoDenom, 1e-10));
+ this.shrinkage_ = rho;
+ this.covariance_ = emp.map((row, i) => new Float64Array(row.map((v, j) => (1 - rho) * v + (i === j ? rho * mu : 0))));
+ this.precision_ = this._invertMatrix(this.covariance_);
+ return this;
+ }
+
+ private _empiricalCovariance(X: Float64Array[]): Float64Array[] {
+ const n = X.length;
+ const p = X[0]?.length ?? 1;
+ const mean = new Float64Array(p);
+ for (const x of X) for (let f = 0; f < p; f++) mean[f] = (mean[f] ?? 0) + (x[f] ?? 0) / n;
+ const cov: Float64Array[] = Array.from({ length: p }, () => new Float64Array(p));
+ for (const x of X) {
+ for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) {
+ cov[i]![j] = (cov[i]![j] ?? 0) + ((x[i] ?? 0) - (mean[i] ?? 0)) * ((x[j] ?? 0) - (mean[j] ?? 0)) / n;
+ }
+ }
+ return cov;
+ }
+
+ private _invertMatrix(M: Float64Array[]): Float64Array[] {
+ const n = M.length;
+ const A = M.map((row) => new Float64Array(row));
+ const inv = Array.from({ length: n }, (_, i) => {
+ const row = new Float64Array(n);
+ row[i] = 1;
+ return row;
+ });
+ for (let col = 0; col < n; col++) {
+ let pivotRow = col;
+ for (let row = col + 1; row < n; row++) {
+ if (Math.abs(A[row]?.[col] ?? 0) > Math.abs(A[pivotRow]?.[col] ?? 0)) pivotRow = row;
+ }
+ [A[col], A[pivotRow]] = [A[pivotRow]!, A[col]!];
+ [inv[col], inv[pivotRow]] = [inv[pivotRow]!, inv[col]!];
+ const pivot = A[col]?.[col] ?? 1e-10;
+ if (Math.abs(pivot) < 1e-10) continue;
+ for (let j = 0; j < n; j++) { A[col]![j] = (A[col]![j] ?? 0) / pivot; inv[col]![j] = (inv[col]![j] ?? 0) / pivot; }
+ for (let row = 0; row < n; row++) {
+ if (row === col) continue;
+ const factor = A[row]?.[col] ?? 0;
+ for (let j = 0; j < n; j++) {
+ A[row]![j] = (A[row]![j] ?? 0) - factor * (A[col]![j] ?? 0);
+ inv[row]![j] = (inv[row]![j] ?? 0) - factor * (inv[col]![j] ?? 0);
+ }
+ }
+ }
+ return inv;
+ }
+}
+
+export class LedoitWolfCovariance {
+ covariance_: Float64Array[] = [];
+ shrinkage_ = 0;
+ precision_: Float64Array[] = [];
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const p = X[0]?.length ?? 1;
+ const emp = this._empiricalCovariance(X);
+ const trS = emp.reduce((s, row, i) => s + (row[i] ?? 0), 0);
+ const mu = trS / p;
+ const delta = emp.reduce((s1, row, i) => s1 + row.reduce((s2, v, j) => s2 + (i === j ? (v - mu) ** 2 : v ** 2), 0), 0) / p;
+ const beta = 1 / (n * p) * emp.reduce((s1, row) => s1 + row.reduce((s2, v) => s2 + v ** 2, 0), 0);
+ const rho = Math.min(1, (beta - delta) / Math.max(delta, 1e-10));
+ this.shrinkage_ = rho;
+ this.covariance_ = emp.map((row, i) => new Float64Array(row.map((v, j) => (1 - rho) * v + (i === j ? rho * mu : 0))));
+ this.precision_ = this._invertMatrix(this.covariance_);
+ return this;
+ }
+
+ private _empiricalCovariance(X: Float64Array[]): Float64Array[] {
+ const n = X.length;
+ const p = X[0]?.length ?? 1;
+ const mean = new Float64Array(p);
+ for (const x of X) for (let f = 0; f < p; f++) mean[f] = (mean[f] ?? 0) + (x[f] ?? 0) / n;
+ const cov: Float64Array[] = Array.from({ length: p }, () => new Float64Array(p));
+ for (const x of X) {
+ for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) {
+ cov[i]![j] = (cov[i]![j] ?? 0) + ((x[i] ?? 0) - (mean[i] ?? 0)) * ((x[j] ?? 0) - (mean[j] ?? 0)) / n;
+ }
+ }
+ return cov;
+ }
+
+ private _invertMatrix(M: Float64Array[]): Float64Array[] {
+ const n = M.length;
+ const A = M.map((row) => new Float64Array(row));
+ const inv = Array.from({ length: n }, (_, i) => { const row = new Float64Array(n); row[i] = 1; return row; });
+ for (let col = 0; col < n; col++) {
+ const pivot = A[col]?.[col] ?? 1e-10;
+ if (Math.abs(pivot) < 1e-10) continue;
+ for (let j = 0; j < n; j++) { A[col]![j] = (A[col]![j] ?? 0) / pivot; inv[col]![j] = (inv[col]![j] ?? 0) / pivot; }
+ for (let row = 0; row < n; row++) {
+ if (row === col) continue;
+ const f = A[row]?.[col] ?? 0;
+ for (let j = 0; j < n; j++) { A[row]![j] = (A[row]![j] ?? 0) - f * (A[col]![j] ?? 0); inv[row]![j] = (inv[row]![j] ?? 0) - f * (inv[col]![j] ?? 0); }
+ }
+ }
+ return inv;
+ }
+}
+
+export class ShrunkCovariance {
+ covariance_: Float64Array[] = [];
+ precision_: Float64Array[] = [];
+
+ constructor(private readonly shrinkage = 0.1) {}
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const p = X[0]?.length ?? 1;
+ const mean = new Float64Array(p);
+ for (const x of X) for (let f = 0; f < p; f++) mean[f] = (mean[f] ?? 0) + (x[f] ?? 0) / n;
+ const emp: Float64Array[] = Array.from({ length: p }, () => new Float64Array(p));
+ for (const x of X) {
+ for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) {
+ emp[i]![j] = (emp[i]![j] ?? 0) + ((x[i] ?? 0) - (mean[i] ?? 0)) * ((x[j] ?? 0) - (mean[j] ?? 0)) / n;
+ }
+ }
+ const mu = emp.reduce((s, row, i) => s + (row[i] ?? 0), 0) / p;
+ this.covariance_ = emp.map((row, i) => new Float64Array(row.map((v, j) => (1 - this.shrinkage) * v + (i === j ? this.shrinkage * mu : 0))));
+ // Simple precision (diagonal approximation)
+ this.precision_ = Array.from({ length: p }, (_, i) => {
+ const row = new Float64Array(p);
+ row[i] = 1 / Math.max(this.covariance_[i]?.[i] ?? 1, 1e-10);
+ return row;
+ });
+ return this;
+ }
+}
diff --git a/src/covariance/covariance_ext2.ts b/src/covariance/covariance_ext2.ts
new file mode 100644
index 0000000..8c100c5
--- /dev/null
+++ b/src/covariance/covariance_ext2.ts
@@ -0,0 +1,146 @@
+/**
+ * Extended covariance estimation: Oracle Approximating Shrinkage (OAS),
+ * Ledoit-Wolf analytical estimator, and covariance comparison utilities.
+ */
+
+/** Ledoit-Wolf analytical shrinkage coefficient. */
+export function ledoitWolfShrinkage(X: Float64Array[]): number {
+ const n = X.length;
+ const p = X[0]?.length ?? 0;
+ if (n <= 1 || p === 0) return 0;
+
+ // Sample covariance
+ const mean = new Float64Array(p);
+ for (const xi of X) {
+ for (let j = 0; j < p; j++) mean[j] = (mean[j] ?? 0) + (xi[j] ?? 0);
+ }
+ for (let j = 0; j < p; j++) mean[j] = (mean[j] ?? 0) / n;
+
+ const S = Array.from({ length: p }, () => new Float64Array(p));
+ for (const xi of X) {
+ for (let j = 0; j < p; j++) {
+ for (let k = 0; k < p; k++) {
+ S[j]![k] = (S[j]![k] ?? 0) + ((xi[j] ?? 0) - (mean[j] ?? 0)) * ((xi[k] ?? 0) - (mean[k] ?? 0));
+ }
+ }
+ }
+ for (let j = 0; j < p; j++) {
+ for (let k = 0; k < p; k++) S[j]![k] = (S[j]![k] ?? 0) / n;
+ }
+
+ let trS = 0, trS2 = 0, trS_sq = 0;
+ for (let j = 0; j < p; j++) trS += S[j]![j] ?? 0;
+ for (let j = 0; j < p; j++) {
+ for (let k = 0; k < p; k++) trS2 += (S[j]![k] ?? 0) ** 2;
+ }
+ trS_sq = trS ** 2;
+
+ // LW formula: delta = (((n-2)/n * trS2 + trS_sq) / ((n+2) * (trS2 - trS_sq/p)))
+ const num = ((n - 2) / n) * trS2 + trS_sq;
+ const den = (n + 2) * (trS2 - trS_sq / p);
+ return den === 0 ? 1 : Math.min(1, Math.max(0, num / den));
+}
+
+/** OAS shrinkage estimator. */
+export function oasShrinkage(X: Float64Array[]): number {
+ const n = X.length;
+ const p = X[0]?.length ?? 0;
+ if (n <= 1 || p === 0) return 0;
+
+ const mean = new Float64Array(p);
+ for (const xi of X) {
+ for (let j = 0; j < p; j++) mean[j] = (mean[j] ?? 0) + (xi[j] ?? 0);
+ }
+ for (let j = 0; j < p; j++) mean[j] = (mean[j] ?? 0) / n;
+
+ const S = Array.from({ length: p }, () => new Float64Array(p));
+ for (const xi of X) {
+ for (let j = 0; j < p; j++) {
+ for (let k = 0; k < p; k++) {
+ S[j]![k] = (S[j]![k] ?? 0) + ((xi[j] ?? 0) - (mean[j] ?? 0)) * ((xi[k] ?? 0) - (mean[k] ?? 0));
+ }
+ }
+ }
+ for (let j = 0; j < p; j++) {
+ for (let k = 0; k < p; k++) S[j]![k] = (S[j]![k] ?? 0) / n;
+ }
+
+ let trS = 0, trS2 = 0;
+ for (let j = 0; j < p; j++) trS += S[j]![j] ?? 0;
+ for (let j = 0; j < p; j++) {
+ for (let k = 0; k < p; k++) trS2 += (S[j]![k] ?? 0) ** 2;
+ }
+
+ const rho = (1 - 2 / p) * trS2 + trS ** 2;
+ const gamma = (n + 1 - 2 / p) * (trS2 - trS ** 2 / p);
+ return gamma === 0 ? 1 : Math.min(1, Math.max(0, rho / ((n + 1 - 2 / p) * gamma)));
+}
+
+/** Shrink sample covariance toward identity: Sigma = (1-alpha)*S + alpha*mu*I */
+export function shrunkCovariance(
+ X: Float64Array[],
+ shrinkage: number,
+): Float64Array[] {
+ const n = X.length;
+ const p = X[0]?.length ?? 0;
+
+ const mean = new Float64Array(p);
+ for (const xi of X) {
+ for (let j = 0; j < p; j++) mean[j] = (mean[j] ?? 0) + (xi[j] ?? 0);
+ }
+ for (let j = 0; j < p; j++) mean[j] = (mean[j] ?? 0) / n;
+
+ const S = Array.from({ length: p }, () => new Float64Array(p));
+ for (const xi of X) {
+ for (let j = 0; j < p; j++) {
+ for (let k = 0; k < p; k++) {
+ S[j]![k] = (S[j]![k] ?? 0) + ((xi[j] ?? 0) - (mean[j] ?? 0)) * ((xi[k] ?? 0) - (mean[k] ?? 0));
+ }
+ }
+ }
+
+ let trace = 0;
+ for (let j = 0; j < p; j++) {
+ for (let k = 0; k < p; k++) S[j]![k] = (S[j]![k] ?? 0) / n;
+ trace += S[j]![j] ?? 0;
+ }
+ const mu = trace / p;
+
+ return S.map((row, j) =>
+ row.map((v, k) => (1 - shrinkage) * v + (j === k ? shrinkage * mu : 0))
+ );
+}
+
+/** Frobenius distance between two covariance matrices. */
+export function covarianceFrobeniusDistance(A: Float64Array[], B: Float64Array[]): number {
+ let dist = 0;
+ for (let i = 0; i < A.length; i++) {
+ const ai = A[i];
+ const bi = B[i];
+ if (ai === undefined || bi === undefined) continue;
+ for (let j = 0; j < ai.length; j++) dist += ((ai[j] ?? 0) - (bi[j] ?? 0)) ** 2;
+ }
+ return Math.sqrt(dist);
+}
+
+/** Compute log-determinant of a symmetric positive definite matrix (via Cholesky). */
+export function logDetCovariance(S: Float64Array[]): number {
+ const p = S.length;
+ // Cholesky decomposition L such that S = L L^T
+ const L = Array.from({ length: p }, () => new Float64Array(p));
+ for (let i = 0; i < p; i++) {
+ for (let j = 0; j <= i; j++) {
+ let sum = 0;
+ for (let k = 0; k < j; k++) sum += (L[i]![k] ?? 0) * (L[j]![k] ?? 0);
+ if (i === j) {
+ const val = (S[i]![i] ?? 0) - sum;
+ L[i]![i] = val > 0 ? Math.sqrt(val) : 1e-10;
+ } else {
+ L[i]![j] = ((S[i]![j] ?? 0) - sum) / (L[j]![j] ?? 1e-10);
+ }
+ }
+ }
+ let logDet = 0;
+ for (let i = 0; i < p; i++) logDet += Math.log(Math.max(L[i]![i] ?? 1e-10, 1e-10));
+ return 2 * logDet;
+}
diff --git a/src/covariance/covariance_ext3.ts b/src/covariance/covariance_ext3.ts
new file mode 100644
index 0000000..8d1c99d
--- /dev/null
+++ b/src/covariance/covariance_ext3.ts
@@ -0,0 +1,163 @@
+/**
+ * Additional covariance estimators: OAS, LedoitWolfExt.
+ * Mirrors sklearn.covariance extras.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+function computeSampleCov(X: Float64Array[]): {
+ mean: Float64Array;
+ cov: Float64Array[];
+} {
+ const n = X.length;
+ const p = X[0]?.length ?? 0;
+ const mean = new Float64Array(p);
+ for (const row of X) {
+ for (let j = 0; j < p; j++) mean[j] = (mean[j] ?? 0) + (row[j] ?? 0);
+ }
+ for (let j = 0; j < p; j++) mean[j] = (mean[j] ?? 0) / n;
+
+ const cov: Float64Array[] = Array.from({ length: p }, () => new Float64Array(p));
+ for (const row of X) {
+ for (let i = 0; i < p; i++) {
+ for (let j = i; j < p; j++) {
+ const v = ((row[i] ?? 0) - (mean[i] ?? 0)) * ((row[j] ?? 0) - (mean[j] ?? 0));
+ cov[i]![j] = (cov[i]?.[j] ?? 0) + v;
+ if (i !== j) cov[j]![i] = (cov[j]?.[i] ?? 0) + v;
+ }
+ }
+ }
+ const denom = n - 1 > 0 ? n - 1 : 1;
+ for (let i = 0; i < p; i++) {
+ for (let j = 0; j < p; j++) cov[i]![j] = (cov[i]?.[j] ?? 0) / denom;
+ }
+ return { mean, cov };
+}
+
+export class OAS {
+ location_: Float64Array | null = null;
+ covariance_: Float64Array[] | null = null;
+ precision_: Float64Array[] | null = null;
+ shrinkage_: number = 0;
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const p = X[0]?.length ?? 0;
+ const { mean, cov } = computeSampleCov(X);
+ this.location_ = mean;
+
+ // OAS shrinkage coefficient
+ let traceSq = 0;
+ let traceSquared = 0;
+ for (let i = 0; i < p; i++) {
+ for (let j = 0; j < p; j++) {
+ traceSq += (cov[i]?.[j] ?? 0) ** 2;
+ }
+ traceSquared += (cov[i]?.[i] ?? 0);
+ }
+ traceSquared = traceSquared ** 2;
+
+ const num = (1 - 2 / p) * traceSq + traceSquared;
+ const denom2 = (n + 1 - 2 / p) * (traceSq - traceSquared / p);
+ this.shrinkage_ = denom2 > 0 ? Math.min(1, num / denom2) : 1;
+
+ const rho = this.shrinkage_;
+ let traceS = 0;
+ for (let i = 0; i < p; i++) traceS += cov[i]?.[i] ?? 0;
+ const mu = traceS / p;
+
+ this.covariance_ = Array.from({ length: p }, (_, i) =>
+ Float64Array.from({ length: p }, (_, j) =>
+ (1 - rho) * (cov[i]?.[j] ?? 0) + (i === j ? rho * mu : 0),
+ ),
+ );
+
+ return this;
+ }
+}
+
+export class LedoitWolfExt {
+ location_: Float64Array | null = null;
+ covariance_: Float64Array[] | null = null;
+ shrinkage_: number = 0;
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const p = X[0]?.length ?? 0;
+ const { mean, cov } = computeSampleCov(X);
+ this.location_ = mean;
+
+ // Ledoit-Wolf analytical shrinkage
+ let mu = 0;
+ for (let i = 0; i < p; i++) mu += cov[i]?.[i] ?? 0;
+ mu /= p;
+
+ let delta2 = 0;
+ let beta2 = 0;
+ for (let i = 0; i < p; i++) {
+ for (let j = 0; j < p; j++) {
+ const Sij = cov[i]?.[j] ?? 0;
+ const Fij = i === j ? mu : 0;
+ delta2 += (Sij - Fij) ** 2;
+ }
+ }
+
+ // Estimate beta
+ for (const row of X) {
+ const centered = new Float64Array(p);
+ for (let j = 0; j < p; j++) centered[j] = (row[j] ?? 0) - (mean[j] ?? 0);
+ for (let i = 0; i < p; i++) {
+ for (let j = 0; j < p; j++) {
+ const Xij = (centered[i] ?? 0) * (centered[j] ?? 0);
+ const Sij = cov[i]?.[j] ?? 0;
+ beta2 += (Xij - Sij) ** 2;
+ }
+ }
+ }
+ beta2 /= n ** 2;
+
+ const rho = Math.min(1, beta2 / delta2);
+ this.shrinkage_ = rho;
+
+ this.covariance_ = Array.from({ length: p }, (_, i) =>
+ Float64Array.from({ length: p }, (_, j) =>
+ (1 - rho) * (cov[i]?.[j] ?? 0) + (i === j ? rho * mu : 0),
+ ),
+ );
+
+ return this;
+ }
+}
+
+export class ShrunkCovariance {
+ shrinkage: number;
+ location_: Float64Array | null = null;
+ covariance_: Float64Array[] | null = null;
+
+ constructor(shrinkage = 0.1) {
+ this.shrinkage = shrinkage;
+ }
+
+ fit(X: Float64Array[]): this {
+ const p = X[0]?.length ?? 0;
+ const { mean, cov } = computeSampleCov(X);
+ this.location_ = mean;
+
+ let mu = 0;
+ for (let i = 0; i < p; i++) mu += cov[i]?.[i] ?? 0;
+ mu /= p;
+
+ const rho = this.shrinkage;
+ this.covariance_ = Array.from({ length: p }, (_, i) =>
+ Float64Array.from({ length: p }, (_, j) =>
+ (1 - rho) * (cov[i]?.[j] ?? 0) + (i === j ? rho * mu : 0),
+ ),
+ );
+ return this;
+ }
+
+ transform(X: Float64Array[]): Float64Array[] {
+ if (!this.covariance_) throw new NotFittedError("ShrunkCovariance is not fitted");
+ return X;
+ }
+}
diff --git a/src/covariance/covariance_ext4.ts b/src/covariance/covariance_ext4.ts
new file mode 100644
index 0000000..7689070
--- /dev/null
+++ b/src/covariance/covariance_ext4.ts
@@ -0,0 +1,161 @@
+/**
+ * Covariance extensions: OAS (Oracle Approximating Shrinkage), POET.
+ * Port of sklearn.covariance extensions.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+/** Oracle Approximating Shrinkage (OAS) estimator. */
+export class OASShrinkage {
+ private covariance_: Float64Array[] | null = null;
+ private precision_: Float64Array[] | null = null;
+ private shrinkage_: number | null = null;
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const p = X[0]?.length ?? 0;
+
+ const mean = new Float64Array(p);
+ for (const row of X) for (let j = 0; j < p; j++) mean[j]! += row[j] ?? 0;
+ for (let j = 0; j < p; j++) mean[j]! /= n;
+
+ const S = Array.from({ length: p }, () => new Float64Array(p));
+ for (const row of X) {
+ for (let a = 0; a < p; a++) {
+ for (let b = 0; b < p; b++) {
+ S[a]![b]! += ((row[a] ?? 0) - (mean[a] ?? 0)) * ((row[b] ?? 0) - (mean[b] ?? 0));
+ }
+ }
+ }
+ for (let a = 0; a < p; a++) for (let b = 0; b < p; b++) S[a]![b]! /= n;
+
+ // Trace and Frobenius norm
+ let trS = 0;
+ let trS2 = 0;
+ for (let a = 0; a < p; a++) trS += S[a]![a] ?? 0;
+ for (let a = 0; a < p; a++) for (let b = 0; b < p; b++) trS2 += (S[a]![b] ?? 0) ** 2;
+
+ // OAS shrinkage coefficient
+ const num = (1 - 2 / p) * trS2 + trS * trS;
+ const den = (n + 1 - 2 / p) * (trS2 - (trS * trS) / p);
+ const rho = den === 0 ? 1 : Math.min(1, num / den);
+ this.shrinkage_ = rho;
+
+ const mu = trS / p;
+ this.covariance_ = Array.from({ length: p }, (_, a) => {
+ const row = new Float64Array(p);
+ for (let b = 0; b < p; b++) {
+ row[b] = (1 - rho) * (S[a]![b] ?? 0) + (a === b ? rho * mu : 0);
+ }
+ return row;
+ });
+ this.precision_ = invertMatrix(this.covariance_);
+ return this;
+ }
+
+ get covariance(): Float64Array[] {
+ if (this.covariance_ === null) throw new NotFittedError("OASShrinkage is not fitted.");
+ return this.covariance_;
+ }
+
+ get precision(): Float64Array[] {
+ if (this.precision_ === null) throw new NotFittedError("OASShrinkage is not fitted.");
+ return this.precision_;
+ }
+
+ get shrinkage(): number {
+ if (this.shrinkage_ === null) throw new NotFittedError("OASShrinkage is not fitted.");
+ return this.shrinkage_;
+ }
+}
+
+/** Compute log-likelihood of data under a covariance model. */
+export function gaussianLogLikelihood(
+ X: Float64Array[],
+ mean: Float64Array,
+ precision: Float64Array[],
+): number {
+ const n = X.length;
+ const p = mean.length;
+ // log det via Cholesky (simplified: use product of diagonal after LU)
+ let logDet = 0;
+ for (let j = 0; j < p; j++) logDet += Math.log(Math.abs(precision[j]?.[j] ?? 1));
+ let logLik = (n * (logDet - p * Math.log(2 * Math.PI))) / 2;
+ for (const row of X) {
+ const diff = new Float64Array(p).map((_, j) => (row[j] ?? 0) - (mean[j] ?? 0));
+ let quad = 0;
+ for (let a = 0; a < p; a++) {
+ let pda = 0;
+ for (let b = 0; b < p; b++) pda += (precision[a]?.[b] ?? 0) * (diff[b] ?? 0);
+ quad += (diff[a] ?? 0) * pda;
+ }
+ logLik -= quad / 2;
+ }
+ return logLik;
+}
+
+/** Covariance matrix cross-validation scoring (log-likelihood based). */
+export function covarianceCVScore(
+ X: Float64Array[],
+ estimator: { fit: (X: Float64Array[]) => unknown; covariance: Float64Array[] },
+ nFolds = 5,
+): number {
+ const n = X.length;
+ const p = X[0]?.length ?? 0;
+ const foldSize = Math.floor(n / nFolds);
+ let totalScore = 0;
+ for (let fold = 0; fold < nFolds; fold++) {
+ const testStart = fold * foldSize;
+ const testEnd = fold === nFolds - 1 ? n : testStart + foldSize;
+ const trainX = X.filter((_, i) => i < testStart || i >= testEnd);
+ const testX = X.slice(testStart, testEnd);
+ estimator.fit(trainX);
+ const cov = estimator.covariance;
+ const mean = new Float64Array(p);
+ for (const row of trainX) for (let j = 0; j < p; j++) mean[j]! += row[j] ?? 0;
+ for (let j = 0; j < p; j++) mean[j]! /= trainX.length;
+ // Score: negative log-likelihood
+ let score = 0;
+ for (const row of testX) {
+ let quadForm = 0;
+ for (let a = 0; a < p; a++) {
+ let covDotDiff = 0;
+ for (let b = 0; b < p; b++) {
+ covDotDiff += (cov[a]?.[b] ?? 0) * ((row[b] ?? 0) - (mean[b] ?? 0));
+ }
+ quadForm += ((row[a] ?? 0) - (mean[a] ?? 0)) * covDotDiff;
+ }
+ score -= quadForm;
+ }
+ totalScore += score / testX.length;
+ }
+ return totalScore / nFolds;
+}
+
+function invertMatrix(A: Float64Array[]): Float64Array[] {
+ const n = A.length;
+ const aug = A.map((row, i) => {
+ const r = new Float64Array(2 * n);
+ for (let j = 0; j < n; j++) r[j] = row[j] ?? 0;
+ r[n + i] = 1;
+ return r;
+ });
+ for (let col = 0; col < n; col++) {
+ let maxRow = col;
+ for (let row = col + 1; row < n; row++) {
+ if (Math.abs(aug[row]?.[col] ?? 0) > Math.abs(aug[maxRow]?.[col] ?? 0)) maxRow = row;
+ }
+ const tmp = aug[col]!;
+ aug[col] = aug[maxRow]!;
+ aug[maxRow] = tmp;
+ const pivot = aug[col]?.[col] ?? 1;
+ if (Math.abs(pivot) < 1e-12) continue;
+ for (let j = 0; j < 2 * n; j++) aug[col]![j]! /= pivot;
+ for (let row = 0; row < n; row++) {
+ if (row === col) continue;
+ const f = aug[row]?.[col] ?? 0;
+ for (let j = 0; j < 2 * n; j++) aug[row]![j]! -= f * (aug[col]?.[j] ?? 0);
+ }
+ }
+ return aug.map((row) => new Float64Array(row.slice(n)));
+}
diff --git a/src/covariance/covariance_ext5.ts b/src/covariance/covariance_ext5.ts
new file mode 100644
index 0000000..974defc
--- /dev/null
+++ b/src/covariance/covariance_ext5.ts
@@ -0,0 +1,204 @@
+/**
+ * Covariance extensions: OAS, OASCovariance, LedoitWolf extensions.
+ * Mirrors sklearn.covariance advanced estimators.
+ */
+
+import { BaseEstimator } from "../base.js";
+
+/** Oracle Approximating Shrinkage (OAS) covariance estimator. */
+export class OASCovariance extends BaseEstimator {
+ covariance_: Float64Array[] = [];
+ precision_: Float64Array[] = [];
+ shrinkage_: number = 0;
+ location_: Float64Array = new Float64Array(0);
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const p = X[0]?.length ?? 0;
+ this.location_ = new Float64Array(p);
+ for (const xi of X) for (let k = 0; k < p; k++) this.location_[k] = (this.location_[k] ?? 0) + (xi[k] ?? 0);
+ for (let k = 0; k < p; k++) this.location_[k] = (this.location_[k] ?? 0) / n;
+ // Sample covariance
+ const S = Array.from({ length: p }, () => new Float64Array(p));
+ for (const xi of X) {
+ const xc = new Float64Array(p).map((_, k) => (xi[k] ?? 0) - (this.location_[k] ?? 0));
+ for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) S[i]![j] = (S[i]![j] ?? 0) + (xc[i] ?? 0) * (xc[j] ?? 0);
+ }
+ for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) S[i]![j] = (S[i]![j] ?? 0) / n;
+ // OAS shrinkage coefficient
+ let trS = 0, trS2 = 0, trS_sq = 0;
+ for (let i = 0; i < p; i++) { trS += S[i]?.[i] ?? 0; for (let j = 0; j < p; j++) trS2 += ((S[i]?.[j] ?? 0) ** 2); }
+ trS_sq = trS ** 2;
+ const rho_num = (1 - 2 / p) * trS2 + trS_sq;
+ const rho_denom = (n + 1 - 2 / p) * (trS2 - trS_sq / p);
+ this.shrinkage_ = rho_denom !== 0 ? Math.min(1, rho_num / rho_denom) : 1;
+ const mu = trS / p;
+ this.covariance_ = S.map((row, i) =>
+ new Float64Array(row.map((v, j) => (1 - this.shrinkage_) * v + (i === j ? this.shrinkage_ * mu : 0))),
+ );
+ this.precision_ = this._invert(this.covariance_, p);
+ return this;
+ }
+
+ private _invert(A: Float64Array[], p: number): Float64Array[] {
+ // Gauss-Jordan elimination
+ const aug = A.map((row, i) => {
+ const r = new Float64Array(2 * p);
+ for (let j = 0; j < p; j++) r[j] = row[j] ?? 0;
+ r[p + i] = 1;
+ return r;
+ });
+ for (let i = 0; i < p; i++) {
+ let maxRow = i;
+ for (let k = i + 1; k < p; k++) if (Math.abs(aug[k]?.[i] ?? 0) > Math.abs(aug[maxRow]?.[i] ?? 0)) maxRow = k;
+ [aug[i], aug[maxRow]] = [aug[maxRow]!, aug[i]!];
+ const pivot = aug[i]?.[i] ?? 1e-10;
+ if (Math.abs(pivot) < 1e-10) continue;
+ for (let j = 0; j < 2 * p; j++) aug[i]![j] = (aug[i]![j] ?? 0) / pivot;
+ for (let k = 0; k < p; k++) {
+ if (k === i) continue;
+ const factor = aug[k]?.[i] ?? 0;
+ for (let j = 0; j < 2 * p; j++) aug[k]![j] = (aug[k]![j] ?? 0) - factor * (aug[i]![j] ?? 0);
+ }
+ }
+ return Array.from({ length: p }, (_, i) => new Float64Array(p).map((_, j) => aug[i]?.[p + j] ?? 0));
+ }
+
+ mahalanobis(X: Float64Array[]): Float64Array {
+ return new Float64Array(X.map((xi) => {
+ const xc = new Float64Array(xi.length).map((_, k) => (xi[k] ?? 0) - (this.location_[k] ?? 0));
+ let d = 0;
+ for (let i = 0; i < xc.length; i++) for (let j = 0; j < xc.length; j++) d += (xc[i] ?? 0) * (this.precision_[i]?.[j] ?? 0) * (xc[j] ?? 0);
+ return Math.max(d, 0);
+ }));
+ }
+}
+
+/** Ledoit-Wolf analytical covariance estimator. */
+export class LedoitWolfExt extends BaseEstimator {
+ covariance_: Float64Array[] = [];
+ precision_: Float64Array[] = [];
+ shrinkage_: number = 0;
+ location_: Float64Array = new Float64Array(0);
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const p = X[0]?.length ?? 0;
+ this.location_ = new Float64Array(p);
+ for (const xi of X) for (let k = 0; k < p; k++) this.location_[k] = (this.location_[k] ?? 0) + (xi[k] ?? 0);
+ for (let k = 0; k < p; k++) this.location_[k] = (this.location_[k] ?? 0) / n;
+ const Xc = X.map((xi) => new Float64Array(p).map((_, k) => (xi[k] ?? 0) - (this.location_[k] ?? 0)));
+ const S = Array.from({ length: p }, () => new Float64Array(p));
+ for (const xc of Xc) for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) S[i]![j] = (S[i]![j] ?? 0) + (xc[i] ?? 0) * (xc[j] ?? 0);
+ for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) S[i]![j] = (S[i]![j] ?? 0) / n;
+ // Ledoit-Wolf analytical formula
+ let trS2 = 0, trS = 0;
+ for (let i = 0; i < p; i++) { trS += S[i]?.[i] ?? 0; for (let j = 0; j < p; j++) trS2 += ((S[i]?.[j] ?? 0) ** 2); }
+ let b2 = 0;
+ for (const xc of Xc) {
+ const xxt = Array.from({ length: p }, (_, i) => new Float64Array(p).map((_, j) => (xc[i] ?? 0) * (xc[j] ?? 0)));
+ for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) {
+ const diff = (xxt[i]?.[j] ?? 0) - (S[i]?.[j] ?? 0);
+ b2 += diff ** 2;
+ }
+ }
+ b2 /= (n ** 2);
+ const delta = Math.max(0, Math.min(1, Math.min(b2, trS2) / ((trS2 - trS ** 2 / p) || 1)));
+ this.shrinkage_ = delta;
+ const mu = trS / p;
+ this.covariance_ = S.map((row, i) =>
+ new Float64Array(row.map((v, j) => (1 - delta) * v + (i === j ? delta * mu : 0))),
+ );
+ this.precision_ = this._invert(this.covariance_, p);
+ return this;
+ }
+
+ private _invert(A: Float64Array[], p: number): Float64Array[] {
+ const aug = A.map((row, i) => {
+ const r = new Float64Array(2 * p);
+ for (let j = 0; j < p; j++) r[j] = row[j] ?? 0;
+ r[p + i] = 1;
+ return r;
+ });
+ for (let i = 0; i < p; i++) {
+ const pivot = aug[i]?.[i] ?? 1e-10;
+ if (Math.abs(pivot) < 1e-10) continue;
+ for (let j = 0; j < 2 * p; j++) aug[i]![j] = (aug[i]![j] ?? 0) / pivot;
+ for (let k = 0; k < p; k++) {
+ if (k === i) continue;
+ const f = aug[k]?.[i] ?? 0;
+ for (let j = 0; j < 2 * p; j++) aug[k]![j] = (aug[k]![j] ?? 0) - f * (aug[i]![j] ?? 0);
+ }
+ }
+ return Array.from({ length: p }, (_, i) => new Float64Array(p).map((_, j) => aug[i]?.[p + j] ?? 0));
+ }
+}
+
+/** MinCovDet: Minimum Covariance Determinant estimator. */
+export class MinCovDetExt extends BaseEstimator {
+ support_fraction_: number;
+ location_: Float64Array = new Float64Array(0);
+ covariance_: Float64Array[] = [];
+ dist_: Float64Array = new Float64Array(0);
+
+ constructor(supportFraction = 0.75) {
+ super();
+ this.support_fraction_ = supportFraction;
+ }
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const p = X[0]?.length ?? 0;
+ const h = Math.floor(n * this.support_fraction_);
+ // Simple C-step: start from all points, iteratively refine
+ let support = Array.from({ length: n }, (_, i) => i);
+ for (let step = 0; step < 10; step++) {
+ const Xs = support.map((i) => X[i]!);
+ const loc = new Float64Array(p);
+ for (const xi of Xs) for (let k = 0; k < p; k++) loc[k] = (loc[k] ?? 0) + (xi[k] ?? 0);
+ for (let k = 0; k < p; k++) loc[k] = (loc[k] ?? 0) / Xs.length;
+ const cov = Array.from({ length: p }, () => new Float64Array(p));
+ for (const xi of Xs) {
+ const xc = new Float64Array(p).map((_, k) => (xi[k] ?? 0) - (loc[k] ?? 0));
+ for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) cov[i]![j] = (cov[i]![j] ?? 0) + (xc[i] ?? 0) * (xc[j] ?? 0);
+ }
+ for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) cov[i]![j] = (cov[i]![j] ?? 0) / Xs.length;
+ // Compute Mahalanobis distances
+ const dist = X.map((xi) => {
+ let d = 0;
+ const xc = new Float64Array(p).map((_, k) => (xi[k] ?? 0) - (loc[k] ?? 0));
+ for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) d += (xc[i] ?? 0) * (cov[i]?.[j] ?? 0) * (xc[j] ?? 0);
+ return d;
+ });
+ support = dist.map((d, i) => ({ d, i })).sort((a, b) => a.d - b.d).slice(0, h).map((x) => x.i);
+ }
+ const Xs = support.map((i) => X[i]!);
+ this.location_ = new Float64Array(p);
+ for (const xi of Xs) for (let k = 0; k < p; k++) this.location_[k] = (this.location_[k] ?? 0) + (xi[k] ?? 0);
+ for (let k = 0; k < p; k++) this.location_[k] = (this.location_[k] ?? 0) / Xs.length;
+ this.covariance_ = Array.from({ length: p }, () => new Float64Array(p));
+ for (const xi of Xs) {
+ const xc = new Float64Array(p).map((_, k) => (xi[k] ?? 0) - (this.location_[k] ?? 0));
+ for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) this.covariance_[i]![j] = (this.covariance_[i]![j] ?? 0) + (xc[i] ?? 0) * (xc[j] ?? 0);
+ }
+ for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) this.covariance_[i]![j] = (this.covariance_[i]![j] ?? 0) / Xs.length;
+ this.dist_ = new Float64Array(n).map((_, i) => {
+ const xi = X[i]!;
+ let d = 0;
+ const xc = new Float64Array(p).map((_, k) => (xi[k] ?? 0) - (this.location_[k] ?? 0));
+ for (let ii = 0; ii < p; ii++) for (let j = 0; j < p; j++) d += (xc[ii] ?? 0) * (this.covariance_[ii]?.[j] ?? 0) * (xc[j] ?? 0);
+ return d;
+ });
+ return this;
+ }
+
+ mahalanobis(X: Float64Array[]): Float64Array {
+ const p = this.location_.length;
+ return new Float64Array(X.map((xi) => {
+ const xc = new Float64Array(p).map((_, k) => (xi[k] ?? 0) - (this.location_[k] ?? 0));
+ let d = 0;
+ for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) d += (xc[i] ?? 0) * (this.covariance_[i]?.[j] ?? 0) * (xc[j] ?? 0);
+ return Math.max(d, 0);
+ }));
+ }
+}
diff --git a/src/covariance/elliptic_envelope.ts b/src/covariance/elliptic_envelope.ts
new file mode 100644
index 0000000..22ad7f2
--- /dev/null
+++ b/src/covariance/elliptic_envelope.ts
@@ -0,0 +1,245 @@
+/**
+ * EllipticEnvelope: outlier detection via robust covariance estimation.
+ * Mirrors sklearn.covariance.EllipticEnvelope.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+function colMeans(X: Float64Array[]): Float64Array {
+ const p = (X[0] ?? new Float64Array(0)).length;
+ const means = new Float64Array(p);
+ const n = X.length;
+ for (const xi of X) {
+ for (let j = 0; j < p; j++) means[j] = (means[j] ?? 0) + (xi[j] ?? 0);
+ }
+ for (let j = 0; j < p; j++) means[j] = (means[j] ?? 0) / n;
+ return means;
+}
+
+function empCov(X: Float64Array[], means: Float64Array): Float64Array[] {
+ const n = X.length;
+ const p = means.length;
+ const C = Array.from({ length: p }, () => new Float64Array(p));
+ for (const xi of X) {
+ for (let i = 0; i < p; i++) {
+ const di = (xi[i] ?? 0) - (means[i] ?? 0);
+ for (let j = i; j < p; j++) {
+ const dj = (xi[j] ?? 0) - (means[j] ?? 0);
+ C[i]![j] = (C[i]![j] ?? 0) + di * dj;
+ }
+ }
+ }
+ for (let i = 0; i < p; i++) {
+ C[i]![i] = (C[i]![i] ?? 0) / n;
+ for (let j = i + 1; j < p; j++) {
+ C[i]![j] = (C[i]![j] ?? 0) / n;
+ C[j]![i] = C[i]![j] ?? 0;
+ }
+ }
+ return C;
+}
+
+/** Compute log-determinant of a positive-definite matrix via Cholesky. */
+function logDet(M: Float64Array[]): number {
+ const p = M.length;
+ const L = Array.from({ length: p }, () => new Float64Array(p));
+ for (let i = 0; i < p; i++) {
+ for (let j = 0; j <= i; j++) {
+ let s = M[i]![j] ?? 0;
+ for (let k = 0; k < j; k++) s -= (L[i]![k] ?? 0) * (L[j]![k] ?? 0);
+ if (i === j) {
+ L[i]![j] = Math.sqrt(Math.max(s, 1e-12));
+ } else {
+ L[i]![j] = s / Math.max(L[j]![j] ?? 1e-12, 1e-12);
+ }
+ }
+ }
+ let logd = 0;
+ for (let i = 0; i < p; i++) logd += Math.log(Math.max(L[i]![i] ?? 1e-12, 1e-12));
+ return 2 * logd;
+}
+
+/** Invert a matrix via Gauss-Jordan. Returns null if singular. */
+function invertMatrix(M: Float64Array[]): Float64Array[] | null {
+ const p = M.length;
+ const A = M.map((row) => new Float64Array(row));
+ const I = Array.from({ length: p }, (_, i) => {
+ const r = new Float64Array(p);
+ r[i] = 1;
+ return r;
+ });
+ for (let col = 0; col < p; col++) {
+ let pivotRow = -1;
+ let pivotVal = 0;
+ for (let row = col; row < p; row++) {
+ if (Math.abs(A[row]![col] ?? 0) > Math.abs(pivotVal)) {
+ pivotVal = A[row]![col] ?? 0;
+ pivotRow = row;
+ }
+ }
+ if (pivotRow === -1 || Math.abs(pivotVal) < 1e-12) return null;
+ const tmpA = A[col]!;
+ A[col] = A[pivotRow]!;
+ A[pivotRow] = tmpA;
+ const tmpI = I[col]!;
+ I[col] = I[pivotRow]!;
+ I[pivotRow] = tmpI;
+ const scale = A[col]![col] ?? 1;
+ for (let j = 0; j < p; j++) {
+ A[col]![j] = (A[col]![j] ?? 0) / scale;
+ I[col]![j] = (I[col]![j] ?? 0) / scale;
+ }
+ for (let row = 0; row < p; row++) {
+ if (row === col) continue;
+ const factor = A[row]![col] ?? 0;
+ for (let j = 0; j < p; j++) {
+ A[row]![j] = (A[row]![j] ?? 0) - factor * (A[col]![j] ?? 0);
+ I[row]![j] = (I[row]![j] ?? 0) - factor * (I[col]![j] ?? 0);
+ }
+ }
+ }
+ return I;
+}
+
+/** Mahalanobis distance squared for each row. */
+function mahalanobisDistSq(
+ X: Float64Array[],
+ mean: Float64Array,
+ precisionMat: Float64Array[],
+): Float64Array {
+ const n = X.length;
+ const p = mean.length;
+ const dists = new Float64Array(n);
+ for (let i = 0; i < n; i++) {
+ const xi = X[i] ?? new Float64Array(p);
+ let d = 0;
+ for (let j = 0; j < p; j++) {
+ let row = 0;
+ for (let k = 0; k < p; k++) {
+ row += (precisionMat[j]![k] ?? 0) * ((xi[k] ?? 0) - (mean[k] ?? 0));
+ }
+ d += ((xi[j] ?? 0) - (mean[j] ?? 0)) * row;
+ }
+ dists[i] = d;
+ }
+ return dists;
+}
+
+/**
+ * EllipticEnvelope: fits a robust covariance estimate to detect outliers.
+ * Uses minimum covariance determinant (fast approximation).
+ * Mirrors sklearn.covariance.EllipticEnvelope.
+ */
+export class EllipticEnvelope {
+ contamination: number;
+ supportFraction: number | null;
+ randomState: number;
+
+ location_: Float64Array | null = null;
+ covariance_: Float64Array[] | null = null;
+ precision_: Float64Array[] | null = null;
+ threshold_: number = 0;
+ offset_: number = 0;
+
+ constructor(
+ options: {
+ contamination?: number;
+ supportFraction?: number | null;
+ randomState?: number;
+ } = {},
+ ) {
+ this.contamination = options.contamination ?? 0.1;
+ this.supportFraction = options.supportFraction ?? null;
+ this.randomState = options.randomState ?? 42;
+ }
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const p = (X[0] ?? new Float64Array(0)).length;
+ const h = this.supportFraction !== null
+ ? Math.floor(this.supportFraction * n)
+ : Math.floor((n + p + 1) / 2);
+
+ // Fast MCD approximation: random subsample + C-step iterations
+ let bestDet = Number.POSITIVE_INFINITY;
+ let bestMean = new Float64Array(p);
+ let bestCov: Float64Array[] = Array.from({ length: p }, () => new Float64Array(p));
+
+ const rng = this.randomState;
+ const nTrials = 10;
+ for (let trial = 0; trial < nTrials; trial++) {
+ // Random subset of h points
+ const indices = Array.from({ length: n }, (_, i) => i);
+ // Pseudo-random shuffle using simple LCG
+ for (let i = n - 1; i > 0; i--) {
+ const j = Math.abs((rng * 1664525 + 1013904223 + i * trial * 31337) % (i + 1));
+ const tmp = indices[i]!;
+ indices[i] = indices[j]!;
+ indices[j] = tmp;
+ }
+ const subset = indices.slice(0, h).map((i) => X[i] ?? new Float64Array(p));
+
+ // C-step iterations
+ let curSubset = subset;
+ for (let cstep = 0; cstep < 30; cstep++) {
+ const mean = colMeans(curSubset);
+ const cov = empCov(curSubset, mean);
+ const inv = invertMatrix(cov);
+ if (!inv) break;
+ const dists = mahalanobisDistSq(X, mean, inv);
+ const sortedIdx = Array.from({ length: n }, (_, i) => i).sort(
+ (a, b) => (dists[a] ?? 0) - (dists[b] ?? 0),
+ );
+ curSubset = sortedIdx.slice(0, h).map((i) => X[i] ?? new Float64Array(p));
+ }
+
+ const mean = colMeans(curSubset);
+ const cov = empCov(curSubset, mean);
+ const det = logDet(cov);
+ if (det < bestDet) {
+ bestDet = det;
+ bestMean = mean;
+ bestCov = cov;
+ }
+ }
+
+ const inv = invertMatrix(bestCov) ?? bestCov;
+ this.location_ = bestMean;
+ this.covariance_ = bestCov;
+ this.precision_ = inv;
+
+ // Compute threshold based on contamination
+ const dists = mahalanobisDistSq(X, bestMean, inv);
+ const sorted = Array.from(dists).sort((a, b) => a - b);
+ const threshIdx = Math.floor((1 - this.contamination) * n);
+ this.threshold_ = sorted[Math.min(threshIdx, n - 1)] ?? 0;
+ this.offset_ = -this.threshold_;
+ return this;
+ }
+
+ mahalanobis(X: Float64Array[]): Float64Array {
+ if (this.location_ === null || this.precision_ === null) {
+ throw new NotFittedError("EllipticEnvelope");
+ }
+ return mahalanobisDistSq(X, this.location_, this.precision_);
+ }
+
+ decisionFunction(X: Float64Array[]): Float64Array {
+ const dists = this.mahalanobis(X);
+ return new Float64Array(dists.map((d) => -d - this.offset_));
+ }
+
+ predict(X: Float64Array[]): Int32Array {
+ const scores = this.decisionFunction(X);
+ return new Int32Array(scores.map((s) => (s >= 0 ? 1 : -1)));
+ }
+
+ score(X: Float64Array[], y: Int32Array): number {
+ const yPred = this.predict(X);
+ let correct = 0;
+ for (let i = 0; i < y.length; i++) {
+ if ((yPred[i] ?? 0) === (y[i] ?? 0)) correct++;
+ }
+ return correct / y.length;
+ }
+}
diff --git a/src/covariance/empirical.ts b/src/covariance/empirical.ts
new file mode 100644
index 0000000..54f2c8b
--- /dev/null
+++ b/src/covariance/empirical.ts
@@ -0,0 +1,152 @@
+/**
+ * Empirical covariance estimators.
+ * Mirrors scikit-learn's covariance.EmpiricalCovariance, LedoitWolf, OAS.
+ */
+
+function mean(X: Float64Array[], nSamples: number, nFeatures: number): Float64Array {
+ const m = new Float64Array(nFeatures);
+ for (const row of X) {
+ for (let j = 0; j < nFeatures; j++) m[j] = (m[j] ?? 0) + (row[j] ?? 0) / nSamples;
+ }
+ return m;
+}
+
+function covMatrix(
+ X: Float64Array[],
+ mu: Float64Array,
+ nSamples: number,
+ nFeatures: number,
+): Float64Array[] {
+ const C: Float64Array[] = Array.from({ length: nFeatures }, () => new Float64Array(nFeatures));
+ for (const row of X) {
+ for (let i = 0; i < nFeatures; i++) {
+ for (let j = 0; j < nFeatures; j++) {
+ C[i]![j] = (C[i]![j] ?? 0) +
+ ((row[i] ?? 0) - (mu[i] ?? 0)) * ((row[j] ?? 0) - (mu[j] ?? 0)) / nSamples;
+ }
+ }
+ }
+ return C;
+}
+
+export class EmpiricalCovariance {
+ location_: Float64Array | null = null;
+ covariance_: Float64Array[] | null = null;
+
+ constructor(readonly assumeCentered = false) {}
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const p = X[0]?.length ?? 0;
+ const mu = this.assumeCentered ? new Float64Array(p) : mean(X, n, p);
+ this.location_ = mu;
+ this.covariance_ = covMatrix(X, mu, n, p);
+ return this;
+ }
+
+ mahalanobis(X: Float64Array[]): Float64Array {
+ if (this.covariance_ === null || this.location_ === null) {
+ throw new Error("EmpiricalCovariance must be fitted first");
+ }
+ // Simplified: diagonal approximation
+ const diagInv = this.covariance_.map((row, i) => row[i] ?? 1);
+ return Float64Array.from(X, (xi) => {
+ let s = 0;
+ for (let j = 0; j < xi.length; j++) {
+ const diff = (xi[j] ?? 0) - (this.location_![j] ?? 0);
+ s += diff * diff / (diagInv[j] ?? 1);
+ }
+ return Math.sqrt(s);
+ });
+ }
+
+ score(XTest: Float64Array[], yTest?: unknown): number {
+ void yTest;
+ if (this.covariance_ === null) throw new Error("Not fitted");
+ const n = XTest.length;
+ const p = XTest[0]?.length ?? 0;
+ const mu = mean(XTest, n, p);
+ const testCov = covMatrix(XTest, mu, n, p);
+ let s = 0;
+ for (let i = 0; i < p; i++) {
+ for (let j = 0; j < p; j++) {
+ const diff = (testCov[i]?.[j] ?? 0) - (this.covariance_[i]?.[j] ?? 0);
+ s += diff * diff;
+ }
+ }
+ return -Math.sqrt(s);
+ }
+}
+
+/**
+ * Ledoit-Wolf covariance estimator with analytic shrinkage.
+ */
+export class LedoitWolf extends EmpiricalCovariance {
+ shrinkage_: number = 0;
+
+ override fit(X: Float64Array[]): this {
+ const n = X.length;
+ const p = X[0]?.length ?? 0;
+ const mu = mean(X, n, p);
+ this.location_ = mu;
+ const S = covMatrix(X, mu, n, p);
+
+ // Ledoit-Wolf analytical formula
+ let trS = 0, trS2 = 0, tr2S = 0;
+ for (let i = 0; i < p; i++) {
+ trS += S[i]?.[i] ?? 0;
+ for (let j = 0; j < p; j++) trS2 += ((S[i]?.[j] ?? 0) ** 2);
+ }
+ tr2S = trS * trS;
+
+ // Oracle approximating shrinkage
+ const mu1 = trS / p;
+ const delta2 = (trS2 - tr2S / p) / p;
+ const beta2 = Math.max(0, (trS2 / n - tr2S / (n * p)) / (trS2 - tr2S / p + 1e-10));
+ const shrinkage = Math.min(1, beta2);
+ this.shrinkage_ = shrinkage;
+
+ this.covariance_ = S.map((row, i) =>
+ Float64Array.from(row, (v, j) =>
+ (1 - shrinkage) * v + (i === j ? shrinkage * mu1 : 0),
+ ),
+ );
+ void delta2;
+ return this;
+ }
+}
+
+/**
+ * Oracle Approximating Shrinkage (OAS) estimator.
+ */
+export class OAS extends EmpiricalCovariance {
+ shrinkage_: number = 0;
+
+ override fit(X: Float64Array[]): this {
+ const n = X.length;
+ const p = X[0]?.length ?? 0;
+ const mu = mean(X, n, p);
+ this.location_ = mu;
+ const S = covMatrix(X, mu, n, p);
+
+ let trS = 0, trS2 = 0;
+ for (let i = 0; i < p; i++) {
+ trS += S[i]?.[i] ?? 0;
+ for (let j = 0; j < p; j++) trS2 += ((S[i]?.[j] ?? 0) ** 2);
+ }
+
+ // OAS formula
+ const rho = (1 - 2 / p) * trS2 + trS * trS;
+ const gamma = (n + 1 - 2 / p) * (trS2 - trS * trS / p);
+ const shrinkage = Math.min(1, rho / (gamma + 1e-10));
+ this.shrinkage_ = shrinkage;
+ const mu1 = trS / p;
+
+ this.covariance_ = S.map((row, i) =>
+ Float64Array.from(row, (v, j) =>
+ (1 - shrinkage) * v + (i === j ? shrinkage * mu1 : 0),
+ ),
+ );
+ return this;
+ }
+}
diff --git a/src/covariance/graphical_lasso.ts b/src/covariance/graphical_lasso.ts
new file mode 100644
index 0000000..00bc9e0
--- /dev/null
+++ b/src/covariance/graphical_lasso.ts
@@ -0,0 +1,252 @@
+/**
+ * GraphicalLasso and MinCovDet (robust covariance).
+ * Mirrors sklearn.covariance.GraphicalLasso and MinCovDet.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+function colMeans(X: Float64Array[]): Float64Array {
+ const p = (X[0] ?? new Float64Array(0)).length;
+ const n = X.length;
+ const means = new Float64Array(p);
+ for (const xi of X) for (let j = 0; j < p; j++) means[j] = (means[j] ?? 0) + (xi[j] ?? 0);
+ for (let j = 0; j < p; j++) means[j] = (means[j] ?? 0) / n;
+ return means;
+}
+
+function empiricalCovariance(X: Float64Array[]): Float64Array[] {
+ const n = X.length;
+ const p = (X[0] ?? new Float64Array(0)).length;
+ const means = colMeans(X);
+ const cov: Float64Array[] = Array.from({ length: p }, () => new Float64Array(p));
+ for (const xi of X) {
+ for (let j = 0; j < p; j++) {
+ for (let k = 0; k <= j; k++) {
+ const d = ((xi[j] ?? 0) - (means[j] ?? 0)) * ((xi[k] ?? 0) - (means[k] ?? 0));
+ cov[j]![k] = (cov[j]![k] ?? 0) + d;
+ if (k !== j) cov[k]![j] = (cov[k]![j] ?? 0) + d;
+ }
+ }
+ }
+ for (let j = 0; j < p; j++) for (let k = 0; k < p; k++) cov[j]![k] = (cov[j]![k] ?? 0) / n;
+ return cov;
+}
+
+function matMul(A: Float64Array[], B: Float64Array[]): Float64Array[] {
+ const n = A.length;
+ const m = (B[0] ?? new Float64Array(0)).length;
+ const k = B.length;
+ const C: Float64Array[] = Array.from({ length: n }, () => new Float64Array(m));
+ for (let i = 0; i < n; i++) for (let j = 0; j < m; j++) for (let l = 0; l < k; l++) C[i]![j] = (C[i]![j] ?? 0) + (A[i]![l] ?? 0) * (B[l]![j] ?? 0);
+ return C;
+}
+
+function invertMatrix(A: Float64Array[]): Float64Array[] {
+ const p = A.length;
+ // Augmented matrix [A | I]
+ const M: Float64Array[] = A.map((row, i) => {
+ const r = new Float64Array(2 * p);
+ for (let j = 0; j < p; j++) r[j] = row[j] ?? 0;
+ r[p + i] = 1;
+ return r;
+ });
+
+ for (let col = 0; col < p; col++) {
+ let pivot = col;
+ for (let row = col + 1; row < p; row++) {
+ if (Math.abs(M[row]![col] ?? 0) > Math.abs(M[pivot]![col] ?? 0)) pivot = row;
+ }
+ const tmp = M[col]!; M[col] = M[pivot]!; M[pivot] = tmp;
+ const denom = M[col]![col] ?? 1;
+ for (let j = 0; j < 2 * p; j++) M[col]![j] = (M[col]![j] ?? 0) / denom;
+ for (let row = 0; row < p; row++) {
+ if (row === col) continue;
+ const factor = M[row]![col] ?? 0;
+ for (let j = 0; j < 2 * p; j++) M[row]![j] = (M[row]![j] ?? 0) - factor * (M[col]![j] ?? 0);
+ }
+ }
+
+ return M.map((row) => new Float64Array(Array.from({ length: p }, (_, j) => row[p + j] ?? 0)));
+}
+
+export interface GraphicalLassoOptions {
+ alpha?: number;
+ maxIter?: number;
+ tol?: number;
+}
+
+/**
+ * Sparse inverse covariance estimation with L1 penalty (Graphical Lasso).
+ * Mirrors sklearn.covariance.GraphicalLasso.
+ * Uses the block coordinate descent algorithm (GLASSO).
+ */
+export class GraphicalLasso {
+ alpha: number;
+ maxIter: number;
+ tol: number;
+
+ covariance_: Float64Array[] | null = null;
+ precision_: Float64Array[] | null = null;
+ nIter_: number = 0;
+ location_: Float64Array | null = null;
+
+ constructor(options: GraphicalLassoOptions = {}) {
+ this.alpha = options.alpha ?? 0.01;
+ this.maxIter = options.maxIter ?? 100;
+ this.tol = options.tol ?? 1e-4;
+ }
+
+ fit(X: Float64Array[]): this {
+ const p = (X[0] ?? new Float64Array(0)).length;
+ this.location_ = colMeans(X);
+ const S = empiricalCovariance(X);
+
+ // Initialize with diagonal of S + alpha * I
+ const W: Float64Array[] = Array.from({ length: p }, (_, i) => {
+ const row = new Float64Array(p);
+ for (let j = 0; j < p; j++) row[j] = S[i]![j] ?? 0;
+ row[i] = (row[i] ?? 0) + this.alpha;
+ return row;
+ });
+
+ for (let iter = 0; iter < this.maxIter; iter++) {
+ let maxDelta = 0;
+ for (let j = 0; j < p; j++) {
+ // Partition W into W11 (p-1 x p-1) and w12 (p-1 vector)
+ const idx = Array.from({ length: p }, (_, k) => k).filter((k) => k !== j);
+ const W11: Float64Array[] = idx.map((r) => new Float64Array(idx.map((c) => W[r]![c] ?? 0)));
+ const s12 = new Float64Array(idx.map((r) => S[r]![j] ?? 0));
+
+ // Solve lasso: W11 * beta = s12 with L1 penalty alpha
+ const W11inv = invertMatrix(W11);
+ const q = new Float64Array(p - 1);
+ for (let k = 0; k < p - 1; k++) for (let l = 0; l < p - 1; l++) q[k] = (q[k] ?? 0) + (W11inv[k]![l] ?? 0) * (s12[l] ?? 0);
+
+ // Coordinate descent for lasso subproblem
+ const beta = new Float64Array(p - 1);
+ for (let lasso = 0; lasso < 100; lasso++) {
+ let maxD = 0;
+ for (let k = 0; k < p - 1; k++) {
+ const r = (s12[k] ?? 0) - ((): number => {
+ let s = 0;
+ for (let l = 0; l < p - 1; l++) if (l !== k) s += (W11[k]![l] ?? 0) * (beta[l] ?? 0);
+ return s;
+ })();
+ const wkk = W11[k]![k] ?? 1;
+ const b = r / wkk;
+ const threshold = this.alpha / wkk;
+ const newBeta = b > threshold ? b - threshold : b < -threshold ? b + threshold : 0;
+ maxD = Math.max(maxD, Math.abs(newBeta - (beta[k] ?? 0)));
+ beta[k] = newBeta;
+ }
+ if (maxD < 1e-6) break;
+ }
+
+ // Update W: w12 = W11 * beta
+ for (let k = 0; k < p - 1; k++) {
+ let s = 0;
+ for (let l = 0; l < p - 1; l++) s += (W11[k]![l] ?? 0) * (beta[l] ?? 0);
+ const delta = Math.abs(s - (W[idx[k]!]![j] ?? 0));
+ if (delta > maxDelta) maxDelta = delta;
+ W[idx[k]!]![j] = s;
+ W[j]![idx[k]!] = s;
+ }
+ }
+ this.nIter_ = iter + 1;
+ if (maxDelta < this.tol) break;
+ }
+
+ this.covariance_ = W;
+ this.precision_ = invertMatrix(W);
+ return this;
+ }
+
+ score(X: Float64Array[]): number {
+ if (!this.covariance_) throw new NotFittedError("GraphicalLasso is not fitted yet.");
+ return 0; // Placeholder: log-likelihood requires determinant
+ }
+}
+
+export interface MinCovDetOptions {
+ support?: number | null;
+ randomState?: number;
+}
+
+/**
+ * Minimum Covariance Determinant robust estimator.
+ * Mirrors sklearn.covariance.MinCovDet.
+ * Uses a simplified C-step algorithm.
+ */
+export class MinCovDet {
+ support: number | null;
+ randomState: number;
+
+ location_: Float64Array | null = null;
+ covariance_: Float64Array[] | null = null;
+ precision_: Float64Array[] | null = null;
+ supportFraction_: number = 0;
+ supportIndices_: Int32Array | null = null;
+ rawLocation_: Float64Array | null = null;
+ rawCovariance_: Float64Array[] | null = null;
+
+ private rng_: () => number;
+
+ constructor(options: MinCovDetOptions = {}) {
+ this.support = options.support ?? null;
+ this.randomState = options.randomState ?? 0;
+ let seed = this.randomState + 1;
+ this.rng_ = () => {
+ seed = (seed * 1664525 + 1013904223) & 0xffffffff;
+ return (seed >>> 0) / 0xffffffff;
+ };
+ }
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const p = (X[0] ?? new Float64Array(0)).length;
+ const h = this.support != null ? Math.floor(this.support * n) : Math.floor((n + p + 1) / 2);
+
+ // Compute Mahalanobis distances from full empirical estimate
+ const fullMeans = colMeans(X);
+ const fullCov = empiricalCovariance(X);
+ let precision: Float64Array[];
+ try { precision = invertMatrix(fullCov); } catch { precision = Array.from({ length: p }, (_, i) => { const r = new Float64Array(p); r[i] = 1; return r; }); }
+
+ // Mahalanobis distance for each point
+ const mDist = X.map((xi) => {
+ const diff = new Float64Array(p);
+ for (let j = 0; j < p; j++) diff[j] = (xi[j] ?? 0) - (fullMeans[j] ?? 0);
+ let d = 0;
+ for (let j = 0; j < p; j++) for (let k = 0; k < p; k++) d += (diff[j] ?? 0) * (precision[j]![k] ?? 0) * (diff[k] ?? 0);
+ return d;
+ });
+
+ // Select h points with smallest Mahalanobis distances
+ const sortedIdx = Array.from({ length: n }, (_, i) => i).sort((a, b) => mDist[a]! - mDist[b]!);
+ const supportIdx = new Int32Array(sortedIdx.slice(0, h));
+
+ const subset = Array.from(supportIdx).map((i) => X[i] ?? new Float64Array(p));
+ this.rawLocation_ = colMeans(subset);
+ this.rawCovariance_ = empiricalCovariance(subset);
+
+ this.location_ = this.rawLocation_;
+ this.covariance_ = this.rawCovariance_;
+ try { this.precision_ = invertMatrix(this.covariance_); } catch { this.precision_ = null; }
+
+ this.supportFraction_ = h / n;
+ this.supportIndices_ = supportIdx;
+ return this;
+ }
+
+ mahalanobis(X: Float64Array[]): Float64Array {
+ if (!this.location_ || !this.precision_) throw new NotFittedError("MinCovDet is not fitted yet.");
+ const p = this.location_.length;
+ return new Float64Array(X.map((xi) => {
+ const diff = new Float64Array(p);
+ for (let j = 0; j < p; j++) diff[j] = (xi[j] ?? 0) - (this.location_![j] ?? 0);
+ let d = 0;
+ for (let j = 0; j < p; j++) for (let k = 0; k < p; k++) d += (diff[j] ?? 0) * (this.precision_![j]![k] ?? 0) * (diff[k] ?? 0);
+ return d;
+ }));
+ }
+}
diff --git a/src/covariance/index.ts b/src/covariance/index.ts
new file mode 100644
index 0000000..91a4185
--- /dev/null
+++ b/src/covariance/index.ts
@@ -0,0 +1,5 @@
+export * from "./covariance.js";
+export * from "./graphical_lasso.js";
+export * from "./elliptic_envelope.js";
+export * from "./precision.js";
+export * from "./shrinkage.js";
diff --git a/src/covariance/mcd.ts b/src/covariance/mcd.ts
new file mode 100644
index 0000000..5e5515e
--- /dev/null
+++ b/src/covariance/mcd.ts
@@ -0,0 +1,148 @@
+/**
+ * Minimum Covariance Determinant (MCD): robust covariance estimation
+ */
+
+export class MinCovDet {
+ private support_fraction: number;
+ private nSubsets: number;
+ location_: Float64Array | null = null;
+ covariance_: Float64Array[] | null = null;
+ precision_: Float64Array[] | null = null;
+ support_: Int32Array | null = null;
+
+ constructor(support_fraction?: number, nSubsets = 500) {
+ this.support_fraction = support_fraction ?? 0;
+ this.nSubsets = nSubsets;
+ }
+
+ fit(X: Float64Array[]): this {
+ const n = X.length;
+ const p = X[0]?.length ?? 0;
+ const h = Math.max(p + 1, Math.floor((this.support_fraction > 0 ? this.support_fraction : (n + p + 1) / 2) * n));
+
+ // FastMCD approximation: multiple random subsets
+ let bestDet = Number.POSITIVE_INFINITY;
+ let bestSubset: number[] | null = null;
+
+ for (let iter = 0; iter < Math.min(this.nSubsets, 500); iter++) {
+ // Random initial subset of size p+1
+ const subset = this.randomSubset(n, Math.min(p + 1, n));
+ const expanded = this.expandSubset(X, subset, h);
+ const { mean, cov } = this.computeMeanCov(X, expanded);
+ const det = this.det(cov);
+ if (det < bestDet) {
+ bestDet = det;
+ bestSubset = expanded;
+ }
+ }
+
+ const finalSubset = bestSubset ?? Array.from({ length: h }, (_, i) => i);
+ const { mean, cov } = this.computeMeanCov(X, finalSubset);
+
+ this.location_ = mean;
+ this.covariance_ = cov;
+ this.precision_ = this.invertMatrix(cov);
+ this.support_ = new Int32Array(n);
+ for (const idx of finalSubset) this.support_[idx] = 1;
+ return this;
+ }
+
+ private randomSubset(n: number, k: number): number[] {
+ const indices = Array.from({ length: n }, (_, i) => i);
+ for (let i = n - 1; i > 0; i--) {
+ const j = Math.floor(Math.random() * (i + 1));
+ const tmp = indices[i]!; indices[i] = indices[j]!; indices[j] = tmp;
+ }
+ return indices.slice(0, k);
+ }
+
+ private expandSubset(X: Float64Array[], subset: number[], h: number): number[] {
+ const { mean, cov } = this.computeMeanCov(X, subset);
+ const prec = this.invertMatrix(cov);
+ const dists = X.map((row, i) => ({ i, d: this.mahalanobis(row, mean, prec) }));
+ dists.sort((a, b) => a.d - b.d);
+ return dists.slice(0, h).map((d) => d.i);
+ }
+
+ private mahalanobis(x: Float64Array, mean: Float64Array, prec: Float64Array[]): number {
+ const p = x.length;
+ const diff = new Float64Array(p);
+ for (let i = 0; i < p; i++) diff[i] = (x[i] ?? 0) - (mean[i] ?? 0);
+ let dist = 0;
+ for (let i = 0; i < p; i++) {
+ for (let j = 0; j < p; j++) dist += (diff[i] ?? 0) * (prec[i]![j] ?? 0) * (diff[j] ?? 0);
+ }
+ return dist;
+ }
+
+ private computeMeanCov(X: Float64Array[], indices: number[]): { mean: Float64Array; cov: Float64Array[] } {
+ const p = X[0]?.length ?? 0;
+ const n = indices.length;
+ const mean = new Float64Array(p);
+ for (const idx of indices) for (let j = 0; j < p; j++) mean[j] += (X[idx]![j] ?? 0) / n;
+ const cov: Float64Array[] = Array.from({ length: p }, () => new Float64Array(p));
+ for (const idx of indices) {
+ const diff = new Float64Array(p);
+ for (let j = 0; j < p; j++) diff[j] = (X[idx]![j] ?? 0) - (mean[j] ?? 0);
+ for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) cov[i]![j] += (diff[i] ?? 0) * (diff[j] ?? 0) / (n - 1);
+ }
+ return { mean, cov };
+ }
+
+ private det(A: Float64Array[]): number {
+ const n = A.length;
+ if (n === 1) return A[0]![0] ?? 0;
+ if (n === 2) return (A[0]![0] ?? 0) * (A[1]![1] ?? 0) - (A[0]![1] ?? 0) * (A[1]![0] ?? 0);
+ let result = 1;
+ const mat = A.map((row) => Float64Array.from(row));
+ for (let col = 0; col < n; col++) {
+ let maxRow = col;
+ for (let row = col + 1; row < n; row++) {
+ if (Math.abs(mat[row]![col] ?? 0) > Math.abs(mat[maxRow]![col] ?? 0)) maxRow = row;
+ }
+ if (maxRow !== col) { const tmp = mat[col]!; mat[col] = mat[maxRow]!; mat[maxRow] = tmp; result *= -1; }
+ const pivot = mat[col]![col] ?? 0;
+ if (Math.abs(pivot) < 1e-10) return 0;
+ result *= pivot;
+ for (let row = col + 1; row < n; row++) {
+ const factor = (mat[row]![col] ?? 0) / pivot;
+ for (let j = col; j < n; j++) mat[row]![j] = (mat[row]![j] ?? 0) - factor * (mat[col]![j] ?? 0);
+ }
+ }
+ return result;
+ }
+
+ private invertMatrix(A: Float64Array[]): Float64Array[] {
+ const n = A.length;
+ const aug = A.map((row, i) => {
+ const r = new Float64Array(2 * n);
+ for (let j = 0; j < n; j++) r[j] = row[j] ?? 0;
+ r[n + i] = 1;
+ return r;
+ });
+ for (let col = 0; col < n; col++) {
+ let maxRow = col;
+ for (let row = col + 1; row < n; row++) {
+ if (Math.abs(aug[row]![col] ?? 0) > Math.abs(aug[maxRow]![col] ?? 0)) maxRow = row;
+ }
+ const tmp = aug[col]!; aug[col] = aug[maxRow]!; aug[maxRow] = tmp;
+ const pivot = aug[col]![col] ?? 1;
+ for (let j = 0; j < 2 * n; j++) aug[col]![j] = (aug[col]![j] ?? 0) / (pivot || 1);
+ for (let row = 0; row < n; row++) {
+ if (row === col) continue;
+ const factor = aug[row]![col] ?? 0;
+ for (let j = 0; j < 2 * n; j++) aug[row]![j] = (aug[row]![j] ?? 0) - factor * (aug[col]![j] ?? 0);
+ }
+ }
+ return Array.from({ length: n }, (_, i) => {
+ const row = new Float64Array(n);
+ for (let j = 0; j < n; j++) row[j] = aug[i]![n + j] ?? 0;
+ return row;
+ });
+ }
+
+ mahalanobisDistances(X: Float64Array[]): Float64Array {
+ if (!this.location_ || !this.precision_) throw new Error("Not fitted");
+ return new Float64Array(X.map((row) => this.mahalanobis(row, this.location_!, this.precision_!)));
+ }
+}
diff --git a/src/covariance/precision.ts b/src/covariance/precision.ts
new file mode 100644
index 0000000..77b6e64
--- /dev/null
+++ b/src/covariance/precision.ts
@@ -0,0 +1,230 @@
+/**
+ * Covariance utilities: precision matrix estimation, covariance selection.
+ * ledoit_wolf() and oas() functional APIs, plus precision/correlation conversion.
+ * Mirrors sklearn.covariance functional API and utility functions.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+function colMeans(X: Float64Array[]): Float64Array {
+ const p = (X[0] ?? new Float64Array(0)).length;
+ const m = new Float64Array(p);
+ const n = X.length;
+ for (const xi of X) for (let j = 0; j < p; j++) m[j] = (m[j] ?? 0) + (xi[j] ?? 0);
+ for (let j = 0; j < p; j++) m[j] = (m[j] ?? 0) / n;
+ return m;
+}
+
+function empCovMatrix(X: Float64Array[], means: Float64Array): Float64Array[] {
+ const n = X.length;
+ const p = means.length;
+ const C = Array.from({ length: p }, () => new Float64Array(p));
+ for (const xi of X) {
+ for (let i = 0; i < p; i++) {
+ const di = (xi[i] ?? 0) - (means[i] ?? 0);
+ for (let j = i; j < p; j++) {
+ const dj = (xi[j] ?? 0) - (means[j] ?? 0);
+ C[i]![j] = (C[i]![j] ?? 0) + di * dj;
+ }
+ }
+ }
+ for (let i = 0; i < p; i++) {
+ C[i]![i] = (C[i]![i] ?? 0) / n;
+ for (let j = i + 1; j < p; j++) {
+ C[i]![j] = (C[i]![j] ?? 0) / n;
+ C[j]![i] = C[i]![j] ?? 0;
+ }
+ }
+ return C;
+}
+
+function matTrace(M: Float64Array[]): number {
+ let s = 0;
+ for (let i = 0; i < M.length; i++) s += M[i]![i] ?? 0;
+ return s;
+}
+
+function matFrobSq(M: Float64Array[]): number {
+ let s = 0;
+ for (const row of M) for (let j = 0; j < row.length; j++) s += (row[j] ?? 0) ** 2;
+ return s;
+}
+
+/** Invert diagonal of a matrix (for precision). */
+function invertDiag(M: Float64Array[]): Float64Array[] {
+ return M.map((row, i) => new Float64Array(row.map((v, j) => i === j && v > 0 ? 1 / v : 0)));
+}
+
+/**
+ * Functional API: Ledoit-Wolf analytical shrinkage.
+ * Mirrors sklearn.covariance.ledoit_wolf.
+ */
+export function ledoitWolf(
+ X: Float64Array[],
+ options: { assumeCentered?: boolean } = {},
+): { covariance: Float64Array[]; shrinkage: number } {
+ const n = X.length;
+ const p = (X[0] ?? new Float64Array(0)).length;
+ const location = options.assumeCentered ? new Float64Array(p) : colMeans(X);
+ const S = empCovMatrix(X, location);
+ const trS = matTrace(S);
+ const trS2 = matFrobSq(S);
+ const trSsq = trS ** 2;
+
+ let delta = 0;
+ for (let i = 0; i < p; i++) {
+ for (let k = 0; k < p; k++) {
+ let fourth = 0;
+ for (let t = 0; t < n; t++) {
+ const xt = X[t] ?? new Float64Array(p);
+ fourth += ((xt[i] ?? 0) - (location[i] ?? 0)) ** 2 * ((xt[k] ?? 0) - (location[k] ?? 0)) ** 2;
+ }
+ fourth /= n;
+ delta += fourth - (S[i]![k] ?? 0) ** 2;
+ }
+ }
+ delta /= n;
+
+ const delta2 = trS2 - trSsq / p;
+ const shrinkage = delta2 > 0
+ ? Math.min(1, Math.max(0, (delta + ((n - 2) / n) * delta2) / ((n + 2) * delta2)))
+ : 0;
+
+ const mu = trS / p;
+ const covariance = S.map((row, i) =>
+ new Float64Array(row.map((v, j) => (1 - shrinkage) * v + shrinkage * (i === j ? mu : 0))),
+ );
+ return { covariance, shrinkage };
+}
+
+/**
+ * Functional API: Oracle Approximating Shrinkage (OAS).
+ * Mirrors sklearn.covariance.oas.
+ */
+export function oas(
+ X: Float64Array[],
+ options: { assumeCentered?: boolean } = {},
+): { covariance: Float64Array[]; shrinkage: number } {
+ const n = X.length;
+ const p = (X[0] ?? new Float64Array(0)).length;
+ const location = options.assumeCentered ? new Float64Array(p) : colMeans(X);
+ const S = empCovMatrix(X, location);
+ const trS = matTrace(S);
+ const trS2 = matFrobSq(S);
+ const trSsq = trS ** 2;
+
+ const num = (1 - 2 / p) * trS2 + trSsq;
+ const denom = (n + 1 - 2 / p) * (trS2 - trSsq / p);
+ const shrinkage = denom > 0 ? Math.min(1, Math.max(0, num / denom)) : 0;
+
+ const mu = trS / p;
+ const covariance = S.map((row, i) =>
+ new Float64Array(row.map((v, j) => (1 - shrinkage) * v + shrinkage * (i === j ? mu : 0))),
+ );
+ return { covariance, shrinkage };
+}
+
+/**
+ * Convert a covariance matrix to a correlation matrix.
+ * Mirrors sklearn.covariance.cov_to_corr.
+ */
+export function covToCorr(covariance: Float64Array[]): Float64Array[] {
+ const p = covariance.length;
+ const std = new Float64Array(p).map((_, i) => Math.sqrt(Math.max(covariance[i]![i] ?? 0, 1e-12)));
+ return covariance.map((row, i) =>
+ new Float64Array(row.map((v, j) => v / ((std[i] ?? 1) * (std[j] ?? 1)))),
+ );
+}
+
+/**
+ * Compute the log-likelihood of X under a Gaussian model.
+ * Mirrors sklearn.covariance.empirical_covariance (log_likelihood method).
+ */
+export function gaussianLogLikelihood(
+ X: Float64Array[],
+ mean: Float64Array,
+ covariance: Float64Array[],
+): number {
+ const n = X.length;
+ const p = mean.length;
+
+ // log-det via Cholesky
+ const L = Array.from({ length: p }, () => new Float64Array(p));
+ for (let i = 0; i < p; i++) {
+ for (let j = 0; j <= i; j++) {
+ let s = covariance[i]![j] ?? 0;
+ for (let k = 0; k < j; k++) s -= (L[i]![k] ?? 0) * (L[j]![k] ?? 0);
+ L[i]![j] = i === j ? Math.sqrt(Math.max(s, 1e-12)) : s / Math.max(L[j]![j] ?? 1, 1e-12);
+ }
+ }
+ let logDet = 0;
+ for (let i = 0; i < p; i++) logDet += Math.log(Math.max(L[i]![i] ?? 1e-12, 1e-12));
+ logDet *= 2;
+
+ // trace(S * precision) where S = empirical covariance of X
+ const S = empCovMatrix(X, mean);
+ // Use diagonal approx for precision
+ let trSP = 0;
+ for (let i = 0; i < p; i++) {
+ const cii = covariance[i]![i] ?? 1;
+ trSP += (S[i]![i] ?? 0) / Math.max(cii, 1e-12);
+ }
+
+ return -0.5 * (n * (p * Math.log(2 * Math.PI) + logDet + trSP));
+}
+
+/**
+ * Sparse inverse covariance estimator (precision matrix selector).
+ * Uses a simple soft-threshold approach to zero out small entries.
+ * Mirrors sklearn.covariance sparse precision concepts.
+ */
+export class SparsePrecision {
+ threshold: number;
+ assumeCentered: boolean;
+
+ location_: Float64Array | null = null;
+ covariance_: Float64Array[] | null = null;
+ precision_: Float64Array[] | null = null;
+
+ constructor(options: { threshold?: number; assumeCentered?: boolean } = {}) {
+ this.threshold = options.threshold ?? 0.1;
+ this.assumeCentered = options.assumeCentered ?? false;
+ }
+
+ fit(X: Float64Array[]): this {
+ const p = (X[0] ?? new Float64Array(0)).length;
+ const location = this.assumeCentered ? new Float64Array(p) : colMeans(X);
+ this.location_ = location;
+ const S = empCovMatrix(X, location);
+ this.covariance_ = S;
+
+ // Simple diagonal precision estimate with soft-thresholding
+ const P = invertDiag(S);
+ // Soft-threshold off-diagonal elements
+ this.precision_ = P.map((row, i) =>
+ new Float64Array(row.map((v, j) => {
+ if (i === j) return v;
+ return Math.abs(v) > this.threshold ? v - Math.sign(v) * this.threshold : 0;
+ })),
+ );
+ return this;
+ }
+
+ mahalanobis(X: Float64Array[]): Float64Array {
+ if (this.precision_ === null || this.location_ === null) {
+ throw new NotFittedError("SparsePrecision");
+ }
+ const P = this.precision_;
+ const mu = this.location_;
+ const p = mu.length;
+ return new Float64Array(X.map((xi) => {
+ let d = 0;
+ for (let j = 0; j < p; j++) {
+ let pRow = 0;
+ for (let k = 0; k < p; k++) pRow += (P[j]![k] ?? 0) * ((xi[k] ?? 0) - (mu[k] ?? 0));
+ d += ((xi[j] ?? 0) - (mu[j] ?? 0)) * pRow;
+ }
+ return d;
+ }));
+ }
+}
diff --git a/src/covariance/shrinkage.ts b/src/covariance/shrinkage.ts
new file mode 100644
index 0000000..94d915a
--- /dev/null
+++ b/src/covariance/shrinkage.ts
@@ -0,0 +1,240 @@
+/**
+ * Covariance estimators: LedoitWolf, OAS, and ShrunkCovariance.
+ * Analogous to sklearn.covariance._shrunk_covariance and _ledoit_wolf.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+/** Result of a covariance estimate. */
+export interface CovResult {
+ /** Estimated covariance matrix (flat, nFeatures Γ nFeatures). */
+ covariance: Float64Array;
+ /** Estimated precision matrix (inverse of covariance). */
+ precision: Float64Array;
+ nFeatures: number;
+}
+
+/**
+ * Computes the sample covariance matrix from a flat (nSamples Γ nFeatures) matrix X
+ * that has already been mean-centered.
+ */
+function sampleCov(X: Float64Array, nSamples: number, nFeatures: number): Float64Array {
+ const cov = new Float64Array(nFeatures * nFeatures);
+ const scale = 1 / (nSamples - 1);
+ for (let i = 0; i < nSamples; i++) {
+ for (let j = 0; j < nFeatures; j++) {
+ for (let k = j; k < nFeatures; k++) {
+ const v = X[i * nFeatures + j]! * X[i * nFeatures + k]! * scale;
+ cov[j * nFeatures + k]! += v;
+ if (k !== j) cov[k * nFeatures + j]! += v;
+ }
+ }
+ }
+ return cov;
+}
+
+/** Centers X in-place and returns the column means. */
+function centerMatrix(X: Float64Array, nSamples: number, nFeatures: number): Float64Array {
+ const means = new Float64Array(nFeatures);
+ for (let i = 0; i < nSamples; i++) for (let j = 0; j < nFeatures; j++) means[j]! += X[i * nFeatures + j]!;
+ for (let j = 0; j < nFeatures; j++) means[j]! /= nSamples;
+ for (let i = 0; i < nSamples; i++) for (let j = 0; j < nFeatures; j++) X[i * nFeatures + j]! -= means[j]!;
+ return means;
+}
+
+/** Applies a shrinkage factor Ξ±: Ξ£_shrunk = (1-Ξ±)Β·S + Ξ±Β·(tr(S)/p)Β·I */
+function shrinkCov(S: Float64Array, p: number, alpha: number): Float64Array {
+ const mu = (() => { let t = 0; for (let j = 0; j < p; j++) t += S[j * p + j]!; return t / p; })();
+ const out = new Float64Array(p * p);
+ for (let i = 0; i < p; i++) {
+ for (let j = 0; j < p; j++) out[i * p + j] = (1 - alpha) * S[i * p + j]!;
+ out[i * p + i]! += alpha * mu;
+ }
+ return out;
+}
+
+/** Inverts a symmetric positive-definite pΓp matrix via Gauss-Jordan. */
+function invertPD(A: Float64Array, p: number): Float64Array {
+ const aug = new Float64Array(p * 2 * p);
+ for (let i = 0; i < p; i++) {
+ for (let j = 0; j < p; j++) aug[i * 2 * p + j] = A[i * p + j]!;
+ aug[i * 2 * p + p + i] = 1;
+ }
+ for (let col = 0; col < p; col++) {
+ let maxRow = col;
+ for (let r = col + 1; r < p; r++) if (Math.abs(aug[r * 2 * p + col]!) > Math.abs(aug[maxRow * 2 * p + col]!)) maxRow = r;
+ if (maxRow !== col) {
+ for (let k = 0; k < 2 * p; k++) {
+ const tmp = aug[col * 2 * p + k]!;
+ aug[col * 2 * p + k] = aug[maxRow * 2 * p + k]!;
+ aug[maxRow * 2 * p + k] = tmp;
+ }
+ }
+ const pivot = aug[col * 2 * p + col]!;
+ if (Math.abs(pivot) < 1e-14) continue;
+ for (let k = 0; k < 2 * p; k++) aug[col * 2 * p + k]! /= pivot;
+ for (let r = 0; r < p; r++) {
+ if (r === col) continue;
+ const f = aug[r * 2 * p + col]!;
+ for (let k = 0; k < 2 * p; k++) aug[r * 2 * p + k]! -= f * aug[col * 2 * p + k]!;
+ }
+ }
+ const inv = new Float64Array(p * p);
+ for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) inv[i * p + j] = aug[i * 2 * p + p + j]!;
+ return inv;
+}
+
+// βββ ShrunkCovariance ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+
+export interface ShrunkCovarianceOptions {
+ /** Shrinkage coefficient in [0, 1]. Default 0.1. */
+ shrinkage?: number;
+ /** Whether to store the precision matrix. Default true. */
+ storePrecision?: boolean;
+ /** Whether to assume the data is already centered. Default false. */
+ assumeCentered?: boolean;
+}
+
+/** Covariance estimator with manually set shrinkage (Ledoit-Wolf is automatic). */
+export class ShrunkCovariance {
+ private opts: Required;
+ covariance_: Float64Array | undefined;
+ precision_: Float64Array | undefined;
+ location_: Float64Array | undefined;
+
+ constructor(opts: ShrunkCovarianceOptions = {}) {
+ this.opts = {
+ shrinkage: opts.shrinkage ?? 0.1,
+ storePrecision: opts.storePrecision ?? true,
+ assumeCentered: opts.assumeCentered ?? false,
+ };
+ }
+
+ fit(X: Float64Array, nSamples: number, nFeatures: number): this {
+ const Xc = new Float64Array(X);
+ let location: Float64Array;
+ if (this.opts.assumeCentered) {
+ location = new Float64Array(nFeatures);
+ } else {
+ location = centerMatrix(Xc, nSamples, nFeatures);
+ }
+ this.location_ = location;
+ const S = sampleCov(Xc, nSamples, nFeatures);
+ this.covariance_ = shrinkCov(S, nFeatures, this.opts.shrinkage);
+ if (this.opts.storePrecision) this.precision_ = invertPD(this.covariance_, nFeatures);
+ return this;
+ }
+
+ score(X: Float64Array, nSamples: number, nFeatures: number): number {
+ if (!this.covariance_) throw new NotFittedError("ShrunkCovariance is not fitted");
+ return logLikelihood(X, nSamples, nFeatures, this.covariance_, this.location_!);
+ }
+}
+
+// βββ OAS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
+
+export interface OASOptions {
+ storePrecision?: boolean;
+ assumeCentered?: boolean;
+}
+
+/**
+ * Oracle Approximating Shrinkage (OAS) covariance estimator.
+ * More accurate than Ledoit-Wolf for Gaussian data when n < p.
+ */
+export class OAS {
+ private opts: Required;
+ covariance_: Float64Array | undefined;
+ precision_: Float64Array | undefined;
+ shrinkage_: number | undefined;
+ location_: Float64Array | undefined;
+
+ constructor(opts: OASOptions = {}) {
+ this.opts = { storePrecision: opts.storePrecision ?? true, assumeCentered: opts.assumeCentered ?? false };
+ }
+
+ fit(X: Float64Array, nSamples: number, nFeatures: number): this {
+ const n = nSamples; const p = nFeatures;
+ const Xc = new Float64Array(X);
+ let location: Float64Array;
+ if (this.opts.assumeCentered) {
+ location = new Float64Array(p);
+ } else {
+ location = centerMatrix(Xc, n, p);
+ }
+ this.location_ = location;
+ const S = sampleCov(Xc, n, p);
+
+ // OAS shrinkage estimate
+ const trS = (() => { let t = 0; for (let j = 0; j < p; j++) t += S[j * p + j]!; return t; })();
+ const trS2 = (() => {
+ let t = 0;
+ for (let i = 0; i < p; i++) for (let j = 0; j < p; j++) t += S[i * p + j]! * S[j * p + i]!;
+ return t;
+ })();
+
+ const mu = trS / p;
+ const rho1 = ((1 - 2 / p) * trS2 + trS * trS) / ((n + 1 - 2 / p) * (trS2 - trS * trS / p));
+ const alpha = Math.min(1, Math.max(0, rho1));
+ this.shrinkage_ = alpha;
+ this.covariance_ = shrinkCov(S, p, alpha);
+ if (this.opts.storePrecision) this.precision_ = invertPD(this.covariance_, p);
+ // suppress unused warning
+ void mu;
+ return this;
+ }
+
+ score(X: Float64Array, nSamples: number, nFeatures: number): number {
+ if (!this.covariance_) throw new NotFittedError("OAS is not fitted");
+ return logLikelihood(X, nSamples, nFeatures, this.covariance_, this.location_!);
+ }
+}
+
+// βββ Shared log-likelihood βββββββββββββββββββββββββββββββββββββββββββββββββ
+
+/** Gaussian log-likelihood of X given a covariance estimate. */
+function logLikelihood(
+ X: Float64Array,
+ nSamples: number,
+ nFeatures: number,
+ cov: Float64Array,
+ loc: Float64Array,
+): number {
+ const p = nFeatures;
+ const prec = invertPD(cov, p);
+ let ll = 0;
+ for (let i = 0; i < nSamples; i++) {
+ let quad = 0;
+ for (let j = 0; j < p; j++) {
+ let row = 0;
+ for (let k = 0; k < p; k++) row += prec[j * p + k]! * (X[i * p + k]! - loc[k]!);
+ quad += (X[i * p + j]! - loc[j]!) * row;
+ }
+ ll -= 0.5 * quad;
+ }
+ // Subtract 0.5 * n * log|Ξ£|
+ let logDet = 0;
+ // Use the diagonal of a Cholesky factorisation for log-det
+ const L = choleskyDiag(cov, p);
+ for (let j = 0; j < p; j++) logDet += 2 * Math.log(Math.max(L[j]!, 1e-15));
+ ll -= 0.5 * nSamples * logDet;
+ ll -= 0.5 * nSamples * p * Math.log(2 * Math.PI);
+ return ll / nSamples;
+}
+
+/** Returns only the diagonal of the lower Cholesky factor (for log-det). */
+function choleskyDiag(A: Float64Array, p: number): Float64Array {
+ const L = new Float64Array(p * p);
+ for (let i = 0; i < p; i++) {
+ for (let j = 0; j <= i; j++) {
+ let s = A[i * p + j]!;
+ for (let k = 0; k < j; k++) s -= L[i * p + k]! * L[j * p + k]!;
+ if (i === j) {
+ L[i * p + j] = Math.sqrt(Math.max(s, 0));
+ } else {
+ L[i * p + j] = L[j * p + j]! > 0 ? s / L[j * p + j]! : 0;
+ }
+ }
+ }
+ return Float64Array.from({ length: p }, (_, j) => L[j * p + j]!);
+}
diff --git a/src/cross_decomposition/cca.ts b/src/cross_decomposition/cca.ts
new file mode 100644
index 0000000..90dbd41
--- /dev/null
+++ b/src/cross_decomposition/cca.ts
@@ -0,0 +1,260 @@
+/**
+ * Canonical Correlation Analysis (CCA).
+ * Mirrors sklearn.cross_decomposition.CCA.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+function colMeans(X: Float64Array[]): Float64Array {
+ const p = (X[0] ?? new Float64Array(0)).length;
+ const m = new Float64Array(p);
+ for (const xi of X) {
+ for (let j = 0; j < p; j++) m[j] = (m[j] ?? 0) + (xi[j] ?? 0);
+ }
+ for (let j = 0; j < p; j++) m[j] = (m[j] ?? 0) / X.length;
+ return m;
+}
+
+function centerMatrix(X: Float64Array[], means: Float64Array): Float64Array[] {
+ return X.map((xi) => new Float64Array(xi.map((v, j) => v - (means[j] ?? 0))));
+}
+
+/** X^T Y (p x q matrix). */
+function crossProd(X: Float64Array[], Y: Float64Array[]): Float64Array[] {
+ const p = (X[0] ?? new Float64Array(0)).length;
+ const q = (Y[0] ?? new Float64Array(0)).length;
+ const C = Array.from({ length: p }, () => new Float64Array(q));
+ for (let i = 0; i < X.length; i++) {
+ const xi = X[i] ?? new Float64Array(p);
+ const yi = Y[i] ?? new Float64Array(q);
+ for (let j = 0; j < p; j++) {
+ for (let k = 0; k < q; k++) {
+ C[j]![k] = (C[j]![k] ?? 0) + (xi[j] ?? 0) * (yi[k] ?? 0);
+ }
+ }
+ }
+ return C;
+}
+
+/** Gram-Schmidt power iteration to find leading singular vectors. */
+function powerSVD(
+ M: Float64Array[],
+ nComponents: number,
+ maxIter = 200,
+): { U: Float64Array[]; S: Float64Array; Vt: Float64Array[] } {
+ const m = M.length;
+ const n = (M[0] ?? new Float64Array(0)).length;
+ const U: Float64Array[] = [];
+ const S: number[] = [];
+ const Vt: Float64Array[] = [];
+
+ let Mdefl = M.map((row) => new Float64Array(row));
+
+ for (let c = 0; c < nComponents; c++) {
+ let u = new Float64Array(m);
+ u[c % m] = 1;
+
+ for (let iter = 0; iter < maxIter; iter++) {
+ // v = M^T u
+ const v = new Float64Array(n);
+ for (let i = 0; i < m; i++) {
+ const row = Mdefl[i] ?? new Float64Array(n);
+ for (let j = 0; j < n; j++) v[j] = (v[j] ?? 0) + (u[i] ?? 0) * (row[j] ?? 0);
+ }
+ // normalize v
+ let vnorm = 0;
+ for (let j = 0; j < n; j++) vnorm += (v[j] ?? 0) ** 2;
+ vnorm = Math.sqrt(vnorm);
+ if (vnorm < 1e-10) break;
+ for (let j = 0; j < n; j++) v[j] = (v[j] ?? 0) / vnorm;
+ // u = M v
+ const uNew = new Float64Array(m);
+ for (let i = 0; i < m; i++) {
+ const row = Mdefl[i] ?? new Float64Array(n);
+ for (let j = 0; j < n; j++) uNew[i] = (uNew[i] ?? 0) + (row[j] ?? 0) * (v[j] ?? 0);
+ }
+ let unorm = 0;
+ for (let i = 0; i < m; i++) unorm += (uNew[i] ?? 0) ** 2;
+ unorm = Math.sqrt(unorm);
+ if (unorm < 1e-10) break;
+ const sigma = unorm;
+ for (let i = 0; i < m; i++) uNew[i] = (uNew[i] ?? 0) / unorm;
+ const diff = Math.sqrt(Array.from({ length: m }, (_, i) => ((uNew[i] ?? 0) - (u[i] ?? 0)) ** 2).reduce((a, b) => a + b, 0));
+ u = uNew;
+ if (diff < 1e-8) { S.push(sigma); break; }
+ if (iter === maxIter - 1) S.push(sigma);
+ }
+
+ // Deflate
+ const sigma = S[c] ?? 0;
+ const v = new Float64Array(n);
+ for (let i = 0; i < m; i++) {
+ const row = Mdefl[i] ?? new Float64Array(n);
+ for (let j = 0; j < n; j++) v[j] = (v[j] ?? 0) + (u[i] ?? 0) * (row[j] ?? 0);
+ }
+ let vnorm = 0;
+ for (let j = 0; j < n; j++) vnorm += (v[j] ?? 0) ** 2;
+ vnorm = Math.sqrt(vnorm);
+ if (vnorm > 1e-10) for (let j = 0; j < n; j++) v[j] = (v[j] ?? 0) / vnorm;
+
+ U.push(u);
+ Vt.push(v);
+ Mdefl = Mdefl.map((row, i) => {
+ const newRow = new Float64Array(row);
+ for (let j = 0; j < n; j++) {
+ newRow[j] = (newRow[j] ?? 0) - sigma * (u[i] ?? 0) * (v[j] ?? 0);
+ }
+ return newRow;
+ });
+ }
+
+ return { U, S: new Float64Array(S), Vt };
+}
+
+/**
+ * Canonical Correlation Analysis.
+ * Mirrors sklearn.cross_decomposition.CCA.
+ */
+export class CCA {
+ nComponents: number;
+ maxIter: number;
+ tol: number;
+ scale: boolean;
+
+ xWeights_: Float64Array[] | null = null;
+ yWeights_: Float64Array[] | null = null;
+ xLoadings_: Float64Array[] | null = null;
+ yLoadings_: Float64Array[] | null = null;
+ xMean_: Float64Array | null = null;
+ yMean_: Float64Array | null = null;
+ xStd_: Float64Array | null = null;
+ yStd_: Float64Array | null = null;
+
+ constructor(
+ options: {
+ nComponents?: number;
+ maxIter?: number;
+ tol?: number;
+ scale?: boolean;
+ } = {},
+ ) {
+ this.nComponents = options.nComponents ?? 2;
+ this.maxIter = options.maxIter ?? 500;
+ this.tol = options.tol ?? 1e-6;
+ this.scale = options.scale ?? true;
+ }
+
+ fit(X: Float64Array[], Y: Float64Array[]): this {
+ const n = X.length;
+ const p = (X[0] ?? new Float64Array(0)).length;
+ const q = (Y[0] ?? new Float64Array(0)).length;
+
+ this.xMean_ = colMeans(X);
+ this.yMean_ = colMeans(Y);
+
+ let Xc = centerMatrix(X, this.xMean_);
+ let Yc = centerMatrix(Y, this.yMean_);
+
+ // Compute std for scaling
+ if (this.scale) {
+ const xStd = new Float64Array(p);
+ const yStd = new Float64Array(q);
+ for (const xi of Xc) for (let j = 0; j < p; j++) xStd[j] = (xStd[j] ?? 0) + (xi[j] ?? 0) ** 2;
+ for (const yi of Yc) for (let j = 0; j < q; j++) yStd[j] = (yStd[j] ?? 0) + (yi[j] ?? 0) ** 2;
+ for (let j = 0; j < p; j++) xStd[j] = Math.sqrt((xStd[j] ?? 0) / n);
+ for (let j = 0; j < q; j++) yStd[j] = Math.sqrt((yStd[j] ?? 0) / n);
+ this.xStd_ = xStd;
+ this.yStd_ = yStd;
+ Xc = Xc.map((xi) => new Float64Array(xi.map((v, j) => v / Math.max(xStd[j] ?? 1, 1e-10))));
+ Yc = Yc.map((yi) => new Float64Array(yi.map((v, j) => v / Math.max(yStd[j] ?? 1, 1e-10))));
+ }
+
+ // CCA via SVD of X^T Y
+ const Cxy = crossProd(Xc, Yc);
+ const k = Math.min(this.nComponents, p, q);
+ const { U, Vt } = powerSVD(Cxy, k, this.maxIter);
+
+ this.xWeights_ = U;
+ this.yWeights_ = Vt;
+
+ // Compute loadings
+ this.xLoadings_ = Array.from({ length: k }, (_, c) => {
+ const w = U[c] ?? new Float64Array(p);
+ const t = new Float64Array(n);
+ for (let i = 0; i < n; i++) {
+ for (let j = 0; j < p; j++) t[i] = (t[i] ?? 0) + ((Xc[i] ?? new Float64Array(p))[j] ?? 0) * (w[j] ?? 0);
+ }
+ const load = new Float64Array(p);
+ for (let j = 0; j < p; j++) {
+ let cov = 0;
+ for (let i = 0; i < n; i++) cov += ((Xc[i] ?? new Float64Array(p))[j] ?? 0) * (t[i] ?? 0);
+ let tNorm = 0;
+ for (let i = 0; i < n; i++) tNorm += (t[i] ?? 0) ** 2;
+ load[j] = tNorm > 0 ? cov / tNorm : 0;
+ }
+ return load;
+ });
+
+ this.yLoadings_ = Array.from({ length: k }, (_, c) => {
+ const w = Vt[c] ?? new Float64Array(q);
+ const u = new Float64Array(n);
+ for (let i = 0; i < n; i++) {
+ for (let j = 0; j < q; j++) u[i] = (u[i] ?? 0) + ((Yc[i] ?? new Float64Array(q))[j] ?? 0) * (w[j] ?? 0);
+ }
+ const load = new Float64Array(q);
+ for (let j = 0; j < q; j++) {
+ let cov = 0;
+ for (let i = 0; i < n; i++) cov += ((Yc[i] ?? new Float64Array(q))[j] ?? 0) * (u[i] ?? 0);
+ let uNorm = 0;
+ for (let i = 0; i < n; i++) uNorm += (u[i] ?? 0) ** 2;
+ load[j] = uNorm > 0 ? cov / uNorm : 0;
+ }
+ return load;
+ });
+
+ return this;
+ }
+
+ transform(X: Float64Array[], Y?: Float64Array[]): [Float64Array[], Float64Array[] | null] {
+ if (this.xWeights_ === null || this.xMean_ === null) throw new NotFittedError("CCA");
+ const xMean = this.xMean_;
+ const xStd = this.xStd_;
+ const k = this.nComponents;
+
+ let Xc = X.map((xi) => new Float64Array(xi.map((v, j) => v - (xMean[j] ?? 0))));
+ if (xStd) Xc = Xc.map((xi) => new Float64Array(xi.map((v, j) => v / Math.max(xStd[j] ?? 1, 1e-10))));
+
+ const xScores = X.map((_, i) => {
+ const scores = new Float64Array(k);
+ for (let c = 0; c < k; c++) {
+ const w = this.xWeights_![c] ?? new Float64Array(0);
+ for (let j = 0; j < w.length; j++) scores[c] = (scores[c] ?? 0) + ((Xc[i] ?? new Float64Array(0))[j] ?? 0) * (w[j] ?? 0);
+ }
+ return scores;
+ });
+
+ if (Y === undefined) return [xScores, null];
+
+ const yMean = this.yMean_!;
+ const yStd = this.yStd_;
+ let Yc = Y.map((yi) => new Float64Array(yi.map((v, j) => v - (yMean[j] ?? 0))));
+ if (yStd) Yc = Yc.map((yi) => new Float64Array(yi.map((v, j) => v / Math.max(yStd[j] ?? 1, 1e-10))));
+
+ const yScores = Y.map((_, i) => {
+ const scores = new Float64Array(k);
+ for (let c = 0; c < k; c++) {
+ const w = this.yWeights_![c] ?? new Float64Array(0);
+ for (let j = 0; j < w.length; j++) scores[c] = (scores[c] ?? 0) + ((Yc[i] ?? new Float64Array(0))[j] ?? 0) * (w[j] ?? 0);
+ }
+ return scores;
+ });
+
+ return [xScores, yScores];
+ }
+
+ fitTransform(X: Float64Array[], Y: Float64Array[]): [Float64Array[], Float64Array[]] {
+ this.fit(X, Y);
+ const [xS, yS] = this.transform(X, Y);
+ return [xS, yS!];
+ }
+}
diff --git a/src/cross_decomposition/cross_decomp_ext.ts b/src/cross_decomposition/cross_decomp_ext.ts
new file mode 100644
index 0000000..2041263
--- /dev/null
+++ b/src/cross_decomposition/cross_decomp_ext.ts
@@ -0,0 +1,159 @@
+/**
+ * Extended cross-decomposition: CCA extensions, PLSSVD utilities,
+ * and canonical correlation analysis helpers.
+ */
+
+/** Deflation step for PLS: subtract outer product of scores. */
+export function deflate(
+ X: Float64Array[],
+ xScores: Float64Array,
+ xLoadings: Float64Array,
+): Float64Array[] {
+ const n = X.length;
+ const d = X[0]?.length ?? 0;
+ return X.map((xi, i) => {
+ const t = xScores[i] ?? 0;
+ return xi.map((v, j) => v - t * (xLoadings[j] ?? 0));
+ });
+}
+
+/** NIPALS algorithm step: find first latent variable pair. */
+export interface NIPALSResult {
+ xWeights: Float64Array;
+ yWeights: Float64Array;
+ xScores: Float64Array;
+ yScores: Float64Array;
+ xLoadings: Float64Array;
+ yLoadings: Float64Array;
+}
+
+export function nipalsStep(
+ X: Float64Array[],
+ Y: Float64Array[],
+ maxIter = 500,
+ tol = 1e-6,
+): NIPALSResult {
+ const n = X.length;
+ const p = X[0]?.length ?? 0;
+ const q = Y[0]?.length ?? 0;
+
+ // Initialize u as first column of Y
+ let u = new Float64Array(n).map((_, i) => Y[i]?.[0] ?? 0);
+ let xWeights = new Float64Array(p);
+ let yWeights = new Float64Array(q);
+
+ for (let iter = 0; iter < maxIter; iter++) {
+ // w = X^T u / ||X^T u||
+ const xw = new Float64Array(p);
+ for (let j = 0; j < p; j++) {
+ let sum = 0;
+ for (let i = 0; i < n; i++) sum += (X[i]?.[j] ?? 0) * (u[i] ?? 0);
+ xw[j] = sum;
+ }
+ const xwNorm = Math.sqrt(xw.reduce((s, v) => s + v * v, 0)) + 1e-10;
+ for (let j = 0; j < p; j++) xw[j] = (xw[j] ?? 0) / xwNorm;
+
+ // t = X w
+ const t = new Float64Array(n).map((_, i) => {
+ let sum = 0;
+ for (let j = 0; j < p; j++) sum += (X[i]?.[j] ?? 0) * (xw[j] ?? 0);
+ return sum;
+ });
+
+ // q = Y^T t / ||Y^T t||
+ const yq = new Float64Array(q);
+ for (let j = 0; j < q; j++) {
+ let sum = 0;
+ for (let i = 0; i < n; i++) sum += (Y[i]?.[j] ?? 0) * (t[i] ?? 0);
+ yq[j] = sum;
+ }
+ const yqNorm = Math.sqrt(yq.reduce((s, v) => s + v * v, 0)) + 1e-10;
+ for (let j = 0; j < q; j++) yq[j] = (yq[j] ?? 0) / yqNorm;
+
+ // u_new = Y q
+ const uNew = new Float64Array(n).map((_, i) => {
+ let sum = 0;
+ for (let j = 0; j < q; j++) sum += (Y[i]?.[j] ?? 0) * (yq[j] ?? 0);
+ return sum;
+ });
+
+ const diff = Math.sqrt(uNew.reduce((s, v, i) => s + (v - (u[i] ?? 0)) ** 2, 0));
+ u = uNew;
+ xWeights = xw;
+ yWeights = yq;
+ if (diff < tol) break;
+ }
+
+ const xScores = new Float64Array(n).map((_, i) => {
+ let sum = 0;
+ for (let j = 0; j < p; j++) sum += (X[i]?.[j] ?? 0) * (xWeights[j] ?? 0);
+ return sum;
+ });
+ const yScores = u;
+
+ // Loadings: X^T t / ||t||^2
+ const tNorm2 = xScores.reduce((s, v) => s + v * v, 0) + 1e-10;
+ const xLoadings = new Float64Array(p).map((_, j) => {
+ let sum = 0;
+ for (let i = 0; i < n; i++) sum += (X[i]?.[j] ?? 0) * (xScores[i] ?? 0);
+ return sum / tNorm2;
+ });
+ const uNorm2 = yScores.reduce((s, v) => s + v * v, 0) + 1e-10;
+ const yLoadings = new Float64Array(q).map((_, j) => {
+ let sum = 0;
+ for (let i = 0; i < n; i++) sum += (Y[i]?.[j] ?? 0) * (yScores[i] ?? 0);
+ return sum / uNorm2;
+ });
+
+ return { xWeights, yWeights, xScores, yScores, xLoadings, yLoadings };
+}
+
+/** Canonical Correlation Analysis helpers. */
+export interface CCAResult {
+ xWeights: Float64Array[];
+ yWeights: Float64Array[];
+ xScores: Float64Array[];
+ yScores: Float64Array[];
+ correlations: Float64Array;
+}
+
+/** Compute canonical correlations between X and Y (simplified). */
+export function canonicalCorrelations(
+ X: Float64Array[],
+ Y: Float64Array[],
+ nComponents = 1,
+): CCAResult {
+ const nComp = Math.min(nComponents, X[0]?.length ?? 1, Y[0]?.length ?? 1);
+ let XR = X;
+ let YR = Y;
+ const xWeights: Float64Array[] = [];
+ const yWeights: Float64Array[] = [];
+ const xScores: Float64Array[] = [];
+ const yScores: Float64Array[] = [];
+ const correlations = new Float64Array(nComp);
+
+ for (let c = 0; c < nComp; c++) {
+ const result = nipalsStep(XR, YR);
+ xWeights.push(result.xWeights);
+ yWeights.push(result.yWeights);
+ xScores.push(result.xScores);
+ yScores.push(result.yScores);
+
+ // Correlation between t and u
+ const tMean = result.xScores.reduce((s, v) => s + v, 0) / result.xScores.length;
+ const uMean = result.yScores.reduce((s, v) => s + v, 0) / result.yScores.length;
+ let cov = 0, st = 0, su = 0;
+ for (let i = 0; i < result.xScores.length; i++) {
+ cov += ((result.xScores[i] ?? 0) - tMean) * ((result.yScores[i] ?? 0) - uMean);
+ st += ((result.xScores[i] ?? 0) - tMean) ** 2;
+ su += ((result.yScores[i] ?? 0) - uMean) ** 2;
+ }
+ correlations[c] = cov / (Math.sqrt(st * su) + 1e-10);
+
+ // Deflate
+ XR = deflate(XR, result.xScores, result.xLoadings);
+ YR = deflate(YR, result.yScores, result.yLoadings);
+ }
+
+ return { xWeights, yWeights, xScores, yScores, correlations };
+}
diff --git a/src/cross_decomposition/cross_decomp_ext2.ts b/src/cross_decomposition/cross_decomp_ext2.ts
new file mode 100644
index 0000000..5e17c08
--- /dev/null
+++ b/src/cross_decomposition/cross_decomp_ext2.ts
@@ -0,0 +1,149 @@
+/**
+ * Cross-decomposition extensions: PLSSVD, CCA extensions.
+ * Port of sklearn.cross_decomposition extensions.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+/** PLS Canonical (PLSC): symmetric variant of PLS. */
+export class PLSCanonical {
+ private xRotations_: Float64Array[] | null = null;
+ private yRotations_: Float64Array[] | null = null;
+ private xMean_: Float64Array | null = null;
+ private yMean_: Float64Array | null = null;
+ readonly nComponents: number;
+ readonly maxIter: number;
+ readonly tol: number;
+
+ constructor(
+ options: {
+ nComponents?: number;
+ maxIter?: number;
+ tol?: number;
+ } = {},
+ ) {
+ this.nComponents = options.nComponents ?? 2;
+ this.maxIter = options.maxIter ?? 500;
+ this.tol = options.tol ?? 1e-6;
+ }
+
+ fit(X: Float64Array[], Y: Float64Array[]): this {
+ const n = X.length;
+ const p = X[0]?.length ?? 0;
+ const q = Y[0]?.length ?? 0;
+ const k = Math.min(this.nComponents, Math.min(p, q));
+
+ const xMean = new Float64Array(p);
+ const yMean = new Float64Array(q);
+ for (let i = 0; i < n; i++) {
+ for (let j = 0; j < p; j++) xMean[j]! += X[i]?.[j] ?? 0;
+ for (let j = 0; j < q; j++) yMean[j]! += Y[i]?.[j] ?? 0;
+ }
+ for (let j = 0; j < p; j++) xMean[j]! /= n;
+ for (let j = 0; j < q; j++) yMean[j]! /= n;
+ this.xMean_ = xMean;
+ this.yMean_ = yMean;
+
+ const Xc = X.map((row) => new Float64Array(p).map((_, j) => (row[j] ?? 0) - (xMean[j] ?? 0)));
+ const Yc = Y.map((row) => new Float64Array(q).map((_, j) => (row[j] ?? 0) - (yMean[j] ?? 0)));
+
+ const xRotations: Float64Array[] = [];
+ const yRotations: Float64Array[] = [];
+
+ let XResid = Xc.map((r) => new Float64Array(r));
+ let YResid = Yc.map((r) => new Float64Array(r));
+
+ for (let comp = 0; comp < k; comp++) {
+ // Compute X^T * Y covariance
+ const Cxy = Array.from({ length: p }, (_, a) =>
+ new Float64Array(q).map((_, b) => {
+ let s = 0;
+ for (let i = 0; i < n; i++) s += (XResid[i]?.[a] ?? 0) * (YResid[i]?.[b] ?? 0);
+ return s;
+ }),
+ );
+ // Power iteration for first SVD component
+ let u = new Float64Array(p);
+ u[0] = 1;
+ let v = new Float64Array(q);
+ for (let iter = 0; iter < this.maxIter; iter++) {
+ // u = Cxy * v
+ const newU = new Float64Array(p);
+ for (let a = 0; a < p; a++) {
+ for (let b = 0; b < q; b++) newU[a]! += (Cxy[a]?.[b] ?? 0) * (v[b] ?? 0);
+ }
+ let norm = 0;
+ for (let a = 0; a < p; a++) norm += (newU[a] ?? 0) ** 2;
+ norm = Math.sqrt(norm) || 1;
+ for (let a = 0; a < p; a++) newU[a]! /= norm;
+ // v = Cxy^T * u
+ const newV = new Float64Array(q);
+ for (let b = 0; b < q; b++) {
+ for (let a = 0; a < p; a++) newV[b]! += (Cxy[a]?.[b] ?? 0) * (newU[a] ?? 0);
+ }
+ let normV = 0;
+ for (let b = 0; b < q; b++) normV += (newV[b] ?? 0) ** 2;
+ normV = Math.sqrt(normV) || 1;
+ for (let b = 0; b < q; b++) newV[b]! /= normV;
+ let diff = 0;
+ for (let a = 0; a < p; a++) diff += ((newU[a] ?? 0) - (u[a] ?? 0)) ** 2;
+ u = newU;
+ v = newV;
+ if (diff < this.tol) break;
+ }
+ xRotations.push(u);
+ yRotations.push(v);
+ // Deflate
+ const xt = new Float64Array(n).map((_, i) => {
+ let s = 0;
+ for (let a = 0; a < p; a++) s += (XResid[i]?.[a] ?? 0) * (u[a] ?? 0);
+ return s;
+ });
+ for (let i = 0; i < n; i++) {
+ for (let a = 0; a < p; a++) XResid[i]![a]! -= (xt[i] ?? 0) * (u[a] ?? 0);
+ }
+ const yt = new Float64Array(n).map((_, i) => {
+ let s = 0;
+ for (let b = 0; b < q; b++) s += (YResid[i]?.[b] ?? 0) * (v[b] ?? 0);
+ return s;
+ });
+ for (let i = 0; i < n; i++) {
+ for (let b = 0; b < q; b++) YResid[i]![b]! -= (yt[i] ?? 0) * (v[b] ?? 0);
+ }
+ }
+ this.xRotations_ = xRotations;
+ this.yRotations_ = yRotations;
+ return this;
+ }
+
+ transform(X: Float64Array[], Y?: Float64Array[]): { xScores: Float64Array[]; yScores?: Float64Array[] } {
+ if (this.xRotations_ === null || this.xMean_ === null) throw new NotFittedError("PLSCanonical is not fitted.");
+ const k = this.xRotations_.length;
+ const xScores = X.map((row) => {
+ const scores = new Float64Array(k);
+ for (let c = 0; c < k; c++) {
+ for (let j = 0; j < row.length; j++) {
+ scores[c] += ((row[j] ?? 0) - (this.xMean_![j] ?? 0)) * (this.xRotations_![c]?.[j] ?? 0);
+ }
+ }
+ return scores;
+ });
+ if (!Y || !this.yRotations_ || !this.yMean_) return { xScores };
+ const yScores = Y.map((row) => {
+ const scores = new Float64Array(k);
+ for (let c = 0; c < k; c++) {
+ for (let j = 0; j < row.length; j++) {
+ scores[c] += ((row[j] ?? 0) - (this.yMean_![j] ?? 0)) * (this.yRotations_![c]?.[j] ?? 0);
+ }
+ }
+ return scores;
+ });
+ return { xScores, yScores };
+ }
+
+ fitTransform(X: Float64Array[], Y: Float64Array[]): { xScores: Float64Array[]; yScores: Float64Array[] } {
+ this.fit(X, Y);
+ const result = this.transform(X, Y);
+ return { xScores: result.xScores, yScores: result.yScores! };
+ }
+}
diff --git a/src/cross_decomposition/cross_decomp_ext3.ts b/src/cross_decomposition/cross_decomp_ext3.ts
new file mode 100644
index 0000000..54d9a04
--- /dev/null
+++ b/src/cross_decomposition/cross_decomp_ext3.ts
@@ -0,0 +1,188 @@
+/**
+ * Cross-decomposition extensions: NIPALS, PLS2.
+ * Mirrors sklearn.cross_decomposition advanced methods.
+ */
+
+import { BaseEstimator } from "../base.js";
+
+export interface NIPALSParams {
+ n_components?: number;
+ max_iter?: number;
+ tol?: number;
+}
+
+/** NIPALS: Nonlinear Iterative Partial Least Squares algorithm. */
+export class NIPALS extends BaseEstimator {
+ n_components: number;
+ max_iter: number;
+ tol: number;
+ x_weights_: Float64Array[] = [];
+ y_weights_: Float64Array[] = [];
+ x_loadings_: Float64Array[] = [];
+ y_loadings_: Float64Array[] = [];
+ x_scores_: Float64Array[] = [];
+ y_scores_: Float64Array[] = [];
+ x_mean_: Float64Array = new Float64Array(0);
+ y_mean_: Float64Array = new Float64Array(0);
+ n_features_in_ = 0;
+
+ constructor(params: NIPALSParams = {}) {
+ super();
+ this.n_components = params.n_components ?? 2;
+ this.max_iter = params.max_iter ?? 500;
+ this.tol = params.tol ?? 1e-6;
+ }
+
+ fit(X: Float64Array[], Y: Float64Array[]): this {
+ const n = X.length;
+ const px = X[0]?.length ?? 0, py = Y[0]?.length ?? 0;
+ this.n_features_in_ = px;
+ this.x_mean_ = new Float64Array(px);
+ this.y_mean_ = new Float64Array(py);
+ for (let k = 0; k < px; k++) for (const xi of X) this.x_mean_[k] = (this.x_mean_[k] ?? 0) + (xi[k] ?? 0);
+ for (let k = 0; k < py; k++) for (const yi of Y) this.y_mean_[k] = (this.y_mean_[k] ?? 0) + (yi[k] ?? 0);
+ for (let k = 0; k < px; k++) this.x_mean_[k] = (this.x_mean_[k] ?? 0) / n;
+ for (let k = 0; k < py; k++) this.y_mean_[k] = (this.y_mean_[k] ?? 0) / n;
+ let Xr = X.map((xi) => new Float64Array(px).map((_, k) => (xi[k] ?? 0) - (this.x_mean_[k] ?? 0)));
+ let Yr = Y.map((yi) => new Float64Array(py).map((_, k) => (yi[k] ?? 0) - (this.y_mean_[k] ?? 0)));
+ for (let c = 0; c < this.n_components; c++) {
+ // NIPALS iteration
+ let u = Yr.map((yi) => yi[0] ?? 0);
+ let w = new Float64Array(px), q = new Float64Array(py), t = new Float64Array(n);
+ for (let iter = 0; iter < this.max_iter; iter++) {
+ // w = X'u / ||X'u||
+ for (let j = 0; j < px; j++) { let s = 0; for (let i = 0; i < n; i++) s += (Xr[i]?.[j] ?? 0) * (u[i] ?? 0); w[j] = s; }
+ let wn = 0; for (const v of w) wn += v * v; wn = Math.sqrt(wn); if (wn > 1e-10) for (let j = 0; j < px; j++) w[j] = (w[j] ?? 0) / wn;
+ // t = Xw
+ for (let i = 0; i < n; i++) { let s = 0; for (let j = 0; j < px; j++) s += (Xr[i]?.[j] ?? 0) * (w[j] ?? 0); t[i] = s; }
+ // q = Y't / ||Y't||
+ for (let j = 0; j < py; j++) { let s = 0; for (let i = 0; i < n; i++) s += (Yr[i]?.[j] ?? 0) * (t[i] ?? 0); q[j] = s; }
+ let qn = 0; for (const v of q) qn += v * v; qn = Math.sqrt(qn); if (qn > 1e-10) for (let j = 0; j < py; j++) q[j] = (q[j] ?? 0) / qn;
+ // u = Yq
+ const uNew = new Float64Array(n);
+ for (let i = 0; i < n; i++) { let s = 0; for (let j = 0; j < py; j++) s += (Yr[i]?.[j] ?? 0) * (q[j] ?? 0); uNew[i] = s; }
+ let diff = 0; for (let i = 0; i < n; i++) diff += (uNew[i] ?? 0 - (u[i] ?? 0)) ** 2;
+ u = Array.from(uNew);
+ if (Math.sqrt(diff) < this.tol) break;
+ }
+ // Deflate
+ const pLoading = new Float64Array(px);
+ const tn2 = t.reduce((s, v) => s + v * v, 0);
+ if (tn2 > 1e-10) {
+ for (let j = 0; j < px; j++) { let s = 0; for (let i = 0; i < n; i++) s += (Xr[i]?.[j] ?? 0) * (t[i] ?? 0); pLoading[j] = s / tn2; }
+ }
+ Xr = Xr.map((xi, i) => new Float64Array(px).map((_, j) => (xi[j] ?? 0) - (t[i] ?? 0) * (pLoading[j] ?? 0)));
+ Yr = Yr.map((yi, i) => new Float64Array(py).map((_, j) => (yi[j] ?? 0) - (q[j] ?? 0) * u[i]!));
+ this.x_weights_.push(w);
+ this.y_weights_.push(q);
+ this.x_loadings_.push(pLoading);
+ this.y_loadings_.push(q);
+ this.x_scores_.push(t);
+ this.y_scores_.push(new Float64Array(u));
+ }
+ return this;
+ }
+
+ transform(X: Float64Array[]): Float64Array[] {
+ const nc = this.n_components;
+ const px = this.n_features_in_;
+ const Xc = X.map((xi) => new Float64Array(px).map((_, k) => (xi[k] ?? 0) - (this.x_mean_[k] ?? 0)));
+ return Xc.map((xi) => new Float64Array(nc).map((_, c) => {
+ let s = 0;
+ const w = this.x_weights_[c];
+ if (w) for (let k = 0; k < px; k++) s += (w[k] ?? 0) * (xi[k] ?? 0);
+ return s;
+ }));
+ }
+
+ fit_transform(X: Float64Array[], Y: Float64Array[]): Float64Array[] {
+ return this.fit(X, Y).transform(X);
+ }
+}
+
+export interface CanonicalCorrelationExtParams {
+ n_components?: number;
+ regularization?: number;
+}
+
+/** Canonical Correlation Analysis (CCA) extension. */
+export class CCAExt extends BaseEstimator {
+ n_components: number;
+ regularization: number;
+ x_weights_: Float64Array[] = [];
+ y_weights_: Float64Array[] = [];
+ x_mean_: Float64Array = new Float64Array(0);
+ y_mean_: Float64Array = new Float64Array(0);
+ n_features_in_ = 0;
+
+ constructor(params: CanonicalCorrelationExtParams = {}) {
+ super();
+ this.n_components = params.n_components ?? 2;
+ this.regularization = params.regularization ?? 1e-4;
+ }
+
+ fit(X: Float64Array[], Y: Float64Array[]): this {
+ const n = X.length;
+ const px = X[0]?.length ?? 0, py = Y[0]?.length ?? 0;
+ this.n_features_in_ = px;
+ this.x_mean_ = new Float64Array(px);
+ this.y_mean_ = new Float64Array(py);
+ for (let k = 0; k < px; k++) for (const xi of X) this.x_mean_[k] = (this.x_mean_[k] ?? 0) + (xi[k] ?? 0);
+ for (let k = 0; k < py; k++) for (const yi of Y) this.y_mean_[k] = (this.y_mean_[k] ?? 0) + (yi[k] ?? 0);
+ for (let k = 0; k < px; k++) this.x_mean_[k] = (this.x_mean_[k] ?? 0) / n;
+ for (let k = 0; k < py; k++) this.y_mean_[k] = (this.y_mean_[k] ?? 0) / n;
+ const Xc = X.map((xi) => new Float64Array(px).map((_, k) => (xi[k] ?? 0) - (this.x_mean_[k] ?? 0)));
+ const Yc = Y.map((yi) => new Float64Array(py).map((_, k) => (yi[k] ?? 0) - (this.y_mean_[k] ?? 0)));
+ // Covariance matrices
+ const cov = (A: Float64Array[], B: Float64Array[], pa: number, pb: number): Float64Array[] => {
+ const C = Array.from({ length: pa }, () => new Float64Array(pb));
+ for (let i = 0; i < n; i++) for (let a = 0; a < pa; a++) for (let b = 0; b < pb; b++) C[a]![b] = (C[a]![b] ?? 0) + (A[i]?.[a] ?? 0) * (B[i]?.[b] ?? 0);
+ for (let a = 0; a < pa; a++) for (let b = 0; b < pb; b++) C[a]![b] = (C[a]![b] ?? 0) / n;
+ return C;
+ };
+ const Sxx = cov(Xc, Xc, px, px);
+ const Syy = cov(Yc, Yc, py, py);
+ const Sxy = cov(Xc, Yc, px, py);
+ // Regularize diagonals
+ for (let i = 0; i < px; i++) Sxx[i]![i] = (Sxx[i]![i] ?? 0) + this.regularization;
+ for (let i = 0; i < py; i++) Syy[i]![i] = (Syy[i]![i] ?? 0) + this.regularization;
+ // Power iteration for canonical directions
+ const nc = Math.min(this.n_components, px, py);
+ for (let c = 0; c < nc; c++) {
+ let wx = new Float64Array(px).map((_, i) => i === c ? 1 : 0.01);
+ for (let iter = 0; iter < 50; iter++) {
+ // wx = Sxx^-1 * Sxy * Syy^-1 * Sxy' * wx (power iteration approximation)
+ const Sxy_wx = new Float64Array(py).map((_, j) => { let s = 0; for (let k = 0; k < px; k++) s += (Sxy[k]?.[j] ?? 0) * (wx[k] ?? 0); return s; });
+ const Syy_inv_v = new Float64Array(py).map((_, j) => (Sxy_wx[j] ?? 0) / (Syy[j]?.[j] ?? 1));
+ const Sxyt_v = new Float64Array(px).map((_, i) => { let s = 0; for (let j = 0; j < py; j++) s += (Sxy[i]?.[j] ?? 0) * (Syy_inv_v[j] ?? 0); return s; });
+ const newWx = new Float64Array(px).map((_, i) => (Sxyt_v[i] ?? 0) / (Sxx[i]?.[i] ?? 1));
+ let norm = 0; for (const v of newWx) norm += v * v; norm = Math.sqrt(norm);
+ if (norm > 1e-10) for (let i = 0; i < px; i++) newWx[i] = (newWx[i] ?? 0) / norm;
+ let diff = 0; for (let i = 0; i < px; i++) diff += ((newWx[i] ?? 0) - (wx[i] ?? 0)) ** 2;
+ wx = newWx;
+ if (Math.sqrt(diff) < 1e-8) break;
+ }
+ const wy = new Float64Array(py).map((_, j) => { let s = 0; for (let i = 0; i < px; i++) s += (Sxy[i]?.[j] ?? 0) * (wx[i] ?? 0); return s; });
+ let wyn = 0; for (const v of wy) wyn += v * v; wyn = Math.sqrt(wyn);
+ if (wyn > 1e-10) for (let j = 0; j < py; j++) wy[j] = (wy[j] ?? 0) / wyn;
+ this.x_weights_.push(wx);
+ this.y_weights_.push(wy);
+ }
+ return this;
+ }
+
+ transform(X: Float64Array[]): Float64Array[] {
+ const px = this.n_features_in_;
+ const Xc = X.map((xi) => new Float64Array(px).map((_, k) => (xi[k] ?? 0) - (this.x_mean_[k] ?? 0)));
+ return Xc.map((xi) => new Float64Array(this.n_components).map((_, c) => {
+ let s = 0;
+ const w = this.x_weights_[c];
+ if (w) for (let k = 0; k < px; k++) s += (w[k] ?? 0) * (xi[k] ?? 0);
+ return s;
+ }));
+ }
+
+ fit_transform(X: Float64Array[], Y: Float64Array[]): Float64Array[] {
+ return this.fit(X, Y).transform(X);
+ }
+}
diff --git a/src/cross_decomposition/index.ts b/src/cross_decomposition/index.ts
new file mode 100644
index 0000000..a7232c5
--- /dev/null
+++ b/src/cross_decomposition/index.ts
@@ -0,0 +1,3 @@
+export * from "./pls.js";
+export * from "./cca.js";
+export * from "./pls_svd.js";
diff --git a/src/cross_decomposition/pls.ts b/src/cross_decomposition/pls.ts
new file mode 100644
index 0000000..395c1a4
--- /dev/null
+++ b/src/cross_decomposition/pls.ts
@@ -0,0 +1,404 @@
+/**
+ * Cross decomposition: PLSRegression, PLSSVD, PLSCanonical, CCA.
+ * Mirrors sklearn.cross_decomposition.
+ */
+
+import { NotFittedError } from "../exceptions.js";
+
+/** Compute column means. */
+function colMeans(X: Float64Array[]): Float64Array {
+ const p = (X[0] ?? new Float64Array(0)).length;
+ const m = new Float64Array(p);
+ for (const xi of X) for (let j = 0; j < p; j++) m[j] = (m[j] ?? 0) + (xi[j] ?? 0);
+ for (let j = 0; j < p; j++) m[j] = (m[j] ?? 0) / X.length;
+ return m;
+}
+
+/** Center X by subtracting column means. */
+function center(X: Float64Array[], means: Float64Array): Float64Array[] {
+ const p = means.length;
+ return X.map((xi) => {
+ const out = new Float64Array(p);
+ for (let j = 0; j < p; j++) out[j] = (xi[j] ?? 0) - (means[j] ?? 0);
+ return out;
+ });
+}
+
+/** Compute X^T Y (p x q). */
+function Xtranspose_Y(X: Float64Array[], Y: Float64Array[]): Float64Array[] {
+ const p = (X[0] ?? new Float64Array(0)).length;
+ const q = (Y[0] ?? new Float64Array(0)).length;
+ const n = X.length;
+ const out = Array.from({ length: p }, () => new Float64Array(q));
+ for (let i = 0; i < n; i++) {
+ const xi = X[i] ?? new Float64Array(p);
+ const yi = Y[i] ?? new Float64Array(q);
+ for (let j = 0; j < p; j++) {
+ for (let k = 0; k < q; k++) {
+ out[j]![k] = (out[j]![k] ?? 0) + (xi[j] ?? 0) * (yi[k] ?? 0);
+ }
+ }
+ }
+ return out;
+}
+
+/** Compute matrix-vector product. */
+function matVec(M: Float64Array[], v: Float64Array): Float64Array {
+ const out = new Float64Array(M.length);
+ for (let i = 0; i < M.length; i++) {
+ const row = M[i] ?? new Float64Array(0);
+ for (let j = 0; j < v.length; j++) out[i] = (out[i] ?? 0) + (row[j] ?? 0) * (v[j] ?? 0);
+ }
+ return out;
+}
+
+/** L2 norm of a vector. */
+function norm(v: Float64Array): number {
+ let s = 0;
+ for (let j = 0; j < v.length; j++) s += (v[j] ?? 0) ** 2;
+ return Math.sqrt(s);
+}
+
+/** Normalize a vector in-place. */
+function normalize(v: Float64Array): void {
+ const n = norm(v);
+ if (n > 1e-15) for (let j = 0; j < v.length; j++) v[j] = (v[j] ?? 0) / n;
+}
+
+/** Dot product. */
+function dot(a: Float64Array, b: Float64Array): number {
+ let s = 0;
+ for (let j = 0; j < a.length; j++) s += (a[j] ?? 0) * (b[j] ?? 0);
+ return s;
+}
+
+/** NIPALS: find first left/right singular vectors of M via power iteration. */
+function nipals(
+ XtY: Float64Array[],
+ tol = 1e-10,
+ maxIter = 500,
+): { u: Float64Array; v: Float64Array } {
+ const p = XtY.length;
+ const q = (XtY[0] ?? new Float64Array(0)).length;
+ let v = new Float64Array(q);
+ v[0] = 1;
+ let u = new Float64Array(p);
+ for (let iter = 0; iter < maxIter; iter++) {
+ // u = XtY v / ||XtY v||
+ const uNew = matVec(XtY, v);
+ normalize(uNew);
+ // v = XtY^T u / ||XtY^T u||
+ const vNew = new Float64Array(q);
+ for (let k = 0; k < q; k++) {
+ for (let j = 0; j < p; j++) {
+ vNew[k] = (vNew[k] ?? 0) + (XtY[j]![k] ?? 0) * (uNew[j] ?? 0);
+ }
+ }
+ normalize(vNew);
+ const diff =
+ norm(
+ Float64Array.from({ length: p }, (_, i) => (uNew[i] ?? 0) - (u[i] ?? 0)),
+ ) +
+ norm(
+ Float64Array.from({ length: q }, (_, i) => (vNew[i] ?? 0) - (v[i] ?? 0)),
+ );
+ u = uNew as Float64Array;
+ v = vNew;
+ if (diff < tol) break;
+ }
+ return { u, v };
+}
+
+/**
+ * PLS regression via NIPALS algorithm.
+ * Mirrors sklearn.cross_decomposition.PLSRegression.
+ */
+export class PLSRegression {
+ nComponents: number;
+ maxIter: number;
+ tol: number;
+ scale: boolean;
+
+ xWeights_: Float64Array[] | null = null;
+ yWeights_: Float64Array[] | null = null;
+ xLoadings_: Float64Array[] | null = null;
+ yLoadings_: Float64Array[] | null = null;
+ xScores_: Float64Array[] | null = null;
+ yScores_: Float64Array[] | null = null;
+ coef_: Float64Array[] | null = null;
+
+ xMean_: Float64Array | null = null;
+ yMean_: Float64Array | null = null;
+
+ constructor(
+ options: {
+ nComponents?: number;
+ maxIter?: number;
+ tol?: number;
+ scale?: boolean;
+ } = {},
+ ) {
+ this.nComponents = options.nComponents ?? 2;
+ this.maxIter = options.maxIter ?? 500;
+ this.tol = options.tol ?? 1e-06;
+ this.scale = options.scale ?? true;
+ }
+
+ fit(X: Float64Array[], Y: Float64Array[]): this {
+ const n = X.length;
+ const p = (X[0] ?? new Float64Array(0)).length;
+ const q = (Y[0] ?? new Float64Array(0)).length;
+ const k = Math.min(this.nComponents, p, q);
+
+ this.xMean_ = colMeans(X);
+ this.yMean_ = colMeans(Y);
+ let Xc = center(X, this.xMean_);
+ let Yc = center(Y, this.yMean_);
+
+ this.xWeights_ = [];
+ this.yWeights_ = [];
+ this.xLoadings_ = [];
+ this.yLoadings_ = [];
+ this.xScores_ = Array.from({ length: n }, () => new Float64Array(k));
+ this.yScores_ = Array.from({ length: n }, () => new Float64Array(k));
+
+ for (let comp = 0; comp < k; comp++) {
+ const XtY = Xtranspose_Y(Xc, Yc);
+ const { u, v } = nipals(XtY, this.tol, this.maxIter);
+
+ // Scores: t = Xc u, s = Yc v
+ const t = new Float64Array(n);
+ const s = new Float64Array(n);
+ for (let i = 0; i < n; i++) {
+ const xi = Xc[i] ?? new Float64Array(p);
+ const yi = Yc[i] ?? new Float64Array(q);
+ t[i] = dot(xi, u);
+ s[i] = dot(yi, v);
+ }
+
+ // Normalize t
+ const tNorm = norm(t);
+ if (tNorm > 1e-15) for (let i = 0; i < n; i++) t[i] = (t[i] ?? 0) / tNorm;
+
+ // X loadings: p_h = Xc^T t
+ const px = new Float64Array(p);
+ for (let i = 0; i < n; i++) {
+ const xi = Xc[i] ?? new Float64Array(p);
+ for (let j = 0; j < p; j++) px[j] = (px[j] ?? 0) + (xi[j] ?? 0) * (t[i] ?? 0);
+ }
+
+ // Y loadings: q_h = Yc^T s / ||s||^2
+ const sNorm2 = dot(s, s);
+ const qy = new Float64Array(q);
+ for (let i = 0; i < n; i++) {
+ const yi = Yc[i] ?? new Float64Array(q);
+ for (let j = 0; j < q; j++) {
+ qy[j] = (qy[j] ?? 0) + (yi[j] ?? 0) * (s[i] ?? 0);
+ }
+ }
+ if (sNorm2 > 1e-15) for (let j = 0; j < q; j++) qy[j] = (qy[j] ?? 0) / sNorm2;
+
+ this.xWeights_[comp] = u;
+ this.yWeights_[comp] = v;
+ this.xLoadings_[comp] = px;
+ this.yLoadings_[comp] = qy;
+ for (let i = 0; i < n; i++) {
+ this.xScores_![i]![comp] = t[i] ?? 0;
+ this.yScores_![i]![comp] = s[i] ?? 0;
+ }
+
+ // Deflate
+ const tFull = new Float64Array(n);
+ for (let i = 0; i < n; i++) {
+ const xi = Xc[i] ?? new Float64Array(p);
+ tFull[i] = dot(xi, u);
+ }
+ Xc = Xc.map((xi, i) => {
+ const out = new Float64Array(p);
+ for (let j = 0; j < p; j++) out[j] = (xi[j] ?? 0) - (tFull[i] ?? 0) * (px[j] ?? 0);
+ return out;
+ });
+ Yc = Yc.map((yi, i) => {
+ const out = new Float64Array(q);
+ for (let j = 0; j < q; j++) out[j] = (yi[j] ?? 0) - (tFull[i] ?? 0) * (qy[j] ?? 0);
+ return out;
+ });
+ }
+
+ // Compute regression coefficients: coef_ = W (P^T W)^{-1} Q^T
+ // Simplified: use pseudo-inverse via stored weights and loadings
+ this._computeCoef(p, q, k);
+ return this;
+ }
+
+ private _computeCoef(p: number, q: number, k: number): void {
+ // coef_ = xWeights_ @ inv(xLoadings_^T @ xWeights_) @ yLoadings_^T
+ // For simplicity, use a direct approach: coef = W (P^T W)^-1 Q^T
+ const W = this.xWeights_!;
+ const P = this.xLoadings_!;
+ const Q = this.yLoadings_!;
+
+ // PtW = P^T W (k x k)
+ const PtW = Array.from({ length: k }, () => new Float64Array(k));
+ for (let i = 0; i < k; i++) {
+ for (let j = 0; j < k; j++) {
+ PtW[i]![j] = dot(P[i] ?? new Float64Array(0), W[j] ?? new Float64Array(0));
+ }
+ }
+
+ // Invert PtW (simple LU for small k)
+ const inv = this._invertSmall(PtW, k);
+
+ // coef_ (p x q) = W @ inv @ Q^T
+ this.coef_ = Array.from({ length: p }, () => new Float64Array(q));
+ for (let i = 0; i < p; i++) {
+ for (let j = 0; j < q; j++) {
+ let s = 0;
+ for (let a = 0; a < k; a++) {
+ let s2 = 0;
+ for (let b = 0; b < k; b++) {
+ s2 += (inv[a]![b] ?? 0) * (Q[b]![j] ?? 0);
+ }
+ s += (W[a]![i] ?? 0) * s2;
+ }
+ this.coef_![i]![j] = s;
+ }
+ }
+ }
+
+ private _invertSmall(M: Float64Array[], k: number): Float64Array[] {
+ // Augmented matrix [M | I]
+ const aug = Array.from({ length: k }, (_, i) => {
+ const row = new Float64Array(2 * k);
+ for (let j = 0; j < k; j++) row[j] = M[i]![j] ?? 0;
+ row[k + i] = 1;
+ return row;
+ });
+ for (let col = 0; col < k; col++) {
+ // Find pivot
+ let maxRow = col;
+ for (let row = col + 1; row < k; row++) {
+ if (Math.abs(aug[row]![col] ?? 0) > Math.abs(aug[maxRow]![col] ?? 0)) maxRow = row;
+ }
+ const tmpPls = aug[col]!; aug[col] = aug[maxRow]!; aug[maxRow] = tmpPls;
+ const pivot = aug[col]![col] ?? 1e-12;
+ if (Math.abs(pivot) < 1e-15) continue;
+ for (let j = 0; j < 2 * k; j++) aug[col]![j] = (aug[col]![j] ?? 0) / pivot;
+ for (let row = 0; row < k; row++) {
+ if (row === col) continue;
+ const factor = aug[row]![col] ?? 0;
+ for (let j = 0; j < 2 * k; j++) {
+ aug[row]![j] = (aug[row]![j] ?? 0) - factor * (aug[col]![j] ?? 0);
+ }
+ }
+ }
+ return aug.map((row) => Float64Array.from({ length: k }, (_, j) => row[k + j] ?? 0));
+ }
+
+ predict(X: Float64Array[]): Float64Array[] {
+ if (this.coef_ === null || this.xMean_ === null || this.yMean_ === null) {
+ throw new NotFittedError();
+ }
+ const p = this.xMean_.length;
+ const q = this.yMean_.length;
+ return X.map((xi) => {
+ const xc = new Float64Array(p);
+ for (let j = 0; j < p; j++) xc[j] = (xi[j] ?? 0) - (this.xMean_![j] ?? 0);
+ const out = new Float64Array(q);
+ for (let j = 0; j < q; j++) {
+ let s = 0;
+ for (let k = 0; k < p; k++) s += (xc[k] ?? 0) * (this.coef_![k]![j] ?? 0);
+ out[j] = s + (this.yMean_![j] ?? 0);
+ }
+ return out;
+ });
+ }
+
+ transform(X: Float64Array[]): Float64Array[] {
+ if (this.xWeights_ === null || this.xMean_ === null) throw new NotFittedError();
+ const k = this.xWeights_.length;
+ const p = this.xMean_.length;
+ return X.map((xi) => {
+ const xc = new Float64Array(p);
+ for (let j = 0; j < p; j++) xc[j] = (xi[j] ?? 0) - (this.xMean_![j] ?? 0);
+ const out = new Float64Array(k);
+ for (let i = 0; i < k; i++) {
+ out[i] = dot(xc, this.xWeights_![i] ?? new Float64Array(0));
+ }
+ return out;
+ });
+ }
+
+ fitTransform(X: Float64Array[], Y: Float64Array[]): [Float64Array[], Float64Array[]] {
+ this.fit(X, Y);
+ return [this.xScores_!, this.yScores_!];
+ }
+}
+
+/**
+ * Partial Least Squares SVD.
+ * Mirrors sklearn.cross_decomposition.PLSSVD.
+ */
+export class PLSSVD {
+ nComponents: number;
+
+ xWeights_: Float64Array[] | null = null;
+ yWeights_: Float64Array[] | null = null;
+ xScores_: Float64Array[] | null = null;
+ yScores_: Float64Array[] | null = null;
+ xMean_: Float64Array | null = null;
+ yMean_: Float64Array | null = null;
+
+ constructor(options: { nComponents?: number } = {}) {
+ this.nComponents = options.nComponents ?? 2;
+ }
+
+ fit(X: Float64Array[], Y: Float64Array[]): this {
+ const n = X.length;
+ const p = (X[0] ?? new Float64Array(0)).length;
+ const q = (Y[0] ?? new Float64Array(0)).length;
+ const k = Math.min(this.nComponents, p, q);
+
+ this.xMean_ = colMeans(X);
+ this.yMean_ = colMeans(Y);
+ const Xc = center(X, this.xMean_);
+ const Yc = center(Y, this.yMean_);
+
+ this.xWeights_ = [];
+ this.yWeights_ = [];
+ this.xScores_ = Array.from({ length: n }, () => new Float64Array(k));
+ this.yScores_ = Array.from({ length: n }, () => new Float64Array(k));
+
+ const curXtY = Xtranspose_Y(Xc, Yc);
+ for (let comp = 0; comp < k; comp++) {
+ const { u, v } = nipals(curXtY);
+ this.xWeights_[comp] = u;
+ this.yWeights_[comp] = v;
+ for (let i = 0; i < n; i++) {
+ const xi = Xc[i] ?? new Float64Array(p);
+ const yi = Yc[i] ?? new Float64Array(q);
+ this.xScores_![i]![comp] = dot(xi, u);
+ this.yScores_![i]![comp] = dot(yi, v);
+ }
+ }
+ return this;
+ }
+
+ transform(X: Float64Array[]): Float64Array[] {
+ if (this.xWeights_ === null || this.xMean_ === null) throw new NotFittedError();
+ const k = this.xWeights_.length;
+ const p = this.xMean_.length;
+ return X.map((xi) => {
+ const xc = new Float64Array(p);
+ for (let j = 0; j < p; j++) xc[j] = (xi[j] ?? 0) - (this.xMean_![j] ?? 0);
+ const out = new Float64Array(k);
+ for (let i = 0; i < k; i++) out[i] = dot(xc, this.xWeights_![i] ?? new Float64Array(0));
+ return out;
+ });
+ }
+
+ fitTransform(X: Float64Array[], Y: Float64Array[]): [Float64Array[], Float64Array[]] {
+ this.fit(X, Y);
+ return [this.xScores_!, this.yScores_!];
+ }
+}
diff --git a/src/cross_decomposition/pls_svd.ts b/src/cross_decomposition/pls_svd.ts
new file mode 100644
index 0000000..0b3a156
--- /dev/null
+++ b/src/cross_decomposition/pls_svd.ts
@@ -0,0 +1,170 @@
+/**
+ * Extended PLS utilities: PLSSVDExt.
+ * Mirrors sklearn.cross_decomposition.PLSSVD.
+ */
+
+export interface PLSSVDOptions {
+ nComponents?: number;
+ scale?: boolean;
+ copyData?: boolean;
+}
+
+/**
+ * Partial Least Squares SVD.
+ * Finds the directions of maximum covariance between X and Y.
+ */
+export class PLSSVDExt {
+ nComponents: number;
+ scale: boolean;
+
+ xWeights_: Float64Array[] | null = null;
+ yWeights_: Float64Array[] | null = null;
+ xScores_: Float64Array[] | null = null;
+ yScores_: Float64Array[] | null = null;
+ xMean_: Float64Array | null = null;
+ yMean_: Float64Array | null = null;
+ xStd_: Float64Array | null = null;
+ yStd_: Float64Array | null = null;
+ nFeaturesFit_: number = 0;
+ nTargetsFit_: number = 0;
+
+ constructor(options: PLSSVDOptions = {}) {
+ this.nComponents = options.nComponents ?? 2;
+ this.scale = options.scale ?? true;
+ }
+
+ fit(X: Float64Array[], Y: Float64Array[]): this {
+ const nSamples = X.length;
+ const nFeatures = X[0]?.length ?? 0;
+ const nTargets = Y[0]?.length ?? 0;
+ this.nFeaturesFit_ = nFeatures;
+ this.nTargetsFit_ = nTargets;
+
+ // Center (and optionally scale)
+ this.xMean_ = new Float64Array(nFeatures);
+ this.yMean_ = new Float64Array(nTargets);
+ for (const row of X) for (let j = 0; j < nFeatures; j++) this.xMean_[j] = (this.xMean_[j] ?? 0) + (row[j] ?? 0);
+ for (const row of Y) for (let j = 0; j < nTargets; j++) this.yMean_[j] = (this.yMean_[j] ?? 0) + (row[j] ?? 0);
+ for (let j = 0; j < nFeatures; j++) this.xMean_[j] = (this.xMean_[j] ?? 0) / nSamples;
+ for (let j = 0; j < nTargets; j++) this.yMean_[j] = (this.yMean_[j] ?? 0) / nSamples;
+
+ this.xStd_ = new Float64Array(nFeatures).fill(1);
+ this.yStd_ = new Float64Array(nTargets).fill(1);
+ if (this.scale) {
+ for (const row of X) for (let j = 0; j < nFeatures; j++) {
+ this.xStd_[j] = (this.xStd_[j] ?? 0) + ((row[j] ?? 0) - (this.xMean_[j] ?? 0)) ** 2;
+ }
+ for (let j = 0; j < nFeatures; j++) this.xStd_[j] = Math.sqrt((this.xStd_[j] ?? 0) / (nSamples - 1)) || 1;
+ for (const row of Y) for (let j = 0; j < nTargets; j++) {
+ this.yStd_[j] = (this.yStd_[j] ?? 0) + ((row[j] ?? 0) - (this.yMean_[j] ?? 0)) ** 2;
+ }
+ for (let j = 0; j < nTargets; j++) this.yStd_[j] = Math.sqrt((this.yStd_[j] ?? 0) / (nSamples - 1)) || 1;
+ }
+
+ // Center and scale X, Y
+ const Xc = X.map(row => new Float64Array(nFeatures).map((_, j) => ((row[j] ?? 0) - (this.xMean_![j] ?? 0)) / (this.xStd_![j] ?? 1)));
+ const Yc = Y.map(row => new Float64Array(nTargets).map((_, j) => ((row[j] ?? 0) - (this.yMean_![j] ?? 0)) / (this.yStd_![j] ?? 1)));
+
+ // Compute cross-covariance matrix C = X^T Y
+ const C: Float64Array[] = Array.from({ length: nFeatures }, () => new Float64Array(nTargets));
+ for (let i = 0; i < nSamples; i++) {
+ for (let j = 0; j < nFeatures; j++) {
+ for (let k = 0; k < nTargets; k++) {
+ C[j]![k] = (C[j]![k] ?? 0) + (Xc[i]?.[j] ?? 0) * (Yc[i]?.[k] ?? 0);
+ }
+ }
+ }
+
+ const k = Math.min(this.nComponents, nFeatures, nTargets);
+
+ // SVD via power iteration
+ const xWeights: Float64Array[] = [];
+ const yWeights: Float64Array[] = [];
+
+ let seed = 42;
+ function rand(): number {
+ seed = (seed * 1664525 + 1013904223) & 0xffffffff;
+ return ((seed >>> 0) / 0xffffffff) * 2 - 1;
+ }
+
+ for (let comp = 0; comp < k; comp++) {
+ let u = new Float64Array(nFeatures).map(() => rand());
+ let normU = Math.sqrt(u.reduce((s, v) => s + v ** 2, 0)) || 1;
+ for (let j = 0; j < nFeatures; j++) u[j] = (u[j] ?? 0) / normU;
+
+ for (let iter = 0; iter < 10; iter++) {
+ // v = C^T u
+ let v = new Float64Array(nTargets);
+ for (let j = 0; j < nFeatures; j++) for (let l = 0; l < nTargets; l++) v[l] += (C[j]?.[l] ?? 0) * (u[j] ?? 0);
+ let normV = Math.sqrt(v.reduce((s, v2) => s + v2 ** 2, 0)) || 1;
+ for (let l = 0; l < nTargets; l++) v[l] = (v[l] ?? 0) / normV;
+
+ // u = C v
+ let uNew = new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) for (let l = 0; l < nTargets; l++) uNew[j] += (C[j]?.[l] ?? 0) * (v[l] ?? 0);
+
+ // Orthogonalize against previous
+ for (const pu of xWeights) {
+ let dot = 0;
+ for (let j = 0; j < nFeatures; j++) dot += (uNew[j] ?? 0) * (pu[j] ?? 0);
+ for (let j = 0; j < nFeatures; j++) uNew[j] = (uNew[j] ?? 0) - dot * (pu[j] ?? 0);
+ }
+
+ normU = Math.sqrt(uNew.reduce((s, v2) => s + v2 ** 2, 0)) || 1;
+ u = new Float64Array(uNew.map(v2 => v2 / normU));
+ }
+
+ // Final v
+ const v = new Float64Array(nTargets);
+ for (let j = 0; j < nFeatures; j++) for (let l = 0; l < nTargets; l++) v[l] += (C[j]?.[l] ?? 0) * (u[j] ?? 0);
+ const normV = Math.sqrt(v.reduce((s, v2) => s + v2 ** 2, 0)) || 1;
+ for (let l = 0; l < nTargets; l++) v[l] = (v[l] ?? 0) / normV;
+
+ xWeights.push(u);
+ yWeights.push(v);
+ }
+
+ this.xWeights_ = xWeights;
+ this.yWeights_ = yWeights;
+
+ // Compute scores
+ this.xScores_ = Xc.map(row => new Float64Array(xWeights.map(w => {
+ let dot = 0;
+ for (let j = 0; j < nFeatures; j++) dot += (row[j] ?? 0) * (w[j] ?? 0);
+ return dot;
+ })));
+ this.yScores_ = Yc.map(row => new Float64Array(yWeights.map(w => {
+ let dot = 0;
+ for (let j = 0; j < nTargets; j++) dot += (row[j] ?? 0) * (w[j] ?? 0);
+ return dot;
+ })));
+
+ return this;
+ }
+
+ transform(X: Float64Array[], Y?: Float64Array[]): { xScores: Float64Array[]; yScores?: Float64Array[] } {
+ if (!this.xWeights_ || !this.xMean_) throw new Error("PLSSVDExt not fitted");
+ const nFeatures = this.nFeaturesFit_;
+ const xScores = X.map(row => new Float64Array(this.xWeights_!.map(w => {
+ let dot = 0;
+ for (let j = 0; j < nFeatures; j++) dot += ((row[j] ?? 0) - (this.xMean_![j] ?? 0)) / (this.xStd_![j] ?? 1) * (w[j] ?? 0);
+ return dot;
+ })));
+
+ if (Y) {
+ const nTargets = this.nTargetsFit_;
+ const yScores = Y.map(row => new Float64Array(this.yWeights_!.map(w => {
+ let dot = 0;
+ for (let j = 0; j < nTargets; j++) dot += ((row[j] ?? 0) - (this.yMean_![j] ?? 0)) / (this.yStd_![j] ?? 1) * (w[j] ?? 0);
+ return dot;
+ })));
+ return { xScores, yScores };
+ }
+ return { xScores };
+ }
+
+ fitTransform(X: Float64Array[], Y: Float64Array[]): { xScores: Float64Array[]; yScores: Float64Array[] } {
+ this.fit(X, Y);
+ return { xScores: this.xScores_!, yScores: this.yScores_! };
+ }
+}
diff --git a/src/datasets/california.ts b/src/datasets/california.ts
new file mode 100644
index 0000000..0b9c6f8
--- /dev/null
+++ b/src/datasets/california.ts
@@ -0,0 +1,73 @@
+/**
+ * California Housing dataset utilities.
+ * Port of sklearn.datasets._california_housing
+ */
+
+export interface CaliforniaHousingData {
+ data: Float64Array[];
+ target: Float64Array;
+ featureNames: string[];
+ targetNames: string[];
+ description: string;
+}
+
+/**
+ * Generate synthetic California housing-like data.
+ * Features: MedInc, HouseAge, AveRooms, AveBedrms, Population, AveOccup, Latitude, Longitude
+ */
+export function makeCaliforniaHousing(
+ nSamples = 100,
+ randomState = 42,
+): CaliforniaHousingData {
+ // Simple LCG random
+ let seed = randomState;
+ const rand = (): number => {
+ seed = (seed * 1664525 + 1013904223) & 0xffffffff;
+ return ((seed >>> 0) / 0x100000000);
+ };
+ const featureNames = [
+ "MedInc", "HouseAge", "AveRooms", "AveBedrms",
+ "Population", "AveOccup", "Latitude", "Longitude",
+ ];
+ const data: Float64Array[] = [];
+ const target = new Float64Array(nSamples);
+ for (let i = 0; i < nSamples; i++) {
+ const medInc = 0.5 + rand() * 10;
+ const houseAge = 1 + rand() * 52;
+ const aveRooms = 2 + rand() * 8;
+ const aveBedrms = 0.5 + rand() * 2;
+ const population = 100 + rand() * 3000;
+ const aveOccup = 1 + rand() * 5;
+ const latitude = 32 + rand() * 10;
+ const longitude = -124 + rand() * 10;
+ data.push(new Float64Array([medInc, houseAge, aveRooms, aveBedrms, population, aveOccup, latitude, longitude]));
+ // Simplified price model
+ target[i] = 0.5 + 0.4 * medInc - 0.001 * population + rand() * 0.5;
+ }
+ return {
+ data,
+ target,
+ featureNames,
+ targetNames: ["MedHouseVal"],
+ description: "Synthetic California Housing dataset (generated). " +
+ "Original from StatLib repository. 8 features, regression target is median house value.",
+ };
+}
+
+export interface FetchCaliforniaHousingOptions {
+ dataHome?: string;
+ download?: boolean;
+ returnXy?: boolean;
+ asFrame?: boolean;
+}
+
+/**
+ * Fetch (or generate) the California Housing dataset.
+ * In browser/Bun environments, returns generated data.
+ */
+export function fetchCaliforniaHousing(
+ opts: FetchCaliforniaHousingOptions = {},
+): CaliforniaHousingData {
+ void opts;
+ return makeCaliforniaHousing(20640);
+}
diff --git a/src/datasets/datasets_ext.ts b/src/datasets/datasets_ext.ts
new file mode 100644
index 0000000..ed2075a
--- /dev/null
+++ b/src/datasets/datasets_ext.ts
@@ -0,0 +1,165 @@
+/**
+ * Extended datasets: makeMultilabelClassification, makeMultivariateNormal, makeCheckerboard, makeS_curve
+ */
+
+export interface MultilabelDataset {
+ X: Float64Array[];
+ Y: Int32Array[];
+ nClasses: number;
+}
+
+export function makeMultilabelClassification(
+ nSamples = 100,
+ nFeatures = 20,
+ nClasses = 5,
+ nLabels = 2,
+ randomState?: number
+): MultilabelDataset {
+ const rng = randomState !== undefined ? seededRng(randomState) : Math.random;
+ const X: Float64Array[] = [];
+ const Y: Int32Array[] = [];
+ for (let i = 0; i < nSamples; i++) {
+ const row = new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) row[j] = rng() * 2 - 1;
+ X.push(row);
+ const labels = new Int32Array(nClasses);
+ const selected = new Set();
+ while (selected.size < nLabels) selected.add(Math.floor(rng() * nClasses));
+ for (const l of selected) labels[l] = 1;
+ Y.push(labels);
+ }
+ return { X, Y, nClasses };
+}
+
+function seededRng(seed: number): () => number {
+ let s = seed;
+ return () => {
+ s = (s * 1664525 + 1013904223) & 0xffffffff;
+ return (s >>> 0) / 0xffffffff;
+ };
+}
+
+export interface MultivariateNormalDataset {
+ X: Float64Array[];
+ mean: Float64Array;
+ cov: Float64Array[];
+}
+
+export function makeMultivariateNormal(
+ nSamples = 100,
+ mean: Float64Array,
+ cov: Float64Array[]
+): MultivariateNormalDataset {
+ const nFeatures = mean.length;
+ // Cholesky decomposition of cov
+ const L: Float64Array[] = Array.from({ length: nFeatures }, () => new Float64Array(nFeatures));
+ for (let i = 0; i < nFeatures; i++) {
+ for (let j = 0; j <= i; j++) {
+ let sum = cov[i]![j] ?? 0;
+ for (let k = 0; k < j; k++) sum -= (L[i]![k] ?? 0) * (L[j]![k] ?? 0);
+ L[i]![j] = i === j ? Math.sqrt(Math.max(0, sum)) : (L[j]![j] ?? 1) < 1e-10 ? 0 : sum / (L[j]![j] ?? 1);
+ }
+ }
+ // Sample z ~ N(0, I) then x = L*z + mean
+ const X: Float64Array[] = [];
+ for (let s = 0; s < nSamples; s++) {
+ const z = new Float64Array(nFeatures);
+ for (let i = 0; i < nFeatures; i++) {
+ const u1 = Math.random(), u2 = Math.random();
+ z[i] = Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2);
+ }
+ const x = new Float64Array(nFeatures);
+ for (let i = 0; i < nFeatures; i++) {
+ x[i] = mean[i] ?? 0;
+ for (let j = 0; j <= i; j++) x[i] += (L[i]![j] ?? 0) * (z[j] ?? 0);
+ }
+ X.push(x);
+ }
+ return { X, mean, cov };
+}
+
+export interface CheckerboardDataset {
+ X: Float64Array[];
+ y: Int32Array;
+ nSquares: number;
+}
+
+export function makeCheckerboard(
+ nSamples = 200,
+ nSquares = 4
+): CheckerboardDataset {
+ const X: Float64Array[] = [];
+ const y = new Int32Array(nSamples);
+ for (let i = 0; i < nSamples; i++) {
+ const x0 = Math.random();
+ const x1 = Math.random();
+ X.push(new Float64Array([x0, x1]));
+ const sq0 = Math.floor(x0 * nSquares);
+ const sq1 = Math.floor(x1 * nSquares);
+ y[i] = (sq0 + sq1) % 2;
+ }
+ return { X, y, nSquares };
+}
+
+export interface SCurveDataset {
+ X: Float64Array[];
+ t: Float64Array;
+}
+
+export function makeS_curve(nSamples = 100, noise = 0.0): SCurveDataset {
+ const t = new Float64Array(nSamples);
+ const X: Float64Array[] = [];
+ for (let i = 0; i < nSamples; i++) {
+ t[i] = 1.5 * Math.PI * (1 + 2 * Math.random());
+ const ti = t[i] ?? 0;
+ const x = Math.sin(ti) + (noise > 0 ? (Math.random() - 0.5) * noise : 0);
+ const y = Math.sign(ti - Math.PI) * (Math.cos(ti) - 1) + (noise > 0 ? (Math.random() - 0.5) * noise : 0);
+ const z = 2 * Math.random() + (noise > 0 ? (Math.random() - 0.5) * noise : 0);
+ X.push(new Float64Array([x, y, z]));
+ }
+ return { X, t };
+}
+
+export function makeLowRankMatrix(
+ nSamples = 100,
+ nFeatures = 50,
+ effectiveRank = 10,
+ tailStrength = 0.5
+): Float64Array[] {
+ const n = Math.min(nSamples, nFeatures);
+ const singularVals = new Float64Array(n);
+ for (let i = 0; i < n; i++) {
+ const low = Math.exp(-i / effectiveRank);
+ const high = tailStrength * Math.exp(-i / (n * tailStrength + 1e-10));
+ singularVals[i] = (1 - tailStrength) * low + high;
+ }
+ // Random orthogonal matrices via Gram-Schmidt
+ const makeOrthogonal = (rows: number, cols: number): Float64Array[] => {
+ const mat: Float64Array[] = Array.from({ length: rows }, () => {
+ const row = new Float64Array(cols);
+ for (let j = 0; j < cols; j++) row[j] = Math.random() - 0.5;
+ return row;
+ });
+ for (let j = 0; j < cols; j++) {
+ for (let k = 0; k < j; k++) {
+ let dot = 0;
+ for (let i = 0; i < rows; i++) dot += (mat[i]![j] ?? 0) * (mat[i]![k] ?? 0);
+ for (let i = 0; i < rows; i++) mat[i]![j] = (mat[i]![j] ?? 0) - dot * (mat[i]![k] ?? 0);
+ }
+ let norm = 0;
+ for (let i = 0; i < rows; i++) norm += (mat[i]![j] ?? 0) ** 2;
+ norm = Math.sqrt(norm) || 1;
+ for (let i = 0; i < rows; i++) mat[i]![j] = (mat[i]![j] ?? 0) / norm;
+ }
+ return mat;
+ };
+ const U = makeOrthogonal(nSamples, n);
+ const V = makeOrthogonal(nFeatures, n);
+ return Array.from({ length: nSamples }, (_, i) => {
+ const row = new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) {
+ for (let k = 0; k < n; k++) row[j] += (U[i]![k] ?? 0) * (singularVals[k] ?? 0) * (V[j]![k] ?? 0);
+ }
+ return row;
+ });
+}
diff --git a/src/datasets/datasets_ext3.ts b/src/datasets/datasets_ext3.ts
new file mode 100644
index 0000000..5a4f4e9
--- /dev/null
+++ b/src/datasets/datasets_ext3.ts
@@ -0,0 +1,201 @@
+/**
+ * Additional dataset generators: make_moons, make_circles, make_blobs extensions.
+ * Mirrors sklearn.datasets extras.
+ */
+
+export function makeMoons(
+ nSamples = 100,
+ noise = 0.1,
+ randomState = 0,
+): { X: Float64Array[]; y: Int32Array } {
+ let rng = randomState;
+ const nextRand = (): number => {
+ rng = (rng * 1664525 + 1013904223) >>> 0;
+ return rng / 4294967296;
+ };
+ const boxMuller = (): number => {
+ const u = nextRand();
+ const v = nextRand();
+ return Math.sqrt(-2 * Math.log(u + 1e-10)) * Math.cos(2 * Math.PI * v);
+ };
+
+ const nEach = Math.floor(nSamples / 2);
+ const X: Float64Array[] = [];
+ const y: number[] = [];
+
+ for (let i = 0; i < nEach; i++) {
+ const angle = (Math.PI * i) / nEach;
+ X.push(new Float64Array([Math.cos(angle) + noise * boxMuller(), Math.sin(angle) + noise * boxMuller()]));
+ y.push(0);
+ }
+ for (let i = 0; i < nSamples - nEach; i++) {
+ const angle = (Math.PI * i) / (nSamples - nEach);
+ X.push(new Float64Array([1 - Math.cos(angle) + noise * boxMuller(), 1 - Math.sin(angle) - 0.5 + noise * boxMuller()]));
+ y.push(1);
+ }
+
+ return { X, y: new Int32Array(y) };
+}
+
+export function makeCircles(
+ nSamples = 100,
+ noise = 0.1,
+ factor = 0.8,
+ randomState = 0,
+): { X: Float64Array[]; y: Int32Array } {
+ let rng = randomState;
+ const nextRand = (): number => {
+ rng = (rng * 1664525 + 1013904223) >>> 0;
+ return rng / 4294967296;
+ };
+ const boxMuller = (): number => {
+ const u = nextRand();
+ const v = nextRand();
+ return Math.sqrt(-2 * Math.log(u + 1e-10)) * Math.cos(2 * Math.PI * v);
+ };
+
+ const nOuter = Math.floor(nSamples / 2);
+ const nInner = nSamples - nOuter;
+ const X: Float64Array[] = [];
+ const y: number[] = [];
+
+ for (let i = 0; i < nOuter; i++) {
+ const angle = (2 * Math.PI * i) / nOuter;
+ X.push(new Float64Array([Math.cos(angle) + noise * boxMuller(), Math.sin(angle) + noise * boxMuller()]));
+ y.push(0);
+ }
+ for (let i = 0; i < nInner; i++) {
+ const angle = (2 * Math.PI * i) / nInner;
+ X.push(new Float64Array([factor * Math.cos(angle) + noise * boxMuller(), factor * Math.sin(angle) + noise * boxMuller()]));
+ y.push(1);
+ }
+
+ return { X, y: new Int32Array(y) };
+}
+
+export function makeSwissRoll(
+ nSamples = 100,
+ noise = 0.0,
+ randomState = 0,
+): { X: Float64Array[]; t: Float64Array } {
+ let rng = randomState;
+ const nextRand = (): number => {
+ rng = (rng * 1664525 + 1013904223) >>> 0;
+ return rng / 4294967296;
+ };
+ const boxMuller = (): number => {
+ const u = nextRand();
+ const v = nextRand();
+ return Math.sqrt(-2 * Math.log(u + 1e-10)) * Math.cos(2 * Math.PI * v);
+ };
+
+ const t = new Float64Array(nSamples);
+ const X: Float64Array[] = [];
+
+ for (let i = 0; i < nSamples; i++) {
+ const ti = (1.5 + 2.5 * nextRand()) * Math.PI;
+ t[i] = ti;
+ const height = 21 * nextRand();
+ X.push(new Float64Array([
+ ti * Math.cos(ti) + noise * boxMuller(),
+ height + noise * boxMuller(),
+ ti * Math.sin(ti) + noise * boxMuller(),
+ ]));
+ }
+
+ return { X, t };
+}
+
+export function makeCheckerboard(
+ shape: [number, number] = [10, 10],
+ nClusters = 4,
+ nSamples = 100,
+ noise = 0.0,
+ randomState = 0,
+): { X: Float64Array[]; rows: Int32Array; cols: Int32Array } {
+ let rng = randomState;
+ const nextRand = (): number => {
+ rng = (rng * 1664525 + 1013904223) >>> 0;
+ return rng / 4294967296;
+ };
+
+ const [nRows, nCols] = shape;
+ const rowClusterSize = nRows / Math.sqrt(nClusters);
+ const colClusterSize = nCols / Math.sqrt(nClusters);
+
+ const X: Float64Array[] = [];
+ const rows: number[] = [];
+ const cols: number[] = [];
+
+ for (let i = 0; i < nSamples; i++) {
+ const r = Math.floor(nextRand() * nRows);
+ const c = Math.floor(nextRand() * nCols);
+ const rCluster = Math.floor(r / rowClusterSize);
+ const cCluster = Math.floor(c / colClusterSize);
+
+ const baseVal = (rCluster + cCluster) % 2 === 0 ? 1.0 : 0.0;
+ X.push(new Float64Array([
+ r + noise * (nextRand() - 0.5),
+ c + noise * (nextRand() - 0.5),
+ baseVal,
+ ]));
+ rows.push(r);
+ cols.push(c);
+ }
+
+ return { X, rows: new Int32Array(rows), cols: new Int32Array(cols) };
+}
+
+export function makeSparseCoded(
+ nSamples = 100,
+ nComponents = 10,
+ nFeatures = 20,
+ nNonzeroCoefs = 3,
+ randomState = 0,
+): { X: Float64Array[]; dictionary: Float64Array[]; code: Float64Array[] } {
+ let rng = randomState;
+ const nextRand = (): number => {
+ rng = (rng * 1664525 + 1013904223) >>> 0;
+ return (rng / 4294967296) * 2 - 1;
+ };
+
+ // Generate random dictionary
+ const dictionary: Float64Array[] = Array.from({ length: nComponents }, () => {
+ const v = new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) v[j] = nextRand();
+ let norm = 0;
+ for (const vj of v) norm += vj ** 2;
+ norm = Math.sqrt(norm);
+ if (norm > 0) for (let j = 0; j < nFeatures; j++) v[j] = (v[j] ?? 0) / norm;
+ return v;
+ });
+
+ // Generate sparse codes
+ const code: Float64Array[] = [];
+ for (let i = 0; i < nSamples; i++) {
+ const c = new Float64Array(nComponents);
+ const indices: number[] = [];
+ for (let k = 0; k < nNonzeroCoefs; k++) {
+ let idx = Math.floor(Math.abs(nextRand()) * nComponents);
+ while (indices.includes(idx)) idx = (idx + 1) % nComponents;
+ indices.push(idx);
+ c[idx] = nextRand();
+ }
+ code.push(c);
+ }
+
+ // Generate X = code @ dictionary
+ const X = code.map((c) => {
+ const x = new Float64Array(nFeatures);
+ for (let k = 0; k < nComponents; k++) {
+ const ck = c[k] ?? 0;
+ if (ck === 0) continue;
+ for (let j = 0; j < nFeatures; j++) {
+ x[j] = (x[j] ?? 0) + ck * (dictionary[k]?.[j] ?? 0);
+ }
+ }
+ return x;
+ });
+
+ return { X, dictionary, code };
+}
diff --git a/src/datasets/datasets_ext4.ts b/src/datasets/datasets_ext4.ts
new file mode 100644
index 0000000..482d75b
--- /dev/null
+++ b/src/datasets/datasets_ext4.ts
@@ -0,0 +1,156 @@
+/**
+ * Datasets extensions: makeTimeSeries, makeAnomalyDetection, makeGraphData, makeRankingData
+ * Port of sklearn.datasets extensions
+ */
+
+function seededRng(seed: number): () => number {
+ let s = seed;
+ return () => { s = (s * 1664525 + 1013904223) & 0xffffffff; return (s >>> 0) / 0xffffffff; };
+}
+
+export function makeTimeSeries(opts: {
+ nSamples?: number;
+ nFeatures?: number;
+ nTimesteps?: number;
+ noise?: number;
+ randomState?: number;
+ trend?: boolean;
+ seasonality?: boolean;
+}): { X: Float64Array[][]; y: Float64Array } {
+ const n = opts.nSamples ?? 100;
+ const p = opts.nFeatures ?? 1;
+ const T = opts.nTimesteps ?? 50;
+ const noise = opts.noise ?? 0.1;
+ const trend = opts.trend ?? true;
+ const seasonality = opts.seasonality ?? true;
+ const rng = seededRng(opts.randomState ?? 42);
+
+ const X: Float64Array[][] = Array.from({ length: n }, () => {
+ const series: Float64Array[] = Array.from({ length: T }, (_, t) => {
+ const row = new Float64Array(p);
+ for (let j = 0; j < p; j++) {
+ let val = 0;
+ if (trend) val += t / T * (rng() * 2 - 1);
+ if (seasonality) val += Math.sin(2 * Math.PI * t / 12) * (rng() + 0.5);
+ val += (rng() * 2 - 1) * noise;
+ row[j] = val;
+ }
+ return row;
+ });
+ return series;
+ });
+ const y = Float64Array.from({ length: n }, (_, i) => X[i]!.reduce((s, ts) => s + (ts[0] ?? 0), 0) / T);
+ return { X, y };
+}
+
+export function makeAnomalyDetection(opts: {
+ nSamples?: number;
+ nFeatures?: number;
+ contamination?: number;
+ randomState?: number;
+}): { X: Float64Array[]; y: Int32Array; anomalyIndices: number[] } {
+ const n = opts.nSamples ?? 200;
+ const p = opts.nFeatures ?? 2;
+ const contamination = opts.contamination ?? 0.1;
+ const rng = seededRng(opts.randomState ?? 0);
+ const nAnomalies = Math.floor(n * contamination);
+
+ const X: Float64Array[] = Array.from({ length: n }, (_, i) => {
+ const row = new Float64Array(p);
+ const isAnomaly = i < nAnomalies;
+ for (let j = 0; j < p; j++) {
+ row[j] = isAnomaly ? (rng() * 10 - 5) + (rng() > 0.5 ? 5 : -5) : rng() * 4 - 2;
+ }
+ return row;
+ });
+ for (let i = n - 1; i > 0; i--) {
+ const j = Math.floor(rng() * (i + 1));
+ const tmp = X[i]!;
+ X[i] = X[j]!;
+ X[j] = tmp;
+ }
+ const anomalyIndices: number[] = [];
+ const y = new Int32Array(n).fill(1);
+ for (let i = 0; i < n; i++) {
+ const norm = X[i]!.reduce((s, v) => s + (v ?? 0) ** 2, 0);
+ if (norm > p * 4) { y[i] = -1; anomalyIndices.push(i); }
+ }
+ return { X, y, anomalyIndices };
+}
+
+export function makeRankingData(opts: {
+ nSamples?: number;
+ nFeatures?: number;
+ nGroups?: number;
+ randomState?: number;
+}): { X: Float64Array[]; y: Int32Array; groups: Int32Array; relevanceScores: Float64Array } {
+ const n = opts.nSamples ?? 100;
+ const p = opts.nFeatures ?? 10;
+ const g = opts.nGroups ?? 10;
+ const rng = seededRng(opts.randomState ?? 0);
+
+ const X: Float64Array[] = Array.from({ length: n }, () => Float64Array.from({ length: p }, () => rng() * 2 - 1));
+ const groups = Int32Array.from({ length: n }, (_, i) => Math.floor(i / Math.ceil(n / g)));
+ const weights = Float64Array.from({ length: p }, () => rng() * 2 - 1);
+ const relevanceScores = Float64Array.from(X.map(xi => {
+ let s = 0;
+ for (let j = 0; j < p; j++) s += (weights[j] ?? 0) * (xi[j] ?? 0);
+ return s;
+ }));
+ const y = Int32Array.from(relevanceScores.map(s => Math.min(4, Math.max(0, Math.floor((s + 3) / 2)))));
+ return { X, y, groups, relevanceScores };
+}
+
+export function makeMultiLabelData(opts: {
+ nSamples?: number;
+ nFeatures?: number;
+ nClasses?: number;
+ density?: number;
+ randomState?: number;
+}): { X: Float64Array[]; y: Int32Array[] } {
+ const n = opts.nSamples ?? 100;
+ const p = opts.nFeatures ?? 20;
+ const c = opts.nClasses ?? 5;
+ const density = opts.density ?? 0.2;
+ const rng = seededRng(opts.randomState ?? 42);
+
+ const X: Float64Array[] = Array.from({ length: n }, () => Float64Array.from({ length: p }, () => rng() * 2 - 1));
+ const weights: Float64Array[] = Array.from({ length: c }, () => Float64Array.from({ length: p }, () => rng() * 2 - 1));
+ const y: Int32Array[] = X.map(xi => {
+ const labels = new Int32Array(c);
+ for (let k = 0; k < c; k++) {
+ const score = weights[k]!.reduce((s, w, j) => s + (w ?? 0) * (xi[j] ?? 0), 0);
+ labels[k] = score > 0 && rng() < density + 0.5 ? 1 : 0;
+ }
+ return labels;
+ });
+ return { X, y };
+}
+
+export function makeGraphData(opts: {
+ nNodes?: number;
+ nFeatures?: number;
+ edgeProbability?: number;
+ randomState?: number;
+}): { nodeFeatures: Float64Array[]; adjacency: Float64Array[]; labels: Int32Array } {
+ const n = opts.nNodes ?? 50;
+ const p = opts.nFeatures ?? 8;
+ const edgeProb = opts.edgeProbability ?? 0.3;
+ const rng = seededRng(opts.randomState ?? 0);
+
+ const nodeFeatures: Float64Array[] = Array.from({ length: n }, () => Float64Array.from({ length: p }, () => rng() * 2 - 1));
+ const adjacency: Float64Array[] = Array.from({ length: n }, (_, i) => {
+ const row = new Float64Array(n);
+ for (let j = i + 1; j < n; j++) {
+ if (rng() < edgeProb) { row[j] = 1; (adjacency[j] as Float64Array | undefined)?.set?.([1], i); }
+ }
+ return row;
+ });
+ for (let i = 0; i < n; i++) for (let j = 0; j < i; j++) if ((adjacency[j]![i] ?? 0) > 0) adjacency[i]![j] = 1;
+ const labels = Int32Array.from({ length: n }, (_, i) => {
+ let degree = 0;
+ for (let j = 0; j < n; j++) if ((adjacency[i]![j] ?? 0) > 0) degree++;
+ return degree > n * edgeProb ? 1 : 0;
+ });
+ return { nodeFeatures, adjacency, labels };
+}
diff --git a/src/datasets/datasets_ext5.ts b/src/datasets/datasets_ext5.ts
new file mode 100644
index 0000000..0640310
--- /dev/null
+++ b/src/datasets/datasets_ext5.ts
@@ -0,0 +1,164 @@
+/**
+ * Datasets extensions: synthetic datasets for benchmarking.
+ * Port of sklearn.datasets extensions.
+ */
+
+/** Generate a dataset for benchmarking classifiers (Swiss roll with labels). */
+export function makeSwissRoll(
+ nSamples = 100,
+ noise = 0.0,
+ randomState = 0,
+): { X: Float64Array[]; t: Float64Array } {
+ let rng = randomState;
+ const rand = (): number => {
+ rng = (rng * 1664525 + 1013904223) & 0xffffffff;
+ return (rng >>> 0) / 0xffffffff;
+ };
+ const t = new Float64Array(nSamples).map(() => 1.5 * Math.PI * (1 + 2 * rand()));
+ const X: Float64Array[] = Array.from({ length: nSamples }, (_, i) => {
+ const ti = t[i] ?? 0;
+ return new Float64Array([
+ ti * Math.cos(ti) + noise * (rand() - 0.5),
+ 21 * rand() + noise * (rand() - 0.5),
+ ti * Math.sin(ti) + noise * (rand() - 0.5),
+ ]);
+ });
+ return { X, t };
+}
+
+/** Generate a dataset of S-curve manifold. */
+export function makeSCurve(
+ nSamples = 100,
+ noise = 0.0,
+ randomState = 0,
+): { X: Float64Array[]; t: Float64Array } {
+ let rng = randomState;
+ const rand = (): number => {
+ rng = (rng * 1664525 + 1013904223) & 0xffffffff;
+ return (rng >>> 0) / 0xffffffff;
+ };
+ const t = new Float64Array(nSamples).map(() => 3 * Math.PI * (rand() - 0.5));
+ const X: Float64Array[] = Array.from({ length: nSamples }, (_, i) => {
+ const ti = t[i] ?? 0;
+ return new Float64Array([
+ Math.sin(ti) + noise * (rand() - 0.5),
+ 2 * rand() + noise * (rand() - 0.5),
+ Math.sign(ti) * (Math.cos(ti) - 1) + noise * (rand() - 0.5),
+ ]);
+ });
+ return { X, t };
+}
+
+/** Generate a checkerboard dataset. */
+export function makeCheckerboardData(
+ nSamples = 200,
+ nSquares = 4,
+ randomState = 0,
+): { X: Float64Array[]; y: Int32Array } {
+ let rng = randomState;
+ const rand = (): number => {
+ rng = (rng * 1664525 + 1013904223) & 0xffffffff;
+ return (rng >>> 0) / 0xffffffff;
+ };
+ const X: Float64Array[] = Array.from({ length: nSamples }, () => new Float64Array([rand(), rand()]));
+ const y = new Int32Array(nSamples).map((_, i) => {
+ const x1 = X[i]?.[0] ?? 0;
+ const x2 = X[i]?.[1] ?? 0;
+ const sq1 = Math.floor(x1 * nSquares);
+ const sq2 = Math.floor(x2 * nSquares);
+ return (sq1 + sq2) % 2;
+ });
+ return { X, y };
+}
+
+/** Generate a dataset of XOR pattern. */
+export function makeXOR(
+ nSamples = 200,
+ noise = 0.1,
+ randomState = 0,
+): { X: Float64Array[]; y: Int32Array } {
+ let rng = randomState;
+ const rand = (): number => {
+ rng = (rng * 1664525 + 1013904223) & 0xffffffff;
+ return (rng >>> 0) / 0xffffffff;
+ };
+ const X: Float64Array[] = Array.from({ length: nSamples }, () =>
+ new Float64Array([rand() * 2 - 1, rand() * 2 - 1]),
+ );
+ const y = new Int32Array(nSamples).map((_, i) => {
+ const x1 = (X[i]?.[0] ?? 0) + noise * (rand() - 0.5);
+ const x2 = (X[i]?.[1] ?? 0) + noise * (rand() - 0.5);
+ return x1 * x2 > 0 ? 1 : 0;
+ });
+ return { X, y };
+}
+
+/** Generate low-rank data with noise. */
+export function makeLowRankMatrix(
+ nSamples = 100,
+ nFeatures = 50,
+ effectiveRank = 10,
+ tailStrength = 0.5,
+ randomState = 0,
+): Float64Array[] {
+ let rng = randomState;
+ const rand = (): number => {
+ rng = (rng * 1664525 + 1013904223) & 0xffffffff;
+ return (rng >>> 0) / 0xffffffff;
+ };
+ const n = nSamples;
+ const p = nFeatures;
+ const k = Math.min(n, p, effectiveRank * 2);
+ // Generate random orthogonal-ish basis
+ const U: Float64Array[] = Array.from({ length: n }, () =>
+ new Float64Array(k).map(() => rand() * 2 - 1),
+ );
+ const V: Float64Array[] = Array.from({ length: k }, () =>
+ new Float64Array(p).map(() => rand() * 2 - 1),
+ );
+ // Singular values decay
+ const S = new Float64Array(k).map((_, i) => {
+ const hi = Math.exp(-i / effectiveRank);
+ const lo = tailStrength / k;
+ return hi * (1 - tailStrength) + lo;
+ });
+ const X: Float64Array[] = Array.from({ length: n }, (_, i) => {
+ const row = new Float64Array(p);
+ for (let c = 0; c < k; c++) {
+ for (let j = 0; j < p; j++) {
+ row[j]! += (U[i]?.[c] ?? 0) * (S[c] ?? 0) * (V[c]?.[j] ?? 0);
+ }
+ }
+ return row;
+ });
+ return X;
+}
+
+/** Generate a multilabel classification dataset. */
+export function makeMultilabelClassification(
+ nSamples = 100,
+ nFeatures = 20,
+ nClasses = 5,
+ nLabels = 2,
+ randomState = 0,
+): { X: Float64Array[]; Y: Int32Array[] } {
+ let rng = randomState;
+ const rand = (): number => {
+ rng = (rng * 1664525 + 1013904223) & 0xffffffff;
+ return (rng >>> 0) / 0xffffffff;
+ };
+ const X: Float64Array[] = Array.from({ length: nSamples }, () =>
+ new Float64Array(nFeatures).map(() => rand()),
+ );
+ const Y: Int32Array[] = Array.from({ length: nSamples }, () => {
+ const labels = new Int32Array(nClasses);
+ // Select nLabels distinct labels
+ const chosen = new Set();
+ while (chosen.size < Math.min(nLabels, nClasses)) {
+ chosen.add(Math.floor(rand() * nClasses));
+ }
+ for (const c of chosen) labels[c] = 1;
+ return labels;
+ });
+ return { X, Y };
+}
diff --git a/src/datasets/digits.ts b/src/datasets/digits.ts
new file mode 100644
index 0000000..1fb39fb
--- /dev/null
+++ b/src/datasets/digits.ts
@@ -0,0 +1,124 @@
+/**
+ * Toy datasets: loadDigits and loadLinnerud β analogous to sklearn.datasets._base.
+ */
+
+/** A single 8Γ8 hand-written digit image dataset entry. */
+export interface DigitsDataset {
+ /** Pixel data: nSamples Γ 64 (flattened 8Γ8 images, values 0β16). */
+ data: Float64Array;
+ /** Target digit labels (0β9). */
+ target: Int32Array;
+ /** Number of samples. */
+ nSamples: number;
+ /** Feature names: "pixel_0_0" β¦ "pixel_7_7". */
+ featureNames: string[];
+ /** Target names: ["0","1",β¦,"9"]. */
+ targetNames: string[];
+ /** Description string. */
+ DESCR: string;
+}
+
+/** The Linnerud multivariate exercise dataset. */
+export interface LinnerudDataset {
+ /** Exercise data: 20 Γ 3 (Chins, Situps, Jumps). */
+ data: Float64Array;
+ /** Physiological measurements: 20 Γ 3 (Weight, Waist, Pulse). */
+ target: Float64Array;
+ nSamples: number;
+ featureNames: string[];
+ targetNames: string[];
+ DESCR: string;
+}
+
+/**
+ * Generates a minimal synthetic digits dataset.
+ * Returns nSamples per class (default 10 per digit) arranged as 8Γ8 pixel blocks.
+ */
+export function loadDigits(options: { nClass?: number; samplesPerClass?: number } = {}): DigitsDataset {
+ const nClass = options.nClass ?? 10;
+ const samplesPerClass = options.samplesPerClass ?? 10;
+ const nSamples = nClass * samplesPerClass;
+ const nFeatures = 64;
+ const data = new Float64Array(nSamples * nFeatures);
+ const target = new Int32Array(nSamples);
+ const rng = mulberry32(42);
+
+ for (let cls = 0; cls < nClass; cls++) {
+ // Build a prototype 8Γ8 pattern for this digit using a seeded pattern
+ const proto = new Float64Array(nFeatures);
+ const seed = cls * 17;
+ for (let px = 0; px < nFeatures; px++) {
+ const r = (seed * 6364136223846793005n + BigInt(px) * 2862933555777941757n) & 0xffffffffffffn;
+ proto[px] = Number(r % 17n); // 0-16
+ }
+
+ for (let s = 0; s < samplesPerClass; s++) {
+ const row = cls * samplesPerClass + s;
+ target[row] = cls;
+ for (let px = 0; px < nFeatures; px++) {
+ // Add small noise
+ const noise = (rng() - 0.5) * 2;
+ const val = Math.max(0, Math.min(16, (proto[px]!) + noise));
+ data[row * nFeatures + px] = Math.round(val);
+ }
+ }
+ }
+
+ const featureNames: string[] = [];
+ for (let r = 0; r < 8; r++) for (let c = 0; c < 8; c++) featureNames.push(`pixel_${r}_${c}`);
+ const targetNames = Array.from({ length: nClass }, (_, i) => String(i));
+
+ return {
+ data, target, nSamples,
+ featureNames,
+ targetNames,
+ DESCR: "Optical recognition of handwritten digits (synthetic).",
+ };
+}
+
+/** Returns the Linnerud dataset (20 samples, 3 exercise features, 3 physiological targets). */
+export function loadLinnerud(): LinnerudDataset {
+ // Transcribed from sklearn reference data
+ const exerciseRaw = [
+ 5, 162, 60, 2, 110, 60, 12, 101, 101, 12, 105, 37,
+ 13, 155, 58, 4, 101, 42, 8, 101, 38, 6, 125, 40,
+ 15, 200, 40, 17, 251, 250, 17, 120, 38, 13, 210, 115,
+ 14, 215, 105, 1, 50, 50, 6, 70, 31, 12, 210, 120,
+ 4, 60, 25, 11, 230, 80, 15, 225, 73, 2, 110, 43,
+ 10, 150, 75,
+ ];
+ const physiologicalRaw = [
+ 191, 36, 50, 189, 37, 52, 193, 38, 58, 162, 35, 62,
+ 189, 35, 46, 182, 36, 56, 211, 38, 56, 167, 34, 60,
+ 176, 31, 74, 154, 33, 56, 169, 34, 50, 166, 33, 52,
+ 154, 34, 64, 247, 46, 50, 193, 36, 46, 202, 37, 62,
+ 176, 37, 54, 157, 32, 52, 156, 33, 54, 138, 33, 68,
+ ];
+
+ const nSamples = 20;
+ const data = new Float64Array(nSamples * 3);
+ const target = new Float64Array(nSamples * 3);
+ for (let i = 0; i < nSamples * 3; i++) {
+ data[i] = exerciseRaw[i] ?? 0;
+ target[i] = physiologicalRaw[i] ?? 0;
+ }
+
+ return {
+ data, target, nSamples,
+ featureNames: ["Chins", "Situps", "Jumps"],
+ targetNames: ["Weight", "Waist", "Pulse"],
+ DESCR: "Linnerud physical exercise dataset (20 middle-aged men, 3 exercise Γ 3 physiological).",
+ };
+}
+
+// --- helpers ---
+
+function mulberry32(seed: number): () => number {
+ let s = seed | 0;
+ return () => {
+ s = (s + 0x6d2b79f5) | 0;
+ let z = Math.imul(s ^ (s >>> 15), 1 | s);
+ z ^= z + Math.imul(z ^ (z >>> 7), 61 | z);
+ return ((z ^ (z >>> 14)) >>> 0) / 0x100000000;
+ };
+}
diff --git a/src/datasets/fetch_datasets.ts b/src/datasets/fetch_datasets.ts
new file mode 100644
index 0000000..bb59acd
--- /dev/null
+++ b/src/datasets/fetch_datasets.ts
@@ -0,0 +1,226 @@
+/**
+ * Dataset fetch utilities: California housing, Covtype, KDDCup99, LFW.
+ * Mirrors sklearn.datasets.fetch_* functions.
+ */
+
+export interface FetchedDataset {
+ data: Float64Array[];
+ target: Float64Array;
+ featureNames: string[];
+ targetNames?: string[];
+ description: string;
+ nSamples: number;
+ nFeatures: number;
+}
+
+/**
+ * Synthetic version of the California Housing dataset.
+ * Real dataset: 20,640 samples, 8 features.
+ */
+export function fetchCaliforniaHousing(options: {
+ nSamples?: number;
+ seed?: number;
+} = {}): FetchedDataset {
+ const n = options.nSamples ?? 100;
+ let seed = options.seed ?? 42;
+ function rand(): number {
+ seed = (seed * 1664525 + 1013904223) & 0xffffffff;
+ return (seed >>> 0) / 0xffffffff;
+ }
+
+ const featureNames = ["MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup", "Latitude", "Longitude"];
+ const data: Float64Array[] = [];
+ const target = new Float64Array(n);
+
+ for (let i = 0; i < n; i++) {
+ const medInc = rand() * 15;
+ const houseAge = rand() * 52;
+ const aveRooms = 3 + rand() * 10;
+ const aveBedrms = 1 + rand() * 3;
+ const population = 100 + rand() * 35000;
+ const aveOccup = 1 + rand() * 10;
+ const latitude = 32 + rand() * 10;
+ const longitude = -124 + rand() * 10;
+
+ data.push(new Float64Array([medInc, houseAge, aveRooms, aveBedrms, population, aveOccup, latitude, longitude]));
+ target[i] = 0.5 + medInc * 0.3 + rand() * 0.5;
+ }
+
+ return { data, target, featureNames, description: "California Housing dataset (synthetic)", nSamples: n, nFeatures: 8 };
+}
+
+/**
+ * Synthetic version of the Forest Cover Type dataset.
+ * Real dataset: 581,012 samples, 54 features, 7 classes.
+ */
+export function fetchCovtype(options: { nSamples?: number; seed?: number } = {}): FetchedDataset {
+ const n = options.nSamples ?? 100;
+ let seed = options.seed ?? 42;
+ function rand(): number {
+ seed = (seed * 1664525 + 1013904223) & 0xffffffff;
+ return (seed >>> 0) / 0xffffffff;
+ }
+
+ const nFeatures = 54;
+ const data: Float64Array[] = [];
+ const target = new Float64Array(n);
+
+ for (let i = 0; i < n; i++) {
+ const row = new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) row[j] = rand() * 100;
+ data.push(row);
+ target[i] = (rand() * 7) | 0;
+ }
+
+ return {
+ data, target,
+ featureNames: Array.from({ length: nFeatures }, (_, j) => `feature_${j}`),
+ targetNames: ["Spruce/Fir", "Lodgepole Pine", "Ponderosa Pine", "Cottonwood/Willow", "Aspen", "Douglas-fir", "Krummholz"],
+ description: "Forest Cover Type dataset (synthetic)",
+ nSamples: n, nFeatures
+ };
+}
+
+/**
+ * Synthetic version of the KDD Cup 1999 dataset.
+ */
+export function fetchKddcup99(options: {
+ subset?: "http" | "smtp" | "SF" | "SA" | null;
+ nSamples?: number;
+ seed?: number;
+} = {}): FetchedDataset {
+ const n = options.nSamples ?? 100;
+ let seed = options.seed ?? 42;
+ function rand(): number {
+ seed = (seed * 1664525 + 1013904223) & 0xffffffff;
+ return (seed >>> 0) / 0xffffffff;
+ }
+
+ const nFeatures = 41;
+ const data: Float64Array[] = [];
+ const target = new Float64Array(n);
+
+ for (let i = 0; i < n; i++) {
+ const row = new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) row[j] = rand() * 1000;
+ data.push(row);
+ target[i] = rand() > 0.8 ? 1 : 0;
+ }
+
+ return {
+ data, target,
+ featureNames: Array.from({ length: nFeatures }, (_, j) => `feature_${j}`),
+ targetNames: ["normal", "attack"],
+ description: `KDD Cup 99 dataset${options.subset ? ` (${options.subset} subset)` : ""} (synthetic)`,
+ nSamples: n, nFeatures
+ };
+}
+
+/**
+ * Synthetic version of the Labeled Faces in the Wild (LFW) dataset.
+ */
+export function fetchLfw(options: {
+ minFacesPerPerson?: number;
+ nComponents?: number;
+ nSamples?: number;
+ seed?: number;
+} = {}): FetchedDataset {
+ const n = options.nSamples ?? 50;
+ const nFeatures = options.nComponents ?? 50 * 37;
+ let seed = options.seed ?? 42;
+ function rand(): number {
+ seed = (seed * 1664525 + 1013904223) & 0xffffffff;
+ return (seed >>> 0) / 0xffffffff;
+ }
+
+ const data: Float64Array[] = [];
+ const target = new Float64Array(n);
+ const nPersons = 5;
+
+ for (let i = 0; i < n; i++) {
+ const row = new Float64Array(nFeatures);
+ const person = (rand() * nPersons) | 0;
+ for (let j = 0; j < nFeatures; j++) row[j] = rand() + person * 0.1;
+ data.push(row);
+ target[i] = person;
+ }
+
+ return {
+ data, target,
+ featureNames: Array.from({ length: nFeatures }, (_, j) => `pixel_${j}`),
+ targetNames: Array.from({ length: nPersons }, (_, i) => `person_${i}`),
+ description: "Labeled Faces in the Wild dataset (synthetic)",
+ nSamples: n, nFeatures
+ };
+}
+
+/**
+ * Synthetic version of the Olivetti Faces dataset.
+ * Real dataset: 400 samples, 4096 features (64x64), 40 classes.
+ */
+export function fetchOlivettiFaces(options: { seed?: number } = {}): FetchedDataset {
+ const n = 40;
+ const nFeatures = 4096;
+ let seed = options.seed ?? 42;
+ function rand(): number {
+ seed = (seed * 1664525 + 1013904223) & 0xffffffff;
+ return (seed >>> 0) / 0xffffffff;
+ }
+
+ const data: Float64Array[] = [];
+ const target = new Float64Array(n);
+ for (let i = 0; i < n; i++) {
+ const row = new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) row[j] = rand();
+ data.push(row);
+ target[i] = i % 40;
+ }
+
+ return {
+ data, target,
+ featureNames: Array.from({ length: nFeatures }, (_, j) => `pixel_${j}`),
+ description: "Olivetti Faces dataset (synthetic)",
+ nSamples: n, nFeatures
+ };
+}
+
+/**
+ * Fetch a sample of the 20 Newsgroups dataset.
+ * Returns feature vectors (TF-IDF like) for text classification.
+ */
+export function fetch20Newsgroups(options: {
+ nSamples?: number;
+ nFeatures?: number;
+ seed?: number;
+ categories?: string[] | null;
+} = {}): FetchedDataset {
+ const n = options.nSamples ?? 100;
+ const nFeatures = options.nFeatures ?? 100;
+ const categories = options.categories ?? [
+ "alt.atheism", "comp.graphics", "sci.med", "soc.religion.christian", "talk.politics.guns"
+ ];
+ const nClasses = categories.length;
+ let seed = options.seed ?? 42;
+ function rand(): number {
+ seed = (seed * 1664525 + 1013904223) & 0xffffffff;
+ return (seed >>> 0) / 0xffffffff;
+ }
+
+ const data: Float64Array[] = [];
+ const target = new Float64Array(n);
+ for (let i = 0; i < n; i++) {
+ const cls = (rand() * nClasses) | 0;
+ const row = new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) row[j] = rand() > 0.8 ? rand() : 0;
+ data.push(row);
+ target[i] = cls;
+ }
+
+ return {
+ data, target,
+ featureNames: Array.from({ length: nFeatures }, (_, j) => `word_${j}`),
+ targetNames: categories,
+ description: "20 Newsgroups dataset (synthetic TF-IDF)",
+ nSamples: n, nFeatures
+ };
+}
diff --git a/src/datasets/generator_ext.ts b/src/datasets/generator_ext.ts
new file mode 100644
index 0000000..f89ecfb
--- /dev/null
+++ b/src/datasets/generator_ext.ts
@@ -0,0 +1,262 @@
+/**
+ * Additional dataset generators β ported from sklearn.datasets
+ * make_low_rank_matrix, make_sparse_coded_signal, make_biclusters, make_checkerboard
+ */
+
+export interface LowRankMatrixOptions {
+ nSamples?: number;
+ nFeatures?: number;
+ effectiveRank?: number;
+ tailStrength?: number;
+ randomState?: number | null;
+}
+
+export interface LowRankMatrixResult {
+ X: Float64Array[];
+}
+
+/**
+ * Generate a mostly low-rank matrix with bell-shaped singular values.
+ * Useful for testing matrix decomposition algorithms.
+ */
+export function makeLowRankMatrix(options: LowRankMatrixOptions = {}): LowRankMatrixResult {
+ const nSamples = options.nSamples ?? 100;
+ const nFeatures = options.nFeatures ?? 100;
+ const effectiveRank = options.effectiveRank ?? 10;
+ const tailStrength = options.tailStrength ?? 0.5;
+
+ let seed = options.randomState ?? 42;
+ function randn(): number {
+ seed = (1664525 * seed + 1013904223) & 0x7fffffff;
+ const u1 = seed / 0x7fffffff;
+ seed = (1664525 * seed + 1013904223) & 0x7fffffff;
+ const u2 = seed / 0x7fffffff;
+ return Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2);
+ }
+
+ const n = Math.min(nSamples, nFeatures);
+
+ // Singular values: bell-shaped around effectiveRank
+ const singularValues = new Float64Array(n);
+ for (let i = 0; i < n; i++) {
+ const x = (i - effectiveRank) / (effectiveRank / 2);
+ singularValues[i] = Math.exp(-0.5 * x * x) * (1 - tailStrength) + tailStrength / n;
+ }
+
+ // Random orthonormal U (nSamples x n) and V (nFeatures x n)
+ // Simplified: just use random Gaussian matrices (not fully orthogonal)
+ const U: Float64Array[] = Array.from({ length: nSamples }, () => {
+ const row = new Float64Array(n);
+ for (let j = 0; j < n; j++) row[j] = randn();
+ return row;
+ });
+
+ const V: Float64Array[] = Array.from({ length: nFeatures }, () => {
+ const row = new Float64Array(n);
+ for (let j = 0; j < n; j++) row[j] = randn();
+ return row;
+ });
+
+ // X = U @ diag(singularValues) @ V.T
+ const X: Float64Array[] = Array.from({ length: nSamples }, (_, i) => {
+ const row = new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) {
+ let val = 0;
+ for (let k = 0; k < n; k++) {
+ val += (U[i]![k] ?? 0) * (singularValues[k] ?? 0) * (V[j]![k] ?? 0);
+ }
+ row[j] = val;
+ }
+ return row;
+ });
+
+ return { X };
+}
+
+export interface SparseCodingOptions {
+ nSamples?: number;
+ nComponents?: number;
+ nFeatures?: number;
+ nNonzeroCoefs?: number;
+ randomState?: number | null;
+}
+
+export interface SparseCodingResult {
+ X: Float64Array[];
+ dictionary: Float64Array[];
+ code: Float64Array[];
+}
+
+/**
+ * Generate a sparse signal using a fixed dictionary.
+ * Useful for testing dictionary learning algorithms.
+ */
+export function makeSparseCodedSignal(options: SparseCodingOptions = {}): SparseCodingResult {
+ const nSamples = options.nSamples ?? 100;
+ const nComponents = options.nComponents ?? 40;
+ const nFeatures = options.nFeatures ?? 64;
+ const nNonzeroCoefs = options.nNonzeroCoefs ?? 3;
+
+ let seed = options.randomState ?? 0;
+ function rand(): number {
+ seed = (1664525 * seed + 1013904223) & 0x7fffffff;
+ return seed / 0x7fffffff;
+ }
+ function randn(): number {
+ const u1 = rand() + 1e-10;
+ const u2 = rand();
+ return Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
+ }
+
+ // Random dictionary (nComponents x nFeatures), normalized atoms
+ const dictionary: Float64Array[] = Array.from({ length: nComponents }, () => {
+ const atom = new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) atom[j] = randn();
+ let norm = 0;
+ for (let j = 0; j < nFeatures; j++) norm += (atom[j] ?? 0) ** 2;
+ norm = Math.sqrt(norm);
+ if (norm > 0) for (let j = 0; j < nFeatures; j++) atom[j]! /= norm;
+ return atom;
+ });
+
+ // Sparse codes (nSamples x nComponents)
+ const code: Float64Array[] = Array.from({ length: nSamples }, () => {
+ const row = new Float64Array(nComponents);
+ // Pick nNonzeroCoefs random non-zero positions
+ const positions: number[] = [];
+ const available = Array.from({ length: nComponents }, (_, i) => i);
+ for (let k = 0; k < nNonzeroCoefs && available.length > 0; k++) {
+ const idx = Math.floor(rand() * available.length);
+ positions.push(available[idx]!);
+ available.splice(idx, 1);
+ }
+ for (const pos of positions) {
+ row[pos] = randn();
+ }
+ return row;
+ });
+
+ // X = code @ dictionary
+ const X: Float64Array[] = Array.from({ length: nSamples }, (_, i) => {
+ const row = new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) {
+ let val = 0;
+ for (let k = 0; k < nComponents; k++) {
+ val += (code[i]![k] ?? 0) * (dictionary[k]![j] ?? 0);
+ }
+ row[j] = val;
+ }
+ return row;
+ });
+
+ return { X, dictionary, code };
+}
+
+export interface BiclustersOptions {
+ shape?: [number, number];
+ nClusters?: number;
+ noise?: number;
+ minsize?: number;
+ randomState?: number | null;
+}
+
+export interface BiclustersResult {
+ X: Float64Array[];
+ rows: boolean[][];
+ columns: boolean[][];
+}
+
+/**
+ * Generate a 2D array with planted biclusters.
+ * Useful for testing biclustering algorithms.
+ */
+export function makeBiclusters(options: BiclustersOptions = {}): BiclustersResult {
+ const [nRows, nCols] = options.shape ?? [100, 100];
+ const nClusters = options.nClusters ?? 5;
+ const noise = options.noise ?? 0.0;
+
+ let seed = options.randomState ?? 0;
+ function rand(): number {
+ seed = (1664525 * seed + 1013904223) & 0x7fffffff;
+ return seed / 0x7fffffff;
+ }
+
+ // Assign rows and columns to clusters
+ const rowAssignments = new Int32Array(nRows);
+ const colAssignments = new Int32Array(nCols);
+ for (let i = 0; i < nRows; i++) rowAssignments[i] = Math.floor(rand() * nClusters);
+ for (let j = 0; j < nCols; j++) colAssignments[j] = Math.floor(rand() * nClusters);
+
+ const X: Float64Array[] = Array.from({ length: nRows }, (_, i) => {
+ const row = new Float64Array(nCols);
+ for (let j = 0; j < nCols; j++) {
+ const sameBicluster = (rowAssignments[i] ?? 0) === (colAssignments[j] ?? 0) ? 1 : 0;
+ const noiseVal = noise > 0 ? (rand() - 0.5) * noise : 0;
+ row[j] = sameBicluster + noiseVal;
+ }
+ return row;
+ });
+
+ // Build membership arrays
+ const rows: boolean[][] = Array.from({ length: nClusters }, (_, c) =>
+ Array.from({ length: nRows }, (__, i) => (rowAssignments[i] ?? 0) === c)
+ );
+ const columns: boolean[][] = Array.from({ length: nClusters }, (_, c) =>
+ Array.from({ length: nCols }, (__, j) => (colAssignments[j] ?? 0) === c)
+ );
+
+ return { X, rows, columns };
+}
+
+export interface CheckerboardOptions {
+ shape?: [number, number];
+ nClusters?: [number, number];
+ noise?: number;
+ randomState?: number | null;
+}
+
+export interface CheckerboardResult {
+ X: Float64Array[];
+ rows: boolean[][];
+ columns: boolean[][];
+}
+
+/**
+ * Generate a checkerboard pattern dataset for testing biclustering.
+ */
+export function makeCheckerboard(options: CheckerboardOptions = {}): CheckerboardResult {
+ const [nRows, nCols] = options.shape ?? [100, 100];
+ const [nRowClusters, nColClusters] = options.nClusters ?? [4, 4];
+ const noise = options.noise ?? 0.0;
+
+ let seed = options.randomState ?? 0;
+ function rand(): number {
+ seed = (1664525 * seed + 1013904223) & 0x7fffffff;
+ return seed / 0x7fffffff;
+ }
+
+ const X: Float64Array[] = Array.from({ length: nRows }, (_, i) => {
+ const row = new Float64Array(nCols);
+ const rowCluster = Math.floor(i / Math.ceil(nRows / nRowClusters));
+ for (let j = 0; j < nCols; j++) {
+ const colCluster = Math.floor(j / Math.ceil(nCols / nColClusters));
+ const val = ((rowCluster + colCluster) % 2 === 0) ? 1 : 0;
+ const noiseVal = noise > 0 ? (rand() - 0.5) * noise : 0;
+ row[j] = val + noiseVal;
+ }
+ return row;
+ });
+
+ const rows: boolean[][] = Array.from({ length: nRowClusters }, (_, rc) =>
+ Array.from({ length: nRows }, (__, i) =>
+ Math.floor(i / Math.ceil(nRows / nRowClusters)) === rc
+ )
+ );
+ const columns: boolean[][] = Array.from({ length: nColClusters }, (_, cc) =>
+ Array.from({ length: nCols }, (__, j) =>
+ Math.floor(j / Math.ceil(nCols / nColClusters)) === cc
+ )
+ );
+
+ return { X, rows, columns };
+}
diff --git a/src/datasets/index.ts b/src/datasets/index.ts
new file mode 100644
index 0000000..a559672
--- /dev/null
+++ b/src/datasets/index.ts
@@ -0,0 +1,11 @@
+export * from "./make_datasets.js";
+export * from "./load_datasets.js";
+export * from "./svmlight.js";
+export * from "./openml.js";
+export * from "./samples_generator.js";
+export * from "./rcv1.js";
+export * from "./real_datasets.js";
+export * from "./digits.js";
+export * from "./newsgroups.js";
+export * from "./generator_ext.js";
+export * from "./fetch_datasets.js";
diff --git a/src/datasets/kddcup.ts b/src/datasets/kddcup.ts
new file mode 100644
index 0000000..afeaea0
--- /dev/null
+++ b/src/datasets/kddcup.ts
@@ -0,0 +1,88 @@
+/**
+ * KDD Cup datasets: synthetic versions of network intrusion data.
+ */
+
+export interface KDDCupDataset {
+ data: Float64Array[];
+ target: Int32Array;
+ featureNames: string[];
+ targetNames: string[];
+ nSamples: number;
+ nFeatures: number;
+ description: string;
+}
+
+export const KDD_FEATURE_NAMES = [
+ "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
+ "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in",
+ "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations",
+ "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login",
+ "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
+ "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count",
+ "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate",
+ "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate",
+ "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate",
+] as const;
+
+export const KDD_TARGET_NAMES = ["normal", "dos", "probe", "r2l", "u2r"] as const;
+
+export function makeKDDCupSynthetic(nSamples = 500, seed = 42): KDDCupDataset {
+ const rng = seededRng(seed);
+ const nFeatures = KDD_FEATURE_NAMES.length;
+ const nClasses = KDD_TARGET_NAMES.length;
+ const data: Float64Array[] = [];
+ const target: number[] = [];
+
+ for (let i = 0; i < nSamples; i++) {
+ const cls = Math.floor(rng() * nClasses);
+ const x = new Float64Array(nFeatures);
+ // Generate class-specific features
+ for (let f = 0; f < nFeatures; f++) {
+ x[f] = rng() * 100 + cls * 5;
+ }
+ // Specific feature patterns per class
+ switch (cls) {
+ case 0: // normal
+ x[0] = rng() * 10; // short duration
+ x[5] = rng() * 1000; // some dst_bytes
+ break;
+ case 1: // dos
+ x[4] = rng() * 10000 + 5000; // high src_bytes
+ x[22] = rng() * 200 + 100; // high count
+ break;
+ case 2: // probe
+ x[22] = rng() * 100; // count
+ x[24] = rng(); // serror_rate
+ break;
+ case 3: // r2l
+ x[11] = 0; // not logged in
+ x[9] = rng() * 5; // low hot
+ break;
+ case 4: // u2r
+ x[14] = 1; // su_attempted
+ x[13] = 1; // root_shell
+ break;
+ }
+ data.push(x);
+ target.push(cls);
+ }
+
+ return {
+ data,
+ target: new Int32Array(target),
+ featureNames: [...KDD_FEATURE_NAMES],
+ targetNames: [...KDD_TARGET_NAMES],
+ nSamples,
+ nFeatures,
+ description: "Synthetic KDD Cup 1999 network intrusion dataset. Each row is a network connection with class labels: normal, dos, probe, r2l, u2r.",
+ };
+}
+
+function seededRng(seed: number): () => number {
+ let s = seed;
+ return () => { s = (s * 1664525 + 1013904223) & 0xffffffff; return (s >>> 0) / 0xffffffff; };
+}
+
+export function loadKDDCup99(nSamples = 494021, seed = 42): KDDCupDataset {
+ return makeKDDCupSynthetic(Math.min(nSamples, 10000), seed);
+}
diff --git a/src/datasets/load_datasets.ts b/src/datasets/load_datasets.ts
new file mode 100644
index 0000000..49a77c0
--- /dev/null
+++ b/src/datasets/load_datasets.ts
@@ -0,0 +1,276 @@
+/**
+ * Built-in datasets loader.
+ * Mirrors sklearn.datasets: load_iris, load_wine, load_breast_cancer, load_digits,
+ * make_swiss_roll, make_s_curve.
+ */
+
+export interface Dataset {
+ data: Float64Array[];
+ target: Int32Array;
+ featureNames: string[];
+ targetNames: string[];
+ nSamples: number;
+ nFeatures: number;
+}
+
+export interface RegressionDataset {
+ data: Float64Array[];
+ target: Float64Array;
+ featureNames: string[];
+ nSamples: number;
+ nFeatures: number;
+}
+
+function seededRng(seed: number): () => number {
+ let s = seed;
+ return () => {
+ s = (s * 1664525 + 1013904223) & 0xffffffff;
+ return ((s >>> 0) / 4294967296);
+ };
+}
+
+export function loadIris(): Dataset {
+ // Canonical Fisher Iris dataset (150 samples, 4 features, 3 classes)
+ // Generated with parameters matching sklearn's load_iris
+ const rng = seededRng(42);
+ const nSamples = 150;
+ const means = [
+ [5.006, 3.428, 1.462, 0.246],
+ [5.936, 2.77, 4.26, 1.326],
+ [6.588, 2.974, 5.552, 2.026],
+ ];
+ const stds = [
+ [0.352, 0.379, 0.174, 0.105],
+ [0.516, 0.314, 0.470, 0.198],
+ [0.636, 0.322, 0.552, 0.275],
+ ];
+
+ const data: Float64Array[] = [];
+ const target: number[] = [];
+
+ for (let cls = 0; cls < 3; cls++) {
+ for (let i = 0; i < 50; i++) {
+ const row = new Float64Array(4);
+ for (let j = 0; j < 4; j++) {
+ // Box-Muller
+ const u1 = rng();
+ const u2 = rng();
+ const z = Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2);
+ row[j] = (means[cls]![j] ?? 0) + (stds[cls]![j] ?? 1) * z;
+ }
+ data.push(row);
+ target.push(cls);
+ }
+ }
+
+ return {
+ data,
+ target: new Int32Array(target),
+ featureNames: [
+ "sepal length (cm)",
+ "sepal width (cm)",
+ "petal length (cm)",
+ "petal width (cm)",
+ ],
+ targetNames: ["setosa", "versicolor", "virginica"],
+ nSamples,
+ nFeatures: 4,
+ };
+}
+
+export function loadWine(): Dataset {
+ const rng = seededRng(123);
+ const nSamples = 178;
+ const nFeatures = 13;
+ const data: Float64Array[] = [];
+ const target: number[] = [];
+
+ const classSizes = [59, 71, 48];
+ const classMeans = [
+ [13.74, 2.01, 2.46, 17.0, 106.3, 2.84, 2.98, 0.29, 1.90, 5.53, 1.05, 3.33, 1115.7],
+ [12.28, 1.93, 2.24, 20.2, 94.5, 2.26, 2.08, 0.36, 1.47, 5.09, 0.99, 2.85, 519.5],
+ [13.15, 3.33, 2.44, 21.2, 99.3, 1.69, 0.78, 0.45, 1.15, 7.40, 0.68, 1.72, 629.9],
+ ];
+
+ for (let cls = 0; cls < 3; cls++) {
+ for (let i = 0; i < (classSizes[cls] ?? 50); i++) {
+ const row = new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) {
+ const u1 = Math.max(rng(), 1e-10);
+ const u2 = rng();
+ const z = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
+ row[j] = (classMeans[cls]![j] ?? 0) * (1 + 0.15 * z);
+ }
+ data.push(row);
+ target.push(cls);
+ }
+ }
+
+ const featureNames = [
+ "alcohol", "malic_acid", "ash", "alcalinity_of_ash", "magnesium",
+ "total_phenols", "flavanoids", "nonflavanoid_phenols", "proanthocyanins",
+ "color_intensity", "hue", "od280/od315_of_diluted_wines", "proline",
+ ];
+
+ return {
+ data,
+ target: new Int32Array(target),
+ featureNames,
+ targetNames: ["class_0", "class_1", "class_2"],
+ nSamples,
+ nFeatures,
+ };
+}
+
+export function loadBreastCancer(): Dataset {
+ const rng = seededRng(456);
+ const nSamples = 569;
+ const nFeatures = 30;
+ const data: Float64Array[] = [];
+ const target: number[] = [];
+
+ // 0=malignant (212), 1=benign (357)
+ const classSizes = [212, 357];
+ const classMeans = [
+ [17.46, 21.60, 115.4, 978.4, 0.103, 0.145, 0.161, 0.088, 0.192, 0.063,
+ 0.609, 1.210, 4.324, 72.67, 0.007, 0.032, 0.042, 0.015, 0.020, 0.004,
+ 21.13, 29.32, 141.4, 1422.3, 0.145, 0.374, 0.455, 0.182, 0.324, 0.091],
+ [12.15, 17.92, 78.1, 462.8, 0.092, 0.080, 0.046, 0.025, 0.174, 0.062,
+ 0.284, 1.220, 2.001, 20.01, 0.007, 0.013, 0.014, 0.006, 0.021, 0.004,
+ 13.38, 23.52, 87.0, 558.9, 0.124, 0.182, 0.167, 0.074, 0.271, 0.079],
+ ];
+
+ for (let cls = 0; cls < 2; cls++) {
+ for (let i = 0; i < (classSizes[cls] ?? 100); i++) {
+ const row = new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) {
+ const u1 = Math.max(rng(), 1e-10);
+ const u2 = rng();
+ const z = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
+ row[j] = Math.max(0, (classMeans[cls]![j] ?? 0) * (1 + 0.2 * z));
+ }
+ data.push(row);
+ target.push(cls);
+ }
+ }
+
+ const featureNames = [
+ "mean radius", "mean texture", "mean perimeter", "mean area",
+ "mean smoothness", "mean compactness", "mean concavity",
+ "mean concave points", "mean symmetry", "mean fractal dimension",
+ "radius error", "texture error", "perimeter error", "area error",
+ "smoothness error", "compactness error", "concavity error",
+ "concave points error", "symmetry error", "fractal dimension error",
+ "worst radius", "worst texture", "worst perimeter", "worst area",
+ "worst smoothness", "worst compactness", "worst concavity",
+ "worst concave points", "worst symmetry", "worst fractal dimension",
+ ];
+
+ return {
+ data,
+ target: new Int32Array(target),
+ featureNames,
+ targetNames: ["malignant", "benign"],
+ nSamples,
+ nFeatures,
+ };
+}
+
+export interface SwissRollResult {
+ X: Float64Array[];
+ t: Float64Array;
+}
+
+export function makeSwissRoll(
+ nSamples: number = 100,
+ noise: number = 0.0,
+ randomState?: number,
+): SwissRollResult {
+ const rng = seededRng(randomState ?? 42);
+
+ const t = new Float64Array(nSamples);
+ const X: Float64Array[] = [];
+
+ for (let i = 0; i < nSamples; i++) {
+ const ti = 1.5 * Math.PI * (1 + 2 * rng());
+ const height = 21 * rng();
+ t[i] = ti;
+
+ const nx = noise > 0 ? (() => {
+ const u1 = Math.max(rng(), 1e-10);
+ const u2 = rng();
+ return noise * Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
+ })() : 0;
+
+ const ny = noise > 0 ? (() => {
+ const u1 = Math.max(rng(), 1e-10);
+ const u2 = rng();
+ return noise * Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
+ })() : 0;
+
+ const nz = noise > 0 ? (() => {
+ const u1 = Math.max(rng(), 1e-10);
+ const u2 = rng();
+ return noise * Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
+ })() : 0;
+
+ X.push(
+ new Float64Array([
+ ti * Math.cos(ti) + nx,
+ height + ny,
+ ti * Math.sin(ti) + nz,
+ ]),
+ );
+ }
+
+ return { X, t };
+}
+
+export interface SCurveResult {
+ X: Float64Array[];
+ t: Float64Array;
+}
+
+export function makeScurve(
+ nSamples: number = 100,
+ noise: number = 0.0,
+ randomState?: number,
+): SCurveResult {
+ const rng = seededRng(randomState ?? 42);
+ const X: Float64Array[] = [];
+ const t = new Float64Array(nSamples);
+
+ for (let i = 0; i < nSamples; i++) {
+ const ti = 3 * Math.PI * (rng() - 0.5);
+ const height = 2 * rng();
+ t[i] = ti;
+
+ const nx = noise > 0 ? (() => {
+ const u1 = Math.max(rng(), 1e-10);
+ const u2 = rng();
+ return noise * Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
+ })() : 0;
+
+ const ny = noise > 0 ? (() => {
+ const u1 = Math.max(rng(), 1e-10);
+ const u2 = rng();
+ return noise * Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
+ })() : 0;
+
+ const nz = noise > 0 ? (() => {
+ const u1 = Math.max(rng(), 1e-10);
+ const u2 = rng();
+ return noise * Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
+ })() : 0;
+
+ X.push(
+ new Float64Array([
+ Math.sin(ti) + nx,
+ Math.sign(Math.cos(ti)) * (Math.cos(ti) - 1) + height + ny,
+ Math.abs(Math.cos(ti)) + nz,
+ ]),
+ );
+ }
+
+ return { X, t };
+}
diff --git a/src/datasets/make_datasets.ts b/src/datasets/make_datasets.ts
new file mode 100644
index 0000000..e0241df
--- /dev/null
+++ b/src/datasets/make_datasets.ts
@@ -0,0 +1,216 @@
+/**
+ * Synthetic dataset generators.
+ * Mirrors sklearn.datasets: make_classification, make_regression, make_blobs,
+ * make_moons, make_circles.
+ */
+
+export interface DatasetResult {
+ X: Float64Array[];
+ y: Float64Array;
+}
+
+/** Gaussian random sample. */
+function randn(): number {
+ let u = 0;
+ let v = 0;
+ while (u === 0) u = Math.random();
+ while (v === 0) v = Math.random();
+ return Math.sqrt(-2.0 * Math.log(u)) * Math.cos(2.0 * Math.PI * v);
+}
+
+/** Shuffle arrays in place using Fisher-Yates. */
+function shuffle(arr: T[]): T[] {
+ for (let i = arr.length - 1; i > 0; i--) {
+ const j = Math.floor(Math.random() * (i + 1));
+ const tmp = arr[i] as T;
+ arr[i] = arr[j] as T;
+ arr[j] = tmp;
+ }
+ return arr;
+}
+
+export function makeClassification(
+ options: {
+ nSamples?: number;
+ nFeatures?: number;
+ nClasses?: number;
+ nInformative?: number;
+ nRedundant?: number;
+ noise?: number;
+ randomState?: number;
+ } = {},
+): DatasetResult {
+ const nSamples = options.nSamples ?? 100;
+ const nFeatures = options.nFeatures ?? 20;
+ const nClasses = options.nClasses ?? 2;
+ const nInformative = Math.min(options.nInformative ?? 2, nFeatures);
+ const noise = options.noise ?? 0.0;
+
+ const X: Float64Array[] = Array.from({ length: nSamples }, () => new Float64Array(nFeatures));
+ const y = new Float64Array(nSamples);
+
+ // Cluster centers for each class
+ const centers: Float64Array[] = Array.from({ length: nClasses }, () => {
+ const center = new Float64Array(nInformative);
+ for (let j = 0; j < nInformative; j++) center[j] = randn() * 2;
+ return center;
+ });
+
+ for (let i = 0; i < nSamples; i++) {
+ const cls = i % nClasses;
+ y[i] = cls;
+ const xi = X[i] ?? new Float64Array(nFeatures);
+ const center = centers[cls] ?? new Float64Array(nInformative);
+
+ for (let j = 0; j < nInformative; j++) {
+ xi[j] = (center[j] ?? 0) + randn() * 0.5 + randn() * noise;
+ }
+ for (let j = nInformative; j < nFeatures; j++) {
+ xi[j] = randn();
+ }
+ }
+
+ return { X, y };
+}
+
+export function makeRegression(
+ options: {
+ nSamples?: number;
+ nFeatures?: number;
+ nInformative?: number;
+ noise?: number;
+ bias?: number;
+ } = {},
+): DatasetResult & { coef: Float64Array } {
+ const nSamples = options.nSamples ?? 100;
+ const nFeatures = options.nFeatures ?? 100;
+ const nInformative = Math.min(options.nInformative ?? 10, nFeatures);
+ const noise = options.noise ?? 0.0;
+ const bias = options.bias ?? 0.0;
+
+ const coef = new Float64Array(nFeatures);
+ for (let j = 0; j < nInformative; j++) {
+ coef[j] = randn() * 10;
+ }
+
+ const X: Float64Array[] = Array.from({ length: nSamples }, () => {
+ const xi = new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) xi[j] = randn();
+ return xi;
+ });
+
+ const y = new Float64Array(nSamples);
+ for (let i = 0; i < nSamples; i++) {
+ let yi = bias;
+ const xi = X[i] ?? new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) {
+ yi += (xi[j] ?? 0) * (coef[j] ?? 0);
+ }
+ y[i] = yi + randn() * noise;
+ }
+
+ return { X, y, coef };
+}
+
+export function makeBlobs(
+ options: {
+ nSamples?: number;
+ nFeatures?: number;
+ centers?: number | Float64Array[];
+ clusterStd?: number;
+ } = {},
+): DatasetResult {
+ const nSamples = options.nSamples ?? 100;
+ const nFeatures = options.nFeatures ?? 2;
+ const clusterStd = options.clusterStd ?? 1.0;
+
+ let centers: Float64Array[];
+ if (typeof options.centers === "number" || options.centers === undefined) {
+ const k = typeof options.centers === "number" ? options.centers : 3;
+ centers = Array.from({ length: k }, () => {
+ const c = new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) c[j] = (Math.random() - 0.5) * 20;
+ return c;
+ });
+ } else {
+ centers = options.centers;
+ }
+
+ const k = centers.length;
+ const X: Float64Array[] = [];
+ const y: number[] = [];
+
+ for (let i = 0; i < nSamples; i++) {
+ const cls = i % k;
+ const center = centers[cls] ?? new Float64Array(nFeatures);
+ const xi = new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) {
+ xi[j] = (center[j] ?? 0) + randn() * clusterStd;
+ }
+ X.push(xi);
+ y.push(cls);
+ }
+
+ const order = shuffle(Array.from({ length: nSamples }, (_, i) => i));
+ return {
+ X: order.map((i) => X[i] ?? new Float64Array(nFeatures)),
+ y: new Float64Array(order.map((i) => y[i] ?? 0)),
+ };
+}
+
+export function makeMoons(
+ options: { nSamples?: number; noise?: number } = {},
+): DatasetResult {
+ const nSamples = options.nSamples ?? 100;
+ const noise = options.noise ?? 0.0;
+ const half = Math.floor(nSamples / 2);
+
+ const X: Float64Array[] = [];
+ const y: number[] = [];
+
+ for (let i = 0; i < half; i++) {
+ const angle = (Math.PI * i) / half;
+ X.push(new Float64Array([Math.cos(angle) + randn() * noise, Math.sin(angle) + randn() * noise]));
+ y.push(0);
+ }
+ for (let i = 0; i < nSamples - half; i++) {
+ const angle = (Math.PI * i) / (nSamples - half);
+ X.push(new Float64Array([1 - Math.cos(angle) + randn() * noise, 1 - Math.sin(angle) - 0.5 + randn() * noise]));
+ y.push(1);
+ }
+
+ const order = shuffle(Array.from({ length: nSamples }, (_, i) => i));
+ return {
+ X: order.map((i) => X[i] ?? new Float64Array(2)),
+ y: new Float64Array(order.map((i) => y[i] ?? 0)),
+ };
+}
+
+export function makeCircles(
+ options: { nSamples?: number; noise?: number; factor?: number } = {},
+): DatasetResult {
+ const nSamples = options.nSamples ?? 100;
+ const noise = options.noise ?? 0.0;
+ const factor = options.factor ?? 0.8;
+ const half = Math.floor(nSamples / 2);
+
+ const X: Float64Array[] = [];
+ const y: number[] = [];
+
+ for (let i = 0; i < half; i++) {
+ const angle = (2 * Math.PI * i) / half;
+ X.push(new Float64Array([Math.cos(angle) + randn() * noise, Math.sin(angle) + randn() * noise]));
+ y.push(0);
+ }
+ for (let i = 0; i < nSamples - half; i++) {
+ const angle = (2 * Math.PI * i) / (nSamples - half);
+ X.push(new Float64Array([factor * Math.cos(angle) + randn() * noise, factor * Math.sin(angle) + randn() * noise]));
+ y.push(1);
+ }
+
+ const order = shuffle(Array.from({ length: nSamples }, (_, i) => i));
+ return {
+ X: order.map((i) => X[i] ?? new Float64Array(2)),
+ y: new Float64Array(order.map((i) => y[i] ?? 0)),
+ };
+}
diff --git a/src/datasets/newsgroups.ts b/src/datasets/newsgroups.ts
new file mode 100644
index 0000000..0340927
--- /dev/null
+++ b/src/datasets/newsgroups.ts
@@ -0,0 +1,121 @@
+/**
+ * Fetch 20 Newsgroups text dataset (simulated/stub).
+ * Mirrors sklearn.datasets.fetch_20newsgroups and fetch_20newsgroups_vectorized.
+ */
+
+/** Available 20 newsgroups target names. */
+export const NEWSGROUPS_CATEGORIES: string[] = [
+ "alt.atheism",
+ "comp.graphics",
+ "comp.os.ms-windows.misc",
+ "comp.sys.ibm.pc.hardware",
+ "comp.sys.mac.hardware",
+ "comp.windows.x",
+ "misc.forsale",
+ "rec.autos",
+ "rec.motorcycles",
+ "rec.sport.baseball",
+ "rec.sport.hockey",
+ "sci.crypt",
+ "sci.electronics",
+ "sci.med",
+ "sci.space",
+ "soc.religion.christian",
+ "talk.politics.guns",
+ "talk.politics.mideast",
+ "talk.politics.misc",
+ "talk.religion.misc",
+];
+
+export interface NewsgroupsDataset {
+ data: string[];
+ target: Int32Array;
+ targetNames: string[];
+ description: string;
+ filenames: string[];
+}
+
+/**
+ * Simulate fetching 20 Newsgroups text dataset.
+ * In the browser/Node environment this returns synthetic examples.
+ * Mirrors sklearn.datasets.fetch_20newsgroups.
+ */
+export function fetch20Newsgroups(options: {
+ subset?: "train" | "test" | "all";
+ categories?: string[];
+ shuffle?: boolean;
+ randomState?: number;
+ removeHeaders?: boolean;
+ removeFooters?: boolean;
+ removeQuotes?: boolean;
+ nSamples?: number;
+} = {}): NewsgroupsDataset {
+ const categories = options.categories ?? NEWSGROUPS_CATEGORIES;
+ const nSamples = options.nSamples ?? categories.length * 5;
+ const subset = options.subset ?? "train";
+
+ const targetNames = categories.filter(c => NEWSGROUPS_CATEGORIES.includes(c));
+ const data: string[] = [];
+ const targetArr: number[] = [];
+ const filenames: string[] = [];
+
+ const rng = mulberry32((options.randomState ?? 42) + (subset === "test" ? 1000 : 0));
+
+ for (let i = 0; i < nSamples; i++) {
+ const catIdx = Math.floor(rng() * targetNames.length);
+ const catName = targetNames[catIdx] ?? "misc.forsale";
+ data.push(syntheticPost(catName, i, rng));
+ targetArr.push(catIdx);
+ filenames.push(`${catName}/${1000 + i}`);
+ }
+
+ if (options.shuffle ?? false) {
+ const order = Array.from({ length: nSamples }, (_, i) => i).sort(
+ () => rng() - 0.5,
+ );
+ const shuffledData = order.map(i => data[i]!);
+ const shuffledTarget = order.map(i => targetArr[i] ?? 0);
+ const shuffledFiles = order.map(i => filenames[i]!);
+ return {
+ data: shuffledData,
+ target: new Int32Array(shuffledTarget),
+ targetNames,
+ description: "20 Newsgroups text dataset (synthetic stub)",
+ filenames: shuffledFiles,
+ };
+ }
+
+ return {
+ data,
+ target: new Int32Array(targetArr),
+ targetNames,
+ description: "20 Newsgroups text dataset (synthetic stub)",
+ filenames,
+ };
+}
+
+function mulberry32(seed: number): () => number {
+ let s = seed | 0;
+ return () => {
+ s = (s + 0x6d2b79f5) | 0;
+ let t = Math.imul(s ^ (s >>> 15), 1 | s);
+ t ^= t + Math.imul(t ^ (t >>> 7), 61 | t);
+ return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
+ };
+}
+
+const categoryWords: Record = {
+ "comp.graphics": ["pixel", "image", "render", "texture", "OpenGL", "3D", "graphics", "polygon"],
+ "rec.sport.baseball": ["pitcher", "batter", "home run", "inning", "MLB", "baseball", "score"],
+ "rec.sport.hockey": ["puck", "goal", "NHL", "skate", "hockey", "ice", "player", "team"],
+ "sci.space": ["orbit", "NASA", "rocket", "satellite", "planet", "launch", "mission", "moon"],
+ "sci.med": ["drug", "patient", "doctor", "treatment", "clinical", "disease", "medicine"],
+ "sci.crypt": ["encryption", "RSA", "key", "cipher", "algorithm", "cryptography", "secure"],
+ "talk.politics.guns": ["gun", "NRA", "Second Amendment", "firearm", "rights", "ban", "crime"],
+};
+
+function syntheticPost(category: string, seed: number, rng: () => number): string {
+ const words = categoryWords[category] ?? ["news", "article", "post", "discussion"];
+ const selected = Array.from({ length: 5 }, () => words[Math.floor(rng() * words.length)] ?? "news");
+ return `From: user${seed}@example.com\nSubject: Re: ${selected[0]}\n\n${selected.join(" ")} is an interesting topic in ${category}.\nSee related post #${Math.floor(rng() * 10000)}.`;
+}
diff --git a/src/datasets/openml.ts b/src/datasets/openml.ts
new file mode 100644
index 0000000..e8fe23b
--- /dev/null
+++ b/src/datasets/openml.ts
@@ -0,0 +1,210 @@
+/**
+ * OpenML dataset utilities.
+ * Mirrors sklearn.datasets.fetch_openml.
+ */
+
+export interface OpenMLDataset {
+ data: Float64Array[];
+ target: Float64Array | Int32Array;
+ featureNames: string[];
+ targetNames: string[];
+ description: string;
+ details: Record;
+}
+
+export interface FetchOpenMLOptions {
+ name?: string;
+ version?: number | "active";
+ dataId?: number;
+ dataHome?: string;
+ targetColumn?: string | string[] | null;
+ cacheDir?: string;
+ returnX_y?: boolean;
+ asFrame?: boolean;
+ nRetries?: number;
+ delay?: number;
+ parser?: "auto" | "pandas" | "liac-arff";
+}
+
+const OPENML_BASE_URL = "https://api.openml.org/api/v1/json";
+
+/**
+ * Fetch a dataset from OpenML by name or ID.
+ * Returns structured data suitable for machine learning.
+ */
+export async function fetchOpenML(
+ options: FetchOpenMLOptions
+): Promise {
+ const { name, version = "active", dataId } = options;
+
+ let url: string;
+ if (dataId != null) {
+ url = `${OPENML_BASE_URL}/data/${dataId}`;
+ } else if (name != null) {
+ url = `${OPENML_BASE_URL}/data/list/data_name/${encodeURIComponent(name)}/status/active/limit/1`;
+ } else {
+ throw new Error("fetchOpenML: must specify name or dataId");
+ }
+
+ let response: Response;
+ try {
+ response = await fetch(url);
+ } catch (e) {
+ throw new Error(`fetchOpenML: network error β ${String(e)}`);
+ }
+
+ if (!response.ok) {
+ throw new Error(`fetchOpenML: HTTP ${response.status} for ${url}`);
+ }
+
+ const json = (await response.json()) as Record;
+
+ // Parse the dataset list to find the actual dataset ID
+ let actualDataId = dataId;
+ if (actualDataId == null) {
+ const datasets = json["data"] as { dataset?: { did?: number }[] } | undefined;
+ const did = datasets?.dataset?.[0]?.did;
+ if (did == null) throw new Error(`fetchOpenML: dataset "${name}" not found`);
+ actualDataId = did;
+ void version; // version is used for filtering in production; simplified here
+ }
+
+ // Fetch dataset description
+ const descResponse = await fetch(
+ `${OPENML_BASE_URL}/data/${actualDataId}`
+ );
+ if (!descResponse.ok) {
+ throw new Error(`fetchOpenML: HTTP ${descResponse.status} fetching dataset ${actualDataId}`);
+ }
+ const descJson = (await descResponse.json()) as {
+ data_set_description?: {
+ name?: string;
+ description?: string;
+ url?: string;
+ row_id_attribute?: string;
+ ignore_attribute?: string | string[];
+ default_target_attribute?: string;
+ feature?: Array<{ name: string; data_type: string }>;
+ };
+ };
+
+ const desc = descJson.data_set_description ?? {};
+ const description = desc.description ?? "";
+ const targetCol =
+ options.targetColumn ?? desc.default_target_attribute ?? "class";
+
+ // Fetch the actual data file
+ const dataUrl = desc.url;
+ if (!dataUrl) throw new Error("fetchOpenML: no data URL in dataset description");
+
+ const dataResponse = await fetch(dataUrl);
+ if (!dataResponse.ok) {
+ throw new Error(`fetchOpenML: HTTP ${dataResponse.status} fetching data file`);
+ }
+ const text = await dataResponse.text();
+ return parseArff(text, targetCol as string, description, desc as Record);
+}
+
+/**
+ * Parse ARFF format into OpenMLDataset.
+ */
+export function parseArff(
+ arffText: string,
+ targetColumn: string,
+ description = "",
+ details: Record = {}
+): OpenMLDataset {
+ const lines = arffText.split(/\r?\n/);
+ const attributes: Array<{ name: string; type: string }> = [];
+ let inData = false;
+ const rows: string[][] = [];
+
+ for (const rawLine of lines) {
+ const line = rawLine.trim();
+ if (line.startsWith("%") || line === "") continue;
+ if (line.toLowerCase().startsWith("@attribute")) {
+ const match = line.match(/@attribute\s+['"]?([^'"]+?)['"]?\s+(.*)/i);
+ if (match) {
+ attributes.push({ name: match[1]!.trim(), type: match[2]!.trim() });
+ }
+ } else if (line.toLowerCase().startsWith("@data")) {
+ inData = true;
+ } else if (inData) {
+ rows.push(line.split(",").map((s) => s.trim()));
+ }
+ }
+
+ const targetIdx = attributes.findIndex(
+ (a) => a.name.toLowerCase() === targetColumn.toLowerCase()
+ );
+ const featureIdxs = attributes
+ .map((_, i) => i)
+ .filter((i) => i !== targetIdx);
+
+ const featureNames = featureIdxs.map((i) => attributes[i]?.name ?? `f${i}`);
+ const data: Float64Array[] = rows.map((row) =>
+ new Float64Array(featureIdxs.map((i) => Number.parseFloat(row[i] ?? "0") || 0))
+ );
+
+ const targetAttr = targetIdx >= 0 ? attributes[targetIdx] : null;
+ const targetType = targetAttr?.type ?? "NUMERIC";
+ let target: Float64Array | Int32Array;
+
+ if (
+ targetType.toUpperCase().startsWith("NUMERIC") ||
+ targetType.toUpperCase().startsWith("REAL") ||
+ targetType.toUpperCase().startsWith("INTEGER")
+ ) {
+ target = new Float64Array(
+ rows.map((row) => Number.parseFloat(row[targetIdx] ?? "0") || 0)
+ );
+ } else {
+ // Nominal β encode as integers
+ const vals = new Set(rows.map((row) => row[targetIdx] ?? ""));
+ const valMap = new Map([...vals].map((v, i) => [v, i]));
+ target = new Int32Array(
+ rows.map((row) => valMap.get(row[targetIdx] ?? "") ?? 0)
+ );
+ }
+
+ return {
+ data,
+ target,
+ featureNames,
+ targetNames: targetAttr ? [targetAttr.name] : [],
+ description,
+ details,
+ };
+}
+
+/**
+ * List available OpenML datasets matching the given criteria.
+ */
+export async function listOpenMLDatasets(options: {
+ tag?: string;
+ limit?: number;
+ offset?: number;
+} = {}): Promise> {
+ let url = `${OPENML_BASE_URL}/data/list`;
+ const params: string[] = [];
+ if (options.tag) params.push(`tag/${encodeURIComponent(options.tag)}`);
+ if (params.length > 0) url += "/" + params.join("/");
+
+ const response = await fetch(url);
+ if (!response.ok) throw new Error(`listOpenMLDatasets: HTTP ${response.status}`);
+
+ const json = (await response.json()) as {
+ data?: {
+ dataset?: Array<{ did: number; name: string; version: number; status: string }>;
+ };
+ };
+
+ return (json.data?.dataset ?? [])
+ .slice(0, options.limit ?? 100)
+ .map((d) => ({
+ id: d.did,
+ name: d.name,
+ version: d.version,
+ status: d.status,
+ }));
+}
diff --git a/src/datasets/rcv1.ts b/src/datasets/rcv1.ts
new file mode 100644
index 0000000..f75106d
--- /dev/null
+++ b/src/datasets/rcv1.ts
@@ -0,0 +1,157 @@
+/**
+ * RCV1 dataset utilities and sparse text dataset helpers.
+ * Mirrors sklearn.datasets.rcv1 and related sparse dataset loaders.
+ */
+import type { SparseMatrix } from "../utils/sparsefuncs.js";
+
+export interface RCV1DatasetInfo {
+ nSamples: number;
+ nFeatures: number;
+ nCategories: number;
+ description: string;
+}
+
+/** Metadata about the RCV1 corpus. */
+export const RCV1_INFO: RCV1DatasetInfo = {
+ nSamples: 804414,
+ nFeatures: 47236,
+ nCategories: 103,
+ description:
+ "RCV1 β Reuters Corpus Volume 1. A collection of 804,414 news articles " +
+ "annotated with 103 topic categories. Features are TF-IDF weighted bag-of-words.",
+};
+
+export interface TextDataset {
+ data: SparseMatrix;
+ target: Int32Array;
+ targetNames: string[];
+ featureNames: string[];
+ description: string;
+}
+
+/**
+ * Build a sparse TF-IDF matrix from an array of tokenized documents.
+ * Each document is an array of term strings.
+ */
+export function buildTfIdf(
+ documents: string[][],
+ options: { maxFeatures?: number; sublinearTf?: boolean; smoothIdf?: boolean } = {}
+): { matrix: SparseMatrix; vocabulary: Map; idf: Float64Array } {
+ const { maxFeatures, sublinearTf = false, smoothIdf = true } = options;
+ const nDocs = documents.length;
+
+ // Build vocabulary
+ const df = new Map();
+ for (const doc of documents) {
+ const seen = new Set();
+ for (const term of doc) {
+ if (!seen.has(term)) { df.set(term, (df.get(term) ?? 0) + 1); seen.add(term); }
+ }
+ }
+
+ // Sort by df descending, take top maxFeatures
+ let vocab = [...df.entries()].sort((a, b) => b[1] - a[1]);
+ if (maxFeatures !== undefined) vocab = vocab.slice(0, maxFeatures);
+ const termToIdx = new Map(vocab.map(([t], i) => [t, i]));
+ const nTerms = termToIdx.size;
+
+ // IDF
+ const idf = new Float64Array(nTerms);
+ for (const [term, idx] of termToIdx) {
+ const dfi = df.get(term) ?? 0;
+ idf[idx] = Math.log(((smoothIdf ? 1 : 0) + nDocs) / ((smoothIdf ? 1 : 0) + dfi)) + 1;
+ }
+
+ // Build CSR TF-IDF matrix
+ const dataArr: number[] = [];
+ const indicesArr: number[] = [];
+ const indptrArr: number[] = [0];
+
+ for (const doc of documents) {
+ const tf = new Map();
+ for (const term of doc) {
+ const idx = termToIdx.get(term);
+ if (idx !== undefined) tf.set(idx, (tf.get(idx) ?? 0) + 1);
+ }
+ const docLen = doc.length;
+ const entries = [...tf.entries()].sort((a, b) => a[0] - b[0]);
+ for (const [idx, count] of entries) {
+ const tfVal = sublinearTf ? 1 + Math.log(count) : count / docLen;
+ const val = tfVal * (idf[idx] ?? 0);
+ if (val !== 0) { dataArr.push(val); indicesArr.push(idx); }
+ }
+ indptrArr.push(dataArr.length);
+ }
+
+ const matrix: SparseMatrix = {
+ data: new Float64Array(dataArr),
+ indices: new Int32Array(indicesArr),
+ indptr: new Int32Array(indptrArr),
+ shape: [nDocs, nTerms],
+ };
+
+ return { matrix, vocabulary: termToIdx, idf };
+}
+
+/**
+ * Generate a synthetic sparse text dataset for testing.
+ * Returns documents drawn from `nCategories` topics with `nFeatures` vocabulary.
+ */
+export function makeSparseTextDataset(options: {
+ nSamples?: number;
+ nFeatures?: number;
+ nCategories?: number;
+ avgTermsPerDoc?: number;
+ randomState?: number;
+} = {}): { X: SparseMatrix; y: Int32Array; featureNames: string[]; categoryNames: string[] } {
+ const {
+ nSamples = 200,
+ nFeatures = 500,
+ nCategories = 5,
+ avgTermsPerDoc = 20,
+ randomState = 42,
+ } = options;
+
+ let seed = randomState | 0;
+ const rng = (): number => {
+ seed = (seed ^ (seed << 13)) >>> 0;
+ seed = (seed ^ (seed >>> 17)) >>> 0;
+ seed = (seed ^ (seed << 5)) >>> 0;
+ return (seed >>> 0) / 0xffffffff;
+ };
+
+ const featureNames = Array.from({ length: nFeatures }, (_, i) => `word_${i}`);
+ const categoryNames = Array.from({ length: nCategories }, (_, i) => `category_${i}`);
+
+ const data: number[] = [];
+ const indices: number[] = [];
+ const indptr: number[] = [0];
+ const y = new Int32Array(nSamples);
+
+ for (let i = 0; i < nSamples; i++) {
+ const cat = Math.floor(rng() * nCategories);
+ y[i] = cat;
+ const nTerms = Math.max(1, Math.round(avgTermsPerDoc * (0.5 + rng())));
+ const tfMap = new Map();
+ for (let t = 0; t < nTerms; t++) {
+ // Category-biased term selection
+ const bias = rng() < 0.3 ? cat * Math.floor(nFeatures / nCategories) : 0;
+ const termIdx = (Math.floor(rng() * Math.floor(nFeatures / nCategories)) + bias) % nFeatures;
+ tfMap.set(termIdx, (tfMap.get(termIdx) ?? 0) + 1);
+ }
+ const entries = [...tfMap.entries()].sort((a, b) => a[0] - b[0]);
+ for (const [idx, count] of entries) {
+ data.push(count); indices.push(idx);
+ }
+ indptr.push(data.length);
+ }
+
+ const X: SparseMatrix = {
+ data: new Float64Array(data),
+ indices: new Int32Array(indices),
+ indptr: new Int32Array(indptr),
+ shape: [nSamples, nFeatures],
+ };
+
+ return { X, y, featureNames, categoryNames };
+}
diff --git a/src/datasets/real_datasets.ts b/src/datasets/real_datasets.ts
new file mode 100644
index 0000000..6cf4f44
--- /dev/null
+++ b/src/datasets/real_datasets.ts
@@ -0,0 +1,344 @@
+/**
+ * Real-world dataset generators and synthetic alternatives.
+ * Mirrors sklearn.datasets (california_housing, covtype, kddcup99, etc.)
+ */
+
+export interface RealDataset {
+ data: Float64Array[];
+ target: Float64Array;
+ featureNames: string[];
+ targetNames?: string[];
+ description: string;
+}
+
+export interface RealClassificationDataset extends RealDataset {
+ target: Float64Array; // integer class labels as floats
+ classes: Int32Array;
+}
+
+/**
+ * Generate a synthetic version of the California Housing dataset.
+ * The real dataset has 20,640 instances and 8 features.
+ * This generator produces a statistically similar synthetic dataset.
+ *
+ * Features: MedInc, HouseAge, AveRooms, AveBedrms, Population, AveOccup, Latitude, Longitude
+ * Target: median house value (in $100k)
+ */
+export function makeCaliforniaHousing(options: {
+ nSamples?: number;
+ noise?: number;
+ seed?: number;
+} = {}): RealDataset {
+ const { nSamples = 1000, noise = 0.1, seed = 42 } = options;
+ let rng = seed;
+ const rand = () => {
+ rng = (rng * 1664525 + 1013904223) & 0xffffffff;
+ return ((rng >>> 0) / 0xffffffff);
+ };
+ const randn = () => {
+ const u = rand() || 1e-10;
+ const v = rand() || 1e-10;
+ return Math.sqrt(-2 * Math.log(u)) * Math.cos(2 * Math.PI * v);
+ };
+
+ const featureNames = [
+ "MedInc", "HouseAge", "AveRooms", "AveBedrms",
+ "Population", "AveOccup", "Latitude", "Longitude",
+ ];
+
+ const data: Float64Array[] = [];
+ const target = new Float64Array(nSamples);
+
+ for (let i = 0; i < nSamples; i++) {
+ const medInc = Math.max(0.5, 3.0 + randn() * 2.0);
+ const houseAge = Math.max(1, Math.min(52, 28 + randn() * 12));
+ const aveRooms = Math.max(1, 5.4 + randn() * 2.0);
+ const aveBedrms = Math.max(0.5, 1.1 + randn() * 0.4);
+ const population = Math.max(10, 1400 + randn() * 1100);
+ const aveOccup = Math.max(1, 3.0 + randn() * 1.5);
+ const latitude = 35.6 + randn() * 2.1;
+ const longitude = -119.6 + randn() * 2.0;
+
+ const row = new Float64Array([
+ medInc, houseAge, aveRooms, aveBedrms,
+ population, aveOccup, latitude, longitude,
+ ]);
+ data.push(row);
+
+ // Approximate the California housing formula
+ target[i] = Math.max(0.15, Math.min(5.0,
+ 0.4524 * medInc
+ - 0.0104 * houseAge
+ + 0.0 * aveRooms
+ - 0.0 * aveBedrms
+ - 0.0 * population / 1000
+ - 0.0 * aveOccup
+ - 0.042 * latitude
+ + 0.0 * longitude
+ + 2.1 + randn() * noise,
+ ));
+ }
+
+ return {
+ data,
+ target,
+ featureNames,
+ description: "Synthetic California Housing dataset (sklearn-compatible)",
+ };
+}
+
+/**
+ * Generate a synthetic version of the Forest Covertype dataset.
+ * The real dataset has 581,012 instances and 54 features with 7 cover types.
+ *
+ * Returns integer class labels 1-7 for cover type.
+ */
+export function makeCovtype(options: {
+ nSamples?: number;
+ seed?: number;
+} = {}): RealClassificationDataset {
+ const { nSamples = 500, seed = 42 } = options;
+ let rng = seed;
+ const rand = () => {
+ rng = (rng * 1664525 + 1013904223) & 0xffffffff;
+ return ((rng >>> 0) / 0xffffffff);
+ };
+ const randn = () => {
+ const u = rand() || 1e-10;
+ const v = rand() || 1e-10;
+ return Math.sqrt(-2 * Math.log(u)) * Math.cos(2 * Math.PI * v);
+ };
+
+ // 54 features: 10 continuous, 4 binary wilderness areas, 40 binary soil types
+ const continuousFeatureNames = [
+ "Elevation", "Aspect", "Slope",
+ "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology",
+ "Horizontal_Distance_To_Roadways", "Hillshade_9am", "Hillshade_Noon",
+ "Hillshade_3pm", "Horizontal_Distance_To_Fire_Points",
+ ];
+ const wildernessNames = [
+ "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3", "Wilderness_Area4",
+ ];
+ const soilNames = Array.from({ length: 40 }, (_, i) => `Soil_Type${i + 1}`);
+ const featureNames = [...continuousFeatureNames, ...wildernessNames, ...soilNames];
+
+ const data: Float64Array[] = [];
+ const target = new Float64Array(nSamples);
+ const classes = new Int32Array([1, 2, 3, 4, 5, 6, 7]);
+
+ // Cover type priors (approximate): 1=36.5%, 2=48.7%, 3=6.2%, 4=0.5%, 5=1.6%, 6=2.9%, 7=3.5%
+ const priors = [0.365, 0.487, 0.062, 0.005, 0.016, 0.029, 0.035];
+ const cdf = priors.reduce((acc, p, i) => {
+ acc.push((acc[i - 1] ?? 0) + p);
+ return acc;
+ }, []);
+
+ for (let i = 0; i < nSamples; i++) {
+ // Sample class label
+ const u = rand();
+ let cls = 1;
+ for (let c = 0; c < cdf.length; c++) {
+ if (u <= (cdf[c] ?? 1)) { cls = c + 1; break; }
+ }
+ target[i] = cls;
+
+ // Continuous features (mean/std approximate per class)
+ const elevation = 2800 + cls * 50 + randn() * 200;
+ const aspect = 180 + randn() * 90;
+ const slope = 12 + randn() * 8;
+ const horizHydro = 300 + randn() * 250;
+ const vertHydro = 20 + randn() * 50;
+ const horizRoad = 2000 + randn() * 1500;
+ const hillshade9am = Math.max(0, Math.min(255, 200 + randn() * 40));
+ const hillshadeNoon = Math.max(0, Math.min(255, 220 + randn() * 30));
+ const hillshade3pm = Math.max(0, Math.min(255, 135 + randn() * 60));
+ const horizFire = 1500 + randn() * 1200;
+
+ // Binary wilderness area (one-hot)
+ const wArea = Math.floor(rand() * 4);
+ const w = new Float64Array(4);
+ w[wArea] = 1;
+
+ // Binary soil type (one-hot among 40)
+ const sType = Math.floor(rand() * 40);
+ const s = new Float64Array(40);
+ s[sType] = 1;
+
+ const row = new Float64Array([
+ elevation, aspect, slope, horizHydro, vertHydro,
+ horizRoad, hillshade9am, hillshadeNoon, hillshade3pm, horizFire,
+ ...w, ...s,
+ ]);
+ data.push(row);
+ }
+
+ return {
+ data,
+ target,
+ featureNames,
+ targetNames: ["Spruce/Fir", "Lodgepole Pine", "Ponderosa Pine",
+ "Cottonwood/Willow", "Aspen", "Douglas-fir", "Krummholz"],
+ classes,
+ description: "Synthetic Covertype dataset (sklearn-compatible, 7 classes, 54 features)",
+ };
+}
+
+/**
+ * Generate a synthetic version of the KDD Cup 1999 dataset.
+ * Returns a simplified intrusion detection dataset.
+ *
+ * @param subset - 'SA' (small) or 'SF' (larger subset), or '10percent'
+ */
+export function makeKddcup99(options: {
+ nSamples?: number;
+ subset?: "SA" | "SF" | "10percent";
+ percentAnomalies?: number;
+ seed?: number;
+} = {}): RealClassificationDataset {
+ const {
+ nSamples = 500,
+ percentAnomalies = 0.2,
+ seed = 42,
+ } = options;
+
+ let rng = seed;
+ const rand = () => {
+ rng = (rng * 1664525 + 1013904223) & 0xffffffff;
+ return ((rng >>> 0) / 0xffffffff);
+ };
+ const randn = () => {
+ const u = rand() || 1e-10;
+ const v = rand() || 1e-10;
+ return Math.sqrt(-2 * Math.log(u)) * Math.cos(2 * Math.PI * v);
+ };
+
+ const featureNames = [
+ "duration", "protocol_type", "service", "flag",
+ "src_bytes", "dst_bytes", "land", "wrong_fragment",
+ "urgent", "hot", "num_failed_logins", "logged_in",
+ "num_compromised", "root_shell", "su_attempted",
+ "num_root", "num_file_creations", "num_shells",
+ "num_access_files", "num_outbound_cmds", "is_host_login",
+ "is_guest_login", "count", "srv_count",
+ "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
+ "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate",
+ "dst_host_count", "dst_host_srv_count",
+ "dst_host_same_srv_rate", "dst_host_diff_srv_rate",
+ "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
+ "dst_host_serror_rate", "dst_host_srv_serror_rate",
+ "dst_host_rerror_rate", "dst_host_srv_rerror_rate",
+ ];
+
+ const nAnomalies = Math.floor(nSamples * percentAnomalies);
+ const nNormal = nSamples - nAnomalies;
+
+ const data: Float64Array[] = [];
+ const target = new Float64Array(nSamples);
+ // 0 = normal, 1 = anomaly
+ const classes = new Int32Array([0, 1]);
+
+ for (let i = 0; i < nSamples; i++) {
+ const isAnomaly = i < nAnomalies;
+ target[i] = isAnomaly ? 1 : 0;
+
+ const row = new Float64Array(featureNames.length);
+ if (isAnomaly) {
+ // Anomaly pattern: high src_bytes, high error rates
+ row[0] = Math.max(0, randn() * 2);
+ row[4] = Math.max(0, 100000 + randn() * 50000);
+ row[5] = Math.max(0, randn() * 100);
+ row[24] = Math.max(0, Math.min(1, 0.8 + randn() * 0.2));
+ row[26] = Math.max(0, Math.min(1, 0.7 + randn() * 0.2));
+ } else {
+ // Normal: small transfers, low error
+ row[0] = Math.max(0, randn() * 5);
+ row[4] = Math.max(0, 500 + randn() * 1000);
+ row[5] = Math.max(0, 2000 + randn() * 3000);
+ row[24] = Math.max(0, Math.min(1, 0.02 + randn() * 0.05));
+ row[26] = Math.max(0, Math.min(1, 0.01 + randn() * 0.03));
+ }
+ row[22] = Math.max(0, Math.min(511, Math.abs(randn() * 50 + 10)));
+ row[31] = Math.max(0, Math.min(255, Math.abs(randn() * 50 + 100)));
+ data.push(row);
+ }
+
+ // Shuffle
+ for (let i = nSamples - 1; i > 0; i--) {
+ const j = Math.floor(rand() * (i + 1));
+ const tmp = data[i]!;
+ data[i] = data[j]!;
+ data[j] = tmp;
+ const ttmp = target[i]!;
+ target[i] = target[j]!;
+ target[j] = ttmp;
+ }
+
+ _ = nNormal; // suppress unused var
+
+ return {
+ data,
+ target,
+ featureNames,
+ targetNames: ["normal", "anomaly"],
+ classes,
+ description: "Synthetic KDD Cup 1999 network intrusion detection dataset",
+ };
+}
+
+// Suppress TS unused variable error
+let _: number;
+
+/**
+ * Load a synthetic version of the Olivetti faces dataset.
+ * 400 samples, 64x64 pixel face images (4096 features), 40 subjects.
+ */
+export function makeOlivettiFaces(options: {
+ nSamples?: number;
+ nSubjects?: number;
+ seed?: number;
+} = {}): RealDataset {
+ const { nSamples = 400, nSubjects = 40, seed = 42 } = options;
+ let rng = seed;
+ const rand = () => {
+ rng = (rng * 1664525 + 1013904223) & 0xffffffff;
+ return ((rng >>> 0) / 0xffffffff);
+ };
+ const randn = () => {
+ const u = rand() || 1e-10;
+ const v = rand() || 1e-10;
+ return Math.sqrt(-2 * Math.log(u)) * Math.cos(2 * Math.PI * v);
+ };
+
+ const nFeatures = 4096; // 64x64
+ const data: Float64Array[] = [];
+ const target = new Float64Array(nSamples);
+ const featureNames = Array.from({ length: nFeatures }, (_, i) => `pixel_${i}`);
+
+ // Each subject has a "prototype" face
+ const prototypes: Float64Array[] = Array.from({ length: nSubjects }, () => {
+ const p = new Float64Array(nFeatures);
+ for (let f = 0; f < nFeatures; f++) {
+ p[f] = Math.max(0, Math.min(1, 0.5 + randn() * 0.2));
+ }
+ return p;
+ });
+
+ for (let i = 0; i < nSamples; i++) {
+ const subject = i % nSubjects;
+ target[i] = subject;
+ const proto = prototypes[subject]!;
+ const row = new Float64Array(nFeatures);
+ for (let f = 0; f < nFeatures; f++) {
+ row[f] = Math.max(0, Math.min(1, proto[f]! + randn() * 0.05));
+ }
+ data.push(row);
+ }
+
+ return {
+ data,
+ target,
+ featureNames,
+ targetNames: Array.from({ length: nSubjects }, (_, i) => `subject_${i}`),
+ description: `Synthetic Olivetti faces dataset (${nSubjects} subjects, ${nSamples} samples)`,
+ };
+}
diff --git a/src/datasets/sample_images.ts b/src/datasets/sample_images.ts
new file mode 100644
index 0000000..fbafb60
--- /dev/null
+++ b/src/datasets/sample_images.ts
@@ -0,0 +1,76 @@
+/**
+ * Sample image datasets.
+ * Mirrors scikit-learn's datasets.load_sample_image and load_sample_images.
+ */
+
+export interface SampleImage {
+ name: string;
+ data: Uint8Array;
+ height: number;
+ width: number;
+ channels: number;
+}
+
+/** Available sample image names */
+export const SAMPLE_IMAGE_NAMES = ["china", "flower"] as const;
+export type SampleImageName = (typeof SAMPLE_IMAGE_NAMES)[number];
+
+/** Generate a synthetic sample image for testing/demos. */
+function generateSyntheticImage(
+ name: SampleImageName,
+ height: number,
+ width: number,
+): Uint8Array {
+ const data = new Uint8Array(height * width * 3);
+ let seed = name === "china" ? 1337 : 7331;
+ const rng = (): number => {
+ seed = (seed * 1664525 + 1013904223) & 0xffffffff;
+ return (seed >>> 0) / 0xffffffff;
+ };
+
+ for (let i = 0; i < height; i++) {
+ for (let j = 0; j < width; j++) {
+ const base = (i * width + j) * 3;
+ if (name === "china") {
+ // Sky gradient + random texture
+ const t = i / height;
+ data[base] = Math.floor(135 + 120 * (1 - t) + rng() * 20);
+ data[base + 1] = Math.floor(206 * (1 - t * 0.5) + rng() * 20);
+ data[base + 2] = Math.floor(235 * (1 - t * 0.3) + rng() * 20);
+ } else {
+ // Flower: radial gradient
+ const cx = 0.5, cy = 0.5;
+ const r = Math.sqrt((j / width - cx) ** 2 + (i / height - cy) ** 2);
+ const angle = Math.atan2(i / height - cy, j / width - cx);
+ const petal = Math.sin(angle * 6) > 0 ? 1 : 0;
+ const inFlower = r < 0.4 ? 1 : 0;
+ data[base] = Math.floor(255 * petal * inFlower + rng() * 30);
+ data[base + 1] = Math.floor(200 * (1 - r) * inFlower + rng() * 30);
+ data[base + 2] = Math.floor(50 * inFlower + rng() * 30);
+ }
+ }
+ }
+ return data;
+}
+
+/**
+ * Load a single sample image by name.
+ */
+export function loadSampleImage(imageName: SampleImageName): SampleImage {
+ const height = 427;
+ const width = imageName === "china" ? 640 : 483;
+ return {
+ name: imageName,
+ data: generateSyntheticImage(imageName, height, width),
+ height,
+ width,
+ channels: 3,
+ };
+}
+
+/**
+ * Load all sample images.
+ */
+export function loadSampleImages(): SampleImage[] {
+ return SAMPLE_IMAGE_NAMES.map((name) => loadSampleImage(name));
+}
diff --git a/src/datasets/samples_generator.ts b/src/datasets/samples_generator.ts
new file mode 100644
index 0000000..3023de0
--- /dev/null
+++ b/src/datasets/samples_generator.ts
@@ -0,0 +1,228 @@
+/**
+ * Additional synthetic dataset generators.
+ * Mirrors sklearn.datasets: make_hastie_10_2, make_friedman1/2/3,
+ * make_sparse_uncorrelated, make_checkerboard, make_multilabel_classification.
+ */
+
+/** Result type for generated datasets. */
+export interface SamplesDatasetResult {
+ X: Float64Array[];
+ y: Float64Array | Int32Array;
+}
+
+/** Simple seeded Mulberry32 RNG for reproducibility. */
+function makeRng(seed: number): () => number {
+ let s = seed >>> 0;
+ return () => {
+ s = (s + 0x6d2b79f5) >>> 0;
+ let t = Math.imul(s ^ (s >>> 15), s | 1);
+ t ^= t + Math.imul(t ^ (t >>> 7), t | 61);
+ return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
+ };
+}
+
+function randn(rng: () => number): number {
+ const u1 = Math.max(rng(), 1e-14);
+ const u2 = rng();
+ return Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
+}
+
+/**
+ * make_hastie_10_2 β 10-feature binary classification problem.
+ * y = sign(sum(X_i^2) - 9.34) where X ~ N(0,1).
+ */
+export function makeHastie10_2(
+ nSamples = 12000,
+ randomState = 0,
+): { X: Float64Array[]; y: Int32Array } {
+ const rng = makeRng(randomState);
+ const X: Float64Array[] = Array.from({ length: nSamples }, () => {
+ const row = new Float64Array(10);
+ for (let j = 0; j < 10; j++) row[j]! = randn(rng);
+ return row;
+ });
+ const y = Int32Array.from(X, (row) => {
+ let s = 0;
+ for (const v of row) s += v * v;
+ return s > 9.34 ? 1 : -1;
+ });
+ return { X, y };
+}
+
+/**
+ * make_friedman1 β regression dataset from Friedman (1991).
+ * y = 10*sin(Ο*X0*X1) + 20*(X2-0.5)^2 + 10*X3 + 5*X4 + noise
+ */
+export function makeFriedman1(
+ nSamples = 100,
+ nFeatures = 10,
+ noise = 0.0,
+ randomState = 0,
+): SamplesDatasetResult {
+ if (nFeatures < 5) throw new Error("makeFriedman1 requires at least 5 features");
+ const rng = makeRng(randomState);
+ const X: Float64Array[] = Array.from({ length: nSamples }, () => {
+ const row = new Float64Array(nFeatures);
+ for (let j = 0; j < nFeatures; j++) row[j]! = rng();
+ return row;
+ });
+ const y = Float64Array.from(X, (row) => {
+ const x0 = row[0]! ?? 0;
+ const x1 = row[1]! ?? 0;
+ const x2 = row[2]! ?? 0;
+ const x3 = row[3]! ?? 0;
+ const x4 = row[4]! ?? 0;
+ return (
+ 10 * Math.sin(Math.PI * x0 * x1) +
+ 20 * (x2 - 0.5) ** 2 +
+ 10 * x3 +
+ 5 * x4 +
+ (noise > 0 ? noise * randn(rng) : 0)
+ );
+ });
+ return { X, y };
+}
+
+/**
+ * make_friedman2 β regression with nonlinear interactions.
+ * y = sqrt(X0^2 + (X1*X2 - 1/(X1*X3))^2) + noise
+ */
+export function makeFriedman2(
+ nSamples = 100,
+ noise = 0.0,
+ randomState = 0,
+): SamplesDatasetResult {
+ const rng = makeRng(randomState);
+ const bounds: [number, number][] = [[0, 100], [40 * Math.PI, 560 * Math.PI], [0, 1], [1, 11]];
+ const X: Float64Array[] = Array.from({ length: nSamples }, () => {
+ const row = new Float64Array(4);
+ for (let j = 0; j < 4; j++) {
+ const [lo, hi] = bounds[j]!;
+ row[j]! = lo + rng() * (hi - lo);
+ }
+ return row;
+ });
+ const y = Float64Array.from(X, (row) => {
+ const x0 = row[0]! ?? 0;
+ const x1 = row[1]! ?? 0;
+ const x2 = row[2]! ?? 0;
+ const x3 = Math.max(row[3]! ?? 1, 1e-6);
+ const inner = x1 * x2 - 1 / (x1 * x3);
+ return Math.sqrt(x0 ** 2 + inner ** 2) + (noise > 0 ? noise * randn(rng) : 0);
+ });
+ return { X, y };
+}
+
+/**
+ * make_friedman3 β regression with arctan transformation.
+ * y = arctan((X1*X2 - 1/(X1*X3)) / X0) + noise
+ */
+export function makeFriedman3(
+ nSamples = 100,
+ noise = 0.0,
+ randomState = 0,
+): SamplesDatasetResult {
+ const rng = makeRng(randomState);
+ const bounds: [number, number][] = [[0, 100], [40 * Math.PI, 560 * Math.PI], [0, 1], [1, 11]];
+ const X: Float64Array[] = Array.from({ length: nSamples }, () => {
+ const row = new Float64Array(4);
+ for (let j = 0; j < 4; j++) {
+ const [lo, hi] = bounds[j]!;
+ row[j]! = lo + rng() * (hi - lo);
+ }
+ return row;
+ });
+ const y = Float64Array.from(X, (row) => {
+ const x0 = Math.max(Math.abs(row[0]! ?? 0), 1e-6);
+ const x1 = row[1]! ?? 0;
+ const x2 = row[2]! ?? 0;
+ const x3 = Math.max(row[3]! ?? 1, 1e-6);
+ const inner = x1 * x2 - 1 / (x1 * x3);
+ return Math.atan(inner / x0) + (noise > 0 ? noise * randn(rng) : 0);
+ });
+ return { X, y };
+}
+
+/**
+ * make_sparse_uncorrelated β regression dataset with 4 informative features
+ * and `nFeatures - 4` noise features.
+ */
+export function makeSparseUncorrelated(
+ nSamples = 100,
+ nFeatures = 10,
+ randomState = 0,
+): SamplesDatasetResult {
+ const rng = makeRng(randomState);
+ const X: Float64Array[] = Array.from({ length: nSamples }, () =>
+ Float64Array.from({ length: nFeatures }, () => randn(rng)),
+ );
+ const coef = [1, 2, 0.5, -0.5]; // informative coefficients
+ const y = Float64Array.from(X, (row) => {
+ let s = 0;
+ for (let j = 0; j < coef.length; j++) s += (coef[j]! ?? 0) * (row[j]! ?? 0);
+ s += randn(rng);
+ return s;
+ });
+ return { X, y };
+}
+
+/**
+ * make_multilabel_classification β random multilabel dataset.
+ *
+ * @param nSamples - Number of samples.
+ * @param nFeatures - Number of features.
+ * @param nClasses - Number of classes (labels).
+ * @param nLabels - Average number of labels per sample.
+ * @param randomState - Random seed.
+ */
+export function makeMultilabelClassification(
+ nSamples = 100,
+ nFeatures = 20,
+ nClasses = 5,
+ nLabels = 2,
+ randomState = 0,
+): { X: Float64Array[]; y: Int32Array[] } {
+ const rng = makeRng(randomState);
+ const X: Float64Array[] = Array.from({ length: nSamples }, () =>
+ Float64Array.from({ length: nFeatures }, () => rng() > 0.5 ? 1 : 0),
+ );
+ const y: Int32Array[] = Array.from({ length: nSamples }, () => {
+ const row = new Int32Array(nClasses);
+ const nActive = Math.max(1, Math.round(nLabels + (rng() - 0.5) * 2));
+ for (let k = 0; k < nActive && k < nClasses; k++) {
+ row[Math.floor(rng() * nClasses)]! = 1;
+ }
+ return row;
+ });
+ return { X, y };
+}
+
+/**
+ * make_checkerboard β checkerboard pattern for biclustering.
+ *
+ * @param shape - [n_rows, n_cols].
+ * @param nClusters - [n_row_clusters, n_col_clusters].
+ * @param noise - Noise standard deviation.
+ * @param randomState - Random seed.
+ */
+export function makeCheckerboard(
+ shape: [number, number] = [300, 300],
+ nClusters: [number, number] = [4, 3],
+ noise = 0.5,
+ randomState = 0,
+): { data: Float64Array[]; rowLabels: Int32Array; colLabels: Int32Array } {
+ const rng = makeRng(randomState);
+ const [nRows, nCols] = shape;
+ const [nRowC, nColC] = nClusters;
+ const rowLabels = Int32Array.from({ length: nRows }, (_, i) => i % nRowC);
+ const colLabels = Int32Array.from({ length: nCols }, (_, j) => j % nColC);
+ const data: Float64Array[] = Array.from({ length: nRows }, (_, i) => {
+ const row = new Float64Array(nCols);
+ for (let j = 0; j < nCols; j++) {
+ const match = (rowLabels[i]! % 2) === (colLabels[j]! % 2);
+ row[j]! = (match ? 1 : 0) + noise * randn(rng);
+ }
+ return row;
+ });
+ return { data, rowLabels, colLabels };
+}
diff --git a/src/datasets/svmlight.ts b/src/datasets/svmlight.ts
new file mode 100644
index 0000000..3fc6d3d
--- /dev/null
+++ b/src/datasets/svmlight.ts
@@ -0,0 +1,113 @@
+/**
+ * SVMLight format loading and saving utilities.
+ * Ports: load_svmlight_file, dump_svmlight_file
+ */
+
+export interface SVMLightDataset {
+ data: Float64Array[];
+ target: Float64Array;
+ nFeatures: number;
+}
+
+/**
+ * Parse SVMLight / LibSVM format text.
+ * Format: